export HF_HOME=/mnt/petrelfs/share/yejinhui/Models/huggingface_cache export NCCL_SOCKET_IFNAME=bond0 export NCCL_IB_HCA=mlx5_2,mlx5_3 # 用于check save 的时候的通信 export NCCL_BLOCKING_WAIT=1 export NCCL_ASYNC_ERROR_HANDLING=1 export NCCL_TIMEOUT=1000 # 超时时间设为 1 小时(单位:秒) cd /mnt/petrelfs/yujunqiu/code/vla-baseline/llavavla-00hf1 # MODEL_PATH=/mnt/petrelfs/yejinhui/Projects/llavavla/playground/Pretrained_models/Qwen2.5-VL-3B-Instruct # must be a local path, due to simpler will run in other where # data_root_dir=./playground/Datasets/OXE_LEROBOT_DATASET run_root_dir=./playground/Checkpoints task_name=libero_object run_id=0911_${task_name}_augsteps_0_wo_flash_attention_wo_augsteps_two_view_action_chunk_8_pretrained_vlm_20k export WANDB_MODE=disabled output_dir=${run_root_dir}/${run_id} mkdir -p ${output_dir} # mv this script to the output dir cp $0 ${output_dir}/ # --pretrained_checkpoint ${MODEL_PATH} \ # export CUDA_VISIBLE_DEVICES=4,5,6,7 # --datasets.vla_data.data_mix libero_goal \ # --framework.framework_py qwenpi \ DEBUG=False # DEBUG=True if [ "$DEBUG" = True ]; then num_processes=1 run_id=debug else num_processes=8 fi accelerate launch \ --config_file scripts/run_scripts/deepspeed_zero2.yaml \ --num_processes ${num_processes} \ llavavla/training/train_qwenvla.py \ --config_yaml ./llavavla/config/lerobot_data/qwenvla_cotrain_libero.yaml \ --datasets.vla_data.per_device_batch_size 16 \ --datasets.vla_data.data_mix ${task_name} \ --framework.action_model.future_action_window_size 7 \ --trainer.max_train_steps 100_000 \ --trainer.save_interval 10_000 \ --run_root_dir ${run_root_dir} \ --run_id ${run_id} \ --wandb_project Internvla \ --wandb_entity michaelyu-1101-fudanuniversity \ --is_debug ${DEBUG} \ --framework.qwenvl.base_vlm /mnt/phwfile/efm_t/zhuyangkun_tmp_need_del/exp/exp_08_09/manip_sys2_qwen25_3b_onevision_molmo_a0all_refsp20/checkpoint-20000/ # --framework.qwenvl.base_vlm ${MODEL_PATH} \ # --data_root_dir ${data_root_dir} \ # --is_debug True