File size: 2,084 Bytes
1a2f2af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
export HF_HOME=/mnt/petrelfs/share/yejinhui/Models/huggingface_cache

export NCCL_SOCKET_IFNAME=bond0
export NCCL_IB_HCA=mlx5_2,mlx5_3

# 用于check save 的时候的通信
export NCCL_BLOCKING_WAIT=1
export NCCL_ASYNC_ERROR_HANDLING=1
export NCCL_TIMEOUT=1000  # 超时时间设为 1 小时(单位:秒)

cd /mnt/petrelfs/yujunqiu/code/vla-baseline/llavavla-00hf1

# MODEL_PATH=/mnt/petrelfs/yejinhui/Projects/llavavla/playground/Pretrained_models/Qwen2.5-VL-3B-Instruct # must be a local path, due to simpler will run in other where
# data_root_dir=./playground/Datasets/OXE_LEROBOT_DATASET
run_root_dir=./playground/Checkpoints
task_name=libero_spatial
run_id=0903_${task_name}_augsteps_0_wo_flash_attention_wo_augsteps_two_view_action_chunk_16_pretrained_vlm


export WANDB_MODE=disabled

output_dir=${run_root_dir}/${run_id}
mkdir -p ${output_dir}
# mv this script to the output dir
cp $0 ${output_dir}/

  # --pretrained_checkpoint ${MODEL_PATH} \
# export CUDA_VISIBLE_DEVICES=4,5,6,7

  # --datasets.vla_data.data_mix libero_goal \
  # --framework.framework_py qwenpi \

DEBUG=False
# DEBUG=True

if [ "$DEBUG" = True ]; then
  num_processes=1
  run_id=debug
else
  num_processes=8
fi


accelerate launch \
  --config_file scripts/run_scripts/deepspeed_zero2.yaml \
  --num_processes ${num_processes} \
  llavavla/training/train_qwenvla.py \
  --config_yaml ./llavavla/config/lerobot_data/qwenvla_cotrain_libero.yaml \
  --datasets.vla_data.per_device_batch_size 16 \
  --datasets.vla_data.data_mix ${task_name} \
  --framework.action_model.future_action_window_size 7 \
  --trainer.max_train_steps 100_000 \
  --trainer.save_interval 10_000 \
  --run_root_dir ${run_root_dir} \
  --run_id ${run_id} \
  --wandb_project Internvla \
  --wandb_entity michaelyu-1101-fudanuniversity \
  --is_debug ${DEBUG} \
  --framework.qwenvl.base_vlm /mnt/phwfile/efm_t/zhuyangkun_tmp_need_del/exp/exp_08_09/manip_sys2_qwen25_3b_onevision_molmo_a0all_refsp20/checkpoint-20000

#   --framework.qwenvl.base_vlm ${MODEL_PATH} \
#   --data_root_dir ${data_root_dir} \

  # --is_debug True