set -x | |
NGPUS=$1 | |
PY_ARGS=${@:2} | |
while true | |
do | |
PORT=$(( ((RANDOM<<15)|RANDOM) % 49152 + 10000 )) | |
status="$(nc -z 127.0.0.1 $PORT < /dev/null &>/dev/null; echo $?)" | |
if [ "${status}" != "0" ]; then | |
break; | |
fi | |
done | |
echo $PORT | |
python -m torch.distributed.launch --nproc_per_node=${NGPUS} --rdzv_endpoint=localhost:${PORT} train.py --launcher pytorch ${PY_ARGS} | |