forked from PaddlePaddle/Research
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.sh
executable file
·70 lines (60 loc) · 1.66 KB
/
train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/bin/bash
source ./env/env.sh
source ./env/utils.sh
source ./env/cloud_job_conf.conf
iplist=$1
#iplist=`echo $nodelist | xargs | sed 's/ /,/g'`
if [ ! -d log ]
then
mkdir log
fi
export GLOG_vmodule=fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10
if [[ ${FUSE} == "1" ]]; then
export FLAGS_fuse_parameter_memory_size=64 #MB
fi
set -ux
check_iplist
distributed_args=""
if [[ ${NUM_CARDS} == "1" ]]; then
distributed_args="--selected_gpus 0"
fi
node_ips=${PADDLE_TRAINERS}
distributed_args="--node_ips ${PADDLE_TRAINERS} --node_id ${PADDLE_TRAINER_ID} --current_node_ip ${POD_IP} --nproc_per_node 8 --selected_gpus 0,1,2,3,4,5,6,7"
export FLAGS_eager_delete_tensor_gb=0.0
export FLAGS_fraction_of_gpu_memory_to_use=0.1
export NCCL_IB_GID_INDEX=3
export NCCL_IB_RETRY_CNT=10
export FLAGS_sync_nccl_allreduce=0
BATCH_SIZE=1250
python -u ./src/launch.py ${distributed_args} \
./src/train.py \
--src_vocab_size 37007 \
--tgt_vocab_size 37007 \
--train_file_pattern 'data/translate-train-*' \
--token_delimiter ' ' \
--batch_size ${BATCH_SIZE} \
--use_py_reader True \
--use_delay_load True \
--nccl_comm_num ${NCCL_COMM_NUM} \
--use_hierarchical_allreduce ${USE_HIERARCHICAL_ALLREDUCE} \
--fetch_steps 50 \
--fuse ${FUSE} \
--val_file_pattern 'testset/testfile' \
--infer_batch_size 32 \
--decode_alpha 0.3 \
--beam_size 4 \
--use_fp16 True \
learning_rate 2.0 \
warmup_steps 8000 \
beta2 0.997 \
d_model 1024 \
d_inner_hid 4096 \
n_head 16 \
prepostprocess_dropout 0.3 \
attention_dropout 0.1 \
relu_dropout 0.1 \
embedding_sharing True \
pass_num 100 \
max_length 256 \
save_freq 5000 \
model_dir 'output'