-
Notifications
You must be signed in to change notification settings - Fork 319
/
train_ipadapter.sh
executable file
·71 lines (67 loc) · 2.68 KB
/
train_ipadapter.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
export NCCL_IB_GID_INDEX=3
export NCCL_IB_SL=3
export NCCL_CHECK_DISABLE=1
export NCCL_P2P_DISABLE=0
export NCCL_IB_DISABLE=0
export NCCL_LL_THRESHOLD=16384
export NCCL_IB_CUDA_SUPPORT=1
export NCCL_SOCKET_IFNAME=bond1
export UCX_NET_DEVICES=bond1
export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_5,mlx5_bond_3,mlx5_bond_7,mlx5_bond_4,mlx5_bond_8,mlx5_bond_2,mlx5_bond_6
export NCCL_COLLNET_ENABLE=0
export SHARP_COLL_ENABLE_SAT=0
export NCCL_NET_GDR_LEVEL=2
export NCCL_IB_QPS_PER_CONNECTION=4
export NCCL_IB_TC=160
export NCCL_PXN_DISABLE=1
task_flag="dit_g2_full_1024p" # the task flag is used to identify folders. # checkpoint root for resume
resume=ckpts/t2i/model
results_dir=./log_EXP # save root for results
batch_size=1 # training batch size
image_size=1024 # training image resolution
grad_accu_steps=1 # gradient accumulation
warmup_num_steps=0 # warm-up steps
lr=0.0001 # learning rate
ckpt_every=100 # create a ckpt every a few steps.
ckpt_latest_every=10000 # create a ckpt named `latest.pt` every a few steps.
ckpt_every_n_epoch=2 # create a ckpt every a few epochs.
epochs=8 # total training epochs
PYTHONPATH=. \
sh $(dirname "$0")/run_g_ipadapter.sh \
--task-flag ${task_flag} \
--noise-schedule scaled_linear --beta-start 0.00085 --beta-end 0.018 \
--predict-type v_prediction \
--multireso \
--reso-step 64 \
--uncond-p 0.22 \
--uncond-p-t5 0.22\
--uncond-p-img 0.05\
--index-file \
your data path \
--random-flip \
--lr ${lr} \
--batch-size ${batch_size} \
--image-size ${image_size} \
--global-seed 999 \
--grad-accu-steps ${grad_accu_steps} \
--warmup-num-steps ${warmup_num_steps} \
--use-flash-attn \
--use-fp16 \
--extra-fp16 \
--results-dir ${results_dir} \
--resume \
--resume-module-root ckpts/t2i/model/pytorch_model_module.pt \
--epochs ${epochs} \
--ckpt-every ${ckpt_every} \
--ckpt-latest-every ${ckpt_latest_every} \
--ckpt-every-n-epoch ${ckpt_every_n_epoch} \
--log-every 10 \
--deepspeed \
--use-zero-stage 2 \
--gradient-checkpointing \
--no-strict \
--training-parts ipadapter \
--is-ipa True \
--resume-ipa Ture \
--resume-ipa-root ckpts/t2i/model/ipa.pt \
"$@"