Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

trojai4 update-test logger #8

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 144 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
},
{
"type": "bashdb",
"request": "launch",
"name": "Bash-Debug (select script from list of sh files)",
"cwd": "${workspaceFolder}",
"program": "${command:SelectScriptName}",
"args": []
},
{
"name": "PyTorch Debug",
"type": "python",
"request": "launch",
"program": "${workspaceFolder}/scripts_vit/image_train_vit.py",
"console": "integratedTerminal",
"env": {
// "CUDA_VISIBLE_DEVICES": "1,2,3,4",
"DEBUG_MODE": "1"
},
"args": [
"--batch_size",
"2",
"--exp_name",
"debug_vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16",
"--data_dir",
"/data4/share/xiaolong/dataset/ILSVRC2012_ldm_256_diffuser/train",
// "--data_dir", "/data4/share/imagenet/train",
"--image_size",
"32",
"--class_cond",
"True",
"--diffusion_steps",
"1000",
"--noise_schedule",
"cosine",
"--rescale_learned_sigmas",
"False",
"--lr",
"1e-4",
"--log_interval",
"10",
"--beta1",
"0.99",
"--beta2",
"0.99",
"--use_fp16",
"True",
"--weight_decay",
"0.03",
"--use_wandb",
"True",
"--model_name",
"vit_base_patch2_32",
"--depth",
"12",
"--predict_xstart",
"True",
"--warmup_steps",
"0",
"--lr_anneal_steps",
"0",
"--mse_loss_weight_type",
"min_snr_5",
"--clip_norm",
"-1",
"--in_chans",
"3",
"--drop_label_prob",
"0.15",
"--save_interval",
"1000",
"--schedule_sampler",
"min_max_bin_sampler",
"--time_bins",
"0 100, 300 400, 600 700, 900 1000"
// "--resume_checkpoint", "exp/guided_diffusion/debug_vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs3x64/model000440.pt"
]
},
{
"name": "sample",
"type": "python",
"request": "launch",
"justMyCode": false,
"program": "${workspaceFolder}/scripts_vit/sampler_edm.py",
"console": "integratedTerminal",
"env": {
"OPENAI_LOGDIR": "exp/guided_diffusion/xl_samples50000_step50_scale1.5",
"NUM_SAMPLES": "50000",
"IMG_SIZE": "32",
"BATCH_SIZE": "3",
"MODEL_NAME": "vit_xl_patch2_32",
"DEPTH": "28",
"GUIDANCE_SCALE": "1.5",
"STEP": "50",
"PRED_X0": "True",
"CKPT": "exp/guided_diffusion/ema_0.9999_xl.pt",
"DEBUG_MODE": "1",
"CUDA_VISIBLE_DEVICES": "1,3,4,6"
},
"args": [
"--model_path",
"exp/guided_diffusion/ema_0.9999_xl.pt",
"--class_cond",
"True",
// "--image_size", "${env:IMG_SIZE}",
"--image_size",
"32",
"--model_name",
"vit_xl_patch2_32",
"--depth",
"28",
"--in_chans",
"4",
"--predict_xstart",
"True",
"--diffusion_steps",
"1000",
"--noise_schedule",
"cosine",
"--rescale_learned_sigmas",
"False",
"--rescale_timesteps",
"False",
"--batch_size",
"3",
"--num_samples",
"50000",
"--steps",
"50",
"--guidance_scale",
"1.5"
]
}
]
}
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@ BATCH_SIZE_PER_GPU=32
bash configs/in256/vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs8x32.sh $GPUS $BATCH_SIZE_PER_GPU
```


```bash
GPUS=1
BATCH_SIZE_PER_GPU=3
bash configs/in256/vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs8x32.sh $GPUS $BATCH_SIZE_PER_GPU
```


## Sampling with Pre-trained Models
For sampling for ImageNet-256, you could directly run
```bash
Expand Down
5 changes: 5 additions & 0 deletions code_progress.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- 调试代码scripts_vit/image_train_vit.py 搞清楚wandb中step的定义和作用

- 调整sample 方式来测试不同sample方式对fid的影响

- 测试读取ema.pt 来计算fid的可行性
48 changes: 48 additions & 0 deletions configs/in256/fid.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@

# pip install -r requirements.txt
# pip install -e .

if [ ! -d edm ]; then
git clone https://github.com/NVlabs/edm.git
fi

export NCCL_DEBUG=WARN
export CUDA_VISIBLE_DEVICES=1,3,4,5,6

GPUS=1
IMG_SIZE=32
BATCH_SIZE=32
NUM_SAMPLES=50000
MODEL_NAME="vit_xl_patch2_32"
DEPTH=28
GUIDANCE_SCALES="1.5"
STEPS="50"
PRED_X0=True


# ----------- scale loop ------------- #
for GUIDANCE_SCALE in $GUIDANCE_SCALES
do

for STEP in $STEPS
do

# OPENAI_LOGDIR="exp/guided_diffusion/xl_samples${NUM_SAMPLES}_step${STEP}_scale${GUIDANCE_SCALE}/"
OPENAI_LOGDIR="../exp/guided_diffusion/xl_samples50000_step50_scale1.5/"

cd edm
torchrun --standalone --nproc_per_node=$GPUS fid.py calc --images=$OPENAI_LOGDIR --ref=https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/256/VIRTUAL_imagenet256_labeled.npz --num $NUM_SAMPLES
cd ..

done
done
# ----------- scale loop ------------- #

echo "----> DONE <----"


# -----------------------------------
# expected output
# -----------------------------------
# Calculating FID...
# 2.0559
22 changes: 13 additions & 9 deletions configs/in256/inference.sh
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@

pip install -r requirements.txt
pip install -e .
# pip install -r requirements.txt
# pip install -e .

if [ ! -d edm ]; then
git clone https://github.com/NVlabs/edm.git
fi

export NCCL_DEBUG=WARN
export CUDA_VISIBLE_DEVICES=0,1,3,4,5,6

GPUS=8
GPUS=6
IMG_SIZE=32
BATCH_SIZE=32
NUM_SAMPLES=50000
BATCH_SIZE=128
NUM_SAMPLES=2000
MODEL_NAME="vit_xl_patch2_32"
DEPTH=28
GUIDANCE_SCALES="1.5"
Expand All @@ -25,8 +26,10 @@ if [ -e $CKPT ]; then
echo "$CKPT exists."
else
echo "$$CKPT does not exist.";
sudo mkdir -p exp/guided_diffusion/;
sudo wget https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/releases/download/v0.0.0/ema_0.9999_xl.pt -O $CKPT;
# sudo mkdir -p exp/guided_diffusion/;
# sudo wget https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/releases/download/v0.0.0/ema_0.9999_xl.pt -O $CKPT;
mkdir -p exp/guided_diffusion/;
wget https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/releases/download/v0.0.0/ema_0.9999_xl.pt -O $CKPT;
fi

MODEL_FLAGS="--class_cond True --image_size $IMG_SIZE --model_name ${MODEL_NAME} --depth $DEPTH --in_chans 4 --predict_xstart $PRED_X0 "
Expand All @@ -42,8 +45,9 @@ do
SAMPLE_FLAGS="--batch_size $BATCH_SIZE --num_samples ${NUM_SAMPLES} --steps $STEP --guidance_scale $GUIDANCE_SCALE"

OPENAI_LOGDIR="exp/guided_diffusion/xl_samples${NUM_SAMPLES}_step${STEP}_scale${GUIDANCE_SCALE}"
sudo mkdir -p $OPENAI_LOGDIR && sudo chmod 777 $OPENAI_LOGDIR
OPENAI_LOGDIR=$OPENAI_LOGDIR torchrun --nproc_per_node=$GPUS --master_port=23456 scripts_vit/sampler_edm.py --model_path $CKPT $MODEL_FLAGS $DIFFUSION_FLAGS $SAMPLE_FLAGS
# sudo mkdir -p $OPENAI_LOGDIR && sudo chmod 777 $OPENAI_LOGDIR
mkdir -p $OPENAI_LOGDIR && chmod 777 $OPENAI_LOGDIR
OPENAI_LOGDIR=$OPENAI_LOGDIR torchrun --nproc_per_node=$GPUS --master_port=12349 scripts_vit/sampler_edm.py --model_path $CKPT $MODEL_FLAGS $DIFFUSION_FLAGS $SAMPLE_FLAGS

cd edm
torchrun --standalone --nproc_per_node=$GPUS fid.py calc --images=../$OPENAI_LOGDIR --ref=https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/256/VIRTUAL_imagenet256_labeled.npz --num $NUM_SAMPLES
Expand Down
7 changes: 7 additions & 0 deletions configs/in256/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
TIME_BINS="0 100, 500 600, 900 1000"

# Replace spaces with underscores and remove commas
TIME_BINS="${TIME_BINS// /_}"
TIME_BINS="${TIME_BINS//,/}"

echo $TIME_BINS
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# set -ex

# pip install -r requirements.txt
# pip install -e .

# 修改
USE_WANDB="True"
TIME_BINS="0 100, 500 600, 900 1000"

# DATA_DIR="../datasets/ILSVRC2012_ldm_256_diffuser/train/"
DATA_DIR="/data4/share/imagenet/train"

EXP_TIME_BINS="${TIME_BINS// /_}"
EXP_TIME_BINS="${EXP_TIME_BINS//,/}"

export CUDA_VISIBLE_DEVICES=1,3,4,5,6

# GPUS=$1
# BATCH_PER_GPU=$2
GPUS=4
BATCH_PER_GPU=128
EXP_NAME="time_bins_${EXP_TIME_BINS}_vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs${GPUS}x${BATCH_PER_GPU}"

MODEL_BLOB="/mnt/external"
if [ ! -d $MODEL_BLOB ]; then
MODEL_BLOB="."
fi

OPENAI_LOGDIR="${MODEL_BLOB}/exp/guided_diffusion/$EXP_NAME"
# if permission denied
# sudo mkdir -p $OPENAI_LOGDIR && sudo chmod 777 $OPENAI_LOGDIR
mkdir -p $OPENAI_LOGDIR && chmod 777 $OPENAI_LOGDIR
OPENAI_LOGDIR=$OPENAI_LOGDIR \
torchrun --nproc_per_node=${GPUS} --master_port=12457 scripts_vit/image_train_vit.py \
--data_dir $DATA_DIR --image_size 32 --class_cond True --diffusion_steps 1000 \
--noise_schedule cosine --rescale_learned_sigmas False \
--lr 1e-4 --batch_size ${BATCH_PER_GPU} --log_interval 10 --beta1 0.99 --beta2 0.99 \
--exp_name $EXP_NAME --use_fp16 True --weight_decay 0.03 \
--use_wandb "$USE_WANDB" --model_name vit_base_patch2_32 --depth 12 \
--predict_xstart True --warmup_steps 0 --lr_anneal_steps 0 \
--mse_loss_weight_type min_snr_5 --clip_norm -1 \
--in_chans 3 --drop_label_prob 0.15 --save_interval 1000 \
--schedule_sampler min_max_bin_sampler --time_bins "$TIME_BINS"
# --resume_checkpoint exp/guided_diffusion/debug_vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs3x64/model000440.pt \
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
# set -ex

pip install -r requirements.txt
pip install -e .
# pip install -r requirements.txt
# pip install -e .

DATA_DIR="../datasets/ILSVRC2012_ldm_256_diffuser/train/"
# DATA_DIR="../datasets/ILSVRC2012_ldm_256_diffuser/train/"
DATA_DIR="/data4/share/imagenet/train"

GPUS=$1
BATCH_PER_GPU=$2
EXP_NAME=vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs${GPUS}x${BATCH_PER_GPU}
export CUDA_VISIBLE_DEVICES=0,1,4,6,7

# GPUS=$1
# BATCH_PER_GPU=$2
GPUS=2
BATCH_PER_GPU=32
EXP_NAME=debug_vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs${GPUS}x${BATCH_PER_GPU}

MODEL_BLOB="/mnt/external"
if [ ! -d $MODEL_BLOB ]; then
Expand All @@ -16,8 +21,10 @@ fi

OPENAI_LOGDIR="${MODEL_BLOB}/exp/guided_diffusion/$EXP_NAME"
# if permission denied
sudo mkdir -p $OPENAI_LOGDIR && sudo chmod 777 $OPENAI_LOGDIR
# sudo mkdir -p $OPENAI_LOGDIR && sudo chmod 777 $OPENAI_LOGDIR
mkdir -p $OPENAI_LOGDIR && chmod 777 $OPENAI_LOGDIR
OPENAI_LOGDIR=$OPENAI_LOGDIR \
# torchrun --nproc_per_node=${GPUS} --master_port=23456 scripts_vit/image_train_vit_back.py \
torchrun --nproc_per_node=${GPUS} --master_port=23456 scripts_vit/image_train_vit.py \
--data_dir $DATA_DIR --image_size 32 --class_cond True --diffusion_steps 1000 \
--noise_schedule cosine --rescale_learned_sigmas False \
Expand All @@ -26,4 +33,5 @@ OPENAI_LOGDIR=$OPENAI_LOGDIR \
--use_wandb False --model_name vit_base_patch2_32 --depth 12 \
--predict_xstart True --warmup_steps 0 --lr_anneal_steps 0 \
--mse_loss_weight_type min_snr_5 --clip_norm -1 \
--in_chans 4 --drop_label_prob 0.15
--in_chans 3 --drop_label_prob 0.15 --save_interval 1000 \
# --resume_checkpoint exp/guided_diffusion/debug_vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs3x64/model000440.pt \
Loading