TiankaiHang · Jinxiaolong1129 · Feb 9, 2024 · Feb 18, 2024
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,144 @@
+{
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "Python Debugger: Current File",
+      "type": "debugpy",
+      "request": "launch",
+      "program": "${file}",
+      "console": "integratedTerminal"
+    },
+    {
+      "type": "bashdb",
+      "request": "launch",
+      "name": "Bash-Debug (select script from list of sh files)",
+      "cwd": "${workspaceFolder}",
+      "program": "${command:SelectScriptName}",
+      "args": []
+    },
+    {
+      "name": "PyTorch Debug",
+      "type": "python",
+      "request": "launch",
+      "program": "${workspaceFolder}/scripts_vit/image_train_vit.py",
+      "console": "integratedTerminal",
+      "env": {
+        // "CUDA_VISIBLE_DEVICES": "1,2,3,4",
+        "DEBUG_MODE": "1"
+      },
+      "args": [
+        "--batch_size",
+        "2",
+        "--exp_name",
+        "debug_vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16",
+        "--data_dir",
+        "/data4/share/xiaolong/dataset/ILSVRC2012_ldm_256_diffuser/train",
+        // "--data_dir", "/data4/share/imagenet/train",
+        "--image_size",
+        "32",
+        "--class_cond",
+        "True",
+        "--diffusion_steps",
+        "1000",
+        "--noise_schedule",
+        "cosine",
+        "--rescale_learned_sigmas",
+        "False",
+        "--lr",
+        "1e-4",
+        "--log_interval",
+        "10",
+        "--beta1",
+        "0.99",
+        "--beta2",
+        "0.99",
+        "--use_fp16",
+        "True",
+        "--weight_decay",
+        "0.03",
+        "--use_wandb",
+        "True",
+        "--model_name",
+        "vit_base_patch2_32",
+        "--depth",
+        "12",
+        "--predict_xstart",
+        "True",
+        "--warmup_steps",
+        "0",
+        "--lr_anneal_steps",
+        "0",
+        "--mse_loss_weight_type",
+        "min_snr_5",
+        "--clip_norm",
+        "-1",
+        "--in_chans",
+        "3",
+        "--drop_label_prob",
+        "0.15",
+        "--save_interval",
+        "1000",
+        "--schedule_sampler",
+        "min_max_bin_sampler",
+        "--time_bins",
+        "0 100, 300 400, 600 700, 900 1000"
+        // "--resume_checkpoint", "exp/guided_diffusion/debug_vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs3x64/model000440.pt"
+      ]
+    },
+    {
+      "name": "sample",
+      "type": "python",
+      "request": "launch",
+      "justMyCode": false,
+      "program": "${workspaceFolder}/scripts_vit/sampler_edm.py",
+      "console": "integratedTerminal",
+      "env": {
+        "OPENAI_LOGDIR": "exp/guided_diffusion/xl_samples50000_step50_scale1.5",
+        "NUM_SAMPLES": "50000",
+        "IMG_SIZE": "32",
+        "BATCH_SIZE": "3",
+        "MODEL_NAME": "vit_xl_patch2_32",
+        "DEPTH": "28",
+        "GUIDANCE_SCALE": "1.5",
+        "STEP": "50",
+        "PRED_X0": "True",
+        "CKPT": "exp/guided_diffusion/ema_0.9999_xl.pt",
+        "DEBUG_MODE": "1",
+        "CUDA_VISIBLE_DEVICES": "1,3,4,6"
+      },
+      "args": [
+        "--model_path",
+        "exp/guided_diffusion/ema_0.9999_xl.pt",
+        "--class_cond",
+        "True",
+        // "--image_size", "${env:IMG_SIZE}",
+        "--image_size",
+        "32",
+        "--model_name",
+        "vit_xl_patch2_32",
+        "--depth",
+        "28",
+        "--in_chans",
+        "4",
+        "--predict_xstart",
+        "True",
+        "--diffusion_steps",
+        "1000",
+        "--noise_schedule",
+        "cosine",
+        "--rescale_learned_sigmas",
+        "False",
+        "--rescale_timesteps",
+        "False",
+        "--batch_size",
+        "3",
+        "--num_samples",
+        "50000",
+        "--steps",
+        "50",
+        "--guidance_scale",
+        "1.5"
+      ]
+    }
+  ]
+}
diff --git a/README.md b/README.md
@@ -29,6 +29,14 @@ BATCH_SIZE_PER_GPU=32
 bash configs/in256/vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs8x32.sh $GPUS $BATCH_SIZE_PER_GPU
 ```
 
+
+```bash
+GPUS=1
+BATCH_SIZE_PER_GPU=3
+bash configs/in256/vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs8x32.sh $GPUS $BATCH_SIZE_PER_GPU
+```
+
+
 ## Sampling with Pre-trained Models
 For sampling for ImageNet-256, you could directly run
 ```bash

diff --git a/code_progress.md b/code_progress.md
@@ -0,0 +1,5 @@
+- 调试代码scripts_vit/image_train_vit.py 搞清楚wandb中step的定义和作用
+
+- 调整sample 方式来测试不同sample方式对fid的影响
+
+- 测试读取ema.pt 来计算fid的可行性
diff --git a/configs/in256/fid.sh b/configs/in256/fid.sh
@@ -0,0 +1,48 @@
+
+# pip install -r requirements.txt
+# pip install -e .
+
+if [ ! -d edm ]; then
+    git clone https://github.com/NVlabs/edm.git
+fi
+
+export NCCL_DEBUG=WARN
+export CUDA_VISIBLE_DEVICES=1,3,4,5,6
+
+GPUS=1
+IMG_SIZE=32
+BATCH_SIZE=32
+NUM_SAMPLES=50000
+MODEL_NAME="vit_xl_patch2_32"
+DEPTH=28
+GUIDANCE_SCALES="1.5"
+STEPS="50"
+PRED_X0=True
+
+
+# ----------- scale loop ------------- #
+for GUIDANCE_SCALE in $GUIDANCE_SCALES
+do
+
+for STEP in $STEPS
+do
+
+# OPENAI_LOGDIR="exp/guided_diffusion/xl_samples${NUM_SAMPLES}_step${STEP}_scale${GUIDANCE_SCALE}/"
+OPENAI_LOGDIR="../exp/guided_diffusion/xl_samples50000_step50_scale1.5/"
+
+cd edm
+torchrun --standalone --nproc_per_node=$GPUS fid.py calc --images=$OPENAI_LOGDIR --ref=https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/256/VIRTUAL_imagenet256_labeled.npz --num $NUM_SAMPLES 
+cd ..
+
+done
+done
+# ----------- scale loop ------------- #
+
+echo "----> DONE <----"
+
+
+# -----------------------------------
+#          expected output
+# -----------------------------------
+# Calculating FID...
+# 2.0559
diff --git a/configs/in256/inference.sh b/configs/in256/inference.sh
@@ -1,17 +1,18 @@
 
-pip install -r requirements.txt
-pip install -e .
+# pip install -r requirements.txt
+# pip install -e .
 
 if [ ! -d edm ]; then
     git clone https://github.com/NVlabs/edm.git
 fi
 
 export NCCL_DEBUG=WARN
+export CUDA_VISIBLE_DEVICES=0,1,3,4,5,6
 
-GPUS=8
+GPUS=6
 IMG_SIZE=32
-BATCH_SIZE=32
-NUM_SAMPLES=50000
+BATCH_SIZE=128
+NUM_SAMPLES=2000
 MODEL_NAME="vit_xl_patch2_32"
 DEPTH=28
 GUIDANCE_SCALES="1.5"
@@ -25,8 +26,10 @@ if [ -e $CKPT ]; then
     echo "$CKPT exists."
 else
     echo "$$CKPT does not exist.";
-    sudo mkdir -p exp/guided_diffusion/;
-    sudo wget https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/releases/download/v0.0.0/ema_0.9999_xl.pt -O $CKPT;
+    # sudo mkdir -p exp/guided_diffusion/;
+    # sudo wget https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/releases/download/v0.0.0/ema_0.9999_xl.pt -O $CKPT;
+    mkdir -p exp/guided_diffusion/;
+    wget https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/releases/download/v0.0.0/ema_0.9999_xl.pt -O $CKPT;
 fi
 
 MODEL_FLAGS="--class_cond True --image_size $IMG_SIZE --model_name ${MODEL_NAME} --depth $DEPTH --in_chans 4 --predict_xstart $PRED_X0 "
@@ -42,8 +45,9 @@ do
 SAMPLE_FLAGS="--batch_size $BATCH_SIZE --num_samples ${NUM_SAMPLES} --steps $STEP --guidance_scale $GUIDANCE_SCALE"
 
 OPENAI_LOGDIR="exp/guided_diffusion/xl_samples${NUM_SAMPLES}_step${STEP}_scale${GUIDANCE_SCALE}"
-sudo mkdir -p $OPENAI_LOGDIR && sudo chmod 777 $OPENAI_LOGDIR
-OPENAI_LOGDIR=$OPENAI_LOGDIR torchrun --nproc_per_node=$GPUS --master_port=23456 scripts_vit/sampler_edm.py --model_path $CKPT $MODEL_FLAGS $DIFFUSION_FLAGS $SAMPLE_FLAGS
+# sudo mkdir -p $OPENAI_LOGDIR && sudo chmod 777 $OPENAI_LOGDIR
+mkdir -p $OPENAI_LOGDIR && chmod 777 $OPENAI_LOGDIR
+OPENAI_LOGDIR=$OPENAI_LOGDIR torchrun --nproc_per_node=$GPUS --master_port=12349 scripts_vit/sampler_edm.py --model_path $CKPT $MODEL_FLAGS $DIFFUSION_FLAGS $SAMPLE_FLAGS
 
 cd edm
 torchrun --standalone --nproc_per_node=$GPUS fid.py calc --images=../$OPENAI_LOGDIR --ref=https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/256/VIRTUAL_imagenet256_labeled.npz --num $NUM_SAMPLES 

diff --git a/configs/in256/test.sh b/configs/in256/test.sh
@@ -0,0 +1,7 @@
+TIME_BINS="0 100, 500 600, 900 1000"
+
+# Replace spaces with underscores and remove commas
+TIME_BINS="${TIME_BINS// /_}"
+TIME_BINS="${TIME_BINS//,/}"
+
+echo $TIME_BINS
diff --git a/configs/in256/time_bins_sample_vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs.sh b/configs/in256/time_bins_sample_vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs.sh
@@ -0,0 +1,44 @@
+# set -ex
+
+# pip install -r requirements.txt
+# pip install -e .
+
+# 修改 
+USE_WANDB="True"
+TIME_BINS="0 100, 500 600, 900 1000"
+
+# DATA_DIR="../datasets/ILSVRC2012_ldm_256_diffuser/train/"
+DATA_DIR="/data4/share/imagenet/train"
+
+EXP_TIME_BINS="${TIME_BINS// /_}"
+EXP_TIME_BINS="${EXP_TIME_BINS//,/}"
+
+export CUDA_VISIBLE_DEVICES=1,3,4,5,6
+
+# GPUS=$1
+# BATCH_PER_GPU=$2
+GPUS=4
+BATCH_PER_GPU=128
+EXP_NAME="time_bins_${EXP_TIME_BINS}_vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs${GPUS}x${BATCH_PER_GPU}"
+
+MODEL_BLOB="/mnt/external"
+if [ ! -d $MODEL_BLOB ]; then
+    MODEL_BLOB="."
+fi
+
+OPENAI_LOGDIR="${MODEL_BLOB}/exp/guided_diffusion/$EXP_NAME"
+# if permission denied
+# sudo mkdir -p $OPENAI_LOGDIR && sudo chmod 777 $OPENAI_LOGDIR
+mkdir -p $OPENAI_LOGDIR && chmod 777 $OPENAI_LOGDIR
+OPENAI_LOGDIR=$OPENAI_LOGDIR \
+    torchrun --nproc_per_node=${GPUS} --master_port=12457 scripts_vit/image_train_vit.py \
+    --data_dir $DATA_DIR --image_size 32 --class_cond True --diffusion_steps 1000 \
+    --noise_schedule cosine --rescale_learned_sigmas False \
+    --lr 1e-4 --batch_size ${BATCH_PER_GPU} --log_interval 10 --beta1 0.99 --beta2 0.99 \
+    --exp_name $EXP_NAME --use_fp16 True --weight_decay 0.03 \
+    --use_wandb "$USE_WANDB" --model_name vit_base_patch2_32 --depth 12 \
+    --predict_xstart True --warmup_steps 0 --lr_anneal_steps 0 \
+    --mse_loss_weight_type min_snr_5 --clip_norm -1 \
+    --in_chans 3 --drop_label_prob 0.15 --save_interval 1000 \
+    --schedule_sampler min_max_bin_sampler --time_bins "$TIME_BINS"
+    # --resume_checkpoint exp/guided_diffusion/debug_vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs3x64/model000440.pt \
diff --git a/configs/in256/vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs8x32.sh b/configs/in256/vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs8x32.sh
@@ -1,13 +1,18 @@
 # set -ex
 
-pip install -r requirements.txt
-pip install -e .
+# pip install -r requirements.txt
+# pip install -e .
 
-DATA_DIR="../datasets/ILSVRC2012_ldm_256_diffuser/train/"
+# DATA_DIR="../datasets/ILSVRC2012_ldm_256_diffuser/train/"
+DATA_DIR="/data4/share/imagenet/train"
 
-GPUS=$1
-BATCH_PER_GPU=$2
-EXP_NAME=vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs${GPUS}x${BATCH_PER_GPU}
+export CUDA_VISIBLE_DEVICES=0,1,4,6,7
+
+# GPUS=$1
+# BATCH_PER_GPU=$2
+GPUS=2
+BATCH_PER_GPU=32
+EXP_NAME=debug_vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs${GPUS}x${BATCH_PER_GPU}
 
 MODEL_BLOB="/mnt/external"
 if [ ! -d $MODEL_BLOB ]; then
@@ -16,8 +21,10 @@ fi
 
 OPENAI_LOGDIR="${MODEL_BLOB}/exp/guided_diffusion/$EXP_NAME"
 # if permission denied
-sudo mkdir -p $OPENAI_LOGDIR && sudo chmod 777 $OPENAI_LOGDIR
+# sudo mkdir -p $OPENAI_LOGDIR && sudo chmod 777 $OPENAI_LOGDIR
+mkdir -p $OPENAI_LOGDIR && chmod 777 $OPENAI_LOGDIR
 OPENAI_LOGDIR=$OPENAI_LOGDIR \
+    # torchrun --nproc_per_node=${GPUS} --master_port=23456 scripts_vit/image_train_vit_back.py \
     torchrun --nproc_per_node=${GPUS} --master_port=23456 scripts_vit/image_train_vit.py \
     --data_dir $DATA_DIR --image_size 32 --class_cond True --diffusion_steps 1000 \
     --noise_schedule cosine --rescale_learned_sigmas False \
@@ -26,4 +33,5 @@ OPENAI_LOGDIR=$OPENAI_LOGDIR \
     --use_wandb False --model_name vit_base_patch2_32 --depth 12 \
     --predict_xstart True --warmup_steps 0 --lr_anneal_steps 0 \
     --mse_loss_weight_type min_snr_5 --clip_norm -1 \
-    --in_chans 4 --drop_label_prob 0.15
+    --in_chans 3 --drop_label_prob 0.15 --save_interval 1000 \
+    # --resume_checkpoint exp/guided_diffusion/debug_vit-b_layer12_lr1e-4_099_099_pred_x0__min_snr_5__fp16_bs3x64/model000440.pt \