- Train DPO on Qwen3-0.6B-Base or Llama-3.2-1B for
$\pi_\theta$
export WANDB_PROJECT=${WANDB_PROJECT:-"TIF_Analysis"}
export ACCELERATE_LOG_LEVEL=info
CONFIG="dpo_configs/ultra_prob/Qwen3-0.6B-Base/train.yaml"
# CONFIG="dpo_configs/ultra_prob/Llama-3.2-1B/train.yaml"
echo "[INFO] Using $NUM_GPUS GPUs, config: $CONFIG"
deepspeed --num_gpus "$NUM_GPUS" \
--master_port "$MASTER_PORT" \
training/dpo.py --config "$CONFIG"
- Compute Influence Function (IF) based on DPO model checkpoints
# You can modify the code comment in the shell to transfer between Qwen3-0.6B-Base and Llama-3.2-1B
bash analysis/analyze_IF_epoch0_ckpt.sh
bash analysis/analyze_IF_epoch1_ckpt.sh
bash analysis/analyze_IF_epoch2_ckpt.sh
bash analysis/analyze_IF_epoch3_ckpt.sh
bash analysis/analyze_IF_epoch4_ckpt.sh
bash analysis/analyze_IF_epoch5_ckpt.sh
- Train DPO on Qwen3-0.6B-Base or Llama-3.2-1B for
$\pi_{\theta_\text{val}}$
export WANDB_PROJECT=${WANDB_PROJECT:-"TIF_Analysis"}
export ACCELERATE_LOG_LEVEL=info
CONFIG="dpo_configs/ultra_prob/Qwen3-0.6B-Base/val.yaml"
# CONFIG="dpo_configs/ultra_prob/Llama-3.2-1B/val.yaml"
echo "[INFO] Using $NUM_GPUS GPUs, config: $CONFIG"
deepspeed --num_gpus "$NUM_GPUS" \
--master_port "$MASTER_PORT" \
training/dpo.py --config "$CONFIG"
- Compute Loss Difference (LossDiff)
# both compute LossDiff for Qwen3-0.6B-Base and Llama-3.2-1B
bash analysis/lossdiff.sh
- Compute Implicit Reward Margin (IRM)
# both compute IRM for Qwen3-0.6B-Base and Llama-3.2-1B
bash analysis/reward_margin.sh
- Analyze Correlation between IF and LossDiff
model=Qwen3-0.6B-Base # or Llama-3.2-1B
step=1
python analysis/correlation_IF_lossdiff.py \
--file1 "storage/influence/${model}_ultra_prob/step${step}/influence_step-${step}.jsonl" \
--file2 "analysis/lossdiff/${model}/step${step}/loss_diff.jsonl" \
--out "analysis/correlation/fig/{model}_IF_lossdiff_step{step}.png" \
--model "${model}"
- Analyze Correlation between IF and IRM
model=Qwen3-0.6B-Base # or Llama-3.2-1B
step=1
python analysis/correlation_IF_IRM.py \
--file1 "storage/influence/${model}_ultra_prob/step${step}/influence_step-${step}.jsonl" \
--file2 "analysis/reward_margin/${model}/step${step}/reward_margin.jsonl" \
--out "analysis/correlation/fig/{model}_IF_IRM_step{step}.png" \
--model "${model}"
- Select Data by IF, LossDiff, IRM or LossDiff-IRM
bash analysis/data_select_mid_k.sh
bash analysis/data_select_by2_mid_k.sh
- Compute Overlap Coefficient between two .jsonl files
bash analysis/overlap_metrics.sh
- Data preprocess to extract validation set
# Split training and validation set
python preprocess/build_ultrafeedback.py \
--split train \
--output_path storage/dataset/ultrafeedback \
--cache_dir cache_dir \
--seed 0
# Split test set
python preprocess/build_ultrafeedback.py \
--split test \
--output_path storage/dataset/ultrafeedback \
--cache_dir cache_dir \
--seed 0
- Train DPO / SLiC, taking Llama-3.1-8B as an example
export WANDB_PROJECT=${WANDB_PROJECT:-"TIF_Analysis"}
export ACCELERATE_LOG_LEVEL=info
# DPO
CONFIG="dpo_configs/ultra/Llama-3.1-8B/dpo_Llama-3.1-8B.yaml"
echo "[INFO] Using $NUM_GPUS GPUs, config: $CONFIG"
deepspeed --num_gpus "$NUM_GPUS" \
--master_port "$MASTER_PORT" \
training/dpo.py --config "$CONFIG"
CONFIG="dpo_configs/ultra/Llama-3.1-8B/dpo_val_Llama-3.1-8B.yaml"
echo "[INFO] Using $NUM_GPUS GPUs, config: $CONFIG"
deepspeed --num_gpus "$NUM_GPUS" \
--master_port "$MASTER_PORT" \
training/dpo.py --config "$CONFIG"
# SLiC
CONFIG="dpo_configs/ultra/Llama-3.1-8B/SLiC_Llama-3.1-8B.yaml"
echo "[INFO] Using $NUM_GPUS GPUs, config: $CONFIG"
deepspeed --num_gpus "$NUM_GPUS" \
--master_port "$MASTER_PORT" \
training/dpo.py --config "$CONFIG"
CONFIG="dpo_configs/ultra/Llama-3.1-8B/SliC_val_Llama-3.1-8B.yaml"
echo "[INFO] Using $NUM_GPUS GPUs, config: $CONFIG"
deepspeed --num_gpus "$NUM_GPUS" \
--master_port "$MASTER_PORT" \
training/dpo.py --config "$CONFIG"
- Compute LossDiff and IRM
python influence/analyze_loss_diff.py \
--input_file storage/dataset/ultrafeedback/train.jsonl \
--output_file storage/lossdiff/Llama-3.1-8B/lossDiff.jsonl \
--train_model_path storage/dpo_ckpts/Llama-3.1-8B_ultra_fulldata_lr2e-4 \
--val_model_path storage/dpo_ckpts/Llama-3.1-8B_ultra_val \
--ref_model_path storage/sft_outputs/sft_Llama-3.1-8B_ultra200k_merged \
--beta 0.1 \
--max_prompt_length 200 \
--max_completion_length 400 \
--batch_size 1 \
--plot --use_lora
python influence/analyze_slic_loss_diff.py \
--input_file storage/dataset/ultrafeedback/train.jsonl \
--output_file storage/lossdiff/Llama-3.1-8B/slic_lossDiff.jsonl \
--train_model_path storage/slic_ckpts/slic_Llama-3.1-8B_ultra_fulldata_lr2e-4 \
--val_model_path storage/slic_ckpts/slic_Llama-3.1-8B_ultra_val \
--ref_model_path storage/sft_outputs/sft_Llama-3.1-8B_ultra200k_merged \
--beta 0.1 \
--max_prompt_length 200 \
--max_completion_length 400 \
--batch_size 1 \
--plot --use_lora
- Make data selection based on LossDiff-IRM
python influence/filter_lossdiff_vs_rm_v1.py \
--input_path storage/lossdiff/Llama-3.1-8B/lossDiff.jsonl \
--output_path storage/lossdiff_select/ultrafeedback/Llama-3.1-8B/select \
--save_dropped_path storage/lossdiff_select/ultrafeedback/Llama-3.1-8B/drop \
--drop_top_delta_percent 10 \
--drop_bottom_delta_percent 10 \
--drop_top_reward_percent 10 \
--drop_bottom_reward_percent 10 --plot
python influence/filter_lossdiff_vs_rm_v1.py \
--input_path storage/lossdiff/Llama-3.1-8B/slic_lossDiff.jsonl \
--output_path storage/slic_lossdiff_select/ultrafeedback/Llama-3.1-8B/select \
--save_dropped_path storage/slic_lossdiff_select/ultrafeedback/Llama-3.1-8B/drop \
--drop_top_delta_percent 10 \
--drop_bottom_delta_percent 10 \
--drop_top_reward_percent 10 \
--drop_bottom_reward_percent 10 --plot
- Train DPO / SLiC based on Selected Data by LossDiff-IRM
export WANDB_PROJECT=${WANDB_PROJECT:-"TIF_Analysis"}
export ACCELERATE_LOG_LEVEL=info
# DPO
CONFIG="dpo_configs/ultra/Llama-3.1-8B/dpo_Llama-3.1-8B_lossdiff-bottom10-top10_rm-bottom10-top10.yaml"
echo "[INFO] Using $NUM_GPUS GPUs, config: $CONFIG"
deepspeed --num_gpus "$NUM_GPUS" \
--master_port "$MASTER_PORT" \
training/dpo.py --config "$CONFIG"
# SLiC
CONFIG="dpo_configs/ultra/Llama-3.1-8B/SLiC_Llama-3.1-8B_lossdiff-bottom10-top10_rm-bottom10-top10.yaml"
echo "[INFO] Using $NUM_GPUS GPUs, config: $CONFIG"
deepspeed --num_gpus "$NUM_GPUS" \
--master_port "$MASTER_PORT" \
training/dpo.py --config "$CONFIG"
- vllm Inference to Obtain the Generation: Given a ckpt, and the benchmark.jsonl file
# Full-parameter model
python vllm_generate/vllm_infer.py \
--model_path "$model_path" \
--tokenizer_path "$tokenizer_path" \
--input_path "$input_path" \
--output_path "$output_path" \
--batch_size 128 --tensor_parallel_size 2
# LoRA model
python vllm_generate/vllm_infer_lora.py \
--model_path "$model_path" \
--lora_path "$lora_path" \
--tokenizer_path "$tokenizer_path" \
--input_path "$input_path" \
--output_path "$output_path" \
--batch_size 256 \
--tensor_parallel_size 2
- Evaluate Single Score or WinRate vs SFT
# Evaluate using GLM-4-Plus
python winrate_eval/single_score.py \
--policy_file $policy_file \
--output_path $output_file \
--num_threads 10
python winrate_eval/winrate.py \
--num_threads 10 \
--policy_file $policy_file \
--ref_file $sft_file \
--output_path $output_file
# Evaluate using local Qwen3-32B
python winrate_eval/single_score_local.py \
--policy_file $policy_file \
--output_path $output_file \
--batch_size 512 \
--seed 47 \
--tensor_parallel_size 2 \
--temperature 0.0
python "$EVAL_SCRIPT" \
--policy_file $policy_file \
--ref_file $sft_file \
--output_path $output_file \
--batch_size 512 \
--seed 47 \
--tensor_parallel_size 2 \
--temperature 0.0
If you use our codes or are interested in our work, please cite our paper!
@inproceedings{zhang2026towards,
title={Towards Understanding Valuable Preference Data for Large Language Model Alignment},
author={Zhang, Zizhuo and Wang, Qizhou and Ye, Shanshan and Zhu, Jianing and Yao, Jiangchao and Han, Bo and Sugiyama, Masashi},
booktitle={The Fourteenth International Conference on Learning Representations},
year={2026}
}
