-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexample.sh
More file actions
60 lines (52 loc) · 2.25 KB
/
example.sh
File metadata and controls
60 lines (52 loc) · 2.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/bin/bash
OUTPUT_DIR="./datasets"
MODEL_POOL=(
"Qwen/Qwen3-0.6B"
"Qwen/Qwen3-1.7B"
"Qwen/Qwen3-4B"
)
JUDGE_MODEL="Qwen/Qwen3-8B"
# Exit on error, unset variable, or pipe failure
set -euo pipefail
# ===== DATASET PRE-PROCESSING =====
# Download the dataset and take the first 1024 rows for example purposes
# Note: allenai/ultrafeedback_binarized_cleaned is supported without further pre-processing
python <<PY
import os
from datasets import load_dataset
dataset = load_dataset("allenai/ultrafeedback_binarized_cleaned", split="train_prefs")
dataset = dataset.select(range(1024))
dataset.save_to_disk("${OUTPUT_DIR}/0_pre_processed")
PY
# ===== RESPONSE GENERATION =====
# Generate the completions for each model individually (can be run in parallel)
for MODEL in "${MODEL_POOL[@]}"; do
MODEL_NAME=${MODEL##*/} # Get the model name from the path, e.g Qwen/Qwen3-0.6B -> Qwen3-0.6B
python -m activeuf.completions.generate_completions \
--dataset_path ${OUTPUT_DIR}/0_pre_processed \
--model_name ${MODEL} \
--model_class vllm \
--output_path ${OUTPUT_DIR}/1_individual_completions/${MODEL_NAME}
done
# Merge the individual completions into a single dataset
python -m activeuf.completions.merge_completions \
--datasets_path ${OUTPUT_DIR}/1_individual_completions \
--output_path ${OUTPUT_DIR}/2_merged_completions
# ===== RESPONSE ANNOTATION =====
# Pre-compute the judge scores for all responses (can be run in parallel)
for MODEL in "${MODEL_POOL[@]}"; do
MODEL_NAME=${MODEL##*/} # Get the model name from the path, e.g Qwen/Qwen3-0.6B -> Qwen3-0.6B
python -m activeuf.oracle.get_raw_annotations \
--model_name ${JUDGE_MODEL} \
--model_to_annotate ${MODEL} \
--dataset_path ${OUTPUT_DIR}/2_merged_completions \
--model_class vllm \
--output_path ${OUTPUT_DIR}/3_annotated_completions/
done
# Merge the annotated completions into a single dataset
python -m activeuf.oracle.combine_annotated_completions \
--annotations_folder ${OUTPUT_DIR}/3_annotated_completions \
--completions_folder ${OUTPUT_DIR}/1_individual_completions \
--output_folder ${OUTPUT_DIR}/4_merged_annotations
# ===== MAIN LOOP =====
python -m activeuf.loop.run --config_path configs/example_loop.yaml