Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions slurmjobs/config_smollm3.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# AutoTrain config for SmolLM2 Sign Language fine-tuning (Round 2 - Extended)
# NOTE: autotrain saves the model to a folder named after 'project_name' in the current working directory.
# The SLURM script (train_smollm.slurm) changes to the checkpoints directory before running autotrain,
# so the model will be saved to: {checkpoints_dir}/{project_name}/

task: llm-sft
base_model: HuggingFaceTB/SmolLM2-135M-Instruct
Expand Down
25 changes: 18 additions & 7 deletions slurmjobs/train_smollm.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,33 @@ date
CONFIG_PATH=~/thesis/text-to-tokenized-video/slurmjobs/config_smollm3.yaml
CHECKPOINT_DIR=~/thesis/text-to-tokenized-video/checkpoints/smollm2-signlang-v3

# === FORCE MODEL SAVE PATH ===
export AUTOTRAIN_SAVE_PATH="$CHECKPOINT_DIR"
# === CREATE OUTPUT DIR AND CHANGE TO PARENT ===
# autotrain creates {project_name} folder in the current working directory
# We cd to the parent of CHECKPOINT_DIR so that the project_name folder is created there
mkdir -p "$(dirname "$CHECKPOINT_DIR")"
cd "$(dirname "$CHECKPOINT_DIR")"
echo "Working directory: $(pwd)"

# === RUN TRAINING ===
# autotrain will create a folder named after project_name (smollm2-signlang-v3) in current dir
autotrain --config "$CONFIG_PATH"

# === FORCE SAVE IF NOT DONE ===
echo "=== Checking for saved model ==="
SAVED_MODEL=$(find . -type f -name "pytorch_model.bin" | head -n 1)

# List what was created
echo "Contents of checkpoint directory:"
ls -la "$CHECKPOINT_DIR" 2>/dev/null || echo "Checkpoint directory does not exist yet"

# autotrain may save different file formats (safetensors, pytorch_model.bin, etc.)
SAVED_MODEL=$(find "$CHECKPOINT_DIR" -type f \( -name "*.safetensors" -o -name "pytorch_model.bin" \) 2>/dev/null | head -n 1)

if [ -n "$SAVED_MODEL" ]; then
echo "Found saved model at: $SAVED_MODEL"
echo "Copying model files to: $CHECKPOINT_DIR"
cp $(dirname $SAVED_MODEL)/* "$CHECKPOINT_DIR"/
echo "✓ Found saved model at: $SAVED_MODEL"
else
echo "⚠️ WARNING: No saved model found! Training may not have saved anything."
echo "⚠️ WARNING: No saved model found in $CHECKPOINT_DIR"
echo "Checking current directory for model files..."
find . -type f \( -name "*.safetensors" -o -name "pytorch_model.bin" \) 2>/dev/null | head -5
fi

echo "=== Training Complete ==="
Expand Down