diff --git a/slurmjobs/config_smollm3.yaml b/slurmjobs/config_smollm3.yaml index c501f84..a2b28c4 100644 --- a/slurmjobs/config_smollm3.yaml +++ b/slurmjobs/config_smollm3.yaml @@ -1,4 +1,7 @@ # AutoTrain config for SmolLM2 Sign Language fine-tuning (Round 2 - Extended) +# NOTE: autotrain saves the model to a folder named after 'project_name' in the current working directory. +# The SLURM script (train_smollm.slurm) changes to the checkpoints directory before running autotrain, +# so the model will be saved to: {checkpoints_dir}/{project_name}/ task: llm-sft base_model: HuggingFaceTB/SmolLM2-135M-Instruct diff --git a/slurmjobs/train_smollm.slurm b/slurmjobs/train_smollm.slurm index 3078880..0632f7e 100644 --- a/slurmjobs/train_smollm.slurm +++ b/slurmjobs/train_smollm.slurm @@ -23,22 +23,33 @@ date CONFIG_PATH=~/thesis/text-to-tokenized-video/slurmjobs/config_smollm3.yaml CHECKPOINT_DIR=~/thesis/text-to-tokenized-video/checkpoints/smollm2-signlang-v3 -# === FORCE MODEL SAVE PATH === -export AUTOTRAIN_SAVE_PATH="$CHECKPOINT_DIR" +# === CREATE OUTPUT DIR AND CHANGE TO PARENT === +# autotrain creates {project_name} folder in the current working directory +# We cd to the parent of CHECKPOINT_DIR so that the project_name folder is created there +mkdir -p "$(dirname "$CHECKPOINT_DIR")" +cd "$(dirname "$CHECKPOINT_DIR")" +echo "Working directory: $(pwd)" # === RUN TRAINING === +# autotrain will create a folder named after project_name (smollm2-signlang-v3) in current dir autotrain --config "$CONFIG_PATH" # === FORCE SAVE IF NOT DONE === echo "=== Checking for saved model ===" -SAVED_MODEL=$(find . -type f -name "pytorch_model.bin" | head -n 1) + +# List what was created +echo "Contents of checkpoint directory:" +ls -la "$CHECKPOINT_DIR" 2>/dev/null || echo "Checkpoint directory does not exist yet" + +# autotrain may save different file formats (safetensors, pytorch_model.bin, etc.) +SAVED_MODEL=$(find "$CHECKPOINT_DIR" -type f \( -name "*.safetensors" -o -name "pytorch_model.bin" \) 2>/dev/null | head -n 1) if [ -n "$SAVED_MODEL" ]; then - echo "Found saved model at: $SAVED_MODEL" - echo "Copying model files to: $CHECKPOINT_DIR" - cp $(dirname $SAVED_MODEL)/* "$CHECKPOINT_DIR"/ + echo "✓ Found saved model at: $SAVED_MODEL" else - echo "⚠️ WARNING: No saved model found! Training may not have saved anything." + echo "⚠️ WARNING: No saved model found in $CHECKPOINT_DIR" + echo "Checking current directory for model files..." + find . -type f \( -name "*.safetensors" -o -name "pytorch_model.bin" \) 2>/dev/null | head -5 fi echo "=== Training Complete ==="