run.sh

#!/bin/bash
set -euo pipefail

# Load environment variables from .env if it exists
if [ -f .env ]; then
    source .env
fi

if [ $# -ne 1 ]; then
    echo "Usage: $0 <job-file>"
    exit 1
fi

JOB_FILE="$1"
if [ ! -f "$JOB_FILE" ]; then
    echo "Error: Job file $JOB_FILE not found"
    exit 1
fi

# Generate unique run ID using timestamp
RUN_ID=$(date +%Y%m%d-%H%M%S)
RUN_DIR="./runs/${RUN_ID}"
STATE_FILE="${RUN_DIR}/state.yaml"
RESULTS_FILE="${RUN_DIR}/results.yaml"

# Create run directory structure
mkdir -p "${RUN_DIR}"

# Copy job file for reference
cp "${JOB_FILE}" "${RUN_DIR}/job.yaml"

# Load job configuration
JOB_NAME=$(yq eval '.name' "${JOB_FILE}")
PROVIDER=$(yq eval '.provider' "${JOB_FILE}")

# Initialize state file
cat > "${STATE_FILE}" << EOF
run_id: ${RUN_ID}
job_name: ${JOB_NAME}
start_time: $(date -u +"%Y-%m-%dT%H:%M:%SZ")
status: running
provider: ${PROVIDER}
instances: []
EOF

# Initialize results file
cat > "${RESULTS_FILE}" << EOF
run_id: ${RUN_ID}
job_name: ${JOB_NAME}
results: []
EOF

# Function to update state
update_state() {
    local instance_id=$1
    local status=$2
    local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
    
    if ! yq -e ".instances[] | select(.id == \"${instance_id}\")" "${STATE_FILE}" > /dev/null; then
        # New instance
        yq -i ".instances += [{\"id\": \"${instance_id}\", \"status\": \"${status}\", \"created_at\": \"${timestamp}\", \"updated_at\": \"${timestamp}\"}]" "${STATE_FILE}"
    else
        # Update existing instance
        yq -i "(.instances[] | select(.id == \"${instance_id}\")).status = \"${status}\" | (.instances[] | select(.id == \"${instance_id}\")).updated_at = \"${timestamp}\"" "${STATE_FILE}"
    fi
}

# Function to save benchmark results
save_results() {
    local instance_id=$1
    local read_iops=$2
    local write_iops=$3
    local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
    
    yq -i ".results += [{\"instance_id\": \"${instance_id}\", \"timestamp\": \"${timestamp}\", \"metrics\": {\"read_iops\": ${read_iops}, \"write_iops\": ${write_iops}}}]" "${RESULTS_FILE}"
}

# Function to check doctl auth
check_doctl() {
    if ! command -v doctl &> /dev/null; then
        echo "Error: doctl not found. Please install it first." >&2
        exit 1
    fi

    if ! doctl account get &> /dev/null; then
        echo "Error: doctl not authenticated. Run 'doctl auth init' first." >&2
        exit 1
    fi
}

# Function to check if instance types have enough disk space
check_disk_sizes() {
    local benchmark_size=$1
    local instances_yaml=$2
    local num_instances=$(yq eval '.instances | length' "${instances_yaml}")
    
    # Convert benchmark size to GB for comparison
    # Strip any trailing G/g and convert to number
    local required_gb=$(echo "${benchmark_size}" | sed 's/[Gg]$//' | bc)
    
    # Add 5GB buffer for OS and overhead
    required_gb=$((required_gb + 5))
    
    echo "Checking if instances have enough disk space (need ${required_gb}GB)..."
    
    for ((idx=0; idx<num_instances; idx++)); do
        local type=$(yq eval ".instances[$idx].type" "${instances_yaml}")
        
        # Get disk size from doctl
        local disk_gb=$(doctl compute size ls --format Disk,Slug --no-header | grep "^.*${type}$" | awk '{print $1}')
        
        if [ -z "$disk_gb" ]; then
            echo "Error: Could not find disk size for instance type ${type}" >&2
            exit 1
        fi
        
        if [ "$disk_gb" -lt "$required_gb" ]; then
            echo "Error: Instance type ${type} has ${disk_gb}GB disk, but benchmark requires ${required_gb}GB" >&2
            exit 1
        fi
        
        echo "Instance type ${type} has ${disk_gb}GB disk space (sufficient)"
    done
}

# Function to get SSH key fingerprint
get_ssh_key() {
    if [ -z "${DO_SSH_KEY_NAME:-}" ]; then
        echo "Error: DO_SSH_KEY_NAME environment variable not set" >&2
        echo "Please set DO_SSH_KEY_NAME to your DigitalOcean SSH key name" >&2
        exit 1
    fi

    # Get the key fingerprint
    if ! KEY_FINGERPRINT=$(doctl compute ssh-key list --format FingerPrint,Name --no-header | grep "${DO_SSH_KEY_NAME}" | awk '{print $1}'); then
        echo "Error: SSH key '${DO_SSH_KEY_NAME}' not found in DigitalOcean" >&2
        exit 1
    fi
}

# Function to wait for droplet to be ready
wait_for_droplet() {
    local droplet_id=$1
    local max_attempts=30
    local attempt=1

    while [ $attempt -le $max_attempts ]; do
        status=$(doctl compute droplet get $droplet_id --format Status --no-header)
        if [ "$status" = "active" ]; then
            return 0
        fi
        echo "Waiting for droplet to be ready... (attempt $attempt/$max_attempts)"
        sleep 10
        ((attempt++))
    done

    echo "Error: Droplet failed to become active in time" >&2
    return 1
}

# Function to wait for SSH to be ready
wait_for_ssh() {
    local ip=$1
    local max_attempts=30
    local attempt=1

    echo "Waiting for SSH on ${ip}..."
    while [ $attempt -le $max_attempts ]; do
        if ssh -o StrictHostKeyChecking=no \
                -o ConnectTimeout=5 \
                -o BatchMode=yes \
                -o LogLevel=ERROR \
                root@"${ip}" "echo ready" 2>&1; then
            echo "SSH ready on ${ip}"
            return 0
        fi
        echo "Attempt ${attempt}/${max_attempts}: SSH not ready yet..."
        sleep 10
        ((attempt++))
    done

    echo "Error: SSH failed to become ready in time" >&2
    return 1
}

# Function to setup instance for benchmarking
setup_instance() {
    local ip=$1
    set -e;
    
    echo "Installing required packages..."
    ssh -o StrictHostKeyChecking=no root@$ip "apt-get update && apt-get install -y fio"
}

# Function to create instance
create_instance() {
    local instance_type=$1
    local region=$2
    local instance_id
    local safe_type=$(echo "${instance_type}" | tr -cd 'a-zA-Z0-9-')
    
    case "${PROVIDER}" in
        "digitalocean")
            # Create the droplet silently
            local droplet_json
            droplet_json=$(DOCTL_OUTPUT=json doctl compute droplet create \
                 "bench-${safe_type}" \
                 --size "${instance_type}" \
                 --image ubuntu-22-04-x64 \
                 --region "${region}" \
                 --ssh-keys "${KEY_FINGERPRINT}" \
                 --wait \
                 --output=json 2>/dev/null)
             
             # Return just the ID and IP in a parseable format
             echo "ID=$(echo "$droplet_json" | yq e '.[0].id' -)"
             echo "IP=$(echo "$droplet_json" | yq e '.[0].networks.v4[] | select(.type == "public") | .ip_address' -)"
             
             ;;
        *)
            echo "Unsupported provider: ${PROVIDER}" >&2
            exit 1
            ;;
    esac
}

# Function to cleanup instance
cleanup_instance() {
    local instance_id=$1
    
    if [[ $instance_id == do-* ]]; then
        local droplet_id=${instance_id#do-}
        echo "Deleting droplet ${droplet_id}..."
        doctl compute droplet delete -f "${droplet_id}"
    fi
}

# Function to run fio benchmark
run_benchmark() {
    local instance_id=$1
    # Sanitize instance_id for directory name
    local safe_dir=$(echo "${instance_id}" | tr -cd 'a-zA-Z0-9-')
    local instance_dir="${RUN_DIR}/${safe_dir}"
    
    echo "Creating instance directory ${instance_dir}"
    # Create instance directory first
    mkdir -p "${instance_dir}" || {
        echo "Error: Failed to create directory ${instance_dir}" >&2
        cleanup_instance "${instance_id}"
        exit 1
    }
    
    # Update state before proceeding
    update_state "${instance_id}" "benchmarking"
    
    local size=$(yq eval '.benchmark.size' "${JOB_FILE}")
    local bs=$(yq eval '.benchmark.bs' "${JOB_FILE}")
    local iodepth=$(yq eval '.benchmark.iodepth' "${JOB_FILE}")
    local rwmixread=$(yq eval '.benchmark.rwmixread' "${JOB_FILE}")
    
    # Get instance IP from state file
    local ip
    ip=$(yq eval ".instances[] | select(.id == \"${instance_id}\") | .ip" "${STATE_FILE}")
    if [ -z "${ip}" ]; then
        echo "Error: Could not find IP for instance ${instance_id} in state file" >&2
        cat "${STATE_FILE}" >&2
        cleanup_instance "${instance_id}"
        exit 1
    fi
    
    # Run fio remotely and capture output
    echo "Running fio benchmark on ${instance_id}..."
    ssh -o StrictHostKeyChecking=no root@${ip} "fio --randrepeat=1 \
        --ioengine=libaio \
        --direct=1 \
        --gtod_reduce=1 \
        --name=test \
        --filename=/root/test \
        --bs=${bs} \
        --iodepth=${iodepth} \
        --size=${size} \
        --readwrite=randrw \
        --rwmixread=${rwmixread} \
        --output-format=json" > "${instance_dir}/fio.json" || {
            echo "Error: fio benchmark failed on ${instance_id}" >&2
            cleanup_instance "${instance_id}"
            exit 1
        }
    
    # Extract and save results
    read_iops=$(yq e '.jobs[0].read.iops | select(type == "number") // "error"' "${instance_dir}/fio.json")
    write_iops=$(yq e '.jobs[0].write.iops | select(type == "number") // "error"' "${instance_dir}/fio.json")
    
    if [ "$read_iops" = "error" ] || [ "$write_iops" = "error" ]; then
        echo "Error: Failed to get valid IOPS values from fio output" >&2
        cat "${instance_dir}/fio.json" >&2
        cleanup_instance "${instance_id}"
        exit 1
    fi
    
    save_results "${instance_id}" "${read_iops}" "${write_iops}"
    update_state "${instance_id}" "completed"
    
    # Cleanup remote test file
    ssh -o StrictHostKeyChecking=no root@${ip} "rm -f /root/test"
    
    # Cleanup the instance
    cleanup_instance "${instance_id}"
}

# Main execution
echo "Starting benchmark run ${RUN_ID} for job ${JOB_NAME}"

check_doctl
get_ssh_key

# Sanity check disk sizes
benchmark_size=$(yq eval '.benchmark.size' "${JOB_FILE}")
check_disk_sizes "${benchmark_size}" "${JOB_FILE}"

# Get all instances as array and iterate
instances=$(yq eval '.instances' "${JOB_FILE}")
num_instances=$(yq eval '.instances | length' "${JOB_FILE}")

for ((idx=0; idx<num_instances; idx++)); do
    type=$(yq eval ".instances[$idx].type" "${JOB_FILE}")
    region=$(yq eval ".instances[$idx].region" "${JOB_FILE}")
    count=$(yq eval ".instances[$idx].count" "${JOB_FILE}")

    if [ -z "$type" ] || [ -z "$region" ] || [ -z "$count" ]; then
        echo "Error: Invalid instance configuration:"
        yq eval ".instances[$idx]" "${JOB_FILE}"
        exit 1
    fi

    echo "Creating ${count} instances of type ${type} in region ${region}"
    for ((i=1; i<=count; i++)); do
        echo "Creating instance ${i} of type ${type} in region ${region}"
        
        # Create instance and parse output
        instance_info=$(create_instance "${type}" "${region}")
        droplet_id=$(echo "$instance_info" | grep "^ID=" | cut -d= -f2)
        droplet_ip=$(echo "$instance_info" | grep "^IP=" | cut -d= -f2)
        instance_id="do-${droplet_id}"
        
        # Wait for instance to be ready
        wait_for_droplet "${droplet_id}"
        wait_for_ssh "${droplet_ip}"
        
        # Setup instance
        setup_instance "${droplet_ip}"
        
        # Store instance info in state file
        yq -i ".instances += [{\"id\": \"${instance_id}\", \"ip\": \"${droplet_ip}\", \"type\": \"${type}\", \"region\": \"${region}\"}]" "${STATE_FILE}"
        
        echo "Instance ${instance_id} created, running benchmark..."
        run_benchmark "${instance_id}"
    done
done

# Update final state
sed -i "s/status: running/status: completed/" "${STATE_FILE}"
echo "Benchmark run completed. Results in ${RUN_DIR}" 

# Run results.py
python3 results.py "${RUN_DIR}"