#!/bin/bash
###############################################################################
# proxmox-stagger-boot.sh
# 
# Sets staggered startup order for all QEMU VMs on this Proxmox VE node.
# - Respects existing settings (does NOT overwrite).
# - Running VMs get onboot=1 and lower order numbers.
# - Stopped VMs get higher order numbers but onboot is NOT touched/set.
# - "up" delay is calculated based on VM size (memory * cores).
#
# Usage: bash proxmox-stagger-boot.sh [--dry-run]
###############################################################################

set -euo pipefail

DRY_RUN=0
if [[ "${1:-}" == "--dry-run" ]]; then
    DRY_RUN=1
    echo "=== DRY RUN MODE — no changes will be made ==="
fi

# --- Configuration knobs (tune as needed) ---

# Delay between VMs within the same size tier.
# This is the core stagger — just enough to avoid simultaneous disk reads.
INTRA_TIER_DELAY=15       # seconds between VMs in the same group

# Extra pause inserted between different tiers, giving the previous
# batch time to finish heavy I/O before bigger VMs join in.
INTER_TIER_DELAY=45       # seconds of breathing room between tiers

# Tier boundaries (by VM weight = memory_MB * cores).
# VMs are sorted into buckets. Tune these thresholds to your fleet.
#   small:  weight <  8192   (e.g. 4GB/2c, 2GB/4c)
#   medium: weight < 32768   (e.g. 8GB/4c, 16GB/2c)
#   large:  weight < 131072  (e.g. 32GB/4c, 64GB/2c)
#   huge:   everything else  (e.g. 64GB/4c+)
TIER_SMALL=8192
TIER_MEDIUM=32768
TIER_LARGE=131072

DOWN_DELAY=30             # Shutdown delay (uniform)
NODE_BOOT_DELAY=30        # startall-onboot-delay (seconds)
ORDER_START_RUNNING=1     # Starting order number for running VMs
ORDER_START_STOPPED=1000  # Starting order number for stopped VMs

# --- Set node-level boot delay (only if not already set) ---
CURRENT_NODE_DELAY=$(pvenode config get 2>/dev/null | grep -oP 'startall-onboot-delay:\s*\K\d+' || echo "")
if [[ -z "$CURRENT_NODE_DELAY" ]]; then
    echo "[NODE] Setting startall-onboot-delay: ${NODE_BOOT_DELAY}s"
    if [[ $DRY_RUN -eq 0 ]]; then
        pvenode config set --startall-onboot-delay "$NODE_BOOT_DELAY"
    fi
else
    echo "[NODE] startall-onboot-delay already set to ${CURRENT_NODE_DELAY}s — not changing."
fi

echo ""

# --- Gather VM data ---
declare -A VM_STATUS VM_MEM VM_CORES VM_STARTUP VM_ONBOOT VM_NAME

for VMID in $(qm list | awk 'NR>1 {print $1}'); do
    # Get status
    STATUS=$(qm status "$VMID" | awk '{print $NF}')
    VM_STATUS[$VMID]="$STATUS"
    
    # Get config values
    CONFIG=$(qm config "$VMID")
    
    # Memory (in MiB) — handle both old "memory: 4096" and new "memory: current=4096"
    MEM_RAW=$(echo "$CONFIG" | grep -oP '^memory:\s*\K.*' || echo "512")
    if [[ "$MEM_RAW" =~ current=([0-9]+) ]]; then
        VM_MEM[$VMID]="${BASH_REMATCH[1]}"
    elif [[ "$MEM_RAW" =~ ^[0-9]+$ ]]; then
        VM_MEM[$VMID]="$MEM_RAW"
    else
        VM_MEM[$VMID]=512
    fi
    
    # Cores
    CORES=$(echo "$CONFIG" | grep -oP '^cores:\s*\K\d+' || echo "1")
    VM_CORES[$VMID]="$CORES"
    
    # Existing startup setting
    STARTUP=$(echo "$CONFIG" | grep -oP '^startup:\s*\K.*' || echo "")
    VM_STARTUP[$VMID]="$STARTUP"
    
    # Existing onboot setting
    ONBOOT=$(echo "$CONFIG" | grep -oP '^onboot:\s*\K\d+' || echo "")
    VM_ONBOOT[$VMID]="$ONBOOT"
    
    # Name (for display)
    NAME=$(echo "$CONFIG" | grep -oP '^name:\s*\K.*' || echo "VM-$VMID")
    VM_NAME[$VMID]="$NAME"
done

# --- Sort VMs by "weight" (memory_GB * cores) within each group ---
# Running VMs first (sorted small → large so small VMs boot first and free resources)
# Stopped VMs second

calc_weight() {
    local vmid=$1
    local mem_mb=${VM_MEM[$vmid]}
    local cores=${VM_CORES[$vmid]}
    echo $(( mem_mb * cores ))
}

# --- Tier classification ---
get_tier() {
    local weight=$1
    if   (( weight < TIER_SMALL ));  then echo "1_small"
    elif (( weight < TIER_MEDIUM )); then echo "2_medium"
    elif (( weight < TIER_LARGE ));  then echo "3_large"
    else                                  echo "4_huge"
    fi
}

# --- Assign up-delays with tiered staggering ---
# Within a tier:  each VM gets INTRA_TIER_DELAY
# Last VM in tier: gets INTER_TIER_DELAY (to let the tier settle before next tier)
assign_up_delays() {
    local ref_list=$1
    local ref_map=$2

    # Copy the referenced array into a local plain array
    local -n _src_list=$ref_list
    local -a vm_arr=("${_src_list[@]+"${_src_list[@]}"}")
    local count=${#vm_arr[@]}

    if (( count == 0 )); then return; fi

    local i=0

    for VMID in "${vm_arr[@]}"; do
        local weight
        weight=$(calc_weight "$VMID")
        local tier
        tier=$(get_tier "$weight")

        # Peek ahead to see if next VM is in a different tier
        local next_tier=""
        if (( i + 1 < count )); then
            local next_vmid="${vm_arr[$((i+1))]}"
            local next_weight
            next_weight=$(calc_weight "$next_vmid")
            next_tier=$(get_tier "$next_weight")
        fi

        if [[ -n "$next_tier" && "$next_tier" != "$tier" ]]; then
            # Last VM in this tier — use inter-tier delay
            eval "${ref_map}[\$VMID]=\$INTER_TIER_DELAY"
        else
            # Regular VM within a tier
            eval "${ref_map}[\$VMID]=\$INTRA_TIER_DELAY"
        fi

        i=$((i + 1))
    done

    # Last VM in the entire list gets 0 delay (nothing after it)
    local last_vmid="${vm_arr[$((count-1))]}"
    eval "${ref_map}[\$last_vmid]=0"
}

# Separate into running and stopped lists
RUNNING_VMS=()
STOPPED_VMS=()

for VMID in "${!VM_STATUS[@]}"; do
    if [[ "${VM_STATUS[$VMID]}" == "running" ]]; then
        RUNNING_VMS+=("$VMID")
    else
        STOPPED_VMS+=("$VMID")
    fi
done

# Sort by weight (ascending — small VMs boot first, they're quick)
sort_by_weight() {
    local -n arr=$1
    if [[ ${#arr[@]} -eq 0 ]]; then return; fi
    local -a sorted
    sorted=($(for vmid in "${arr[@]}"; do
        echo "$(calc_weight "$vmid") $vmid"
    done | sort -n | awk '{print $2}'))
    arr=("${sorted[@]}")
}

sort_by_weight RUNNING_VMS
sort_by_weight STOPPED_VMS

echo "======================================================================"
echo " RUNNING VMs (will get onboot=1, order starting at $ORDER_START_RUNNING)"
echo "======================================================================"
printf "%-8s %-30s %8s %6s %10s %s\n" "VMID" "NAME" "MEM(MB)" "CORES" "WEIGHT" "EXISTING STARTUP"
for VMID in "${RUNNING_VMS[@]}"; do
    printf "%-8s %-30s %8s %6s %10s %s\n" \
        "$VMID" "${VM_NAME[$VMID]}" "${VM_MEM[$VMID]}" "${VM_CORES[$VMID]}" \
        "$(calc_weight "$VMID")" "${VM_STARTUP[$VMID]:-<none>}"
done

echo ""
echo "======================================================================"
echo " STOPPED VMs (will NOT get onboot, order starting at $ORDER_START_STOPPED)"
echo "======================================================================"
printf "%-8s %-30s %8s %6s %10s %s\n" "VMID" "NAME" "MEM(MB)" "CORES" "WEIGHT" "EXISTING STARTUP"
for VMID in "${STOPPED_VMS[@]}"; do
    printf "%-8s %-30s %8s %6s %10s %s\n" \
        "$VMID" "${VM_NAME[$VMID]}" "${VM_MEM[$VMID]}" "${VM_CORES[$VMID]}" \
        "$(calc_weight "$VMID")" "${VM_STARTUP[$VMID]:-<none>}"
done

echo ""
echo "======================================================================"
echo " APPLYING CHANGES"
echo "======================================================================"

# --- Helper: parse existing startup string into components ---
parse_startup() {
    local raw="$1"
    local -n _order=$2
    local -n _up=$3
    local -n _down=$4

    _order=""
    _up=""
    _down=""

    if [[ -z "$raw" ]]; then
        return
    fi

    # Format: "order=N,up=N,down=N" or just "N" (legacy) or partial combos
    if [[ "$raw" =~ order=([0-9]+) ]]; then
        _order="${BASH_REMATCH[1]}"
    elif [[ "$raw" =~ ^([0-9]+) ]]; then
        # Legacy format: just a number means order
        _order="${BASH_REMATCH[1]}"
    fi

    if [[ "$raw" =~ up=([0-9]+) ]]; then
        _up="${BASH_REMATCH[1]}"
    fi

    if [[ "$raw" =~ down=([0-9]+) ]]; then
        _down="${BASH_REMATCH[1]}"
    fi
}

# --- Helper: build startup string from components ---
build_startup() {
    local order="$1"
    local up="$2"
    local down="$3"
    local parts=()

    [[ -n "$order" ]] && parts+=("order=$order")
    [[ -n "$up" ]]    && parts+=("up=$up")
    [[ -n "$down" ]]  && parts+=("down=$down")

    local IFS=','
    echo "${parts[*]}"
}

# --- Pre-calculate tiered delays ---
declare -A RUNNING_DELAYS
declare -A STOPPED_DELAYS
assign_up_delays RUNNING_VMS RUNNING_DELAYS
assign_up_delays STOPPED_VMS STOPPED_DELAYS

# --- Process RUNNING VMs ---
ORDER_COUNTER=$ORDER_START_RUNNING

for VMID in "${RUNNING_VMS[@]}"; do
    echo ""
    echo "--- VM $VMID (${VM_NAME[$VMID]}) [RUNNING] ---"

    # --- Handle onboot ---
    CURRENT_ONBOOT="${VM_ONBOOT[$VMID]}"
    if [[ "$CURRENT_ONBOOT" == "1" ]]; then
        echo "  onboot: already set to 1 — not changing."
    else
        echo "  onboot: setting to 1"
        if [[ $DRY_RUN -eq 0 ]]; then
            qm set "$VMID" --onboot 1
        fi
    fi

    # --- Handle startup ---
    EXISTING_STARTUP="${VM_STARTUP[$VMID]}"
    
    parse_startup "$EXISTING_STARTUP" EXISTING_ORDER EXISTING_UP EXISTING_DOWN

    # Calculate our desired values
    DESIRED_ORDER=$ORDER_COUNTER
    DESIRED_UP="${RUNNING_DELAYS[$VMID]}"
    DESIRED_DOWN=$DOWN_DELAY

    # Only fill in what's missing — never overwrite existing values
    FINAL_ORDER="${EXISTING_ORDER:-$DESIRED_ORDER}"
    FINAL_UP="${EXISTING_UP:-$DESIRED_UP}"
    FINAL_DOWN="${EXISTING_DOWN:-$DESIRED_DOWN}"

    NEW_STARTUP=$(build_startup "$FINAL_ORDER" "$FINAL_UP" "$FINAL_DOWN")

    if [[ "$EXISTING_STARTUP" == "$NEW_STARTUP" ]] || \
       [[ -n "$EXISTING_ORDER" && -n "$EXISTING_UP" && -n "$EXISTING_DOWN" ]]; then
        echo "  startup: already fully configured [$EXISTING_STARTUP] — not changing."
    else
        echo "  startup: setting to [$NEW_STARTUP] (was: [${EXISTING_STARTUP:-<empty>}])"
        if [[ $DRY_RUN -eq 0 ]]; then
            qm set "$VMID" --startup "$NEW_STARTUP"
        fi
    fi

    ORDER_COUNTER=$((ORDER_COUNTER + 1))
done

# --- Process STOPPED VMs ---
ORDER_COUNTER=$ORDER_START_STOPPED

for VMID in "${STOPPED_VMS[@]}"; do
    echo ""
    echo "--- VM $VMID (${VM_NAME[$VMID]}) [STOPPED] ---"

    # --- Do NOT set onboot for stopped VMs ---
    CURRENT_ONBOOT="${VM_ONBOOT[$VMID]}"
    if [[ "$CURRENT_ONBOOT" == "1" ]]; then
        echo "  onboot: already set to 1 (manually/by Ansible?) — not changing."
    else
        echo "  onboot: NOT setting (VM is stopped)."
    fi

    # --- Handle startup (set with lowest priority order) ---
    EXISTING_STARTUP="${VM_STARTUP[$VMID]}"

    parse_startup "$EXISTING_STARTUP" EXISTING_ORDER EXISTING_UP EXISTING_DOWN

    DESIRED_ORDER=$ORDER_COUNTER
    DESIRED_UP="${STOPPED_DELAYS[$VMID]}"
    DESIRED_DOWN=$DOWN_DELAY

    # Only fill in what's missing
    FINAL_ORDER="${EXISTING_ORDER:-$DESIRED_ORDER}"
    FINAL_UP="${EXISTING_UP:-$DESIRED_UP}"
    FINAL_DOWN="${EXISTING_DOWN:-$DESIRED_DOWN}"

    NEW_STARTUP=$(build_startup "$FINAL_ORDER" "$FINAL_UP" "$FINAL_DOWN")

    if [[ "$EXISTING_STARTUP" == "$NEW_STARTUP" ]] || \
       [[ -n "$EXISTING_ORDER" && -n "$EXISTING_UP" && -n "$EXISTING_DOWN" ]]; then
        echo "  startup: already fully configured [$EXISTING_STARTUP] — not changing."
    else
        echo "  startup: setting to [$NEW_STARTUP] (was: [${EXISTING_STARTUP:-<empty>}])"
        if [[ $DRY_RUN -eq 0 ]]; then
            qm set "$VMID" --startup "$NEW_STARTUP"
        fi
    fi

    ORDER_COUNTER=$((ORDER_COUNTER + 1))
done

# ======================================================================
#  SUMMARY REPORT
# ======================================================================
echo ""
echo "======================================================================"
echo " FINAL SUMMARY"
echo "======================================================================"
echo ""
printf "%-8s %-30s %-9s %-8s %-35s\n" "VMID" "NAME" "STATUS" "ONBOOT" "STARTUP"
printf "%-8s %-30s %-9s %-8s %-35s\n" "------" "----------------------------" "-------" "------" "---------------------------------"

# Re-read configs to show actual state after changes
ALL_VMS=("${RUNNING_VMS[@]}" "${STOPPED_VMS[@]}")

for VMID in "${ALL_VMS[@]}"; do
    if [[ $DRY_RUN -eq 1 ]]; then
        # In dry-run, show what WOULD be there
        STATUS="${VM_STATUS[$VMID]}"
        ONBOOT="${VM_ONBOOT[$VMID]}"
        STARTUP="${VM_STARTUP[$VMID]:-<would be set>}"
    else
        CONFIG=$(qm config "$VMID")
        STATUS="${VM_STATUS[$VMID]}"
        ONBOOT=$(echo "$CONFIG" | grep -oP '^onboot:\s*\K\d+' || echo "0")
        STARTUP=$(echo "$CONFIG" | grep -oP '^startup:\s*\K.*' || echo "<none>")
    fi
    printf "%-8s %-30s %-9s %-8s %-35s\n" \
        "$VMID" "${VM_NAME[$VMID]}" "$STATUS" "$ONBOOT" "$STARTUP"
done

echo ""
echo "======================================================================"
echo " NODE-LEVEL SETTINGS"
echo "======================================================================"
if [[ $DRY_RUN -eq 0 ]]; then
    echo "  startall-onboot-delay: $(pvenode config get 2>/dev/null | grep -oP 'startall-onboot-delay:\s*\K\d+' || echo '<not set>')s"
else
    echo "  startall-onboot-delay: ${CURRENT_NODE_DELAY:-$NODE_BOOT_DELAY}s (dry-run)"
fi

echo ""
echo "======================================================================"
echo " BOOT TIMELINE ESTIMATE"
echo "======================================================================"
echo ""
CUMULATIVE=0

# Show node-level delay first
EFFECTIVE_NODE_DELAY="${CURRENT_NODE_DELAY:-$NODE_BOOT_DELAY}"
echo "  T+0s        : Host finishes booting"
echo "  T+${EFFECTIVE_NODE_DELAY}s      : startall-onboot-delay expires, VM startup begins"
CUMULATIVE=$EFFECTIVE_NODE_DELAY

for VMID in "${RUNNING_VMS[@]}"; do
    EXISTING_STARTUP="${VM_STARTUP[$VMID]}"
    parse_startup "$EXISTING_STARTUP" E_ORD E_UP E_DOWN
    UP_DELAY="${E_UP:-${RUNNING_DELAYS[$VMID]}}"
    
    printf "  T+%-8ss : VM %-6s %-25s starts (then wait %ss)\n" \
        "$CUMULATIVE" "$VMID" "(${VM_NAME[$VMID]})" "$UP_DELAY"
    CUMULATIVE=$((CUMULATIVE + UP_DELAY))
done

echo ""
echo "  Total estimated boot cascade: ~${CUMULATIVE}s (~$((CUMULATIVE / 60))min)"
echo ""

if [[ $DRY_RUN -eq 1 ]]; then
    echo "*** DRY RUN — no changes were made. Remove --dry-run to apply. ***"
else
    echo "*** All changes applied successfully. ***"
fi

echo ""
echo "Done."
