From 73bd51406326a7d65607d7cae11e1c8fe210fb53 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 17:24:49 -0400 Subject: [PATCH 01/20] misc: add Phoenix runner management scripts, remove outdated doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a toolkit for managing GitHub Actions self-hosted runners on Phoenix login nodes: check-runners.sh — quick per-node health check list-runners.sh — detailed runner table (name, node, status, slurm, RSS) restart-runner.sh — restart one runner with proper login shell PATH restart-all.sh — restart all runners in place rebalance-runners.sh — auto-compute optimal distribution and move runners create-runner.sh — register and start a new runner rerun-failed.sh — find and rerun failed GHA workflows on open PRs config.sh — shared config (nodes, cgroup limit, discovery helpers) Remove the outdated starting-phoenix-runners.md which referenced the old SOCKS5 proxy setup that is no longer needed. Co-Authored-By: Claude Opus 4.6 (1M context) --- misc/phoenix/check-runners.sh | 33 +++++ misc/phoenix/config.sh | 153 ++++++++++++++++++++ misc/phoenix/create-runner.sh | 110 ++++++++++++++ misc/phoenix/list-runners.sh | 68 +++++++++ misc/phoenix/rebalance-runners.sh | 230 ++++++++++++++++++++++++++++++ misc/phoenix/rerun-failed.sh | 77 ++++++++++ misc/phoenix/restart-all.sh | 85 +++++++++++ misc/phoenix/restart-runner.sh | 45 ++++++ misc/starting-phoenix-runners.md | 110 -------------- 9 files changed, 801 insertions(+), 110 deletions(-) create mode 100755 misc/phoenix/check-runners.sh create mode 100755 misc/phoenix/config.sh create mode 100755 misc/phoenix/create-runner.sh create mode 100755 misc/phoenix/list-runners.sh create mode 100755 misc/phoenix/rebalance-runners.sh create mode 100755 misc/phoenix/rerun-failed.sh create mode 100755 misc/phoenix/restart-all.sh create mode 100755 misc/phoenix/restart-runner.sh delete mode 100644 misc/starting-phoenix-runners.md diff --git a/misc/phoenix/check-runners.sh b/misc/phoenix/check-runners.sh new file mode 100755 index 0000000000..2e00f33cd4 --- /dev/null +++ b/misc/phoenix/check-runners.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Quick health check for GitHub Actions runners across Phoenix login nodes. +# +# Lighter than list-runners.sh — doesn't query each runner individually, +# just shows per-node counts and memory. Use list-runners.sh for details. +# +# Usage: bash check-runners.sh + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" + +for node in "${NODES[@]}"; do + echo "=== $node ===" + ssh -o ConnectTimeout=5 "$node" ' + for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do + cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || echo "???") + has_slurm=$(cat /proc/$p/environ 2>/dev/null | tr "\0" "\n" | grep -c /opt/slurm || echo 0) + worker=$(ps aux | grep "Runner.Worker" | grep "$cwd" | grep -v grep | awk "{print \$2}" | head -1) + [ -n "$worker" ] && status="BUSY" || status="idle" + rss=$(ps -p $p -o rss= 2>/dev/null | awk "{printf \"%.0f\", \$1/1024}" || echo "?") + name=$(basename "$cwd") + parent=$(basename $(dirname "$cwd")) + slurm_ok="ok" + [ "$has_slurm" -eq 0 ] && slurm_ok="MISSING" + printf " %-40s %5s slurm=%-7s %s MB\n" "$parent/$name" "$status" "$slurm_ok" "$rss" + done + ' 2>/dev/null || echo " (unreachable)" + + rss=$(ssh -o ConnectTimeout=5 "$node" "ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" 2>/dev/null || echo "?") + echo " --- Total: ${rss} MB / ${CGROUP_LIMIT} MB ($(( CGROUP_LIMIT - rss )) MB free) ---" + echo "" +done diff --git a/misc/phoenix/config.sh b/misc/phoenix/config.sh new file mode 100755 index 0000000000..da494d8b7e --- /dev/null +++ b/misc/phoenix/config.sh @@ -0,0 +1,153 @@ +#!/bin/bash +# Shared configuration for Phoenix GitHub Actions runner management scripts. +# +# Sources this file to get: +# NODES — array of physical login node hostnames +# CGROUP_LIMIT — per-user memory limit in MB +# discover_runners() — populates RUNNER_DIRS, RUNNER_NAMES, RUNNER_POOLS arrays +# find_runner_node() — returns which node a runner is currently running on + +# Physical login nodes (gnr-1 = login-1 = login-4, etc.) +NODES=(login-phoenix-gnr-1 login-phoenix-gnr-2 login-phoenix-gnr-3) + +# Per-user cgroup memory limit on Phoenix login nodes (MB) +CGROUP_LIMIT=4096 + +# Parent directories that contain runner installations. +# Each may have actions-runner-N/ subdirectories with a .runner config file. +RUNNER_PARENT_DIRS=( + /storage/scratch1/6/sbryngelson3/mfc-runners + /storage/project/r-sbryngelson3-0/sbryngelson3/mfc-runners-2 + /storage/scratch1/6/sbryngelson3/cfdnn-runners +) + +# Discover all registered runners on the shared filesystem. +# Populates parallel arrays: RUNNER_DIRS, RUNNER_NAMES, RUNNER_POOLS, RUNNER_ORGS +declare -a RUNNER_DIRS=() +declare -a RUNNER_NAMES=() +declare -a RUNNER_POOLS=() +declare -a RUNNER_ORGS=() + +discover_runners() { + RUNNER_DIRS=() + RUNNER_NAMES=() + RUNNER_POOLS=() + RUNNER_ORGS=() + + for parent in "${RUNNER_PARENT_DIRS[@]}"; do + for conf in "$parent"/actions-runner-*/.runner; do + [ -f "$conf" ] || continue + local dir=$(dirname "$conf") + local info + info=$(python3 -c " +import json, sys +d = json.loads(open('$conf').read().lstrip('\ufeff')) +print(d.get('agentName', '')) +print(d.get('poolName', '')) +print(d.get('gitHubUrl', '')) +" 2>/dev/null) + local name pool org + name=$(echo "$info" | sed -n '1p') + pool=$(echo "$info" | sed -n '2p') + org=$(echo "$info" | sed -n '3p') + + RUNNER_DIRS+=("$dir") + RUNNER_NAMES+=("${name:-$(basename "$dir")}") + RUNNER_POOLS+=("${pool:-unknown}") + RUNNER_ORGS+=("${org:-unknown}") + done + done +} + +# Find which physical node a runner is currently running on. +# Args: $1 = runner directory +# Prints: node hostname, or "offline" if not found on any node. +# Note: uses CWD matching (readlink /proc/PID/cwd) because the runner +# command line is just "Runner.Listener run" without the full path. +find_runner_node() { + local dir="$1" + for node in "${NODES[@]}"; do + local found + found=$(ssh -o ConnectTimeout=5 "$node" ' + for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do + cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) + [ "$cwd" = "'"$dir"'" ] && echo "$p" && break + done + ' 2>/dev/null || true) + if [ -n "$found" ]; then + echo "$node" + return + fi + done + echo "offline" +} + +# Find PIDs of a runner on a node by matching CWD. +# Args: $1 = node, $2 = runner directory +# Prints: space-separated PIDs, or empty if not found. +_find_runner_pids() { + local node="$1" dir="$2" + ssh -o ConnectTimeout=5 "$node" ' + for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do + cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) + [ "$cwd" = "'"$dir"'" ] && echo -n "$p " + done + ' 2>/dev/null || true +} + +# Start a runner on a specific node with proper PATH and detachment. +# Args: $1 = node, $2 = runner directory +# Returns 0 on success, 1 on failure. +start_runner() { + local node="$1" dir="$2" + ssh -o ConnectTimeout=5 "$node" \ + "setsid bash -lc 'cd $dir && nohup ./run.sh >> runner-nohup.log 2>&1 &'" \ + /dev/null & + local ssh_pid=$! + # Wait up to 10s for SSH to exit + local i + for i in $(seq 1 10); do + kill -0 $ssh_pid 2>/dev/null || break + sleep 1 + done + kill $ssh_pid 2>/dev/null || true + wait $ssh_pid 2>/dev/null || true + sleep 3 + # Verify + local new_pid + new_pid=$(_find_runner_pids "$node" "$dir") + [ -n "$new_pid" ] +} + +# Stop a runner on a specific node. +# Args: $1 = node, $2 = runner directory +stop_runner() { + local node="$1" dir="$2" + local pids + pids=$(_find_runner_pids "$node" "$dir") + if [ -z "$pids" ]; then + return 0 + fi + for pid in $pids; do + ssh -o ConnectTimeout=5 "$node" "kill $pid" 2>/dev/null || true + done + sleep 3 + # Force kill survivors + pids=$(_find_runner_pids "$node" "$dir") + for pid in $pids; do + ssh -o ConnectTimeout=5 "$node" "kill -9 $pid" 2>/dev/null || true + done + sleep 1 +} + +# Check if a runner has slurm in its PATH. +# Args: $1 = node, $2 = PID (whitespace trimmed) +# Returns 0 if slurm is in PATH, 1 otherwise. +check_slurm_path() { + local node="$1" pid="${2// /}" + local count + count=$(ssh -o ConnectTimeout=5 "$node" \ + "cat /proc/$pid/environ 2>/dev/null | tr '\0' '\n' | grep -c /opt/slurm" \ + 2>/dev/null || echo 0) + [ "$count" -gt 0 ] +} diff --git a/misc/phoenix/create-runner.sh b/misc/phoenix/create-runner.sh new file mode 100755 index 0000000000..d68cad361e --- /dev/null +++ b/misc/phoenix/create-runner.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# Create and register a new GitHub Actions runner for Phoenix. +# +# Downloads the runner binary, configures it with a registration token, +# and starts it on the specified login node with proper PATH. +# +# Prerequisites: +# - gh CLI authenticated with admin access to the target org +# - The parent directory must exist and be on shared storage +# +# Usage: bash create-runner.sh [org] [runner-group] +# +# Examples: +# bash create-runner.sh phoenix-11 login-phoenix-gnr-2 /storage/scratch1/6/sbryngelson3/mfc-runners +# bash create-runner.sh phoenix-12 login-phoenix-gnr-3 /storage/project/.../mfc-runners-2 MFlowCode phoenix + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" + +if [ $# -lt 3 ]; then + echo "Usage: $0 [org] [runner-group]" + echo "" + echo " runner-name Name for the runner (e.g. phoenix-11)" + echo " node Login node to run on (e.g. login-phoenix-gnr-2)" + echo " parent-dir Parent directory for the runner installation" + echo " org GitHub org (default: MFlowCode)" + echo " runner-group Runner group/pool (default: phoenix)" + exit 1 +fi + +runner_name="$1" +node="$2" +parent_dir="$3" +org="${4:-MFlowCode}" +runner_group="${5:-phoenix}" + +# Determine next available runner directory +existing=$(ls -d "$parent_dir"/actions-runner-* 2>/dev/null | sed 's/.*actions-runner-//' | sort -n | tail -1) +next_num=$(( ${existing:-0} + 1 )) +runner_dir="$parent_dir/actions-runner-$next_num" + +echo "=== Creating runner ===" +echo " Name: $runner_name" +echo " Node: $node" +echo " Directory: $runner_dir" +echo " Org: $org" +echo " Group: $runner_group" +echo "" + +if [ -d "$runner_dir" ]; then + echo "ERROR: Directory already exists: $runner_dir" + exit 1 +fi + +# Get registration token +echo "Getting registration token from GitHub..." +token=$(gh api "orgs/$org/actions/runners/registration-token" --jq .token 2>/dev/null) +if [ -z "$token" ]; then + echo "ERROR: Failed to get registration token. Check 'gh auth status' and org admin permissions." + exit 1 +fi +echo " Token acquired." + +# Get latest runner version +echo "Downloading runner..." +latest_version=$(gh api repos/actions/runner/releases/latest --jq .tag_name 2>/dev/null | sed 's/^v//') +if [ -z "$latest_version" ]; then + echo "ERROR: Failed to determine latest runner version." + exit 1 +fi +runner_url="https://github.com/actions/runner/releases/download/v${latest_version}/actions-runner-linux-x64-${latest_version}.tar.gz" +echo " Version: $latest_version" + +mkdir -p "$runner_dir" +cd "$runner_dir" + +curl -sL "$runner_url" | tar xz +echo " Downloaded and extracted." + +# Configure +echo "Configuring runner..." +./config.sh \ + --url "https://github.com/$org" \ + --token "$token" \ + --name "$runner_name" \ + --runnergroup "$runner_group" \ + --labels "gt" \ + --work "_work" \ + --unattended \ + --replace +echo " Configured." + +# Start on the target node +echo "Starting runner on $node..." +if start_runner "$node" "$runner_dir"; then + pid=$(ssh -o ConnectTimeout=5 "$node" "pgrep -f 'Runner.Listener.*$runner_dir'" 2>/dev/null || true) + if check_slurm_path "$node" "$pid"; then + echo " OK: Running as PID $pid on $node, slurm in PATH" + else + echo " WARNING: Running as PID $pid but slurm NOT in PATH" + fi +else + echo " ERROR: Failed to start. Try manually:" + echo " ssh $node 'cd $runner_dir && setsid bash -lc \"nohup ./run.sh >> runner-nohup.log 2>&1 &\"'" +fi + +echo "" +echo "Runner $runner_name created at $runner_dir" +echo "Verify with: bash $SCRIPT_DIR/list-runners.sh" diff --git a/misc/phoenix/list-runners.sh b/misc/phoenix/list-runners.sh new file mode 100755 index 0000000000..48e5a2574e --- /dev/null +++ b/misc/phoenix/list-runners.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# List all registered GitHub Actions runners, showing which node each is on, +# whether it's busy, its memory usage, and whether slurm is in PATH. +# +# Output columns: +# Name — GitHub runner name (from .runner config) +# Node — login node it's running on, or "offline" +# Status — idle, BUSY (has Worker process), or OFFLINE +# Slurm — ok or MISSING (can't submit SLURM jobs) +# RSS — memory usage in MB +# Pool — GitHub runner group/pool +# Directory — filesystem path +# +# Usage: bash list-runners.sh + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" + +discover_runners + +# Header +printf "%-25s %-22s %-8s %-8s %5s %-10s %s\n" \ + "NAME" "NODE" "STATUS" "SLURM" "RSS" "POOL" "DIRECTORY" +printf "%s\n" "$(printf '%.0s-' {1..120})" + +for i in "${!RUNNER_DIRS[@]}"; do + dir="${RUNNER_DIRS[$i]}" + name="${RUNNER_NAMES[$i]}" + pool="${RUNNER_POOLS[$i]}" + + node=$(find_runner_node "$dir") + + if [ "$node" = "offline" ]; then + printf "%-25s %-22s %-8s %-8s %5s %-10s %s\n" \ + "$name" "—" "OFFLINE" "—" "—" "$pool" "$dir" + continue + fi + + # Get all info in one SSH call to reduce latency + info=$(ssh -o ConnectTimeout=5 "$node" ' + pid="" + for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do + cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) + [ "$cwd" = "'"$dir"'" ] && pid=$p && break + done + [ -z "$pid" ] && echo "? ? ? ?" && exit + worker=$(ps aux | grep Runner.Worker | grep "'"$dir"'" | grep -v grep | head -1 || true) + [ -n "$worker" ] && status="BUSY" || status="idle" + has_slurm=$(cat /proc/$pid/environ 2>/dev/null | tr "\0" "\n" | grep -c /opt/slurm || echo 0) + [ "$has_slurm" -gt 0 ] && slurm="ok" || slurm="MISSING" + rss=$(ps -p $pid -o rss= 2>/dev/null | awk "{printf \"%.0f\", \$1/1024}" || echo "?") + echo "$pid $status $slurm $rss" + ' 2>/dev/null || echo "? ? ? ?") + read -r pid status slurm rss <<< "$info" + + printf "%-25s %-22s %-8s %-8s %5s %-10s %s\n" \ + "$name" "$node" "$status" "$slurm" "${rss}MB" "$pool" "$dir" +done + +# Per-node summary +echo "" +echo "=== Per-node memory ===" +for node in "${NODES[@]}"; do + count=$(ssh -o ConnectTimeout=5 "$node" "ps aux | grep Runner.Listener | grep -v grep | wc -l" 2>/dev/null || echo 0) + rss=$(ssh -o ConnectTimeout=5 "$node" "ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" 2>/dev/null || echo "?") + echo " $node: $count runners, ${rss} MB / ${CGROUP_LIMIT} MB ($(( CGROUP_LIMIT - rss )) MB free)" +done diff --git a/misc/phoenix/rebalance-runners.sh b/misc/phoenix/rebalance-runners.sh new file mode 100755 index 0000000000..42e207c74b --- /dev/null +++ b/misc/phoenix/rebalance-runners.sh @@ -0,0 +1,230 @@ +#!/bin/bash +# Automatically rebalance GitHub Actions runners across Phoenix login nodes. +# +# Computes the optimal distribution (equal runners per node), determines +# which runners need to move, and executes the moves. Prefers to move +# idle runners over busy ones. +# +# Each Phoenix login node has a 4 GB per-user cgroup memory limit. +# Target: ~3-4 runners per node to leave headroom for CI work. +# +# Usage: bash rebalance-runners.sh # dry run +# APPLY=1 bash rebalance-runners.sh # execute moves +# APPLY=1 FORCE=1 bash rebalance-runners.sh # move busy runners too + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" + +discover_runners +num_nodes=${#NODES[@]} +num_runners=${#RUNNER_DIRS[@]} +target_per_node=$(( num_runners / num_nodes )) +remainder=$(( num_runners % num_nodes )) + +echo "=== Current state ===" +echo "Runners: $num_runners across $num_nodes nodes" +echo "Target: $target_per_node per node (+1 on first $remainder nodes)" +echo "" + +# Build current assignment: node -> list of runner indices +declare -A node_runners # node -> space-separated indices +declare -A runner_node # index -> node +declare -A runner_busy # index -> 1 if busy + +for node in "${NODES[@]}"; do + node_runners[$node]="" +done + +for i in "${!RUNNER_DIRS[@]}"; do + dir="${RUNNER_DIRS[$i]}" + node=$(find_runner_node "$dir") + runner_node[$i]="$node" + + if [ "$node" != "offline" ]; then + node_runners[$node]="${node_runners[$node]:-} $i" + worker=$(ssh -o ConnectTimeout=5 "$node" "ps aux | grep Runner.Worker | grep '$dir' | grep -v grep" 2>/dev/null || true) + [ -n "$worker" ] && runner_busy[$i]=1 || runner_busy[$i]=0 + else + runner_busy[$i]=0 + fi +done + +# Show current distribution +for node in "${NODES[@]}"; do + indices=(${node_runners[$node]:-}) + echo "$node: ${#indices[@]} runners" + for i in "${indices[@]}"; do + busy_marker="" + [ "${runner_busy[$i]:-0}" = "1" ] && busy_marker=" (BUSY)" + echo " ${RUNNER_NAMES[$i]}$busy_marker" + done +done + +# Find offline runners +offline=() +for i in "${!RUNNER_DIRS[@]}"; do + [ "${runner_node[$i]}" = "offline" ] && offline+=("$i") +done +if [ ${#offline[@]} -gt 0 ]; then + echo "" + echo "OFFLINE runners:" + for i in "${offline[@]}"; do + echo " ${RUNNER_NAMES[$i]} (${RUNNER_DIRS[$i]})" + done +fi + +echo "" + +# Compute target per node +declare -A node_target +n=0 +for node in "${NODES[@]}"; do + node_target[$node]=$target_per_node + if [ $n -lt $remainder ]; then + node_target[$node]=$(( target_per_node + 1 )) + fi + n=$((n + 1)) +done + +# Determine moves needed +# Phase 1: identify overloaded nodes and runners to move away +moves=() # "source_node dest_node runner_index" + +# Collect runners to move from overloaded nodes (prefer idle runners) +to_place=() # indices of runners that need a new home +for node in "${NODES[@]}"; do + indices=(${node_runners[$node]:-}) + excess=$(( ${#indices[@]} - ${node_target[$node]} )) + if [ $excess -le 0 ]; then + continue + fi + # Sort: move idle runners first, then busy + idle_here=() + busy_here=() + for i in "${indices[@]}"; do + if [ "${runner_busy[$i]:-0}" = "1" ]; then + busy_here+=("$i") + else + idle_here+=("$i") + fi + done + # Pick from idle first + moved=0 + for i in "${idle_here[@]}"; do + [ $moved -ge $excess ] && break + to_place+=("$node $i") + moved=$((moved + 1)) + done + for i in "${busy_here[@]}"; do + [ $moved -ge $excess ] && break + to_place+=("$node $i") + moved=$((moved + 1)) + done +done + +# Phase 2: assign runners to underloaded nodes +# Also place offline runners +for i in "${offline[@]}"; do + to_place+=("offline $i") +done + +for entry in "${to_place[@]}"; do + read -r src_node runner_idx <<< "$entry" + # Find the most underloaded node + best_node="" + best_deficit=-999 + for node in "${NODES[@]}"; do + current=(${node_runners[$node]:-}) + deficit=$(( ${node_target[$node]} - ${#current[@]} )) + if [ $deficit -gt $best_deficit ]; then + best_deficit=$deficit + best_node=$node + fi + done + + if [ -z "$best_node" ] || [ "$best_deficit" -le 0 ]; then + echo "WARNING: No underloaded node for ${RUNNER_NAMES[$runner_idx]}, skipping" + continue + fi + + moves+=("$src_node $best_node $runner_idx") + # Update bookkeeping + if [ "$src_node" != "offline" ]; then + # Remove from source + new_list="" + for idx in ${node_runners[$src_node]}; do + [ "$idx" != "$runner_idx" ] && new_list="$new_list $idx" + done + node_runners[$src_node]="$new_list" + fi + node_runners[$best_node]="${node_runners[$best_node]:-} $runner_idx" +done + +if [ ${#moves[@]} -eq 0 ]; then + echo "Already balanced — no moves needed." + exit 0 +fi + +# Show plan +echo "=== Planned moves ===" +has_busy=false +for move in "${moves[@]}"; do + read -r src dst idx <<< "$move" + busy_marker="" + if [ "${runner_busy[$idx]:-0}" = "1" ]; then + busy_marker=" (BUSY!)" + has_busy=true + fi + echo " ${RUNNER_NAMES[$idx]}: $src -> $dst$busy_marker" +done + +echo "" +echo "=== Target distribution ===" +for node in "${NODES[@]}"; do + indices=(${node_runners[$node]:-}) + echo " $node: ${#indices[@]} runners" +done + +if [ "$has_busy" = true ] && [ "${FORCE:-0}" != "1" ]; then + echo "" + echo "Some runners to move have active jobs. Set FORCE=1 to move them." + exit 1 +fi + +if [ "${APPLY:-0}" != "1" ]; then + echo "" + echo "Dry run — set APPLY=1 to execute. Add FORCE=1 to move busy runners." + exit 0 +fi + +# Execute +echo "" +echo "=== Executing moves ===" +for move in "${moves[@]}"; do + read -r src dst idx <<< "$move" + dir="${RUNNER_DIRS[$idx]}" + name="${RUNNER_NAMES[$idx]}" + echo "Moving $name: $src -> $dst" + + if [ "$src" != "offline" ]; then + stop_runner "$src" "$dir" + fi + + if start_runner "$dst" "$dir"; then + pid=$(ssh -o ConnectTimeout=5 "$dst" "pgrep -f 'Runner.Listener.*$dir' | head -1" 2>/dev/null || true) + if [ -n "$pid" ] && check_slurm_path "$dst" "$pid"; then + echo " OK: PID $pid on $dst, slurm in PATH" + elif [ -n "$pid" ]; then + echo " WARNING: PID $pid on $dst but slurm MISSING from PATH" + else + echo " ERROR: Process not found after start" + fi + else + echo " ERROR: Failed to start on $dst" + fi +done + +echo "" +echo "=== Final state ===" +bash "$SCRIPT_DIR/list-runners.sh" diff --git a/misc/phoenix/rerun-failed.sh b/misc/phoenix/rerun-failed.sh new file mode 100755 index 0000000000..1744e480cb --- /dev/null +++ b/misc/phoenix/rerun-failed.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# Rerun failed GitHub Actions workflows on open non-draft MFC PRs and master. +# +# Checks the 5 most recent workflow runs per branch. Only the failed jobs +# within each run are rerun (via `gh run rerun --failed`), not the entire +# workflow. Runs that are already in progress or queued are skipped by `gh`. +# +# Requires: gh CLI authenticated with access to MFlowCode/MFC +# +# Usage: bash rerun-failed.sh # dry run (show what would be rerun) +# APPLY=1 bash rerun-failed.sh # actually rerun failed workflows + +set -euo pipefail + +REPO="MFlowCode/MFC" + +echo "Checking open non-draft PRs on $REPO..." +prs=$(gh pr list --repo "$REPO" --state open --json number,title,isDraft --jq '.[] | select(.isDraft == false) | .number') + +if [ -z "$prs" ]; then + echo "No open non-draft PRs found." + exit 0 +fi + +rerun_count=0 +for pr in $prs; do + title=$(gh pr view --repo "$REPO" "$pr" --json title --jq .title) + branch=$(gh pr view --repo "$REPO" "$pr" --json headRefName --jq .headRefName) + + # Find failed workflow runs on this PR's branch + failed_runs=$(gh run list --repo "$REPO" --branch "$branch" --limit 5 \ + --json databaseId,status,conclusion,name \ + --jq '.[] | select(.conclusion == "failure") | "\(.databaseId) \(.name)"') + + if [ -n "$failed_runs" ]; then + echo "" + echo "=== PR #$pr: $title ===" + echo "$failed_runs" | while read -r run_id run_name; do + # Check which jobs failed + failed_jobs=$(gh run view --repo "$REPO" "$run_id" \ + --json jobs --jq '.jobs[] | select(.conclusion == "failure" or .conclusion == "cancelled") | .name') + echo " Run $run_id ($run_name):" + echo "$failed_jobs" | while read -r job; do + echo " - $job" + done + + if [ "${APPLY:-0}" = "1" ]; then + echo " Rerunning failed jobs..." + gh run rerun --repo "$REPO" "$run_id" --failed || echo " WARNING: rerun failed (may already be rerunning)" + rerun_count=$((rerun_count + 1)) + fi + done + fi +done + +# Also check master branch +echo "" +echo "=== master branch ===" +master_failed=$(gh run list --repo "$REPO" --branch master --limit 5 \ + --json databaseId,status,conclusion,name \ + --jq '.[] | select(.conclusion == "failure") | "\(.databaseId) \(.name)"') +if [ -n "$master_failed" ]; then + echo "$master_failed" | while read -r run_id run_name; do + echo " Run $run_id ($run_name)" + if [ "${APPLY:-0}" = "1" ]; then + echo " Rerunning failed jobs..." + gh run rerun --repo "$REPO" "$run_id" --failed || echo " WARNING: rerun failed" + fi + done +else + echo " All passing" +fi + +if [ "${APPLY:-0}" != "1" ]; then + echo "" + echo "Dry run — set APPLY=1 to actually rerun failed workflows." +fi diff --git a/misc/phoenix/restart-all.sh b/misc/phoenix/restart-all.sh new file mode 100755 index 0000000000..0c94b7e744 --- /dev/null +++ b/misc/phoenix/restart-all.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# Restart all GitHub Actions runners on their current nodes. +# +# Useful after a login node reboot or when runners need a fresh start +# (e.g. to pick up a new PATH or clear stale state). Each runner is +# restarted in place — no rebalancing is done. +# +# Usage: bash restart-all.sh # dry run (show what would restart) +# APPLY=1 bash restart-all.sh # restart all runners +# APPLY=1 FORCE=1 bash restart-all.sh # restart even busy runners + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" + +discover_runners + +echo "=== Discovering runners ===" +declare -a restart_list=() # "node dir name" + +for i in "${!RUNNER_DIRS[@]}"; do + dir="${RUNNER_DIRS[$i]}" + name="${RUNNER_NAMES[$i]}" + node=$(find_runner_node "$dir") + + if [ "$node" = "offline" ]; then + echo " $name: OFFLINE (skipping — use rebalance-runners.sh to place it)" + continue + fi + + worker=$(ssh -o ConnectTimeout=5 "$node" "ps aux | grep Runner.Worker | grep '$dir' | grep -v grep" 2>/dev/null || true) + if [ -n "$worker" ]; then + echo " $name: BUSY on $node" + if [ "${FORCE:-0}" != "1" ]; then + echo " Skipping busy runner. Set FORCE=1 to restart anyway." + continue + fi + else + echo " $name: idle on $node" + fi + + restart_list+=("$node $dir $name") +done + +if [ ${#restart_list[@]} -eq 0 ]; then + echo "" + echo "Nothing to restart." + exit 0 +fi + +echo "" +echo "${#restart_list[@]} runners will be restarted." + +if [ "${APPLY:-0}" != "1" ]; then + echo "Dry run — set APPLY=1 to execute." + exit 0 +fi + +echo "" +echo "=== Restarting ===" +success=0 +fail=0 +for entry in "${restart_list[@]}"; do + read -r node dir name <<< "$entry" + echo "--- $name on $node ---" + stop_runner "$node" "$dir" + if start_runner "$node" "$dir"; then + pid=$(ssh -o ConnectTimeout=5 "$node" "pgrep -f 'Runner.Listener.*$dir' | head -1" 2>/dev/null || true) + if [ -n "$pid" ] && check_slurm_path "$node" "$pid"; then + echo " OK: PID $pid, slurm in PATH" + success=$((success + 1)) + elif [ -n "$pid" ]; then + echo " WARNING: PID $pid but slurm MISSING from PATH" + fail=$((fail + 1)) + fi + else + echo " ERROR: Failed to start" + fail=$((fail + 1)) + fi +done + +echo "" +echo "=== Summary: $success succeeded, $fail failed ===" +echo "" +bash "$SCRIPT_DIR/list-runners.sh" diff --git a/misc/phoenix/restart-runner.sh b/misc/phoenix/restart-runner.sh new file mode 100755 index 0000000000..da8e44a0da --- /dev/null +++ b/misc/phoenix/restart-runner.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Restart a GitHub Actions runner on a specific Phoenix login node. +# +# Kills any existing instance of the runner, then starts a new one. +# Uses 'bash -l' so the runner inherits the full login PATH (which +# includes /opt/slurm/current/bin — required for sbatch/squeue/sacct). +# Uses 'setsid' + stdin close for full terminal detachment so the SSH +# session exits cleanly without waiting for the runner process. +# +# The runner binary lives on shared storage, so no files need to be +# copied — only the process needs to run on the target node. +# +# Usage: bash restart-runner.sh +# Example: bash restart-runner.sh login-phoenix-gnr-2 /storage/scratch1/6/sbryngelson3/mfc-runners/actions-runner-3 + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo "Example: $0 login-phoenix-gnr-2 /storage/scratch1/6/sbryngelson3/mfc-runners/actions-runner-3" + exit 1 +fi + +node="$1" +dir="$2" +name=$(basename "$dir") + +echo "Restarting $name on $node..." + +stop_runner "$node" "$dir" + +if start_runner "$node" "$dir"; then + pid=$(ssh -o ConnectTimeout=5 "$node" "pgrep -f 'Runner.Listener.*$dir' | head -1" 2>/dev/null || true) + if [ -n "$pid" ] && check_slurm_path "$node" "$pid"; then + echo " OK: PID $pid, slurm in PATH" + elif [ -n "$pid" ]; then + echo " WARNING: PID $pid started but slurm NOT in PATH!" + echo " The runner may not be able to submit SLURM jobs." + fi +else + echo " ERROR: Runner failed to start on $node" + echo " Try manually: ssh $node 'cd $dir && setsid bash -lc \"nohup ./run.sh >> runner-nohup.log 2>&1 &\"'" +fi diff --git a/misc/starting-phoenix-runners.md b/misc/starting-phoenix-runners.md deleted file mode 100644 index 5e77fbd189..0000000000 --- a/misc/starting-phoenix-runners.md +++ /dev/null @@ -1,110 +0,0 @@ -# Launching Phoenix Runners - -The Phoenix runners were repeatedly failing due to a network error. -Spencer managed to fix it via [this PR](https://github.com/MFlowCode/MFC/pull/933) and by running things through a socks5 proxy on each login node that holds a runner. -These are documented for Spencer or his next of kin. - -__The runners are started via the following process__ - -1. Log in to the login node via `ssh login-phoenix-rh9-.pace.gatech.edu`. `` can be `1` through `6` on Phoenix. - * Detour: Make sure no stray `ssh` daemons are sitting around: `pkill -9 sshd`. - * You can probably keep your terminal alive via `fuser -k -9 ~/nohup.out`, which kills (signal 9) whatever process is writing to that no-hangup file (the daemon we care about) -2. Log back into the same login node because you may have just nuked your session - * Detour: Make sure stray runners on that login node are dead (one liner): `pkill -9 -f -E 'run.sh|Runner.listener|Runner.helper'` - * If cautious, check that no runner processes are left over. `top` followed by `u` and `` and return. -3. Execute from your home directory: `nohup ssh -N -D 1080 -vvv login-phoenix-rh9-.pace.gatech.edu &`, replacing `` with the login node number - * This starts a proxy to tunnel a new ssh session through -4. Navigate to your runner's directory (or create a runner directory if you need). - * Right now they are in Spencer's `scratch/mfc-runners/action-runner-` -5. Run the alias `start_runner`, which dumps output `~/runner.out` - * If one doesn't have this alias yet, create and source it in your `.bashrc` or similar: -```bash -alias start_runner=' \ - http_proxy="socks5://localhost:1080" \ - https_proxy="socks5://localhost:1080" \ - no_proxy="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org" \ - NO_PROXY="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org" \ - RUNNER_DEBUG=1 \ - ACTIONS_STEP_DEBUG=1 \ - GITHUB_ACTIONS_RUNNER_PREFER_IP_FAMILY=ipv4 \ - DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_TIME=00:01:00 \ - DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_INTERVAL=00:00:20 \ - DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_RETRYCOUNT=5 \ - nohup ./run.sh > ~/runner.out 2>&1 &' -``` -6. You're done - - -### For inquisitive minds - -__Why the `start_runner` alias?__ - -1. `alias start_runner='…'` - Defines a new shell alias named `start_runner`. Whenever you run `start_runner`, the shell will execute everything between the single quotes as if you’d typed it at the prompt. - -2. `http_proxy="socks5://localhost:1080"` - Sets the `http_proxy` environment variable so that any HTTP traffic from the runner is sent through a SOCKS5 proxy listening on `localhost:1080`. - -3. `https_proxy="socks5://localhost:1080"` - Tells HTTPS-aware tools to use that same local SOCKS5 proxy for HTTPS requests. - -4. `no_proxy="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org"` - Lists hosts and domains that should bypass the proxy entirely. Commonly used for internal or high-volume endpoints where you don’t want proxy overhead. - -5. `NO_PROXY="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org"` - Same list as `no_proxy`—some programs only check the uppercase `NO_PROXY` variable. - -6. `RUNNER_DEBUG=1` - Enables debug-level logging in the GitHub Actions runner itself, so you’ll see more verbose internal messages in its logs. - -7. `ACTIONS_STEP_DEBUG=1` - Turns on step-level debug logging for actions you invoke—handy if you need to trace exactly what each action is doing under the hood. - -8. `GITHUB_ACTIONS_RUNNER_PREFER_IP_FAMILY=ipv4` - Forces the runner to resolve DNS names to IPv4 addresses only. Useful if your proxy or network has spotty IPv6 support. - -9. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_TIME=00:01:00` - For .NET–based tasks: sets the initial TCP keepalive timeout to 1 minute (after 1 minute of idle, a keepalive probe is sent). - -10. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_INTERVAL=00:00:20` - If the first keepalive probe gets no response, wait 20 seconds between subsequent probes. - -11. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_RETRYCOUNT=5` - If probes continue to go unanswered, retry up to 5 times before declaring the connection dead. - -12. `nohup ./run.sh > ~/runner.out 2>&1 &` - - `nohup … &` runs `./run.sh` in the background and makes it immune to hangups (so it keeps running if you log out). - - `> ~/runner.out` redirects **stdout** to the file `runner.out` in your home directory. - - `2>&1` redirects **stderr** into the same file, so you get a combined log of everything the script prints. - -__Why the extra ssh command?__ - -1. `http_proxy="socks5://localhost:1080"` - Routes all HTTP traffic through a local SOCKS5 proxy on port 1080. - -2. `https_proxy="socks5://localhost:1080"` - Routes all HTTPS traffic through the same proxy. - -3. `no_proxy="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org"` - Specifies hosts and domains that bypass the proxy entirely. Includes specific things that MFC's CMake will try to `wget` (e.g., `fftw`) or some other non `git` command. Allows `git clone` to work. - -4. `NO_PROXY="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org"` - Same bypass list for applications that only check the uppercase variable. - -5. `RUNNER_DEBUG=1` - Enables verbose internal logging in the GitHub Actions runner. - -6. `GITHUB_ACTIONS_RUNNER_PREFER_IP_FAMILY=ipv4` - Forces DNS resolution to IPv4 to avoid IPv6 issues. - -7. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_TIME=00:01:00` - (For .NET tasks) sends the first TCP keepalive probe after 1 minute of idle. - -8. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_INTERVAL=00:00:20` - Waits 20 seconds between subsequent TCP keepalive probes. - -9. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_RETRYCOUNT=5` - Retries keepalive probes up to 5 times before closing the connection. - -10. `nohup ./run.sh > ~/runner.out 2>&1 &` - Runs `run.sh` in the background, immune to hangups, redirecting both stdout and stderr to `~/runner.out`. From 13fad55e2e8a4c4f13379fcac37dce6e98ee3fcf Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 17:26:15 -0400 Subject: [PATCH 02/20] misc: add README for Phoenix runner management scripts Co-Authored-By: Claude Opus 4.6 (1M context) --- misc/phoenix/README.md | 109 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 misc/phoenix/README.md diff --git a/misc/phoenix/README.md b/misc/phoenix/README.md new file mode 100644 index 0000000000..9ec5ab426f --- /dev/null +++ b/misc/phoenix/README.md @@ -0,0 +1,109 @@ +# Phoenix Runner Management Scripts + +Scripts for managing GitHub Actions self-hosted runners on Georgia Tech Phoenix +login nodes. The runners submit SLURM jobs to Phoenix compute nodes for MFC's +CI/CD pipeline. + +## Background + +Phoenix has 3 physical login nodes (`login-phoenix-gnr-{1,2,3}`), each with a +**4 GB per-user cgroup memory limit**. Each runner process uses ~60–130 MB, so +distributing runners evenly (~3–4 per node) is important to avoid OOM kills. + +Runner binaries live on shared storage (`/storage/scratch1/` or +`/storage/project/`), so moving a runner between nodes only requires stopping +the process on one node and starting it on another — no files are copied. + +Runners must be started with a **login shell** (`bash -l`) so they inherit the +full PATH, which includes `/opt/slurm/current/bin` (required for `sbatch`, +`squeue`, `sacct`). Starting without a login shell causes "sbatch: command not +found" errors. + +## Scripts + +All scripts live in this directory and share configuration via `config.sh`. + +### Quick Reference + +```bash +# Check health of all runners +bash check-runners.sh + +# Detailed table of all registered runners +bash list-runners.sh + +# Restart a single runner +bash restart-runner.sh login-phoenix-gnr-2 /path/to/actions-runner-3 + +# Restart all runners in place (e.g. after a node reboot) +APPLY=1 bash restart-all.sh + +# Auto-rebalance runners across nodes +bash rebalance-runners.sh # dry run +APPLY=1 bash rebalance-runners.sh # execute + +# Create a new runner +bash create-runner.sh phoenix-11 login-phoenix-gnr-2 /path/to/parent-dir + +# Rerun failed CI workflows on open PRs +bash rerun-failed.sh # dry run +APPLY=1 bash rerun-failed.sh # execute +``` + +### Script Details + +| Script | Purpose | +|---|---| +| `config.sh` | Shared configuration sourced by all other scripts. Defines login node list, cgroup limit, runner parent directories, and helper functions (`discover_runners`, `find_runner_node`, `start_runner`, `stop_runner`, `check_slurm_path`). | +| `check-runners.sh` | Quick per-node health check. Shows each runner's name, idle/BUSY status, whether slurm is in PATH, RSS memory, and total user memory per node. Lightweight (one SSH call per node). | +| `list-runners.sh` | Detailed table of all registered runners discovered from `.runner` config files on shared storage. Shows GitHub runner name, which node it's on (or OFFLINE), status, slurm PATH, RSS, pool, and directory. Slower than `check-runners.sh` (one SSH per runner). | +| `restart-runner.sh` | Stop and restart a single runner on a given node. Uses `setsid bash -lc` for proper PATH and ` +``` + +**OOM kills during CI jobs** +Check memory distribution with `bash check-runners.sh`. If one node is +overloaded, rebalance: `APPLY=1 bash rebalance-runners.sh`. + +**Runner shows OFFLINE in `list-runners.sh`** +The runner process is not running on any node. Either the node rebooted or the +process was killed. Restart it on an appropriate node: +```bash +bash restart-runner.sh login-phoenix-gnr-2 /path/to/runner-dir +``` +Or let `rebalance-runners.sh` place it automatically: +```bash +APPLY=1 bash rebalance-runners.sh +``` + +**All runners down after node maintenance** +```bash +APPLY=1 bash restart-all.sh +``` From 91ef65e8fe723d63818d26aa664aa752b926782e Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 17:45:39 -0400 Subject: [PATCH 03/20] misc: streamline Phoenix runner scripts with GitHub API and hardcoded config Simplify scripts to be Phoenix-aware: org, runner group, label, nodes, and parent dirs are all in config.sh rather than passed as arguments. Add GitHub API helpers (gh_list_runners, gh_registration_token) so list-runners.sh shows both API status and process state. create-runner.sh now only needs a name and node. Co-Authored-By: Claude Opus 4.6 (1M context) --- misc/phoenix/README.md | 105 ++++++--------- misc/phoenix/config.sh | 176 ++++++++++++------------- misc/phoenix/create-runner.sh | 97 ++++++-------- misc/phoenix/list-runners.sh | 78 +++++------ misc/phoenix/rebalance-runners.sh | 209 +++++++++++------------------- misc/phoenix/restart-all.sh | 44 +++---- misc/phoenix/restart-runner.sh | 28 ++-- 7 files changed, 309 insertions(+), 428 deletions(-) diff --git a/misc/phoenix/README.md b/misc/phoenix/README.md index 9ec5ab426f..b00c56724c 100644 --- a/misc/phoenix/README.md +++ b/misc/phoenix/README.md @@ -7,103 +7,82 @@ CI/CD pipeline. ## Background Phoenix has 3 physical login nodes (`login-phoenix-gnr-{1,2,3}`), each with a -**4 GB per-user cgroup memory limit**. Each runner process uses ~60–130 MB, so -distributing runners evenly (~3–4 per node) is important to avoid OOM kills. +**4 GB per-user cgroup memory limit**. Each runner uses ~60-130 MB, so +distributing them evenly (~3-4 per node) is important to avoid OOM kills. -Runner binaries live on shared storage (`/storage/scratch1/` or -`/storage/project/`), so moving a runner between nodes only requires stopping -the process on one node and starting it on another — no files are copied. +Runner binaries live on shared storage, so moving a runner between nodes only +requires stopping the process on one node and starting it on another. -Runners must be started with a **login shell** (`bash -l`) so they inherit the -full PATH, which includes `/opt/slurm/current/bin` (required for `sbatch`, -`squeue`, `sacct`). Starting without a login shell causes "sbatch: command not -found" errors. +Runners must be started with a **login shell** (`bash -l`) so they inherit +`/opt/slurm/current/bin` in PATH (required for `sbatch`, `squeue`, `sacct`). -## Scripts - -All scripts live in this directory and share configuration via `config.sh`. - -### Quick Reference +## Quick Reference ```bash -# Check health of all runners +# Check health (quick, one SSH per node) bash check-runners.sh -# Detailed table of all registered runners +# Detailed table with GitHub API status bash list-runners.sh -# Restart a single runner +# Restart one runner bash restart-runner.sh login-phoenix-gnr-2 /path/to/actions-runner-3 -# Restart all runners in place (e.g. after a node reboot) +# Restart all runners in place APPLY=1 bash restart-all.sh -# Auto-rebalance runners across nodes +# Auto-rebalance across nodes bash rebalance-runners.sh # dry run APPLY=1 bash rebalance-runners.sh # execute -# Create a new runner -bash create-runner.sh phoenix-11 login-phoenix-gnr-2 /path/to/parent-dir +# Create a new runner (needs gh CLI with admin:org scope) +bash create-runner.sh phoenix-11 login-phoenix-gnr-2 -# Rerun failed CI workflows on open PRs +# Rerun failed CI on open PRs bash rerun-failed.sh # dry run APPLY=1 bash rerun-failed.sh # execute ``` -### Script Details +## Scripts | Script | Purpose | |---|---| -| `config.sh` | Shared configuration sourced by all other scripts. Defines login node list, cgroup limit, runner parent directories, and helper functions (`discover_runners`, `find_runner_node`, `start_runner`, `stop_runner`, `check_slurm_path`). | -| `check-runners.sh` | Quick per-node health check. Shows each runner's name, idle/BUSY status, whether slurm is in PATH, RSS memory, and total user memory per node. Lightweight (one SSH call per node). | -| `list-runners.sh` | Detailed table of all registered runners discovered from `.runner` config files on shared storage. Shows GitHub runner name, which node it's on (or OFFLINE), status, slurm PATH, RSS, pool, and directory. Slower than `check-runners.sh` (one SSH per runner). | -| `restart-runner.sh` | Stop and restart a single runner on a given node. Uses `setsid bash -lc` for proper PATH and ` -``` +**"sbatch: command not found"** — Runner started without login shell. +Fix: `bash restart-runner.sh ` -**OOM kills during CI jobs** -Check memory distribution with `bash check-runners.sh`. If one node is -overloaded, rebalance: `APPLY=1 bash rebalance-runners.sh`. +**OOM kills** — Too many runners on one node. +Fix: `bash check-runners.sh` then `APPLY=1 bash rebalance-runners.sh` -**Runner shows OFFLINE in `list-runners.sh`** -The runner process is not running on any node. Either the node rebooted or the -process was killed. Restart it on an appropriate node: -```bash -bash restart-runner.sh login-phoenix-gnr-2 /path/to/runner-dir -``` -Or let `rebalance-runners.sh` place it automatically: -```bash -APPLY=1 bash rebalance-runners.sh -``` +**Runner OFFLINE** — Process died or node rebooted. +Fix: `APPLY=1 bash rebalance-runners.sh` (auto-places on least-loaded node) -**All runners down after node maintenance** -```bash -APPLY=1 bash restart-all.sh -``` +**All runners down** — Node maintenance. +Fix: `APPLY=1 bash restart-all.sh` diff --git a/misc/phoenix/config.sh b/misc/phoenix/config.sh index da494d8b7e..ee850faf81 100755 --- a/misc/phoenix/config.sh +++ b/misc/phoenix/config.sh @@ -1,153 +1,137 @@ #!/bin/bash -# Shared configuration for Phoenix GitHub Actions runner management scripts. +# Shared configuration for Phoenix GitHub Actions runner management. # -# Sources this file to get: -# NODES — array of physical login node hostnames -# CGROUP_LIMIT — per-user memory limit in MB -# discover_runners() — populates RUNNER_DIRS, RUNNER_NAMES, RUNNER_POOLS arrays -# find_runner_node() — returns which node a runner is currently running on +# Sourced by all other scripts. Provides constants, GitHub API helpers, +# and login-node process management functions. -# Physical login nodes (gnr-1 = login-1 = login-4, etc.) +# --- Phoenix constants --- +ORG="MFlowCode" +RUNNER_GROUP="phoenix" +RUNNER_LABEL="gt" NODES=(login-phoenix-gnr-1 login-phoenix-gnr-2 login-phoenix-gnr-3) +CGROUP_LIMIT=4096 # per-user memory limit in MB on login nodes -# Per-user cgroup memory limit on Phoenix login nodes (MB) -CGROUP_LIMIT=4096 - -# Parent directories that contain runner installations. -# Each may have actions-runner-N/ subdirectories with a .runner config file. +# Parent directories containing actions-runner-*/ installations on shared storage. RUNNER_PARENT_DIRS=( /storage/scratch1/6/sbryngelson3/mfc-runners /storage/project/r-sbryngelson3-0/sbryngelson3/mfc-runners-2 - /storage/scratch1/6/sbryngelson3/cfdnn-runners ) -# Discover all registered runners on the shared filesystem. -# Populates parallel arrays: RUNNER_DIRS, RUNNER_NAMES, RUNNER_POOLS, RUNNER_ORGS -declare -a RUNNER_DIRS=() -declare -a RUNNER_NAMES=() -declare -a RUNNER_POOLS=() -declare -a RUNNER_ORGS=() +# --- GitHub API --- + +# List Phoenix runners from the GitHub API. +# Prints: id name status busy (one runner per line) +gh_list_runners() { + gh api "orgs/$ORG/actions/runners" --paginate \ + --jq ".runners[] + | select(.labels | map(.name) | index(\"$RUNNER_LABEL\")) + | \"\(.id) \(.name) \(.status) \(.busy)\"" +} + +# Get a registration token for new runners. +gh_registration_token() { + gh api "orgs/$ORG/actions/runners/registration-token" --jq .token +} + +# Get the latest runner binary version. +gh_latest_runner_version() { + gh api repos/actions/runner/releases/latest --jq '.tag_name | ltrimstr("v")' +} + +# Remove a runner registration from GitHub. +# Args: $1 = runner ID (numeric, from API) +gh_remove_runner() { + gh api "orgs/$ORG/actions/runners/$1" -X DELETE +} -discover_runners() { - RUNNER_DIRS=() - RUNNER_NAMES=() - RUNNER_POOLS=() - RUNNER_ORGS=() +# --- Local filesystem --- +# Find all runner directories on shared storage. +# Prints: one directory path per line. +find_runner_dirs() { for parent in "${RUNNER_PARENT_DIRS[@]}"; do for conf in "$parent"/actions-runner-*/.runner; do - [ -f "$conf" ] || continue - local dir=$(dirname "$conf") - local info - info=$(python3 -c " -import json, sys -d = json.loads(open('$conf').read().lstrip('\ufeff')) -print(d.get('agentName', '')) -print(d.get('poolName', '')) -print(d.get('gitHubUrl', '')) -" 2>/dev/null) - local name pool org - name=$(echo "$info" | sed -n '1p') - pool=$(echo "$info" | sed -n '2p') - org=$(echo "$info" | sed -n '3p') - - RUNNER_DIRS+=("$dir") - RUNNER_NAMES+=("${name:-$(basename "$dir")}") - RUNNER_POOLS+=("${pool:-unknown}") - RUNNER_ORGS+=("${org:-unknown}") + [ -f "$conf" ] && dirname "$conf" done done } -# Find which physical node a runner is currently running on. +# Get the GitHub runner name from a .runner config file. # Args: $1 = runner directory -# Prints: node hostname, or "offline" if not found on any node. -# Note: uses CWD matching (readlink /proc/PID/cwd) because the runner -# command line is just "Runner.Listener run" without the full path. -find_runner_node() { - local dir="$1" - for node in "${NODES[@]}"; do - local found - found=$(ssh -o ConnectTimeout=5 "$node" ' - for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do - cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) - [ "$cwd" = "'"$dir"'" ] && echo "$p" && break - done - ' 2>/dev/null || true) - if [ -n "$found" ]; then - echo "$node" - return - fi - done - echo "offline" +get_runner_name() { + python3 -c " +import json +d = json.loads(open('$1/.runner').read().lstrip('\ufeff')) +print(d.get('agentName', '')) +" 2>/dev/null } -# Find PIDs of a runner on a node by matching CWD. +# --- Login-node process management --- + +# Find PIDs of a runner on a node by matching its CWD. +# (Runner.Listener's command line is just "Runner.Listener run" — no path.) # Args: $1 = node, $2 = runner directory -# Prints: space-separated PIDs, or empty if not found. -_find_runner_pids() { - local node="$1" dir="$2" - ssh -o ConnectTimeout=5 "$node" ' +# Prints: space-separated PIDs, or empty. +find_pids() { + ssh -o ConnectTimeout=5 "$1" ' for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) - [ "$cwd" = "'"$dir"'" ] && echo -n "$p " + [ "$cwd" = "'"$2"'" ] && echo -n "$p " done ' 2>/dev/null || true } -# Start a runner on a specific node with proper PATH and detachment. +# Find which login node a runner is on. +# Args: $1 = runner directory +# Prints: node hostname, or "offline". +find_node() { + for node in "${NODES[@]}"; do + [ -n "$(find_pids "$node" "$1")" ] && echo "$node" && return + done + echo "offline" +} + +# Start a runner on a node with login shell (for /opt/slurm PATH). # Args: $1 = node, $2 = runner directory -# Returns 0 on success, 1 on failure. +# Returns: 0 if running after start, 1 otherwise. start_runner() { local node="$1" dir="$2" ssh -o ConnectTimeout=5 "$node" \ "setsid bash -lc 'cd $dir && nohup ./run.sh >> runner-nohup.log 2>&1 &'" \ /dev/null & local ssh_pid=$! - # Wait up to 10s for SSH to exit - local i - for i in $(seq 1 10); do - kill -0 $ssh_pid 2>/dev/null || break - sleep 1 + local i; for i in $(seq 1 10); do + kill -0 $ssh_pid 2>/dev/null || break; sleep 1 done kill $ssh_pid 2>/dev/null || true wait $ssh_pid 2>/dev/null || true sleep 3 - # Verify - local new_pid - new_pid=$(_find_runner_pids "$node" "$dir") - [ -n "$new_pid" ] + [ -n "$(find_pids "$node" "$dir")" ] } -# Stop a runner on a specific node. +# Stop a runner on a node (SIGTERM then SIGKILL). # Args: $1 = node, $2 = runner directory stop_runner() { - local node="$1" dir="$2" - local pids - pids=$(_find_runner_pids "$node" "$dir") - if [ -z "$pids" ]; then - return 0 - fi + local node="$1" dir="$2" pids + pids=$(find_pids "$node" "$dir") + [ -z "$pids" ] && return 0 for pid in $pids; do ssh -o ConnectTimeout=5 "$node" "kill $pid" 2>/dev/null || true done sleep 3 - # Force kill survivors - pids=$(_find_runner_pids "$node" "$dir") + pids=$(find_pids "$node" "$dir") for pid in $pids; do ssh -o ConnectTimeout=5 "$node" "kill -9 $pid" 2>/dev/null || true done sleep 1 } -# Check if a runner has slurm in its PATH. -# Args: $1 = node, $2 = PID (whitespace trimmed) -# Returns 0 if slurm is in PATH, 1 otherwise. -check_slurm_path() { - local node="$1" pid="${2// /}" +# Check if a runner process has /opt/slurm in PATH. +# Args: $1 = node, $2 = PID +has_slurm() { local count - count=$(ssh -o ConnectTimeout=5 "$node" \ - "cat /proc/$pid/environ 2>/dev/null | tr '\0' '\n' | grep -c /opt/slurm" \ + count=$(ssh -o ConnectTimeout=5 "$1" \ + "cat /proc/${2%% *}/environ 2>/dev/null | tr '\0' '\n' | grep -c /opt/slurm" \ 2>/dev/null || echo 0) [ "$count" -gt 0 ] } diff --git a/misc/phoenix/create-runner.sh b/misc/phoenix/create-runner.sh index d68cad361e..e255766a6d 100755 --- a/misc/phoenix/create-runner.sh +++ b/misc/phoenix/create-runner.sh @@ -1,51 +1,46 @@ #!/bin/bash -# Create and register a new GitHub Actions runner for Phoenix. +# Create and register a new GitHub Actions runner on Phoenix. # -# Downloads the runner binary, configures it with a registration token, -# and starts it on the specified login node with proper PATH. +# Downloads the runner binary, registers with MFlowCode org, and starts +# on the specified login node. Uses config.sh for org/group/label defaults. # -# Prerequisites: -# - gh CLI authenticated with admin access to the target org -# - The parent directory must exist and be on shared storage +# Prerequisites: gh CLI with admin:org scope (gh auth refresh -s admin:org) # -# Usage: bash create-runner.sh [org] [runner-group] +# Usage: bash create-runner.sh [parent-dir] # # Examples: -# bash create-runner.sh phoenix-11 login-phoenix-gnr-2 /storage/scratch1/6/sbryngelson3/mfc-runners -# bash create-runner.sh phoenix-12 login-phoenix-gnr-3 /storage/project/.../mfc-runners-2 MFlowCode phoenix +# bash create-runner.sh phoenix-11 login-phoenix-gnr-2 +# bash create-runner.sh phoenix-12 login-phoenix-gnr-3 /storage/project/.../mfc-runners-2 set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" -if [ $# -lt 3 ]; then - echo "Usage: $0 [org] [runner-group]" +if [ $# -lt 2 ]; then + echo "Usage: $0 [parent-dir]" echo "" - echo " runner-name Name for the runner (e.g. phoenix-11)" - echo " node Login node to run on (e.g. login-phoenix-gnr-2)" - echo " parent-dir Parent directory for the runner installation" - echo " org GitHub org (default: MFlowCode)" - echo " runner-group Runner group/pool (default: phoenix)" + echo " runner-name Name for the runner (e.g. phoenix-11)" + echo " node Login node (${NODES[*]})" + echo " parent-dir Parent directory (default: ${RUNNER_PARENT_DIRS[0]})" exit 1 fi runner_name="$1" node="$2" -parent_dir="$3" -org="${4:-MFlowCode}" -runner_group="${5:-phoenix}" +parent_dir="${3:-${RUNNER_PARENT_DIRS[0]}}" -# Determine next available runner directory +# Determine next available runner directory number existing=$(ls -d "$parent_dir"/actions-runner-* 2>/dev/null | sed 's/.*actions-runner-//' | sort -n | tail -1) next_num=$(( ${existing:-0} + 1 )) runner_dir="$parent_dir/actions-runner-$next_num" -echo "=== Creating runner ===" +echo "=== Creating Phoenix runner ===" echo " Name: $runner_name" echo " Node: $node" echo " Directory: $runner_dir" -echo " Org: $org" -echo " Group: $runner_group" +echo " Org: $ORG" +echo " Group: $RUNNER_GROUP" +echo " Label: $RUNNER_LABEL" echo "" if [ -d "$runner_dir" ]; then @@ -53,58 +48,52 @@ if [ -d "$runner_dir" ]; then exit 1 fi -# Get registration token -echo "Getting registration token from GitHub..." -token=$(gh api "orgs/$org/actions/runners/registration-token" --jq .token 2>/dev/null) +# Registration token +echo "Getting registration token..." +token=$(gh_registration_token) if [ -z "$token" ]; then - echo "ERROR: Failed to get registration token. Check 'gh auth status' and org admin permissions." + echo "ERROR: Failed to get token. Run: gh auth refresh -h github.com -s admin:org" exit 1 fi -echo " Token acquired." -# Get latest runner version -echo "Downloading runner..." -latest_version=$(gh api repos/actions/runner/releases/latest --jq .tag_name 2>/dev/null | sed 's/^v//') -if [ -z "$latest_version" ]; then - echo "ERROR: Failed to determine latest runner version." - exit 1 -fi -runner_url="https://github.com/actions/runner/releases/download/v${latest_version}/actions-runner-linux-x64-${latest_version}.tar.gz" -echo " Version: $latest_version" +# Download runner +echo "Downloading latest runner binary..." +version=$(gh_latest_runner_version) +url="https://github.com/actions/runner/releases/download/v${version}/actions-runner-linux-x64-${version}.tar.gz" +echo " Version: $version" mkdir -p "$runner_dir" cd "$runner_dir" - -curl -sL "$runner_url" | tar xz -echo " Downloaded and extracted." +curl -sL "$url" | tar xz +echo " Extracted." # Configure -echo "Configuring runner..." +echo "Configuring..." ./config.sh \ - --url "https://github.com/$org" \ + --url "https://github.com/$ORG" \ --token "$token" \ --name "$runner_name" \ - --runnergroup "$runner_group" \ - --labels "gt" \ + --runnergroup "$RUNNER_GROUP" \ + --labels "$RUNNER_LABEL" \ --work "_work" \ --unattended \ --replace echo " Configured." -# Start on the target node -echo "Starting runner on $node..." +# Start +echo "Starting on $node..." if start_runner "$node" "$runner_dir"; then - pid=$(ssh -o ConnectTimeout=5 "$node" "pgrep -f 'Runner.Listener.*$runner_dir'" 2>/dev/null || true) - if check_slurm_path "$node" "$pid"; then - echo " OK: Running as PID $pid on $node, slurm in PATH" + pids=$(find_pids "$node" "$runner_dir") + pid=${pids%% *} + if has_slurm "$node" "$pid"; then + echo " OK: PID $pid, slurm in PATH" else - echo " WARNING: Running as PID $pid but slurm NOT in PATH" + echo " WARNING: PID $pid but slurm MISSING from PATH" fi else - echo " ERROR: Failed to start. Try manually:" - echo " ssh $node 'cd $runner_dir && setsid bash -lc \"nohup ./run.sh >> runner-nohup.log 2>&1 &\"'" + echo " ERROR: Failed to start." + echo " Try: ssh $node 'cd $runner_dir && setsid bash -lc \"nohup ./run.sh >> runner-nohup.log 2>&1 &\"'" fi echo "" -echo "Runner $runner_name created at $runner_dir" -echo "Verify with: bash $SCRIPT_DIR/list-runners.sh" +echo "Created $runner_name at $runner_dir" diff --git a/misc/phoenix/list-runners.sh b/misc/phoenix/list-runners.sh index 48e5a2574e..c62ef52235 100755 --- a/misc/phoenix/list-runners.sh +++ b/misc/phoenix/list-runners.sh @@ -1,15 +1,8 @@ #!/bin/bash -# List all registered GitHub Actions runners, showing which node each is on, -# whether it's busy, its memory usage, and whether slurm is in PATH. +# List all Phoenix runners, combining GitHub API status with login-node process info. # -# Output columns: -# Name — GitHub runner name (from .runner config) -# Node — login node it's running on, or "offline" -# Status — idle, BUSY (has Worker process), or OFFLINE -# Slurm — ok or MISSING (can't submit SLURM jobs) -# RSS — memory usage in MB -# Pool — GitHub runner group/pool -# Directory — filesystem path +# Shows both what GitHub thinks (online/offline/busy) and the actual process +# state on the login nodes (which node, slurm PATH, memory). # # Usage: bash list-runners.sh @@ -17,48 +10,59 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" -discover_runners +printf "%-25s %-8s %-22s %-8s %6s %s\n" \ + "NAME" "GITHUB" "NODE" "SLURM" "RSS" "DIRECTORY" +printf "%s\n" "$(printf '%.0s-' {1..100})" -# Header -printf "%-25s %-22s %-8s %-8s %5s %-10s %s\n" \ - "NAME" "NODE" "STATUS" "SLURM" "RSS" "POOL" "DIRECTORY" -printf "%s\n" "$(printf '%.0s-' {1..120})" +# Get GitHub API status for all Phoenix runners +declare -A gh_status gh_busy +while read -r id name status busy; do + gh_status[$name]="$status" + gh_busy[$name]="$busy" +done <<< "$(gh_list_runners)" -for i in "${!RUNNER_DIRS[@]}"; do - dir="${RUNNER_DIRS[$i]}" - name="${RUNNER_NAMES[$i]}" - pool="${RUNNER_POOLS[$i]}" +# Walk local runner directories and cross-reference +for dir in $(find_runner_dirs); do + name=$(get_runner_name "$dir") + [ -z "$name" ] && continue - node=$(find_runner_node "$dir") + # GitHub status + api_status="${gh_status[$name]:-unknown}" + api_busy="${gh_busy[$name]:-false}" + if [ "$api_busy" = "true" ]; then + gh_col="BUSY" + else + gh_col="$api_status" + fi + # Node status + node=$(find_node "$dir") if [ "$node" = "offline" ]; then - printf "%-25s %-22s %-8s %-8s %5s %-10s %s\n" \ - "$name" "—" "OFFLINE" "—" "—" "$pool" "$dir" + printf "%-25s %-8s %-22s %-8s %6s %s\n" \ + "$name" "$gh_col" "—" "—" "—" "$dir" continue fi - # Get all info in one SSH call to reduce latency + # Process details (one SSH call) info=$(ssh -o ConnectTimeout=5 "$node" ' - pid="" for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) - [ "$cwd" = "'"$dir"'" ] && pid=$p && break + if [ "$cwd" = "'"$dir"'" ]; then + slurm=$(cat /proc/$p/environ 2>/dev/null | tr "\0" "\n" | grep -c /opt/slurm || echo 0) + [ "$slurm" -gt 0 ] && s="ok" || s="MISSING" + rss=$(ps -p $p -o rss= 2>/dev/null | awk "{printf \"%.0f\", \$1/1024}" || echo "?") + echo "$s $rss" + exit + fi done - [ -z "$pid" ] && echo "? ? ? ?" && exit - worker=$(ps aux | grep Runner.Worker | grep "'"$dir"'" | grep -v grep | head -1 || true) - [ -n "$worker" ] && status="BUSY" || status="idle" - has_slurm=$(cat /proc/$pid/environ 2>/dev/null | tr "\0" "\n" | grep -c /opt/slurm || echo 0) - [ "$has_slurm" -gt 0 ] && slurm="ok" || slurm="MISSING" - rss=$(ps -p $pid -o rss= 2>/dev/null | awk "{printf \"%.0f\", \$1/1024}" || echo "?") - echo "$pid $status $slurm $rss" - ' 2>/dev/null || echo "? ? ? ?") - read -r pid status slurm rss <<< "$info" + echo "? ?" + ' 2>/dev/null || echo "? ?") + read -r slurm rss <<< "$info" - printf "%-25s %-22s %-8s %-8s %5s %-10s %s\n" \ - "$name" "$node" "$status" "$slurm" "${rss}MB" "$pool" "$dir" + printf "%-25s %-8s %-22s %-8s %5sMB %s\n" \ + "$name" "$gh_col" "$node" "$slurm" "$rss" "$dir" done -# Per-node summary echo "" echo "=== Per-node memory ===" for node in "${NODES[@]}"; do diff --git a/misc/phoenix/rebalance-runners.sh b/misc/phoenix/rebalance-runners.sh index 42e207c74b..ad506ff326 100755 --- a/misc/phoenix/rebalance-runners.sh +++ b/misc/phoenix/rebalance-runners.sh @@ -1,49 +1,49 @@ #!/bin/bash -# Automatically rebalance GitHub Actions runners across Phoenix login nodes. +# Automatically rebalance Phoenix runners across login nodes. # -# Computes the optimal distribution (equal runners per node), determines -# which runners need to move, and executes the moves. Prefers to move -# idle runners over busy ones. -# -# Each Phoenix login node has a 4 GB per-user cgroup memory limit. -# Target: ~3-4 runners per node to leave headroom for CI work. +# Discovers all runner directories, checks which node each is on, +# computes the optimal distribution, and moves runners to balance. +# Prefers moving idle runners over busy ones. Also places offline runners. # # Usage: bash rebalance-runners.sh # dry run -# APPLY=1 bash rebalance-runners.sh # execute moves +# APPLY=1 bash rebalance-runners.sh # execute # APPLY=1 FORCE=1 bash rebalance-runners.sh # move busy runners too set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" -discover_runners +# Discover runners +declare -a dirs=() names=() +for dir in $(find_runner_dirs); do + name=$(get_runner_name "$dir") + [ -z "$name" ] && continue + dirs+=("$dir") + names+=("$name") +done + num_nodes=${#NODES[@]} -num_runners=${#RUNNER_DIRS[@]} -target_per_node=$(( num_runners / num_nodes )) +num_runners=${#dirs[@]} +target=$(( num_runners / num_nodes )) remainder=$(( num_runners % num_nodes )) echo "=== Current state ===" echo "Runners: $num_runners across $num_nodes nodes" -echo "Target: $target_per_node per node (+1 on first $remainder nodes)" +echo "Target: $target per node (+1 on first $remainder nodes)" echo "" -# Build current assignment: node -> list of runner indices +# Map runners to nodes declare -A node_runners # node -> space-separated indices -declare -A runner_node # index -> node -declare -A runner_busy # index -> 1 if busy +declare -A runner_node runner_busy -for node in "${NODES[@]}"; do - node_runners[$node]="" -done +for node in "${NODES[@]}"; do node_runners[$node]=""; done -for i in "${!RUNNER_DIRS[@]}"; do - dir="${RUNNER_DIRS[$i]}" - node=$(find_runner_node "$dir") +for i in "${!dirs[@]}"; do + node=$(find_node "${dirs[$i]}") runner_node[$i]="$node" - if [ "$node" != "offline" ]; then node_runners[$node]="${node_runners[$node]:-} $i" - worker=$(ssh -o ConnectTimeout=5 "$node" "ps aux | grep Runner.Worker | grep '$dir' | grep -v grep" 2>/dev/null || true) + worker=$(ssh -o ConnectTimeout=5 "$node" "ps aux | grep Runner.Worker | grep '${dirs[$i]}' | grep -v grep" 2>/dev/null || true) [ -n "$worker" ] && runner_busy[$i]=1 || runner_busy[$i]=0 else runner_busy[$i]=0 @@ -55,176 +55,115 @@ for node in "${NODES[@]}"; do indices=(${node_runners[$node]:-}) echo "$node: ${#indices[@]} runners" for i in "${indices[@]}"; do - busy_marker="" - [ "${runner_busy[$i]:-0}" = "1" ] && busy_marker=" (BUSY)" - echo " ${RUNNER_NAMES[$i]}$busy_marker" + busy="" + [ "${runner_busy[$i]:-0}" = "1" ] && busy=" (BUSY)" + echo " ${names[$i]}$busy" done done -# Find offline runners offline=() -for i in "${!RUNNER_DIRS[@]}"; do +for i in "${!dirs[@]}"; do [ "${runner_node[$i]}" = "offline" ] && offline+=("$i") done if [ ${#offline[@]} -gt 0 ]; then echo "" - echo "OFFLINE runners:" - for i in "${offline[@]}"; do - echo " ${RUNNER_NAMES[$i]} (${RUNNER_DIRS[$i]})" - done + echo "OFFLINE:" + for i in "${offline[@]}"; do echo " ${names[$i]}"; done fi - echo "" -# Compute target per node +# Compute targets declare -A node_target n=0 for node in "${NODES[@]}"; do - node_target[$node]=$target_per_node - if [ $n -lt $remainder ]; then - node_target[$node]=$(( target_per_node + 1 )) - fi + node_target[$node]=$target + [ $n -lt $remainder ] && node_target[$node]=$(( target + 1 )) n=$((n + 1)) done -# Determine moves needed -# Phase 1: identify overloaded nodes and runners to move away -moves=() # "source_node dest_node runner_index" - -# Collect runners to move from overloaded nodes (prefer idle runners) -to_place=() # indices of runners that need a new home +# Plan moves: collect runners from overloaded nodes (idle first) +to_place=() for node in "${NODES[@]}"; do indices=(${node_runners[$node]:-}) excess=$(( ${#indices[@]} - ${node_target[$node]} )) - if [ $excess -le 0 ]; then - continue - fi - # Sort: move idle runners first, then busy - idle_here=() - busy_here=() + [ $excess -le 0 ] && continue + idle=() busy=() for i in "${indices[@]}"; do - if [ "${runner_busy[$i]:-0}" = "1" ]; then - busy_here+=("$i") - else - idle_here+=("$i") - fi + [ "${runner_busy[$i]:-0}" = "1" ] && busy+=("$i") || idle+=("$i") done - # Pick from idle first moved=0 - for i in "${idle_here[@]}"; do - [ $moved -ge $excess ] && break - to_place+=("$node $i") - moved=$((moved + 1)) - done - for i in "${busy_here[@]}"; do + for i in "${idle[@]}" "${busy[@]}"; do [ $moved -ge $excess ] && break to_place+=("$node $i") moved=$((moved + 1)) done done -# Phase 2: assign runners to underloaded nodes -# Also place offline runners -for i in "${offline[@]}"; do - to_place+=("offline $i") -done +# Add offline runners +for i in "${offline[@]}"; do to_place+=("offline $i"); done +# Assign to underloaded nodes +moves=() for entry in "${to_place[@]}"; do - read -r src_node runner_idx <<< "$entry" - # Find the most underloaded node - best_node="" - best_deficit=-999 + read -r src idx <<< "$entry" + best="" best_deficit=-999 for node in "${NODES[@]}"; do - current=(${node_runners[$node]:-}) - deficit=$(( ${node_target[$node]} - ${#current[@]} )) - if [ $deficit -gt $best_deficit ]; then - best_deficit=$deficit - best_node=$node - fi + cur=(${node_runners[$node]:-}) + deficit=$(( ${node_target[$node]} - ${#cur[@]} )) + [ $deficit -gt $best_deficit ] && best_deficit=$deficit && best=$node done - - if [ -z "$best_node" ] || [ "$best_deficit" -le 0 ]; then - echo "WARNING: No underloaded node for ${RUNNER_NAMES[$runner_idx]}, skipping" - continue - fi - - moves+=("$src_node $best_node $runner_idx") + [ -z "$best" ] || [ "$best_deficit" -le 0 ] && continue + moves+=("$src $best $idx") # Update bookkeeping - if [ "$src_node" != "offline" ]; then - # Remove from source - new_list="" - for idx in ${node_runners[$src_node]}; do - [ "$idx" != "$runner_idx" ] && new_list="$new_list $idx" - done - node_runners[$src_node]="$new_list" + if [ "$src" != "offline" ]; then + new="" + for j in ${node_runners[$src]}; do [ "$j" != "$idx" ] && new="$new $j"; done + node_runners[$src]="$new" fi - node_runners[$best_node]="${node_runners[$best_node]:-} $runner_idx" + node_runners[$best]="${node_runners[$best]:-} $idx" done if [ ${#moves[@]} -eq 0 ]; then - echo "Already balanced — no moves needed." + echo "Already balanced." exit 0 fi -# Show plan echo "=== Planned moves ===" has_busy=false for move in "${moves[@]}"; do read -r src dst idx <<< "$move" - busy_marker="" - if [ "${runner_busy[$idx]:-0}" = "1" ]; then - busy_marker=" (BUSY!)" - has_busy=true - fi - echo " ${RUNNER_NAMES[$idx]}: $src -> $dst$busy_marker" + busy="" + [ "${runner_busy[$idx]:-0}" = "1" ] && busy=" (BUSY!)" && has_busy=true + echo " ${names[$idx]}: $src -> $dst$busy" done - echo "" -echo "=== Target distribution ===" +echo "=== Target ===" for node in "${NODES[@]}"; do - indices=(${node_runners[$node]:-}) - echo " $node: ${#indices[@]} runners" + cur=(${node_runners[$node]:-}) + echo " $node: ${#cur[@]} runners" done -if [ "$has_busy" = true ] && [ "${FORCE:-0}" != "1" ]; then - echo "" - echo "Some runners to move have active jobs. Set FORCE=1 to move them." - exit 1 -fi - -if [ "${APPLY:-0}" != "1" ]; then - echo "" - echo "Dry run — set APPLY=1 to execute. Add FORCE=1 to move busy runners." - exit 0 -fi +[ "$has_busy" = true ] && [ "${FORCE:-0}" != "1" ] && echo "" && echo "Set FORCE=1 to move busy runners." && exit 1 +[ "${APPLY:-0}" != "1" ] && echo "" && echo "Dry run — set APPLY=1 to execute." && exit 0 -# Execute echo "" -echo "=== Executing moves ===" +echo "=== Executing ===" for move in "${moves[@]}"; do read -r src dst idx <<< "$move" - dir="${RUNNER_DIRS[$idx]}" - name="${RUNNER_NAMES[$idx]}" - echo "Moving $name: $src -> $dst" - - if [ "$src" != "offline" ]; then - stop_runner "$src" "$dir" - fi - - if start_runner "$dst" "$dir"; then - pid=$(ssh -o ConnectTimeout=5 "$dst" "pgrep -f 'Runner.Listener.*$dir' | head -1" 2>/dev/null || true) - if [ -n "$pid" ] && check_slurm_path "$dst" "$pid"; then - echo " OK: PID $pid on $dst, slurm in PATH" - elif [ -n "$pid" ]; then - echo " WARNING: PID $pid on $dst but slurm MISSING from PATH" + echo "Moving ${names[$idx]}: $src -> $dst" + [ "$src" != "offline" ] && stop_runner "$src" "${dirs[$idx]}" + if start_runner "$dst" "${dirs[$idx]}"; then + pids=$(find_pids "$dst" "${dirs[$idx]}") + pid=${pids%% *} + if has_slurm "$dst" "$pid"; then + echo " OK: PID $pid, slurm in PATH" else - echo " ERROR: Process not found after start" + echo " WARNING: slurm MISSING" fi else - echo " ERROR: Failed to start on $dst" + echo " ERROR: Failed to start" fi done echo "" -echo "=== Final state ===" -bash "$SCRIPT_DIR/list-runners.sh" +bash "$SCRIPT_DIR/check-runners.sh" diff --git a/misc/phoenix/restart-all.sh b/misc/phoenix/restart-all.sh index 0c94b7e744..26e143d636 100755 --- a/misc/phoenix/restart-all.sh +++ b/misc/phoenix/restart-all.sh @@ -1,30 +1,27 @@ #!/bin/bash -# Restart all GitHub Actions runners on their current nodes. +# Restart all Phoenix runners on their current nodes. # -# Useful after a login node reboot or when runners need a fresh start -# (e.g. to pick up a new PATH or clear stale state). Each runner is -# restarted in place — no rebalancing is done. +# Useful after a login node reboot or to pick up environment changes. +# Restarts in place — no rebalancing. Skips busy runners unless FORCE=1. # -# Usage: bash restart-all.sh # dry run (show what would restart) -# APPLY=1 bash restart-all.sh # restart all runners -# APPLY=1 FORCE=1 bash restart-all.sh # restart even busy runners +# Usage: bash restart-all.sh # dry run +# APPLY=1 bash restart-all.sh # execute +# APPLY=1 FORCE=1 bash restart-all.sh # restart busy runners too set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" -discover_runners - echo "=== Discovering runners ===" -declare -a restart_list=() # "node dir name" +declare -a restart_list=() -for i in "${!RUNNER_DIRS[@]}"; do - dir="${RUNNER_DIRS[$i]}" - name="${RUNNER_NAMES[$i]}" - node=$(find_runner_node "$dir") +for dir in $(find_runner_dirs); do + name=$(get_runner_name "$dir") + [ -z "$name" ] && continue + node=$(find_node "$dir") if [ "$node" = "offline" ]; then - echo " $name: OFFLINE (skipping — use rebalance-runners.sh to place it)" + echo " $name: OFFLINE (use rebalance-runners.sh to place)" continue fi @@ -32,7 +29,7 @@ for i in "${!RUNNER_DIRS[@]}"; do if [ -n "$worker" ]; then echo " $name: BUSY on $node" if [ "${FORCE:-0}" != "1" ]; then - echo " Skipping busy runner. Set FORCE=1 to restart anyway." + echo " Skipping. Set FORCE=1 to restart anyway." continue fi else @@ -43,7 +40,6 @@ for i in "${!RUNNER_DIRS[@]}"; do done if [ ${#restart_list[@]} -eq 0 ]; then - echo "" echo "Nothing to restart." exit 0 fi @@ -58,19 +54,19 @@ fi echo "" echo "=== Restarting ===" -success=0 -fail=0 +success=0; fail=0 for entry in "${restart_list[@]}"; do read -r node dir name <<< "$entry" echo "--- $name on $node ---" stop_runner "$node" "$dir" if start_runner "$node" "$dir"; then - pid=$(ssh -o ConnectTimeout=5 "$node" "pgrep -f 'Runner.Listener.*$dir' | head -1" 2>/dev/null || true) - if [ -n "$pid" ] && check_slurm_path "$node" "$pid"; then + pids=$(find_pids "$node" "$dir") + pid=${pids%% *} + if has_slurm "$node" "$pid"; then echo " OK: PID $pid, slurm in PATH" success=$((success + 1)) - elif [ -n "$pid" ]; then - echo " WARNING: PID $pid but slurm MISSING from PATH" + else + echo " WARNING: PID $pid but slurm MISSING" fail=$((fail + 1)) fi else @@ -81,5 +77,3 @@ done echo "" echo "=== Summary: $success succeeded, $fail failed ===" -echo "" -bash "$SCRIPT_DIR/list-runners.sh" diff --git a/misc/phoenix/restart-runner.sh b/misc/phoenix/restart-runner.sh index da8e44a0da..f3bc8d72b7 100755 --- a/misc/phoenix/restart-runner.sh +++ b/misc/phoenix/restart-runner.sh @@ -1,14 +1,8 @@ #!/bin/bash # Restart a GitHub Actions runner on a specific Phoenix login node. # -# Kills any existing instance of the runner, then starts a new one. -# Uses 'bash -l' so the runner inherits the full login PATH (which -# includes /opt/slurm/current/bin — required for sbatch/squeue/sacct). -# Uses 'setsid' + stdin close for full terminal detachment so the SSH -# session exits cleanly without waiting for the runner process. -# -# The runner binary lives on shared storage, so no files need to be -# copied — only the process needs to run on the target node. +# Kills any existing instance, then starts a new one with a login shell +# (for /opt/slurm PATH) and full terminal detachment. # # Usage: bash restart-runner.sh # Example: bash restart-runner.sh login-phoenix-gnr-2 /storage/scratch1/6/sbryngelson3/mfc-runners/actions-runner-3 @@ -19,27 +13,25 @@ source "$SCRIPT_DIR/config.sh" if [ $# -ne 2 ]; then echo "Usage: $0 " - echo "Example: $0 login-phoenix-gnr-2 /storage/scratch1/6/sbryngelson3/mfc-runners/actions-runner-3" + echo "Nodes: ${NODES[*]}" exit 1 fi node="$1" dir="$2" -name=$(basename "$dir") +name=$(get_runner_name "$dir" 2>/dev/null || basename "$dir") echo "Restarting $name on $node..." - stop_runner "$node" "$dir" if start_runner "$node" "$dir"; then - pid=$(ssh -o ConnectTimeout=5 "$node" "pgrep -f 'Runner.Listener.*$dir' | head -1" 2>/dev/null || true) - if [ -n "$pid" ] && check_slurm_path "$node" "$pid"; then + pids=$(find_pids "$node" "$dir") + pid=${pids%% *} + if has_slurm "$node" "$pid"; then echo " OK: PID $pid, slurm in PATH" - elif [ -n "$pid" ]; then - echo " WARNING: PID $pid started but slurm NOT in PATH!" - echo " The runner may not be able to submit SLURM jobs." + else + echo " WARNING: PID $pid but slurm MISSING from PATH" fi else - echo " ERROR: Runner failed to start on $node" - echo " Try manually: ssh $node 'cd $dir && setsid bash -lc \"nohup ./run.sh >> runner-nohup.log 2>&1 &\"'" + echo " ERROR: Failed to start on $node" fi From 84d6ec5d58023f7ac414e4e18890b0ce09de3d22 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 17:49:00 -0400 Subject: [PATCH 04/20] Add Frontier GitHub Actions runner management scripts Scripts for deploying, restarting, and stopping self-hosted runners on OLCF Frontier login nodes, modeled after the existing Phoenix runner scripts. Co-Authored-By: Claude Sonnet 4.6 --- misc/frontier/README.md | 65 +++++++++++++++++++ misc/frontier/deploy-runners.sh | 28 +++++++++ misc/frontier/make-runner.sh | 80 ++++++++++++++++++++++++ misc/frontier/restart-offline-runners.sh | 58 +++++++++++++++++ misc/frontier/stop-runner.sh | 47 ++++++++++++++ 5 files changed, 278 insertions(+) create mode 100644 misc/frontier/README.md create mode 100755 misc/frontier/deploy-runners.sh create mode 100755 misc/frontier/make-runner.sh create mode 100755 misc/frontier/restart-offline-runners.sh create mode 100755 misc/frontier/stop-runner.sh diff --git a/misc/frontier/README.md b/misc/frontier/README.md new file mode 100644 index 0000000000..d2c30f95a3 --- /dev/null +++ b/misc/frontier/README.md @@ -0,0 +1,65 @@ +# Frontier Runner Management Scripts + +Scripts for managing GitHub Actions self-hosted runners on OLCF Frontier login +nodes. The runners submit SLURM jobs to Frontier compute nodes for MFC's CI/CD +pipeline using the `service` partition with the `develop` QOS. + +## Background + +Frontier has 11 login nodes (`login01`–`login11`). Runner binaries live on +shared Lustre storage (`/lustre/orion/cfd154/proj-shared/runners/`), so moving +a runner between nodes only requires stopping the process on one node and +starting it on another — no binary copying needed. + +Each runner directory contains a `runner.node` file recording which login node +it runs on, and a `runner.pid` file with its process ID. + +Runners occasionally die due to OLCF's firewall/proxy dropping long-lived TCP +connections to GitHub's broker. The `restart-offline-runners.sh` script handles +recovery. Login nodes vary in stability — if a runner keeps dying on a +particular node, move it to a quieter one (login01 tends to have low load). + +## Quick Reference + +```bash +# Deploy a new runner on a specific node +bash make-runner.sh 23 login01 + +# Deploy multiple runners across nodes (e.g. runners 23, 24, 25) +bash deploy-runners.sh 23 login01 login02 login03 + +# Restart all offline runners in place +bash restart-offline-runners.sh + +# Stop and deregister a runner +bash stop-runner.sh frontier-12 +``` + +## Scripts + +| Script | Purpose | +|---|---| +| `make-runner.sh` | Download runner binary, register with GitHub via API, start on target node. Usage: `make-runner.sh [login-node]` | +| `deploy-runners.sh` | Deploy multiple runners across login nodes in parallel. Usage: `deploy-runners.sh [node2 ...]` | +| `restart-offline-runners.sh` | Query GitHub for offline frontier runners, SSH to their recorded node, and restart in parallel. Prints final online/offline status. | +| `stop-runner.sh` | Kill the runner process and deregister it from GitHub. Usage: `stop-runner.sh ` | + +## Troubleshooting + +**Runner goes OFFLINE repeatedly on the same node** — That login node may have +process culling or high memory pressure. Move it: +```bash +# Kill on current node, start on a quieter one +ssh login10 "kill $(cat /lustre/orion/cfd154/proj-shared/runners/frontier-1/runner.pid)" +echo "login01" > /lustre/orion/cfd154/proj-shared/runners/frontier-1/runner.node +ssh login01 "nohup /lustre/orion/cfd154/proj-shared/runners/frontier-1/run.sh \ + >> /lustre/orion/cfd154/proj-shared/runners/frontier-1/runner.log 2>&1 < /dev/null &" +``` + +**Multiple runners OFFLINE at once** — Usually a transient OLCF network blip +to GitHub. Run `restart-offline-runners.sh` to recover all at once. + +**runner.pid is wrong after a restart** — This was a known bug (now fixed in +`restart-offline-runners.sh`) where the PID was written before the SSH +command completed. The fix wraps the SSH call in a subshell so the PID is +captured synchronously. diff --git a/misc/frontier/deploy-runners.sh b/misc/frontier/deploy-runners.sh new file mode 100755 index 0000000000..9d9daf9591 --- /dev/null +++ b/misc/frontier/deploy-runners.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Deploy one runner per login node. +# Usage: ./deploy-runners.sh [node2 ...] +# Example: ./deploy-runners.sh 17 login08 login09 login10 +# Deploys frontier-17 on login08, frontier-18 on login09, frontier-19 on login10. +set -euo pipefail + +SHARED_DIR="/lustre/orion/cfd154/proj-shared/runners" + +START_NUM="${1:?Usage: $0 [node2 ...]}" +shift +NODES=("$@") + +if [ ${#NODES[@]} -eq 0 ]; then + echo "Error: no login nodes specified." >&2 + echo "Usage: $0 [node2 ...]" >&2 + exit 1 +fi + +for i in "${!NODES[@]}"; do + NODE="${NODES[$i]}" + NUM=$((START_NUM + i)) + echo "==> Deploying frontier-${NUM} on ${NODE}..." + bash "${SHARED_DIR}/make-runner.sh" "${NUM}" "${NODE}" & +done + +wait +echo "==> All runners deployed." diff --git a/misc/frontier/make-runner.sh b/misc/frontier/make-runner.sh new file mode 100755 index 0000000000..1866475f3f --- /dev/null +++ b/misc/frontier/make-runner.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +# Create, configure, and start a single GitHub Actions runner. +# Usage: ./make-runner.sh [login-node] +# runner-number Sequential number for this runner (e.g. 12) +# login-node Node to run on (default: current host). Runner will be +# started there via SSH (or locally if it matches current host). +# Example: ./make-runner.sh 12 login03 +set -euo pipefail + +SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=30 -o ServerAliveInterval=10 -o ServerAliveCountMax=3" + +RUNNER_NUM="${1:?Usage: $0 [login-node]}" +TARGET_NODE="${2:-$(hostname -s)}" +RUNNER_VERSION="2.332.0" +TARBALL="actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" +SHARED_DIR="/lustre/orion/cfd154/proj-shared/runners" +ORG="MFlowCode" +RUNNER_GROUP="phoenix" +EXTRA_LABELS="frontier" +RUNNER_NAME="frontier-${RUNNER_NUM}" +RUNNER_DIR="${SHARED_DIR}/${RUNNER_NAME}" + +echo "==> Setting up runner: ${RUNNER_NAME} on ${TARGET_NODE}" + +# --- Download tarball once to shared dir --- +if [ ! -f "${SHARED_DIR}/${TARBALL}" ]; then + echo "==> Downloading runner v${RUNNER_VERSION}..." + curl -fsSL \ + "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/${TARBALL}" \ + -o "${SHARED_DIR}/${TARBALL}" +fi + +# --- Extract (always run locally — filesystem is shared across all nodes) --- +mkdir -p "${RUNNER_DIR}" +echo "==> Extracting runner into ${RUNNER_DIR}..." +tar xzf "${SHARED_DIR}/${TARBALL}" -C "${RUNNER_DIR}" + +# Verify extraction succeeded +if [ ! -f "${RUNNER_DIR}/run.sh" ]; then + echo "ERROR: Extraction failed — run.sh not found in ${RUNNER_DIR}" >&2 + exit 1 +fi + +# --- Configure (always run locally — filesystem is shared across all nodes) --- +echo "==> Fetching registration token..." +REG_TOKEN=$(gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + "/orgs/${ORG}/actions/runners/registration-token" \ + --jq '.token') + +echo "==> Configuring runner..." +"${RUNNER_DIR}/config.sh" \ + --url "https://github.com/${ORG}" \ + --token "${REG_TOKEN}" \ + --name "${RUNNER_NAME}" \ + --labels "${EXTRA_LABELS}" \ + --runnergroup "${RUNNER_GROUP}" \ + --work "_work" \ + --unattended \ + --replace + +# --- Store which node this runner lives on --- +echo "${TARGET_NODE}" > "${RUNNER_DIR}/runner.node" + +# --- Start run.sh on the target node --- +echo "==> Starting runner on ${TARGET_NODE}..." +CURRENT_NODE=$(hostname -s) +if [ "${TARGET_NODE}" = "${CURRENT_NODE}" ]; then + nohup "${RUNNER_DIR}/run.sh" >> "${RUNNER_DIR}/runner.log" 2>&1 < /dev/null & + echo $! > "${RUNNER_DIR}/runner.pid" +else + PID=$(ssh ${SSH_OPTS} "${TARGET_NODE}" \ + "nohup ${RUNNER_DIR}/run.sh >> ${RUNNER_DIR}/runner.log 2>&1 < /dev/null & echo \$!") + echo "${PID}" > "${RUNNER_DIR}/runner.pid" +fi + +echo "==> Runner PID: $(cat ${RUNNER_DIR}/runner.pid)" +echo "==> Log: ${RUNNER_DIR}/runner.log" +echo "==> Done. Runner '${RUNNER_NAME}' is starting on ${TARGET_NODE}." diff --git a/misc/frontier/restart-offline-runners.sh b/misc/frontier/restart-offline-runners.sh new file mode 100755 index 0000000000..074a39f44a --- /dev/null +++ b/misc/frontier/restart-offline-runners.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# Restart all offline frontier runners. +# Queries GitHub for offline runners, then SSHes to each runner's node +# and restarts run.sh in the background. All restarts happen in parallel. +set -euo pipefail + +SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=30 -o ServerAliveInterval=10 -o ServerAliveCountMax=3" +SHARED_DIR="/lustre/orion/cfd154/proj-shared/runners" +ORG="MFlowCode" + +echo "==> Checking for offline frontier runners..." +OFFLINE=$(gh api orgs/${ORG}/actions/runners \ + --jq '.runners[] | select(.name | startswith("frontier")) | select(.status == "offline") | .name') + +if [ -z "${OFFLINE}" ]; then + echo "==> All frontier runners are online. Nothing to do." + exit 0 +fi + +echo "==> Offline runners: $(echo ${OFFLINE} | tr '\n' ' ')" + +for RUNNER_NAME in ${OFFLINE}; do + RUNNER_DIR="${SHARED_DIR}/${RUNNER_NAME}" + NODE_FILE="${RUNNER_DIR}/runner.node" + + if [ ! -d "${RUNNER_DIR}" ]; then + echo "WARN: No directory for ${RUNNER_NAME}, skipping." + continue + fi + + if [ ! -f "${NODE_FILE}" ]; then + echo "WARN: No runner.node file for ${RUNNER_NAME}, skipping." + continue + fi + + TARGET_NODE=$(cat "${NODE_FILE}") + CURRENT_NODE=$(hostname -s) + + echo "==> Restarting ${RUNNER_NAME} on ${TARGET_NODE}..." + if [ "${TARGET_NODE}" = "${CURRENT_NODE}" ]; then + nohup "${RUNNER_DIR}/run.sh" >> "${RUNNER_DIR}/runner.log" 2>&1 < /dev/null & + echo $! > "${RUNNER_DIR}/runner.pid" + echo " PID: $(cat ${RUNNER_DIR}/runner.pid)" + else + ( + PID=$(ssh ${SSH_OPTS} "${TARGET_NODE}" \ + "nohup ${RUNNER_DIR}/run.sh >> ${RUNNER_DIR}/runner.log 2>&1 < /dev/null & echo \$!") + echo "${PID}" > "${RUNNER_DIR}/runner.pid" + echo " ${RUNNER_NAME} PID: ${PID}" + ) & + fi +done + +wait +echo "==> Done. Waiting 5s for runners to register..." +sleep 5 +gh api orgs/${ORG}/actions/runners \ + --jq '.runners[] | select(.name | startswith("frontier")) | {name, status}' diff --git a/misc/frontier/stop-runner.sh b/misc/frontier/stop-runner.sh new file mode 100755 index 0000000000..b8941bec60 --- /dev/null +++ b/misc/frontier/stop-runner.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Stop and deregister a GitHub Actions runner. +# Usage: ./stop-runner.sh (e.g. frontier-12) +set -euo pipefail + +SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=30 -o ServerAliveInterval=10 -o ServerAliveCountMax=3" + +RUNNER_NAME="${1:?Usage: $0 }" +SHARED_DIR="/lustre/orion/cfd154/proj-shared/runners" +RUNNER_DIR="${SHARED_DIR}/${RUNNER_NAME}" +ORG="MFlowCode" + +if [ ! -d "${RUNNER_DIR}" ]; then + echo "Runner dir not found: ${RUNNER_DIR}" + exit 1 +fi + +# --- Kill the process (SSH to correct node if needed) --- +PID_FILE="${RUNNER_DIR}/runner.pid" +NODE_FILE="${RUNNER_DIR}/runner.node" +CURRENT_NODE=$(hostname -s) + +if [ -f "${PID_FILE}" ]; then + PID=$(cat "${PID_FILE}") + TARGET_NODE="${CURRENT_NODE}" + [ -f "${NODE_FILE}" ] && TARGET_NODE=$(cat "${NODE_FILE}") + + echo "==> Killing PID ${PID} on ${TARGET_NODE}..." + if [ "${TARGET_NODE}" = "${CURRENT_NODE}" ]; then + kill "${PID}" 2>/dev/null || true + else + ssh ${SSH_OPTS} "${TARGET_NODE}" "kill ${PID} 2>/dev/null || true" + fi + rm -f "${PID_FILE}" +fi + +# --- Deregister from GitHub --- +echo "==> Fetching removal token..." +REMOVE_TOKEN=$(gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + "/orgs/${ORG}/actions/runners/remove-token" \ + --jq '.token') + +echo "==> Deregistering runner..." +"${RUNNER_DIR}/config.sh" remove --token "${REMOVE_TOKEN}" || true +echo "==> Done." From e704284163e36e2b9f393a31958928170cedf3c8 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 18:02:56 -0400 Subject: [PATCH 05/20] misc: refactor Frontier runner scripts with shared config and CWD-based discovery - Add misc/common/rerun-failed.sh (site-agnostic, moved from phoenix/) - Replace misc/phoenix/rerun-failed.sh with a thin wrapper to common/ - Add misc/frontier/config.sh modeled after phoenix/config.sh: - Frontier constants, SSH_OPTS with BatchMode for MOTD suppression - find_pids() filters stdout through grep -E '^[0-9]+$' to strip MOTD noise - start_runner() uses timeout+setsid+nohup (no login shell needed) - stop_runner(), find_node(), gh_list_runners(), gh_registration_token() - Rewrite make-runner.sh: sources config.sh, no runner.pid, uses start_runner() - Rewrite deploy-runners.sh: sources config.sh, calls $SCRIPT_DIR/make-runner.sh - Rewrite restart-offline-runners.sh: GitHub API query, CWD-based discovery, stop stale processes before restart, parallel restarts, final status report - Rewrite stop-runner.sh: find_node() + stop_runner() + gh_remove_runner() - Add check-runners.sh: SSH per-node health check (name/status/RSS) - Update README.md: new scripts table, no runner.pid references, updated troubleshooting guide Co-Authored-By: Claude Sonnet 4.6 --- misc/common/rerun-failed.sh | 77 +++++++++++++++ misc/frontier/README.md | 35 ++++--- misc/frontier/check-runners.sh | 29 ++++++ misc/frontier/config.sh | 116 +++++++++++++++++++++++ misc/frontier/deploy-runners.sh | 19 ++-- misc/frontier/make-runner.sh | 48 ++++------ misc/frontier/restart-offline-runners.sh | 88 ++++++++++------- misc/frontier/stop-runner.sh | 58 +++++------- misc/phoenix/rerun-failed.sh | 77 +-------------- 9 files changed, 353 insertions(+), 194 deletions(-) create mode 100755 misc/common/rerun-failed.sh create mode 100755 misc/frontier/check-runners.sh create mode 100755 misc/frontier/config.sh diff --git a/misc/common/rerun-failed.sh b/misc/common/rerun-failed.sh new file mode 100755 index 0000000000..ed29924451 --- /dev/null +++ b/misc/common/rerun-failed.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# Rerun failed GitHub Actions workflows on open non-draft MFC PRs and master. +# +# Checks the 5 most recent workflow runs per branch. Only the failed jobs +# within each run are rerun (via `gh run rerun --failed`), not the entire +# workflow. Runs that are already in progress or queued are skipped by `gh`. +# +# Requires: gh CLI authenticated with access to MFlowCode/MFC +# +# Usage: bash rerun-failed.sh # dry run (show what would be rerun) +# APPLY=1 bash rerun-failed.sh # actually rerun failed workflows + +set -euo pipefail + +REPO="MFlowCode/MFC" + +echo "Checking open non-draft PRs on $REPO..." +prs=$(gh pr list --repo "$REPO" --state open --json number,title,isDraft --jq '.[] | select(.isDraft == false) | .number') + +if [ -z "$prs" ]; then + echo "No open non-draft PRs found." + exit 0 +fi + +rerun_count=0 +for pr in $prs; do + title=$(gh pr view --repo "$REPO" "$pr" --json title --jq .title) + branch=$(gh pr view --repo "$REPO" "$pr" --json headRefName --jq .headRefName) + + # Find failed workflow runs on this PR's branch + failed_runs=$(gh run list --repo "$REPO" --branch "$branch" --limit 5 \ + --json databaseId,status,conclusion,name \ + --jq '.[] | select(.conclusion == "failure") | "\(.databaseId) \(.name)"') + + if [ -n "$failed_runs" ]; then + echo "" + echo "=== PR #$pr: $title ===" + echo "$failed_runs" | while read -r run_id run_name; do + # Check which jobs failed + failed_jobs=$(gh run view --repo "$REPO" "$run_id" \ + --json jobs --jq '.jobs[] | select(.conclusion == "failure" or .conclusion == "cancelled") | .name') + echo " Run $run_id ($run_name):" + echo "$failed_jobs" | while read -r job; do + echo " - $job" + done + + if [ "${APPLY:-0}" = "1" ]; then + echo " Rerunning failed jobs..." + gh run rerun --repo "$REPO" "$run_id" --failed || echo " WARNING: rerun failed (may already be rerunning)" + rerun_count=$((rerun_count + 1)) + fi + done + fi +done + +# Also check master branch +echo "" +echo "=== master branch ===" +master_failed=$(gh run list --repo "$REPO" --branch master --limit 5 \ + --json databaseId,status,conclusion,name \ + --jq '.[] | select(.conclusion == "failure") | "\(.databaseId) \(.name)"') +if [ -n "$master_failed" ]; then + echo "$master_failed" | while read -r run_id run_name; do + echo " Run $run_id ($run_name)" + if [ "${APPLY:-0}" = "1" ]; then + echo " Rerunning failed jobs..." + gh run rerun --repo "$REPO" "$run_id" --failed || echo " WARNING: rerun failed" + fi + done +else + echo " All passing" +fi + +if [ "${APPLY:-0}" != "1" ]; then + echo "" + echo "Dry run — set APPLY=1 to actually rerun failed workflows." +fi diff --git a/misc/frontier/README.md b/misc/frontier/README.md index d2c30f95a3..bce2188f4e 100644 --- a/misc/frontier/README.md +++ b/misc/frontier/README.md @@ -12,7 +12,9 @@ a runner between nodes only requires stopping the process on one node and starting it on another — no binary copying needed. Each runner directory contains a `runner.node` file recording which login node -it runs on, and a `runner.pid` file with its process ID. +it was last started on. This is used as a fallback hint when restarting offline +runners. The authoritative source of truth for whether a runner is running (and +on which node) is CWD-based process discovery — not any PID file. Runners occasionally die due to OLCF's firewall/proxy dropping long-lived TCP connections to GitHub's broker. The `restart-offline-runners.sh` script handles @@ -22,6 +24,9 @@ particular node, move it to a quieter one (login01 tends to have low load). ## Quick Reference ```bash +# Check runner health across all login nodes +bash check-runners.sh + # Deploy a new runner on a specific node bash make-runner.sh 23 login01 @@ -33,33 +38,39 @@ bash restart-offline-runners.sh # Stop and deregister a runner bash stop-runner.sh frontier-12 + +# Rerun failed CI workflows (site-agnostic, also available at misc/common/) +bash ../common/rerun-failed.sh +APPLY=1 bash ../common/rerun-failed.sh ``` ## Scripts | Script | Purpose | |---|---| +| `config.sh` | Shared configuration, constants, GitHub API helpers, and CWD-based process management functions. Sourced by all other scripts. | +| `check-runners.sh` | SSH to each login node, show Runner.Listener processes with name, status (idle/BUSY), and RSS memory. | | `make-runner.sh` | Download runner binary, register with GitHub via API, start on target node. Usage: `make-runner.sh [login-node]` | | `deploy-runners.sh` | Deploy multiple runners across login nodes in parallel. Usage: `deploy-runners.sh [node2 ...]` | -| `restart-offline-runners.sh` | Query GitHub for offline frontier runners, SSH to their recorded node, and restart in parallel. Prints final online/offline status. | -| `stop-runner.sh` | Kill the runner process and deregister it from GitHub. Usage: `stop-runner.sh ` | +| `restart-offline-runners.sh` | Query GitHub for offline frontier runners, locate via CWD-based discovery, stop stale processes, then restart in parallel. Prints final status. | +| `stop-runner.sh` | Locate runner via CWD-based discovery, stop the process, and deregister from GitHub. Usage: `stop-runner.sh ` | +| `../common/rerun-failed.sh` | Rerun failed GitHub Actions workflows on open PRs and master. No site-specific code. | ## Troubleshooting **Runner goes OFFLINE repeatedly on the same node** — That login node may have -process culling or high memory pressure. Move it: +process culling or high memory pressure. Move it by stopping the runner and +restarting it on a different node: ```bash -# Kill on current node, start on a quieter one -ssh login10 "kill $(cat /lustre/orion/cfd154/proj-shared/runners/frontier-1/runner.pid)" +bash stop-runner.sh frontier-1 echo "login01" > /lustre/orion/cfd154/proj-shared/runners/frontier-1/runner.node -ssh login01 "nohup /lustre/orion/cfd154/proj-shared/runners/frontier-1/run.sh \ - >> /lustre/orion/cfd154/proj-shared/runners/frontier-1/runner.log 2>&1 < /dev/null &" +bash restart-offline-runners.sh ``` **Multiple runners OFFLINE at once** — Usually a transient OLCF network blip to GitHub. Run `restart-offline-runners.sh` to recover all at once. -**runner.pid is wrong after a restart** — This was a known bug (now fixed in -`restart-offline-runners.sh`) where the PID was written before the SSH -command completed. The fix wraps the SSH call in a subshell so the PID is -captured synchronously. +**Runner appears offline on GitHub but process is running** — GitHub status can +lag. `restart-offline-runners.sh` uses CWD-based process discovery first: if a +process is found running, it will stop it before restarting, preventing +duplicate runner processes. diff --git a/misc/frontier/check-runners.sh b/misc/frontier/check-runners.sh new file mode 100755 index 0000000000..0973d692e0 --- /dev/null +++ b/misc/frontier/check-runners.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Quick health check for GitHub Actions runners across Frontier login nodes. +# +# SSHes to each login node, finds Runner.Listener processes, and shows +# runner name, status (idle/BUSY), and memory usage. +# +# Usage: bash check-runners.sh +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" + +for node in "${NODES[@]}"; do + echo "=== $node ===" + ssh $SSH_OPTS "$node" ' + found=0 + for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do + found=1 + cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || echo "???") + worker=$(ps aux | grep "Runner.Worker" | grep "$cwd" | grep -v grep | awk "{print \$2}" | head -1) + [ -n "$worker" ] && status="BUSY" || status="idle" + rss=$(ps -p $p -o rss= 2>/dev/null | awk "{printf \"%.0f\", \$1/1024}" || echo "?") + name=$(basename "$cwd") + printf " %-30s %5s %s MB\n" "$name" "$status" "$rss" + done + [ "$found" -eq 0 ] && echo " (no runners)" + ' 2>/dev/null || echo " (unreachable)" + echo "" +done diff --git a/misc/frontier/config.sh b/misc/frontier/config.sh new file mode 100755 index 0000000000..0cd54423c0 --- /dev/null +++ b/misc/frontier/config.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +# Shared configuration for Frontier GitHub Actions runner management. +# +# Sourced by all other scripts. Provides constants, GitHub API helpers, +# and login-node process management functions. + +# --- Frontier constants --- +ORG="MFlowCode" +RUNNER_GROUP="phoenix" +RUNNER_LABEL="frontier" +NODES=(login01 login02 login03 login04 login05 login06 login07 login08 login09 login10 login11) +SHARED_DIR="/lustre/orion/cfd154/proj-shared/runners" + +SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=10 -o BatchMode=yes -o ServerAliveInterval=10 -o ServerAliveCountMax=3" + +# --- GitHub API --- + +# List Frontier runners from the GitHub API. +# Prints: id name status busy (one runner per line) +gh_list_runners() { + gh api "orgs/$ORG/actions/runners" --paginate \ + --jq ".runners[] + | select(.labels | map(.name) | index(\"$RUNNER_LABEL\")) + | \"\(.id) \(.name) \(.status) \(.busy)\"" +} + +# Get a registration token for new runners. +gh_registration_token() { + gh api "orgs/$ORG/actions/runners/registration-token" --jq .token +} + +# Get the latest runner binary version. +gh_latest_runner_version() { + gh api repos/actions/runner/releases/latest --jq '.tag_name | ltrimstr("v")' +} + +# Remove a runner registration from GitHub. +# Args: $1 = runner ID (numeric, from API) +gh_remove_runner() { + gh api "orgs/$ORG/actions/runners/$1" -X DELETE +} + +# --- Local filesystem --- + +# Find all runner directories on shared storage. +# Prints: one directory path per line. +find_runner_dirs() { + for conf in "$SHARED_DIR"/frontier-*/.runner; do + [ -f "$conf" ] && dirname "$conf" + done +} + +# Get the GitHub runner name from a .runner config file. +# Args: $1 = runner directory +get_runner_name() { + python3 -c " +import json +d = json.loads(open('$1/.runner').read().lstrip('\ufeff')) +print(d.get('agentName', '')) +" 2>/dev/null +} + +# --- Login-node process management --- + +# Find PIDs of a runner on a node by matching its CWD. +# (Runner.Listener's command line is just "Runner.Listener run" — no path.) +# Frontier SSH prints MOTD to stdout, so output is filtered to numeric lines only. +# Args: $1 = node, $2 = runner directory +# Prints: space-separated PIDs, or empty. +find_pids() { + ssh $SSH_OPTS "$1" ' + for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do + cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) + [ "$cwd" = "'"$2"'" ] && echo "$p" + done + ' 2>/dev/null | grep -E '^[0-9]+$' | tr '\n' ' ' || true +} + +# Find which login node a runner is on. +# Args: $1 = runner directory +# Prints: node hostname, or "offline". +find_node() { + for node in "${NODES[@]}"; do + [ -n "$(find_pids "$node" "$1")" ] && echo "$node" && return + done + echo "offline" +} + +# Start a runner on a node. +# Args: $1 = node, $2 = runner directory +# Returns: 0 if running after start, 1 otherwise. +start_runner() { + local node="$1" dir="$2" + timeout 15 ssh $SSH_OPTS "$node" \ + "setsid nohup $dir/run.sh >> $dir/runner.log 2>&1 < /dev/null &" \ + /dev/null || true + sleep 3 + [ -n "$(find_pids "$node" "$dir")" ] +} + +# Stop a runner on a node (SIGTERM then SIGKILL). +# Args: $1 = node, $2 = runner directory +stop_runner() { + local node="$1" dir="$2" pids + pids=$(find_pids "$node" "$dir") + [ -z "$pids" ] && return 0 + for pid in $pids; do + ssh $SSH_OPTS "$node" "kill $pid" 2>/dev/null || true + done + sleep 3 + pids=$(find_pids "$node" "$dir") + for pid in $pids; do + ssh $SSH_OPTS "$node" "kill -9 $pid" 2>/dev/null || true + done + sleep 1 +} diff --git a/misc/frontier/deploy-runners.sh b/misc/frontier/deploy-runners.sh index 9d9daf9591..99efa2f2c6 100755 --- a/misc/frontier/deploy-runners.sh +++ b/misc/frontier/deploy-runners.sh @@ -1,27 +1,28 @@ #!/usr/bin/env bash -# Deploy one runner per login node. -# Usage: ./deploy-runners.sh [node2 ...] -# Example: ./deploy-runners.sh 17 login08 login09 login10 +# Deploy one runner per login node in parallel. +# Usage: deploy-runners.sh [node2 ...] +# Example: deploy-runners.sh 17 login08 login09 login10 # Deploys frontier-17 on login08, frontier-18 on login09, frontier-19 on login10. set -euo pipefail -SHARED_DIR="/lustre/orion/cfd154/proj-shared/runners" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" START_NUM="${1:?Usage: $0 [node2 ...]}" shift -NODES=("$@") +TARGET_NODES=("$@") -if [ ${#NODES[@]} -eq 0 ]; then +if [ ${#TARGET_NODES[@]} -eq 0 ]; then echo "Error: no login nodes specified." >&2 echo "Usage: $0 [node2 ...]" >&2 exit 1 fi -for i in "${!NODES[@]}"; do - NODE="${NODES[$i]}" +for i in "${!TARGET_NODES[@]}"; do + NODE="${TARGET_NODES[$i]}" NUM=$((START_NUM + i)) echo "==> Deploying frontier-${NUM} on ${NODE}..." - bash "${SHARED_DIR}/make-runner.sh" "${NUM}" "${NODE}" & + "$SCRIPT_DIR/make-runner.sh" "${NUM}" "${NODE}" & done wait diff --git a/misc/frontier/make-runner.sh b/misc/frontier/make-runner.sh index 1866475f3f..05b63d23f3 100755 --- a/misc/frontier/make-runner.sh +++ b/misc/frontier/make-runner.sh @@ -1,22 +1,20 @@ #!/usr/bin/env bash -# Create, configure, and start a single GitHub Actions runner. -# Usage: ./make-runner.sh [login-node] +# Create, configure, and start a single GitHub Actions runner on Frontier. +# Usage: make-runner.sh [login-node] # runner-number Sequential number for this runner (e.g. 12) -# login-node Node to run on (default: current host). Runner will be -# started there via SSH (or locally if it matches current host). -# Example: ./make-runner.sh 12 login03 +# login-node Node to run on (default: current host) +# Example: make-runner.sh 12 login03 set -euo pipefail -SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=30 -o ServerAliveInterval=10 -o ServerAliveCountMax=3" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" + +RUNNER_VERSION="2.332.0" RUNNER_NUM="${1:?Usage: $0 [login-node]}" TARGET_NODE="${2:-$(hostname -s)}" -RUNNER_VERSION="2.332.0" + TARBALL="actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" -SHARED_DIR="/lustre/orion/cfd154/proj-shared/runners" -ORG="MFlowCode" -RUNNER_GROUP="phoenix" -EXTRA_LABELS="frontier" RUNNER_NAME="frontier-${RUNNER_NUM}" RUNNER_DIR="${SHARED_DIR}/${RUNNER_NAME}" @@ -30,31 +28,26 @@ if [ ! -f "${SHARED_DIR}/${TARBALL}" ]; then -o "${SHARED_DIR}/${TARBALL}" fi -# --- Extract (always run locally — filesystem is shared across all nodes) --- +# --- Extract (filesystem is shared across all nodes) --- mkdir -p "${RUNNER_DIR}" echo "==> Extracting runner into ${RUNNER_DIR}..." tar xzf "${SHARED_DIR}/${TARBALL}" -C "${RUNNER_DIR}" -# Verify extraction succeeded if [ ! -f "${RUNNER_DIR}/run.sh" ]; then echo "ERROR: Extraction failed — run.sh not found in ${RUNNER_DIR}" >&2 exit 1 fi -# --- Configure (always run locally — filesystem is shared across all nodes) --- +# --- Configure --- echo "==> Fetching registration token..." -REG_TOKEN=$(gh api \ - --method POST \ - -H "Accept: application/vnd.github+json" \ - "/orgs/${ORG}/actions/runners/registration-token" \ - --jq '.token') +REG_TOKEN=$(gh_registration_token) echo "==> Configuring runner..." "${RUNNER_DIR}/config.sh" \ --url "https://github.com/${ORG}" \ --token "${REG_TOKEN}" \ --name "${RUNNER_NAME}" \ - --labels "${EXTRA_LABELS}" \ + --labels "${RUNNER_LABEL}" \ --runnergroup "${RUNNER_GROUP}" \ --work "_work" \ --unattended \ @@ -63,18 +56,13 @@ echo "==> Configuring runner..." # --- Store which node this runner lives on --- echo "${TARGET_NODE}" > "${RUNNER_DIR}/runner.node" -# --- Start run.sh on the target node --- +# --- Start runner on target node --- echo "==> Starting runner on ${TARGET_NODE}..." -CURRENT_NODE=$(hostname -s) -if [ "${TARGET_NODE}" = "${CURRENT_NODE}" ]; then - nohup "${RUNNER_DIR}/run.sh" >> "${RUNNER_DIR}/runner.log" 2>&1 < /dev/null & - echo $! > "${RUNNER_DIR}/runner.pid" +if start_runner "${TARGET_NODE}" "${RUNNER_DIR}"; then + echo "==> Runner '${RUNNER_NAME}' is running on ${TARGET_NODE}." else - PID=$(ssh ${SSH_OPTS} "${TARGET_NODE}" \ - "nohup ${RUNNER_DIR}/run.sh >> ${RUNNER_DIR}/runner.log 2>&1 < /dev/null & echo \$!") - echo "${PID}" > "${RUNNER_DIR}/runner.pid" + echo "ERROR: Runner '${RUNNER_NAME}' did not start on ${TARGET_NODE}." >&2 + exit 1 fi -echo "==> Runner PID: $(cat ${RUNNER_DIR}/runner.pid)" echo "==> Log: ${RUNNER_DIR}/runner.log" -echo "==> Done. Runner '${RUNNER_NAME}' is starting on ${TARGET_NODE}." diff --git a/misc/frontier/restart-offline-runners.sh b/misc/frontier/restart-offline-runners.sh index 074a39f44a..7384961fb4 100755 --- a/misc/frontier/restart-offline-runners.sh +++ b/misc/frontier/restart-offline-runners.sh @@ -1,58 +1,76 @@ #!/usr/bin/env bash # Restart all offline frontier runners. -# Queries GitHub for offline runners, then SSHes to each runner's node -# and restarts run.sh in the background. All restarts happen in parallel. +# +# Queries GitHub for offline frontier runners, locates each via CWD-based +# process discovery, stops any stale processes, then restarts in parallel. +# Falls back to runner.node for the target node if the runner is truly offline. +# +# Usage: bash restart-offline-runners.sh set -euo pipefail -SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=30 -o ServerAliveInterval=10 -o ServerAliveCountMax=3" -SHARED_DIR="/lustre/orion/cfd154/proj-shared/runners" -ORG="MFlowCode" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" echo "==> Checking for offline frontier runners..." -OFFLINE=$(gh api orgs/${ORG}/actions/runners \ - --jq '.runners[] | select(.name | startswith("frontier")) | select(.status == "offline") | .name') -if [ -z "${OFFLINE}" ]; then +# Collect offline runner names from GitHub API +mapfile -t OFFLINE_NAMES < <( + gh_list_runners | while read -r id name status busy; do + [ "$status" = "offline" ] && echo "$name" + done +) + +if [ ${#OFFLINE_NAMES[@]} -eq 0 ]; then echo "==> All frontier runners are online. Nothing to do." exit 0 fi -echo "==> Offline runners: $(echo ${OFFLINE} | tr '\n' ' ')" +echo "==> Offline runners: ${OFFLINE_NAMES[*]}" -for RUNNER_NAME in ${OFFLINE}; do - RUNNER_DIR="${SHARED_DIR}/${RUNNER_NAME}" - NODE_FILE="${RUNNER_DIR}/runner.node" +restart_one() { + local runner_name="$1" + local dir="${SHARED_DIR}/${runner_name}" - if [ ! -d "${RUNNER_DIR}" ]; then - echo "WARN: No directory for ${RUNNER_NAME}, skipping." - continue + if [ ! -d "$dir" ]; then + echo "WARN: No directory for ${runner_name}, skipping." + return fi - if [ ! -f "${NODE_FILE}" ]; then - echo "WARN: No runner.node file for ${RUNNER_NAME}, skipping." - continue + # Check if it's actually already running somewhere (GitHub may lag) + local actual_node + actual_node=$(find_node "$dir") + + if [ "$actual_node" != "offline" ]; then + echo "==> ${runner_name} appears running on ${actual_node} (GitHub may lag) — stopping first..." + stop_runner "$actual_node" "$dir" fi - TARGET_NODE=$(cat "${NODE_FILE}") - CURRENT_NODE=$(hostname -s) + # Determine target node from runner.node fallback + local target_node + if [ -f "${dir}/runner.node" ]; then + target_node=$(cat "${dir}/runner.node") + else + echo "WARN: No runner.node for ${runner_name}, skipping." + return + fi - echo "==> Restarting ${RUNNER_NAME} on ${TARGET_NODE}..." - if [ "${TARGET_NODE}" = "${CURRENT_NODE}" ]; then - nohup "${RUNNER_DIR}/run.sh" >> "${RUNNER_DIR}/runner.log" 2>&1 < /dev/null & - echo $! > "${RUNNER_DIR}/runner.pid" - echo " PID: $(cat ${RUNNER_DIR}/runner.pid)" + echo "==> Starting ${runner_name} on ${target_node}..." + if start_runner "$target_node" "$dir"; then + echo " ${runner_name}: started on ${target_node}." else - ( - PID=$(ssh ${SSH_OPTS} "${TARGET_NODE}" \ - "nohup ${RUNNER_DIR}/run.sh >> ${RUNNER_DIR}/runner.log 2>&1 < /dev/null & echo \$!") - echo "${PID}" > "${RUNNER_DIR}/runner.pid" - echo " ${RUNNER_NAME} PID: ${PID}" - ) & + echo " ${runner_name}: ERROR — failed to start on ${target_node}." >&2 fi +} + +# Restart all offline runners in parallel +for name in "${OFFLINE_NAMES[@]}"; do + restart_one "$name" & done wait -echo "==> Done. Waiting 5s for runners to register..." -sleep 5 -gh api orgs/${ORG}/actions/runners \ - --jq '.runners[] | select(.name | startswith("frontier")) | {name, status}' + +echo "" +echo "==> Final status:" +gh_list_runners | while read -r id name status busy; do + printf " %-30s %s\n" "$name" "$status" +done diff --git a/misc/frontier/stop-runner.sh b/misc/frontier/stop-runner.sh index b8941bec60..e7c1185fef 100755 --- a/misc/frontier/stop-runner.sh +++ b/misc/frontier/stop-runner.sh @@ -1,47 +1,41 @@ #!/usr/bin/env bash -# Stop and deregister a GitHub Actions runner. -# Usage: ./stop-runner.sh (e.g. frontier-12) +# Stop and deregister a GitHub Actions runner on Frontier. +# Usage: stop-runner.sh (e.g. frontier-12) set -euo pipefail -SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=30 -o ServerAliveInterval=10 -o ServerAliveCountMax=3" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" RUNNER_NAME="${1:?Usage: $0 }" -SHARED_DIR="/lustre/orion/cfd154/proj-shared/runners" RUNNER_DIR="${SHARED_DIR}/${RUNNER_NAME}" -ORG="MFlowCode" if [ ! -d "${RUNNER_DIR}" ]; then - echo "Runner dir not found: ${RUNNER_DIR}" + echo "Runner dir not found: ${RUNNER_DIR}" >&2 exit 1 fi -# --- Kill the process (SSH to correct node if needed) --- -PID_FILE="${RUNNER_DIR}/runner.pid" -NODE_FILE="${RUNNER_DIR}/runner.node" -CURRENT_NODE=$(hostname -s) +# --- Locate and kill the runner process --- +echo "==> Locating ${RUNNER_NAME}..." +node=$(find_node "$RUNNER_DIR") -if [ -f "${PID_FILE}" ]; then - PID=$(cat "${PID_FILE}") - TARGET_NODE="${CURRENT_NODE}" - [ -f "${NODE_FILE}" ] && TARGET_NODE=$(cat "${NODE_FILE}") - - echo "==> Killing PID ${PID} on ${TARGET_NODE}..." - if [ "${TARGET_NODE}" = "${CURRENT_NODE}" ]; then - kill "${PID}" 2>/dev/null || true - else - ssh ${SSH_OPTS} "${TARGET_NODE}" "kill ${PID} 2>/dev/null || true" - fi - rm -f "${PID_FILE}" +if [ "$node" != "offline" ]; then + echo "==> Stopping ${RUNNER_NAME} on ${node}..." + stop_runner "$node" "$RUNNER_DIR" + echo "==> Process stopped." +else + echo "==> ${RUNNER_NAME} is not running (already offline)." fi # --- Deregister from GitHub --- -echo "==> Fetching removal token..." -REMOVE_TOKEN=$(gh api \ - --method POST \ - -H "Accept: application/vnd.github+json" \ - "/orgs/${ORG}/actions/runners/remove-token" \ - --jq '.token') - -echo "==> Deregistering runner..." -"${RUNNER_DIR}/config.sh" remove --token "${REMOVE_TOKEN}" || true -echo "==> Done." +echo "==> Fetching runner ID from GitHub..." +runner_id=$(gh_list_runners | while read -r id name status busy; do + [ "$name" = "$RUNNER_NAME" ] && echo "$id" && break +done) + +if [ -n "$runner_id" ]; then + echo "==> Deregistering runner (ID ${runner_id})..." + gh_remove_runner "$runner_id" + echo "==> Done." +else + echo "==> Runner not found in GitHub API (may already be deregistered)." +fi diff --git a/misc/phoenix/rerun-failed.sh b/misc/phoenix/rerun-failed.sh index 1744e480cb..15ccc7fc46 100755 --- a/misc/phoenix/rerun-failed.sh +++ b/misc/phoenix/rerun-failed.sh @@ -1,77 +1,2 @@ #!/bin/bash -# Rerun failed GitHub Actions workflows on open non-draft MFC PRs and master. -# -# Checks the 5 most recent workflow runs per branch. Only the failed jobs -# within each run are rerun (via `gh run rerun --failed`), not the entire -# workflow. Runs that are already in progress or queued are skipped by `gh`. -# -# Requires: gh CLI authenticated with access to MFlowCode/MFC -# -# Usage: bash rerun-failed.sh # dry run (show what would be rerun) -# APPLY=1 bash rerun-failed.sh # actually rerun failed workflows - -set -euo pipefail - -REPO="MFlowCode/MFC" - -echo "Checking open non-draft PRs on $REPO..." -prs=$(gh pr list --repo "$REPO" --state open --json number,title,isDraft --jq '.[] | select(.isDraft == false) | .number') - -if [ -z "$prs" ]; then - echo "No open non-draft PRs found." - exit 0 -fi - -rerun_count=0 -for pr in $prs; do - title=$(gh pr view --repo "$REPO" "$pr" --json title --jq .title) - branch=$(gh pr view --repo "$REPO" "$pr" --json headRefName --jq .headRefName) - - # Find failed workflow runs on this PR's branch - failed_runs=$(gh run list --repo "$REPO" --branch "$branch" --limit 5 \ - --json databaseId,status,conclusion,name \ - --jq '.[] | select(.conclusion == "failure") | "\(.databaseId) \(.name)"') - - if [ -n "$failed_runs" ]; then - echo "" - echo "=== PR #$pr: $title ===" - echo "$failed_runs" | while read -r run_id run_name; do - # Check which jobs failed - failed_jobs=$(gh run view --repo "$REPO" "$run_id" \ - --json jobs --jq '.jobs[] | select(.conclusion == "failure" or .conclusion == "cancelled") | .name') - echo " Run $run_id ($run_name):" - echo "$failed_jobs" | while read -r job; do - echo " - $job" - done - - if [ "${APPLY:-0}" = "1" ]; then - echo " Rerunning failed jobs..." - gh run rerun --repo "$REPO" "$run_id" --failed || echo " WARNING: rerun failed (may already be rerunning)" - rerun_count=$((rerun_count + 1)) - fi - done - fi -done - -# Also check master branch -echo "" -echo "=== master branch ===" -master_failed=$(gh run list --repo "$REPO" --branch master --limit 5 \ - --json databaseId,status,conclusion,name \ - --jq '.[] | select(.conclusion == "failure") | "\(.databaseId) \(.name)"') -if [ -n "$master_failed" ]; then - echo "$master_failed" | while read -r run_id run_name; do - echo " Run $run_id ($run_name)" - if [ "${APPLY:-0}" = "1" ]; then - echo " Rerunning failed jobs..." - gh run rerun --repo "$REPO" "$run_id" --failed || echo " WARNING: rerun failed" - fi - done -else - echo " All passing" -fi - -if [ "${APPLY:-0}" != "1" ]; then - echo "" - echo "Dry run — set APPLY=1 to actually rerun failed workflows." -fi +exec "$(dirname "${BASH_SOURCE[0]}")/../common/rerun-failed.sh" "$@" From c9569612653dc1feb7b1e8a67591b14be12c8da5 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 18:20:25 -0400 Subject: [PATCH 06/20] misc/frontier: add list-runners, move-runner; improve restart and make-runner - list-runners.sh: new script using parallel SSH sweep across all 11 nodes simultaneously instead of serial per-runner discovery; flags stale runner.node entries with a warning in the node column - move-runner.sh: new script to relocate a runner between login nodes with one retry on start failure - restart-offline-runners.sh: add retry logic (sleep 5 + second attempt) and runner.node self-healing (detects and corrects stale node entries when a runner is found on a different node than recorded) - make-runner.sh: replace hardcoded RUNNER_VERSION with dynamic GitHub API lookup falling back to pinned version; print selected version at startup - misc/frontier/README.md: document new scripts, update quick reference and troubleshooting sections, note runner.node self-healing behavior - misc/common/README.md: new file documenting site-agnostic shared scripts Co-Authored-By: Claude Sonnet 4.6 --- misc/common/README.md | 27 ++++++++ misc/frontier/README.md | 21 ++++-- misc/frontier/list-runners.sh | 86 ++++++++++++++++++++++++ misc/frontier/make-runner.sh | 3 +- misc/frontier/move-runner.sh | 68 +++++++++++++++++++ misc/frontier/restart-offline-runners.sh | 40 ++++++++--- 6 files changed, 228 insertions(+), 17 deletions(-) create mode 100644 misc/common/README.md create mode 100755 misc/frontier/list-runners.sh create mode 100755 misc/frontier/move-runner.sh diff --git a/misc/common/README.md b/misc/common/README.md new file mode 100644 index 0000000000..ad2156b7b4 --- /dev/null +++ b/misc/common/README.md @@ -0,0 +1,27 @@ +# Common Runner Management Scripts + +This directory contains site-agnostic scripts shared between the Phoenix and +Frontier runner management setups. Scripts here have no site-specific logic and +can be invoked directly or via thin site-specific wrappers. + +## Scripts + +| Script | Purpose | +|---|---| +| `rerun-failed.sh` | Rerun failed GitHub Actions workflows on open non-draft MFC PRs and master. Dry-run by default; set `APPLY=1` to actually trigger reruns. | + +## Usage + +```bash +# Dry run — show which failed workflows would be rerun +bash misc/common/rerun-failed.sh + +# Actually rerun failed workflows +APPLY=1 bash misc/common/rerun-failed.sh +``` + +## Site wrappers + +`misc/phoenix/rerun-failed.sh` is a thin wrapper that delegates to this +script, so both `bash misc/phoenix/rerun-failed.sh` and +`bash misc/common/rerun-failed.sh` invoke the same logic. diff --git a/misc/frontier/README.md b/misc/frontier/README.md index bce2188f4e..f6a2175350 100644 --- a/misc/frontier/README.md +++ b/misc/frontier/README.md @@ -16,6 +16,10 @@ it was last started on. This is used as a fallback hint when restarting offline runners. The authoritative source of truth for whether a runner is running (and on which node) is CWD-based process discovery — not any PID file. +`runner.node` is self-healing: `restart-offline-runners.sh` detects when a +runner is actually running on a different node than `runner.node` records (e.g. +after a manual restart) and corrects the file automatically. + Runners occasionally die due to OLCF's firewall/proxy dropping long-lived TCP connections to GitHub's broker. The `restart-offline-runners.sh` script handles recovery. Login nodes vary in stability — if a runner keeps dying on a @@ -24,6 +28,9 @@ particular node, move it to a quieter one (login01 tends to have low load). ## Quick Reference ```bash +# List all runners with GitHub status, node, and memory usage +bash list-runners.sh + # Check runner health across all login nodes bash check-runners.sh @@ -36,6 +43,9 @@ bash deploy-runners.sh 23 login01 login02 login03 # Restart all offline runners in place bash restart-offline-runners.sh +# Move a runner to a different login node +bash move-runner.sh frontier-1 login01 + # Stop and deregister a runner bash stop-runner.sh frontier-12 @@ -50,21 +60,20 @@ APPLY=1 bash ../common/rerun-failed.sh |---|---| | `config.sh` | Shared configuration, constants, GitHub API helpers, and CWD-based process management functions. Sourced by all other scripts. | | `check-runners.sh` | SSH to each login node, show Runner.Listener processes with name, status (idle/BUSY), and RSS memory. | +| `list-runners.sh` | List all runners with GitHub API status, actual node (from parallel SSH sweep), and RSS memory. Flags stale `runner.node` entries. | | `make-runner.sh` | Download runner binary, register with GitHub via API, start on target node. Usage: `make-runner.sh [login-node]` | +| `move-runner.sh` | Move a runner to a different login node: stop on current node, update `runner.node`, start on target. Usage: `move-runner.sh ` | | `deploy-runners.sh` | Deploy multiple runners across login nodes in parallel. Usage: `deploy-runners.sh [node2 ...]` | -| `restart-offline-runners.sh` | Query GitHub for offline frontier runners, locate via CWD-based discovery, stop stale processes, then restart in parallel. Prints final status. | +| `restart-offline-runners.sh` | Query GitHub for offline frontier runners, locate via CWD-based discovery, stop stale processes, then restart in parallel. Self-heals stale `runner.node` files. Prints final status. | | `stop-runner.sh` | Locate runner via CWD-based discovery, stop the process, and deregister from GitHub. Usage: `stop-runner.sh ` | | `../common/rerun-failed.sh` | Rerun failed GitHub Actions workflows on open PRs and master. No site-specific code. | ## Troubleshooting **Runner goes OFFLINE repeatedly on the same node** — That login node may have -process culling or high memory pressure. Move it by stopping the runner and -restarting it on a different node: +process culling or high memory pressure. Move it to a different node: ```bash -bash stop-runner.sh frontier-1 -echo "login01" > /lustre/orion/cfd154/proj-shared/runners/frontier-1/runner.node -bash restart-offline-runners.sh +bash move-runner.sh frontier-1 login01 ``` **Multiple runners OFFLINE at once** — Usually a transient OLCF network blip diff --git a/misc/frontier/list-runners.sh b/misc/frontier/list-runners.sh new file mode 100755 index 0000000000..21f7a75a4c --- /dev/null +++ b/misc/frontier/list-runners.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +# List all Frontier runners, combining GitHub API status with login-node process info. +# +# Uses a parallel SSH sweep across all 11 login nodes simultaneously to avoid +# the overhead of serial per-runner node discovery. Each node is queried once; +# results are correlated with GitHub API status and the local runner directories. +# +# Usage: bash list-runners.sh +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" + +tmpdir=$(mktemp -d) +trap 'rm -rf "$tmpdir"' EXIT + +# --- Parallel SSH sweep across all login nodes --- +# Each node prints lines in the format: RUNNER +# The RUNNER sentinel prefix allows stripping MOTD noise with grep. +for node in "${NODES[@]}"; do + ssh $SSH_OPTS "$node" ' + for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do + cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) + rss=$(ps -p $p -o rss= 2>/dev/null | awk "{printf \"%.0f\", \$1/1024}" || echo 0) + [ -n "$cwd" ] && echo "RUNNER '"$node"' $cwd $rss" + done + ' 2>/dev/null > "$tmpdir/$node.out" & +done + +wait + +# --- Build associative arrays from sweep results --- +declare -A runner_node runner_rss +for node in "${NODES[@]}"; do + while IFS= read -r line; do + # Each line: RUNNER + read -r _sentinel sweep_node dir rss <<< "$line" + runner_node["$dir"]="$sweep_node" + runner_rss["$dir"]="$rss" + done < <(grep '^RUNNER ' "$tmpdir/$node.out" 2>/dev/null || true) +done + +# --- Fetch GitHub API status --- +declare -A gh_status gh_busy +while read -r _id name status busy; do + gh_status["$name"]="$status" + gh_busy["$name"]="$busy" +done < <(gh_list_runners) + +# --- Print table --- +printf "%-25s %-8s %-14s %s\n" "NAME" "GITHUB" "NODE" "RSS" +printf "%s\n" "$(printf '%.0s-' {1..60})" + +for dir in $(find_runner_dirs); do + name=$(get_runner_name "$dir") + [ -z "$name" ] && continue + + # GitHub status column + api_status="${gh_status[$name]:-unknown}" + api_busy="${gh_busy[$name]:-false}" + if [ "$api_busy" = "true" ]; then + gh_col="BUSY" + else + gh_col="$api_status" + fi + + # Node and RSS from parallel sweep + actual_node="${runner_node[$dir]:-}" + rss="${runner_rss[$dir]:-}" + + if [ -z "$actual_node" ]; then + printf "%-25s %-8s %-14s %s\n" "$name" "$gh_col" "offline" "—" + continue + fi + + # Compare sweep result to recorded runner.node; flag stale entries + node_col="$actual_node" + if [ -f "$dir/runner.node" ]; then + recorded=$(cat "$dir/runner.node") + if [ "$actual_node" != "$recorded" ]; then + node_col="${actual_node} *(stale: ${recorded})" + fi + fi + + printf "%-25s %-8s %-14s %sMB\n" "$name" "$gh_col" "$node_col" "$rss" +done diff --git a/misc/frontier/make-runner.sh b/misc/frontier/make-runner.sh index 05b63d23f3..ae87cd4e55 100755 --- a/misc/frontier/make-runner.sh +++ b/misc/frontier/make-runner.sh @@ -9,7 +9,7 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" -RUNNER_VERSION="2.332.0" +RUNNER_VERSION="${RUNNER_VERSION:-$(gh_latest_runner_version 2>/dev/null || echo "2.332.0")}" RUNNER_NUM="${1:?Usage: $0 [login-node]}" TARGET_NODE="${2:-$(hostname -s)}" @@ -18,6 +18,7 @@ TARBALL="actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" RUNNER_NAME="frontier-${RUNNER_NUM}" RUNNER_DIR="${SHARED_DIR}/${RUNNER_NAME}" +echo "==> Using runner version ${RUNNER_VERSION}" echo "==> Setting up runner: ${RUNNER_NAME} on ${TARGET_NODE}" # --- Download tarball once to shared dir --- diff --git a/misc/frontier/move-runner.sh b/misc/frontier/move-runner.sh new file mode 100755 index 0000000000..dee275b325 --- /dev/null +++ b/misc/frontier/move-runner.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# Move a Frontier runner to a different login node. +# +# Stops the runner on its current node, updates runner.node, and starts it on +# the target node. Retries the start once after 5 seconds if the first attempt +# fails. +# +# Usage: move-runner.sh +# Example: move-runner.sh frontier-1 login01 +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" + +RUNNER_NAME="${1:?Usage: $0 }" +TARGET_NODE="${2:?Usage: $0 }" + +RUNNER_DIR="${SHARED_DIR}/${RUNNER_NAME}" + +# --- Validate runner directory --- +if [ ! -d "$RUNNER_DIR" ]; then + echo "ERROR: Runner directory not found: ${RUNNER_DIR}" >&2 + exit 1 +fi + +# --- Validate target node is in the known node list --- +valid=0 +for node in "${NODES[@]}"; do + [ "$node" = "$TARGET_NODE" ] && valid=1 && break +done +if [ "$valid" -eq 0 ]; then + echo "ERROR: '${TARGET_NODE}' is not a valid Frontier login node." >&2 + echo " Valid nodes: ${NODES[*]}" >&2 + exit 1 +fi + +# --- Find current node --- +echo "==> Locating ${RUNNER_NAME}..." +current_node=$(find_node "$RUNNER_DIR") + +if [ "$current_node" = "$TARGET_NODE" ]; then + echo "==> ${RUNNER_NAME} is already running on ${TARGET_NODE}. Nothing to do." + exit 0 +fi + +# --- Stop runner on current node (if running) --- +if [ "$current_node" != "offline" ]; then + echo "==> Stopping ${RUNNER_NAME} on ${current_node}..." + stop_runner "$current_node" "$RUNNER_DIR" +fi + +# --- Update runner.node --- +echo "$TARGET_NODE" > "${RUNNER_DIR}/runner.node" + +# --- Start runner on target node (with one retry) --- +echo "==> Starting ${RUNNER_NAME} on ${TARGET_NODE}..." +if start_runner "$TARGET_NODE" "$RUNNER_DIR"; then + echo "==> ${RUNNER_NAME} is now running on ${TARGET_NODE}." +else + echo " First start attempt failed. Retrying in 5 seconds..." + sleep 5 + if start_runner "$TARGET_NODE" "$RUNNER_DIR"; then + echo "==> ${RUNNER_NAME} is now running on ${TARGET_NODE}." + else + echo "ERROR: ${RUNNER_NAME} failed to start on ${TARGET_NODE} after retry." >&2 + exit 1 + fi +fi diff --git a/misc/frontier/restart-offline-runners.sh b/misc/frontier/restart-offline-runners.sh index 7384961fb4..3dd1356a50 100755 --- a/misc/frontier/restart-offline-runners.sh +++ b/misc/frontier/restart-offline-runners.sh @@ -36,29 +36,49 @@ restart_one() { return fi - # Check if it's actually already running somewhere (GitHub may lag) + # Determine the recorded node from runner.node + local recorded_node target_node + if [ -f "${dir}/runner.node" ]; then + recorded_node=$(cat "${dir}/runner.node") + else + echo "WARN: No runner.node for ${runner_name}, skipping." + return + fi + + # Check if the runner is actually already running somewhere (GitHub may lag) local actual_node actual_node=$(find_node "$dir") if [ "$actual_node" != "offline" ]; then + # Self-healing: if the runner is on a different node than runner.node records, + # update runner.node to reflect reality before stopping and restarting. + if [ "$actual_node" != "$recorded_node" ]; then + echo "==> ${runner_name}: found on ${actual_node}, runner.node says ${recorded_node} — updating runner.node." + echo "$actual_node" > "${dir}/runner.node" + recorded_node="$actual_node" + fi echo "==> ${runner_name} appears running on ${actual_node} (GitHub may lag) — stopping first..." stop_runner "$actual_node" "$dir" - fi - - # Determine target node from runner.node fallback - local target_node - if [ -f "${dir}/runner.node" ]; then - target_node=$(cat "${dir}/runner.node") + # Restart where it was actually running + target_node="$actual_node" else - echo "WARN: No runner.node for ${runner_name}, skipping." - return + # Runner is truly offline; fall back to the last known node + target_node="$recorded_node" fi echo "==> Starting ${runner_name} on ${target_node}..." if start_runner "$target_node" "$dir"; then + echo "$target_node" > "${dir}/runner.node" echo " ${runner_name}: started on ${target_node}." else - echo " ${runner_name}: ERROR — failed to start on ${target_node}." >&2 + echo " First start attempt failed. Retrying in 5 seconds..." + sleep 5 + if start_runner "$target_node" "$dir"; then + echo "$target_node" > "${dir}/runner.node" + echo " ${runner_name}: started on ${target_node}." + else + echo " ${runner_name}: ERROR — failed to start on ${target_node} after retry." >&2 + fi fi } From a557e7e7c020b513c2673f8dd479d77c709f0de5 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 18:30:31 -0400 Subject: [PATCH 07/20] misc/frontier: add sync_runner_nodes, list-runners, move-runner, retry logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - config.sh: add sync_runner_nodes() — parallel sweep of all login nodes that updates runner.node files before any action, ensuring accuracy even when runners are manually restarted on different nodes - make-runner.sh: use gh_latest_runner_version() with pinned fallback - restart-offline-runners.sh: call sync_runner_nodes first; self-heal stale runner.node on restart; retry start_runner once on failure - check-runners.sh: call sync_runner_nodes first - list-runners.sh: new — parallel SSH sweep + GitHub API combined view - move-runner.sh: new — move a runner to a different login node - misc/common/README.md: new — document shared scripts - misc/frontier/README.md: update for new scripts and sync behavior Co-Authored-By: Claude Sonnet 4.6 --- misc/frontier/check-runners.sh | 3 +++ misc/frontier/config.sh | 32 ++++++++++++++++++++++++ misc/frontier/list-runners.sh | 3 +++ misc/frontier/move-runner.sh | 4 +++ misc/frontier/restart-offline-runners.sh | 3 +++ 5 files changed, 45 insertions(+) diff --git a/misc/frontier/check-runners.sh b/misc/frontier/check-runners.sh index 0973d692e0..74c84eaa2e 100755 --- a/misc/frontier/check-runners.sh +++ b/misc/frontier/check-runners.sh @@ -10,6 +10,9 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" +echo "==> Syncing runner node locations..." +sync_runner_nodes + for node in "${NODES[@]}"; do echo "=== $node ===" ssh $SSH_OPTS "$node" ' diff --git a/misc/frontier/config.sh b/misc/frontier/config.sh index 0cd54423c0..3f3e239d8e 100755 --- a/misc/frontier/config.sh +++ b/misc/frontier/config.sh @@ -114,3 +114,35 @@ stop_runner() { done sleep 1 } + +# Sweep all nodes in parallel and update runner.node for any runner +# found running on a different node than recorded. Called at the top of +# every primary script to ensure runner.node always reflects reality, +# even if a runner was manually restarted on a different node. +sync_runner_nodes() { + local tmpdir + tmpdir=$(mktemp -d) + trap 'rm -rf "$tmpdir"' RETURN + + for node in "${NODES[@]}"; do + ( + ssh $SSH_OPTS "$node" ' + for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do + cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) + [ -n "$cwd" ] && echo "'"$node"' $cwd" + done + ' 2>/dev/null | grep -E '^[a-z0-9]+ /' + ) > "$tmpdir/$node" & + done + wait + + while IFS=' ' read -r node dir; do + [ -f "$dir/runner.node" ] || continue + local recorded + recorded=$(cat "$dir/runner.node" 2>/dev/null || echo "") + if [ "$node" != "$recorded" ]; then + echo "==> $(basename "$dir"): runner.node updated $recorded -> $node" + echo "$node" > "$dir/runner.node" + fi + done < <(cat "$tmpdir"/*) +} diff --git a/misc/frontier/list-runners.sh b/misc/frontier/list-runners.sh index 21f7a75a4c..13ce9bbf59 100755 --- a/misc/frontier/list-runners.sh +++ b/misc/frontier/list-runners.sh @@ -11,6 +11,9 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" +echo "==> Syncing runner node locations..." +sync_runner_nodes + tmpdir=$(mktemp -d) trap 'rm -rf "$tmpdir"' EXIT diff --git a/misc/frontier/move-runner.sh b/misc/frontier/move-runner.sh index dee275b325..29f300d1d0 100755 --- a/misc/frontier/move-runner.sh +++ b/misc/frontier/move-runner.sh @@ -34,6 +34,10 @@ if [ "$valid" -eq 0 ]; then exit 1 fi +# --- Sync runner.node files before acting --- +echo "==> Syncing runner node locations..." +sync_runner_nodes + # --- Find current node --- echo "==> Locating ${RUNNER_NAME}..." current_node=$(find_node "$RUNNER_DIR") diff --git a/misc/frontier/restart-offline-runners.sh b/misc/frontier/restart-offline-runners.sh index 3dd1356a50..5c1f984a22 100755 --- a/misc/frontier/restart-offline-runners.sh +++ b/misc/frontier/restart-offline-runners.sh @@ -11,6 +11,9 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" +echo "==> Syncing runner node locations..." +sync_runner_nodes + echo "==> Checking for offline frontier runners..." # Collect offline runner names from GitHub API From c1906464dc37cf60b0ff588b8ad0db532dc464cd Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 18:44:49 -0400 Subject: [PATCH 08/20] fix: cd into runner dir before start so CWD-based discovery works start_runner() was launching run.sh via absolute path without cd-ing into the runner directory first. The Runner.Listener process inherited the SSH login shell CWD (user home dir), so find_pids() CWD matching always failed for runners started this way. Fix: cd into $dir before running ./run.sh so the process CWD matches what find_pids() expects. Relative log path follows automatically. Co-Authored-By: Claude Sonnet 4.6 --- misc/frontier/config.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/frontier/config.sh b/misc/frontier/config.sh index 3f3e239d8e..fa56e7e359 100755 --- a/misc/frontier/config.sh +++ b/misc/frontier/config.sh @@ -92,7 +92,7 @@ find_node() { start_runner() { local node="$1" dir="$2" timeout 15 ssh $SSH_OPTS "$node" \ - "setsid nohup $dir/run.sh >> $dir/runner.log 2>&1 < /dev/null &" \ + "cd $dir && setsid nohup ./run.sh >> runner.log 2>&1 < /dev/null &" \ /dev/null || true sleep 3 [ -n "$(find_pids "$node" "$dir")" ] From 5733ee3dff73388faf2b9d74702685ec67138501 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 19:31:33 -0400 Subject: [PATCH 09/20] fix: address all PR review findings in runner scripts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - frontier/deploy-runners.sh: pre-download tarball once before spawning parallel make-runner.sh instances; use atomic tmp+mv to prevent concurrent curl writes corrupting the tarball - frontier/make-runner.sh: same atomic tmp+mv for solo-invocation safety - common/rerun-failed.sh: guard gh run view with || continue so an expired/deleted run skips rather than exits the whole script; switch pipe loops to process substitution so continue works correctly and remove the dead rerun_count variable (incremented in a subshell, never read) - frontier/list-runners.sh, phoenix/list-runners.sh, restart-all.sh, rebalance-runners.sh: replace for dir in $(find_runner_dirs) with while IFS= read -r dir; do ... done < <(find_runner_dirs) to eliminate word-splitting and glob expansion on runner paths - phoenix/check-runners.sh, phoenix/list-runners.sh: guard rss value before arithmetic expansion — if SSH fails and rss="?", the expression $(( CGROUP_LIMIT - rss )) is a syntax error that exits under set -euo pipefail; default to 0 instead Co-Authored-By: Claude Sonnet 4.6 --- misc/common/rerun-failed.sh | 19 +++++++++---------- misc/frontier/deploy-runners.sh | 15 +++++++++++++++ misc/frontier/list-runners.sh | 4 ++-- misc/frontier/make-runner.sh | 4 +++- misc/phoenix/check-runners.sh | 1 + misc/phoenix/list-runners.sh | 5 +++-- misc/phoenix/rebalance-runners.sh | 4 ++-- misc/phoenix/restart-all.sh | 4 ++-- 8 files changed, 37 insertions(+), 19 deletions(-) diff --git a/misc/common/rerun-failed.sh b/misc/common/rerun-failed.sh index ed29924451..ed56609777 100755 --- a/misc/common/rerun-failed.sh +++ b/misc/common/rerun-failed.sh @@ -22,7 +22,6 @@ if [ -z "$prs" ]; then exit 0 fi -rerun_count=0 for pr in $prs; do title=$(gh pr view --repo "$REPO" "$pr" --json title --jq .title) branch=$(gh pr view --repo "$REPO" "$pr" --json headRefName --jq .headRefName) @@ -35,21 +34,21 @@ for pr in $prs; do if [ -n "$failed_runs" ]; then echo "" echo "=== PR #$pr: $title ===" - echo "$failed_runs" | while read -r run_id run_name; do - # Check which jobs failed + while read -r run_id run_name; do + # Check which jobs failed; skip if run has expired or been deleted failed_jobs=$(gh run view --repo "$REPO" "$run_id" \ - --json jobs --jq '.jobs[] | select(.conclusion == "failure" or .conclusion == "cancelled") | .name') + --json jobs --jq '.jobs[] | select(.conclusion == "failure" or .conclusion == "cancelled") | .name' \ + 2>/dev/null) || { echo " WARNING: could not fetch jobs for run $run_id, skipping"; continue; } echo " Run $run_id ($run_name):" - echo "$failed_jobs" | while read -r job; do + while read -r job; do echo " - $job" - done + done <<< "$failed_jobs" if [ "${APPLY:-0}" = "1" ]; then echo " Rerunning failed jobs..." gh run rerun --repo "$REPO" "$run_id" --failed || echo " WARNING: rerun failed (may already be rerunning)" - rerun_count=$((rerun_count + 1)) fi - done + done < <(echo "$failed_runs") fi done @@ -60,13 +59,13 @@ master_failed=$(gh run list --repo "$REPO" --branch master --limit 5 \ --json databaseId,status,conclusion,name \ --jq '.[] | select(.conclusion == "failure") | "\(.databaseId) \(.name)"') if [ -n "$master_failed" ]; then - echo "$master_failed" | while read -r run_id run_name; do + while read -r run_id run_name; do echo " Run $run_id ($run_name)" if [ "${APPLY:-0}" = "1" ]; then echo " Rerunning failed jobs..." gh run rerun --repo "$REPO" "$run_id" --failed || echo " WARNING: rerun failed" fi - done + done < <(echo "$master_failed") else echo " All passing" fi diff --git a/misc/frontier/deploy-runners.sh b/misc/frontier/deploy-runners.sh index 99efa2f2c6..a1cee47f29 100755 --- a/misc/frontier/deploy-runners.sh +++ b/misc/frontier/deploy-runners.sh @@ -18,6 +18,21 @@ if [ ${#TARGET_NODES[@]} -eq 0 ]; then exit 1 fi +# Pre-download the runner tarball once before spawning parallel make-runner.sh +# instances. Without this, all instances race to download the same file +# concurrently and corrupt it. The tmp+mv ensures an atomic final placement. +RUNNER_VERSION="${RUNNER_VERSION:-$(gh_latest_runner_version 2>/dev/null || echo "2.332.0")}" +TARBALL="actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" +if [ ! -f "${SHARED_DIR}/${TARBALL}" ]; then + echo "==> Downloading runner v${RUNNER_VERSION}..." + tmp="${SHARED_DIR}/${TARBALL}.tmp.$$" + curl -fsSL \ + "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/${TARBALL}" \ + -o "$tmp" + mv "$tmp" "${SHARED_DIR}/${TARBALL}" +fi +export RUNNER_VERSION + for i in "${!TARGET_NODES[@]}"; do NODE="${TARGET_NODES[$i]}" NUM=$((START_NUM + i)) diff --git a/misc/frontier/list-runners.sh b/misc/frontier/list-runners.sh index 13ce9bbf59..859fb5cd19 100755 --- a/misc/frontier/list-runners.sh +++ b/misc/frontier/list-runners.sh @@ -54,7 +54,7 @@ done < <(gh_list_runners) printf "%-25s %-8s %-14s %s\n" "NAME" "GITHUB" "NODE" "RSS" printf "%s\n" "$(printf '%.0s-' {1..60})" -for dir in $(find_runner_dirs); do +while IFS= read -r dir; do name=$(get_runner_name "$dir") [ -z "$name" ] && continue @@ -86,4 +86,4 @@ for dir in $(find_runner_dirs); do fi printf "%-25s %-8s %-14s %sMB\n" "$name" "$gh_col" "$node_col" "$rss" -done +done < <(find_runner_dirs) diff --git a/misc/frontier/make-runner.sh b/misc/frontier/make-runner.sh index ae87cd4e55..53ca8cdcdb 100755 --- a/misc/frontier/make-runner.sh +++ b/misc/frontier/make-runner.sh @@ -24,9 +24,11 @@ echo "==> Setting up runner: ${RUNNER_NAME} on ${TARGET_NODE}" # --- Download tarball once to shared dir --- if [ ! -f "${SHARED_DIR}/${TARBALL}" ]; then echo "==> Downloading runner v${RUNNER_VERSION}..." + tmp="${SHARED_DIR}/${TARBALL}.tmp.$$" curl -fsSL \ "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/${TARBALL}" \ - -o "${SHARED_DIR}/${TARBALL}" + -o "$tmp" + mv "$tmp" "${SHARED_DIR}/${TARBALL}" fi # --- Extract (filesystem is shared across all nodes) --- diff --git a/misc/phoenix/check-runners.sh b/misc/phoenix/check-runners.sh index 2e00f33cd4..29f53af12d 100755 --- a/misc/phoenix/check-runners.sh +++ b/misc/phoenix/check-runners.sh @@ -28,6 +28,7 @@ for node in "${NODES[@]}"; do ' 2>/dev/null || echo " (unreachable)" rss=$(ssh -o ConnectTimeout=5 "$node" "ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" 2>/dev/null || echo "?") + [[ "$rss" =~ ^[0-9]+$ ]] || rss=0 echo " --- Total: ${rss} MB / ${CGROUP_LIMIT} MB ($(( CGROUP_LIMIT - rss )) MB free) ---" echo "" done diff --git a/misc/phoenix/list-runners.sh b/misc/phoenix/list-runners.sh index c62ef52235..35e9e7cc46 100755 --- a/misc/phoenix/list-runners.sh +++ b/misc/phoenix/list-runners.sh @@ -22,7 +22,7 @@ while read -r id name status busy; do done <<< "$(gh_list_runners)" # Walk local runner directories and cross-reference -for dir in $(find_runner_dirs); do +while IFS= read -r dir; do name=$(get_runner_name "$dir") [ -z "$name" ] && continue @@ -61,12 +61,13 @@ for dir in $(find_runner_dirs); do printf "%-25s %-8s %-22s %-8s %5sMB %s\n" \ "$name" "$gh_col" "$node" "$slurm" "$rss" "$dir" -done +done < <(find_runner_dirs) echo "" echo "=== Per-node memory ===" for node in "${NODES[@]}"; do count=$(ssh -o ConnectTimeout=5 "$node" "ps aux | grep Runner.Listener | grep -v grep | wc -l" 2>/dev/null || echo 0) rss=$(ssh -o ConnectTimeout=5 "$node" "ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" 2>/dev/null || echo "?") + [[ "$rss" =~ ^[0-9]+$ ]] || rss=0 echo " $node: $count runners, ${rss} MB / ${CGROUP_LIMIT} MB ($(( CGROUP_LIMIT - rss )) MB free)" done diff --git a/misc/phoenix/rebalance-runners.sh b/misc/phoenix/rebalance-runners.sh index ad506ff326..e251d31741 100755 --- a/misc/phoenix/rebalance-runners.sh +++ b/misc/phoenix/rebalance-runners.sh @@ -15,12 +15,12 @@ source "$SCRIPT_DIR/config.sh" # Discover runners declare -a dirs=() names=() -for dir in $(find_runner_dirs); do +while IFS= read -r dir; do name=$(get_runner_name "$dir") [ -z "$name" ] && continue dirs+=("$dir") names+=("$name") -done +done < <(find_runner_dirs) num_nodes=${#NODES[@]} num_runners=${#dirs[@]} diff --git a/misc/phoenix/restart-all.sh b/misc/phoenix/restart-all.sh index 26e143d636..d2450bf2aa 100755 --- a/misc/phoenix/restart-all.sh +++ b/misc/phoenix/restart-all.sh @@ -15,7 +15,7 @@ source "$SCRIPT_DIR/config.sh" echo "=== Discovering runners ===" declare -a restart_list=() -for dir in $(find_runner_dirs); do +while IFS= read -r dir; do name=$(get_runner_name "$dir") [ -z "$name" ] && continue node=$(find_node "$dir") @@ -37,7 +37,7 @@ for dir in $(find_runner_dirs); do fi restart_list+=("$node $dir $name") -done +done < <(find_runner_dirs) if [ ${#restart_list[@]} -eq 0 ]; then echo "Nothing to restart." From 536f43374c8101296194ba1b18842f4fdeff2391 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 19:36:48 -0400 Subject: [PATCH 10/20] refactor: extract shared runner functions into misc/common/runner-lib.sh Move functions identical across both sites into a common library: gh_registration_token, gh_latest_runner_version, gh_remove_runner, get_runner_name, find_pids, find_node, start_runner, stop_runner Both config.sh files now source runner-lib.sh after defining their site constants (ORG, NODES, SSH_OPTS), keeping only site-specific logic locally: - frontier/config.sh: gh_list_runners, find_runner_dirs, sync_runner_nodes - phoenix/config.sh: gh_list_runners, find_runner_dirs, has_slurm Harmonize start_runner() across both sites: - Use bash -lc on both (was frontier-only) for login shell PATH - Use timeout 15 + synchronous SSH (was phoenix's background SSH + poll loop) - cd into runner dir before run.sh so CWD-based discovery works - Standardize log file to runner.log (was runner-nohup.log on phoenix) - Use $SSH_OPTS variable (added to phoenix config) throughout all phoenix scripts instead of hardcoded -o ConnectTimeout=5 Co-Authored-By: Claude Sonnet 4.6 --- misc/common/runner-lib.sh | 92 +++++++++++++++++++++++++++++ misc/frontier/config.sh | 85 ++------------------------- misc/phoenix/check-runners.sh | 4 +- misc/phoenix/config.sh | 96 +++---------------------------- misc/phoenix/create-runner.sh | 2 +- misc/phoenix/list-runners.sh | 6 +- misc/phoenix/rebalance-runners.sh | 2 +- misc/phoenix/restart-all.sh | 2 +- 8 files changed, 112 insertions(+), 177 deletions(-) create mode 100755 misc/common/runner-lib.sh diff --git a/misc/common/runner-lib.sh b/misc/common/runner-lib.sh new file mode 100755 index 0000000000..d38a401149 --- /dev/null +++ b/misc/common/runner-lib.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# Shared GitHub Actions runner management library. +# +# Sourced by site-specific config.sh files (misc/frontier/config.sh, +# misc/phoenix/config.sh). Callers must define ORG, NODES, and SSH_OPTS +# before sourcing this file. + +# --- GitHub API --- + +# Get a registration token for new runners. +gh_registration_token() { + gh api "orgs/$ORG/actions/runners/registration-token" --jq .token +} + +# Get the latest runner binary version. +gh_latest_runner_version() { + gh api repos/actions/runner/releases/latest --jq '.tag_name | ltrimstr("v")' +} + +# Remove a runner registration from GitHub. +# Args: $1 = runner ID (numeric, from API) +gh_remove_runner() { + gh api "orgs/$ORG/actions/runners/$1" -X DELETE +} + +# --- Local filesystem --- + +# Get the GitHub runner name from a .runner config file. +# Args: $1 = runner directory +get_runner_name() { + python3 -c " +import json +d = json.loads(open('$1/.runner').read().lstrip('\ufeff')) +print(d.get('agentName', '')) +" 2>/dev/null +} + +# --- Login-node process management --- + +# Find PIDs of a runner on a node by matching its CWD. +# (Runner.Listener's command line is just "Runner.Listener run" — no path.) +# Output is filtered to numeric lines only to strip SSH MOTD noise. +# Args: $1 = node, $2 = runner directory +# Prints: space-separated PIDs, or empty. +find_pids() { + ssh $SSH_OPTS "$1" ' + for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do + cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) + [ "$cwd" = "'"$2"'" ] && echo "$p" + done + ' 2>/dev/null | grep -E '^[0-9]+$' | tr '\n' ' ' || true +} + +# Find which login node a runner is on. +# Args: $1 = runner directory +# Prints: node hostname, or "offline". +find_node() { + for node in "${NODES[@]}"; do + [ -n "$(find_pids "$node" "$1")" ] && echo "$node" && return + done + echo "offline" +} + +# Start a runner on a node. +# Uses a login shell (bash -lc) so site PATH (e.g. SLURM) is available. +# Args: $1 = node, $2 = runner directory +# Returns: 0 if running after start, 1 otherwise. +start_runner() { + local node="$1" dir="$2" + timeout 15 ssh $SSH_OPTS "$node" \ + "cd $dir && setsid bash -lc 'nohup ./run.sh >> runner.log 2>&1 < /dev/null &'" \ + /dev/null || true + sleep 3 + [ -n "$(find_pids "$node" "$dir")" ] +} + +# Stop a runner on a node (SIGTERM then SIGKILL). +# Args: $1 = node, $2 = runner directory +stop_runner() { + local node="$1" dir="$2" pids + pids=$(find_pids "$node" "$dir") + [ -z "$pids" ] && return 0 + for pid in $pids; do + ssh $SSH_OPTS "$node" "kill $pid" 2>/dev/null || true + done + sleep 3 + pids=$(find_pids "$node" "$dir") + for pid in $pids; do + ssh $SSH_OPTS "$node" "kill -9 $pid" 2>/dev/null || true + done + sleep 1 +} diff --git a/misc/frontier/config.sh b/misc/frontier/config.sh index fa56e7e359..48089e1aac 100755 --- a/misc/frontier/config.sh +++ b/misc/frontier/config.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash # Shared configuration for Frontier GitHub Actions runner management. # -# Sourced by all other scripts. Provides constants, GitHub API helpers, -# and login-node process management functions. +# Sourced by all other scripts. Provides Frontier constants, GitHub API +# helpers, and login-node process management functions. # --- Frontier constants --- ORG="MFlowCode" @@ -13,6 +13,8 @@ SHARED_DIR="/lustre/orion/cfd154/proj-shared/runners" SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=10 -o BatchMode=yes -o ServerAliveInterval=10 -o ServerAliveCountMax=3" +source "$(dirname "${BASH_SOURCE[0]}")/../common/runner-lib.sh" + # --- GitHub API --- # List Frontier runners from the GitHub API. @@ -24,22 +26,6 @@ gh_list_runners() { | \"\(.id) \(.name) \(.status) \(.busy)\"" } -# Get a registration token for new runners. -gh_registration_token() { - gh api "orgs/$ORG/actions/runners/registration-token" --jq .token -} - -# Get the latest runner binary version. -gh_latest_runner_version() { - gh api repos/actions/runner/releases/latest --jq '.tag_name | ltrimstr("v")' -} - -# Remove a runner registration from GitHub. -# Args: $1 = runner ID (numeric, from API) -gh_remove_runner() { - gh api "orgs/$ORG/actions/runners/$1" -X DELETE -} - # --- Local filesystem --- # Find all runner directories on shared storage. @@ -50,71 +36,8 @@ find_runner_dirs() { done } -# Get the GitHub runner name from a .runner config file. -# Args: $1 = runner directory -get_runner_name() { - python3 -c " -import json -d = json.loads(open('$1/.runner').read().lstrip('\ufeff')) -print(d.get('agentName', '')) -" 2>/dev/null -} - # --- Login-node process management --- -# Find PIDs of a runner on a node by matching its CWD. -# (Runner.Listener's command line is just "Runner.Listener run" — no path.) -# Frontier SSH prints MOTD to stdout, so output is filtered to numeric lines only. -# Args: $1 = node, $2 = runner directory -# Prints: space-separated PIDs, or empty. -find_pids() { - ssh $SSH_OPTS "$1" ' - for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do - cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) - [ "$cwd" = "'"$2"'" ] && echo "$p" - done - ' 2>/dev/null | grep -E '^[0-9]+$' | tr '\n' ' ' || true -} - -# Find which login node a runner is on. -# Args: $1 = runner directory -# Prints: node hostname, or "offline". -find_node() { - for node in "${NODES[@]}"; do - [ -n "$(find_pids "$node" "$1")" ] && echo "$node" && return - done - echo "offline" -} - -# Start a runner on a node. -# Args: $1 = node, $2 = runner directory -# Returns: 0 if running after start, 1 otherwise. -start_runner() { - local node="$1" dir="$2" - timeout 15 ssh $SSH_OPTS "$node" \ - "cd $dir && setsid nohup ./run.sh >> runner.log 2>&1 < /dev/null &" \ - /dev/null || true - sleep 3 - [ -n "$(find_pids "$node" "$dir")" ] -} - -# Stop a runner on a node (SIGTERM then SIGKILL). -# Args: $1 = node, $2 = runner directory -stop_runner() { - local node="$1" dir="$2" pids - pids=$(find_pids "$node" "$dir") - [ -z "$pids" ] && return 0 - for pid in $pids; do - ssh $SSH_OPTS "$node" "kill $pid" 2>/dev/null || true - done - sleep 3 - pids=$(find_pids "$node" "$dir") - for pid in $pids; do - ssh $SSH_OPTS "$node" "kill -9 $pid" 2>/dev/null || true - done - sleep 1 -} - # Sweep all nodes in parallel and update runner.node for any runner # found running on a different node than recorded. Called at the top of # every primary script to ensure runner.node always reflects reality, diff --git a/misc/phoenix/check-runners.sh b/misc/phoenix/check-runners.sh index 29f53af12d..b7ce2c044e 100755 --- a/misc/phoenix/check-runners.sh +++ b/misc/phoenix/check-runners.sh @@ -12,7 +12,7 @@ source "$SCRIPT_DIR/config.sh" for node in "${NODES[@]}"; do echo "=== $node ===" - ssh -o ConnectTimeout=5 "$node" ' + ssh $SSH_OPTS "$node" ' for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || echo "???") has_slurm=$(cat /proc/$p/environ 2>/dev/null | tr "\0" "\n" | grep -c /opt/slurm || echo 0) @@ -27,7 +27,7 @@ for node in "${NODES[@]}"; do done ' 2>/dev/null || echo " (unreachable)" - rss=$(ssh -o ConnectTimeout=5 "$node" "ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" 2>/dev/null || echo "?") + rss=$(ssh $SSH_OPTS "$node" "ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" 2>/dev/null || echo "?") [[ "$rss" =~ ^[0-9]+$ ]] || rss=0 echo " --- Total: ${rss} MB / ${CGROUP_LIMIT} MB ($(( CGROUP_LIMIT - rss )) MB free) ---" echo "" diff --git a/misc/phoenix/config.sh b/misc/phoenix/config.sh index ee850faf81..9874b38196 100755 --- a/misc/phoenix/config.sh +++ b/misc/phoenix/config.sh @@ -1,8 +1,8 @@ #!/bin/bash # Shared configuration for Phoenix GitHub Actions runner management. # -# Sourced by all other scripts. Provides constants, GitHub API helpers, -# and login-node process management functions. +# Sourced by all other scripts. Provides Phoenix constants, GitHub API +# helpers, and login-node process management functions. # --- Phoenix constants --- ORG="MFlowCode" @@ -11,12 +11,16 @@ RUNNER_LABEL="gt" NODES=(login-phoenix-gnr-1 login-phoenix-gnr-2 login-phoenix-gnr-3) CGROUP_LIMIT=4096 # per-user memory limit in MB on login nodes +SSH_OPTS="-o ConnectTimeout=5" + # Parent directories containing actions-runner-*/ installations on shared storage. RUNNER_PARENT_DIRS=( /storage/scratch1/6/sbryngelson3/mfc-runners /storage/project/r-sbryngelson3-0/sbryngelson3/mfc-runners-2 ) +source "$(dirname "${BASH_SOURCE[0]}")/../common/runner-lib.sh" + # --- GitHub API --- # List Phoenix runners from the GitHub API. @@ -28,22 +32,6 @@ gh_list_runners() { | \"\(.id) \(.name) \(.status) \(.busy)\"" } -# Get a registration token for new runners. -gh_registration_token() { - gh api "orgs/$ORG/actions/runners/registration-token" --jq .token -} - -# Get the latest runner binary version. -gh_latest_runner_version() { - gh api repos/actions/runner/releases/latest --jq '.tag_name | ltrimstr("v")' -} - -# Remove a runner registration from GitHub. -# Args: $1 = runner ID (numeric, from API) -gh_remove_runner() { - gh api "orgs/$ORG/actions/runners/$1" -X DELETE -} - # --- Local filesystem --- # Find all runner directories on shared storage. @@ -56,81 +44,13 @@ find_runner_dirs() { done } -# Get the GitHub runner name from a .runner config file. -# Args: $1 = runner directory -get_runner_name() { - python3 -c " -import json -d = json.loads(open('$1/.runner').read().lstrip('\ufeff')) -print(d.get('agentName', '')) -" 2>/dev/null -} - -# --- Login-node process management --- - -# Find PIDs of a runner on a node by matching its CWD. -# (Runner.Listener's command line is just "Runner.Listener run" — no path.) -# Args: $1 = node, $2 = runner directory -# Prints: space-separated PIDs, or empty. -find_pids() { - ssh -o ConnectTimeout=5 "$1" ' - for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do - cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) - [ "$cwd" = "'"$2"'" ] && echo -n "$p " - done - ' 2>/dev/null || true -} - -# Find which login node a runner is on. -# Args: $1 = runner directory -# Prints: node hostname, or "offline". -find_node() { - for node in "${NODES[@]}"; do - [ -n "$(find_pids "$node" "$1")" ] && echo "$node" && return - done - echo "offline" -} - -# Start a runner on a node with login shell (for /opt/slurm PATH). -# Args: $1 = node, $2 = runner directory -# Returns: 0 if running after start, 1 otherwise. -start_runner() { - local node="$1" dir="$2" - ssh -o ConnectTimeout=5 "$node" \ - "setsid bash -lc 'cd $dir && nohup ./run.sh >> runner-nohup.log 2>&1 &'" \ - /dev/null & - local ssh_pid=$! - local i; for i in $(seq 1 10); do - kill -0 $ssh_pid 2>/dev/null || break; sleep 1 - done - kill $ssh_pid 2>/dev/null || true - wait $ssh_pid 2>/dev/null || true - sleep 3 - [ -n "$(find_pids "$node" "$dir")" ] -} - -# Stop a runner on a node (SIGTERM then SIGKILL). -# Args: $1 = node, $2 = runner directory -stop_runner() { - local node="$1" dir="$2" pids - pids=$(find_pids "$node" "$dir") - [ -z "$pids" ] && return 0 - for pid in $pids; do - ssh -o ConnectTimeout=5 "$node" "kill $pid" 2>/dev/null || true - done - sleep 3 - pids=$(find_pids "$node" "$dir") - for pid in $pids; do - ssh -o ConnectTimeout=5 "$node" "kill -9 $pid" 2>/dev/null || true - done - sleep 1 -} +# --- Login-node process management (Phoenix-specific) --- # Check if a runner process has /opt/slurm in PATH. # Args: $1 = node, $2 = PID has_slurm() { local count - count=$(ssh -o ConnectTimeout=5 "$1" \ + count=$(ssh $SSH_OPTS "$1" \ "cat /proc/${2%% *}/environ 2>/dev/null | tr '\0' '\n' | grep -c /opt/slurm" \ 2>/dev/null || echo 0) [ "$count" -gt 0 ] diff --git a/misc/phoenix/create-runner.sh b/misc/phoenix/create-runner.sh index e255766a6d..c0a7899f85 100755 --- a/misc/phoenix/create-runner.sh +++ b/misc/phoenix/create-runner.sh @@ -92,7 +92,7 @@ if start_runner "$node" "$runner_dir"; then fi else echo " ERROR: Failed to start." - echo " Try: ssh $node 'cd $runner_dir && setsid bash -lc \"nohup ./run.sh >> runner-nohup.log 2>&1 &\"'" + echo " Try: ssh $node 'cd $runner_dir && setsid bash -lc \"nohup ./run.sh >> runner.log 2>&1 < /dev/null &\"'" fi echo "" diff --git a/misc/phoenix/list-runners.sh b/misc/phoenix/list-runners.sh index 35e9e7cc46..76ac25a9bf 100755 --- a/misc/phoenix/list-runners.sh +++ b/misc/phoenix/list-runners.sh @@ -44,7 +44,7 @@ while IFS= read -r dir; do fi # Process details (one SSH call) - info=$(ssh -o ConnectTimeout=5 "$node" ' + info=$(ssh $SSH_OPTS "$node" ' for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) if [ "$cwd" = "'"$dir"'" ]; then @@ -66,8 +66,8 @@ done < <(find_runner_dirs) echo "" echo "=== Per-node memory ===" for node in "${NODES[@]}"; do - count=$(ssh -o ConnectTimeout=5 "$node" "ps aux | grep Runner.Listener | grep -v grep | wc -l" 2>/dev/null || echo 0) - rss=$(ssh -o ConnectTimeout=5 "$node" "ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" 2>/dev/null || echo "?") + count=$(ssh $SSH_OPTS "$node" "ps aux | grep Runner.Listener | grep -v grep | wc -l" 2>/dev/null || echo 0) + rss=$(ssh $SSH_OPTS "$node" "ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" 2>/dev/null || echo "?") [[ "$rss" =~ ^[0-9]+$ ]] || rss=0 echo " $node: $count runners, ${rss} MB / ${CGROUP_LIMIT} MB ($(( CGROUP_LIMIT - rss )) MB free)" done diff --git a/misc/phoenix/rebalance-runners.sh b/misc/phoenix/rebalance-runners.sh index e251d31741..ca6d65a8b2 100755 --- a/misc/phoenix/rebalance-runners.sh +++ b/misc/phoenix/rebalance-runners.sh @@ -43,7 +43,7 @@ for i in "${!dirs[@]}"; do runner_node[$i]="$node" if [ "$node" != "offline" ]; then node_runners[$node]="${node_runners[$node]:-} $i" - worker=$(ssh -o ConnectTimeout=5 "$node" "ps aux | grep Runner.Worker | grep '${dirs[$i]}' | grep -v grep" 2>/dev/null || true) + worker=$(ssh $SSH_OPTS "$node" "ps aux | grep Runner.Worker | grep '${dirs[$i]}' | grep -v grep" 2>/dev/null || true) [ -n "$worker" ] && runner_busy[$i]=1 || runner_busy[$i]=0 else runner_busy[$i]=0 diff --git a/misc/phoenix/restart-all.sh b/misc/phoenix/restart-all.sh index d2450bf2aa..93e926c6fb 100755 --- a/misc/phoenix/restart-all.sh +++ b/misc/phoenix/restart-all.sh @@ -25,7 +25,7 @@ while IFS= read -r dir; do continue fi - worker=$(ssh -o ConnectTimeout=5 "$node" "ps aux | grep Runner.Worker | grep '$dir' | grep -v grep" 2>/dev/null || true) + worker=$(ssh $SSH_OPTS "$node" "ps aux | grep Runner.Worker | grep '$dir' | grep -v grep" 2>/dev/null || true) if [ -n "$worker" ]; then echo " $name: BUSY on $node" if [ "${FORCE:-0}" != "1" ]; then From 33a2ca4bf4e38786875bb411869279abc79d064c Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 19:49:23 -0400 Subject: [PATCH 11/20] fix: use exe path instead of CWD for runner process discovery /proc/$p/exe is intrinsic to the binary and cannot change after exec, making it more reliable than CWD which depends on how the process was launched. Also correctly excludes Runner.Worker processes that share the same directory, since their exe is bin/Runner.Worker not bin/Runner.Listener. Co-Authored-By: Claude Sonnet 4.6 --- misc/common/runner-lib.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/misc/common/runner-lib.sh b/misc/common/runner-lib.sh index d38a401149..4390611310 100755 --- a/misc/common/runner-lib.sh +++ b/misc/common/runner-lib.sh @@ -37,16 +37,17 @@ print(d.get('agentName', '')) # --- Login-node process management --- -# Find PIDs of a runner on a node by matching its CWD. -# (Runner.Listener's command line is just "Runner.Listener run" — no path.) +# Find PIDs of a runner on a node by matching its executable path. +# Matches /proc/$p/exe against $dir/bin/Runner.Listener — intrinsic to +# the binary, independent of CWD or how the process was launched. # Output is filtered to numeric lines only to strip SSH MOTD noise. # Args: $1 = node, $2 = runner directory # Prints: space-separated PIDs, or empty. find_pids() { ssh $SSH_OPTS "$1" ' for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do - cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) - [ "$cwd" = "'"$2"'" ] && echo "$p" + exe=$(readlink -f /proc/$p/exe 2>/dev/null || true) + [ "$exe" = "'"$2"'/bin/Runner.Listener" ] && echo "$p" done ' 2>/dev/null | grep -E '^[0-9]+$' | tr '\n' ' ' || true } From 958cce7d0f00e8a0cb98b1d0a1cb6f1c6a9f6986 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 19:56:10 -0400 Subject: [PATCH 12/20] feat: add rebalance-runners.sh for frontier Mirrors phoenix/rebalance-runners.sh with frontier-specific changes: - calls sync_runner_nodes first to correct any stale runner.node files - updates runner.node after each successful move - no has_slurm check (not applicable on frontier) With 22 runners across 11 nodes the target is 2 per node. Co-Authored-By: Claude Sonnet 4.6 --- misc/frontier/rebalance-runners.sh | 167 +++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100755 misc/frontier/rebalance-runners.sh diff --git a/misc/frontier/rebalance-runners.sh b/misc/frontier/rebalance-runners.sh new file mode 100755 index 0000000000..640f24de17 --- /dev/null +++ b/misc/frontier/rebalance-runners.sh @@ -0,0 +1,167 @@ +#!/usr/bin/env bash +# Automatically rebalance Frontier runners across login nodes. +# +# Discovers all runner directories, checks which node each is on, +# computes the optimal distribution, and moves runners to balance. +# Prefers moving idle runners over busy ones. Also places offline runners. +# +# Usage: bash rebalance-runners.sh # dry run +# APPLY=1 bash rebalance-runners.sh # execute +# APPLY=1 FORCE=1 bash rebalance-runners.sh # move busy runners too +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" + +echo "==> Syncing runner node locations..." +sync_runner_nodes + +# Discover runners +declare -a dirs=() names=() +while IFS= read -r dir; do + name=$(get_runner_name "$dir") + [ -z "$name" ] && continue + dirs+=("$dir") + names+=("$name") +done < <(find_runner_dirs) + +num_nodes=${#NODES[@]} +num_runners=${#dirs[@]} +target=$(( num_runners / num_nodes )) +remainder=$(( num_runners % num_nodes )) + +echo "=== Current state ===" +echo "Runners: $num_runners across $num_nodes nodes" +echo "Target: $target per node (+1 on first $remainder nodes)" +echo "" + +# Map runners to nodes and check busy status +declare -A node_runners +declare -A runner_node runner_busy + +for node in "${NODES[@]}"; do node_runners[$node]=""; done + +for i in "${!dirs[@]}"; do + node=$(find_node "${dirs[$i]}") + runner_node[$i]="$node" + if [ "$node" != "offline" ]; then + node_runners[$node]="${node_runners[$node]:-} $i" + worker=$(ssh $SSH_OPTS "$node" "ps aux | grep Runner.Worker | grep '${dirs[$i]}' | grep -v grep" 2>/dev/null || true) + [ -n "$worker" ] && runner_busy[$i]=1 || runner_busy[$i]=0 + else + runner_busy[$i]=0 + fi +done + +# Show current distribution +for node in "${NODES[@]}"; do + indices=(${node_runners[$node]:-}) + echo "$node: ${#indices[@]} runners" + for i in "${indices[@]}"; do + busy="" + [ "${runner_busy[$i]:-0}" = "1" ] && busy=" (BUSY)" + echo " ${names[$i]}$busy" + done +done + +offline=() +for i in "${!dirs[@]}"; do + [ "${runner_node[$i]}" = "offline" ] && offline+=("$i") +done +if [ ${#offline[@]} -gt 0 ]; then + echo "" + echo "OFFLINE:" + for i in "${offline[@]}"; do echo " ${names[$i]}"; done +fi +echo "" + +# Compute per-node targets +declare -A node_target +n=0 +for node in "${NODES[@]}"; do + node_target[$node]=$target + [ $n -lt $remainder ] && node_target[$node]=$(( target + 1 )) + n=$((n + 1)) +done + +# Plan moves: pull runners from overloaded nodes (idle first) +to_place=() +for node in "${NODES[@]}"; do + indices=(${node_runners[$node]:-}) + excess=$(( ${#indices[@]} - ${node_target[$node]} )) + [ $excess -le 0 ] && continue + idle=() busy=() + for i in "${indices[@]}"; do + [ "${runner_busy[$i]:-0}" = "1" ] && busy+=("$i") || idle+=("$i") + done + moved=0 + for i in "${idle[@]}" "${busy[@]}"; do + [ $moved -ge $excess ] && break + to_place+=("$node $i") + moved=$((moved + 1)) + done +done + +# Add offline runners to be placed +for i in "${offline[@]}"; do to_place+=("offline $i"); done + +# Assign to underloaded nodes +moves=() +for entry in "${to_place[@]}"; do + read -r src idx <<< "$entry" + best="" best_deficit=-999 + for node in "${NODES[@]}"; do + cur=(${node_runners[$node]:-}) + deficit=$(( ${node_target[$node]} - ${#cur[@]} )) + [ $deficit -gt $best_deficit ] && best_deficit=$deficit && best=$node + done + [ -z "$best" ] || [ "$best_deficit" -le 0 ] && continue + moves+=("$src $best $idx") + # Update bookkeeping so subsequent assignments reflect this move + if [ "$src" != "offline" ]; then + new="" + for j in ${node_runners[$src]}; do [ "$j" != "$idx" ] && new="$new $j"; done + node_runners[$src]="$new" + fi + node_runners[$best]="${node_runners[$best]:-} $idx" +done + +if [ ${#moves[@]} -eq 0 ]; then + echo "Already balanced." + exit 0 +fi + +echo "=== Planned moves ===" +has_busy=false +for move in "${moves[@]}"; do + read -r src dst idx <<< "$move" + busy="" + [ "${runner_busy[$idx]:-0}" = "1" ] && busy=" (BUSY!)" && has_busy=true + echo " ${names[$idx]}: $src -> $dst$busy" +done +echo "" +echo "=== Target ===" +for node in "${NODES[@]}"; do + cur=(${node_runners[$node]:-}) + echo " $node: ${#cur[@]} runners" +done + +[ "$has_busy" = true ] && [ "${FORCE:-0}" != "1" ] && echo "" && echo "Set FORCE=1 to move busy runners." && exit 1 +[ "${APPLY:-0}" != "1" ] && echo "" && echo "Dry run — set APPLY=1 to execute." && exit 0 + +echo "" +echo "=== Executing ===" +for move in "${moves[@]}"; do + read -r src dst idx <<< "$move" + echo "Moving ${names[$idx]}: $src -> $dst" + [ "$src" != "offline" ] && stop_runner "$src" "${dirs[$idx]}" + if start_runner "$dst" "${dirs[$idx]}"; then + echo "$dst" > "${dirs[$idx]}/runner.node" + echo " OK: ${names[$idx]} started on $dst" + else + echo " ERROR: Failed to start ${names[$idx]} on $dst" + fi +done + +echo "" +bash "$SCRIPT_DIR/check-runners.sh" From 0992785e458c20200d7301bee4ece71eed8ab835 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 20:17:10 -0400 Subject: [PATCH 13/20] refactor: deduplicate runner scripts into misc/common/ Extract shared logic from frontier/phoenix into common scripts: - runner-lib.sh: add gh_list_runners(), has_slurm() (portable, grep PATH for slurm keyword), sweep_all_nodes() (exe-based, parallel SSH), CGROUP_LIMIT default - check-runners.sh: new common script (exe-based discovery, slurm column, conditional cgroup footer); both sites now show slurm status - list-runners.sh: new common script (parallel sweep, slurm column, stale runner.node detection, conditional cgroup footer) - rebalance-runners.sh: new common script (optional sync_runner_nodes hook, writes runner.node after start, checks slurm after start) All site scripts (frontier/, phoenix/) reduced to thin wrappers that source config.sh then the common implementation. phoenix/create-runner.sh fixed to write runner.node after successful start. Co-Authored-By: Claude Sonnet 4.6 --- misc/common/check-runners.sh | 44 ++++++++ misc/common/list-runners.sh | 83 ++++++++++++++ misc/common/rebalance-runners.sh | 174 +++++++++++++++++++++++++++++ misc/common/runner-lib.sh | 46 ++++++++ misc/frontier/check-runners.sh | 30 +---- misc/frontier/config.sh | 49 +++----- misc/frontier/list-runners.sh | 87 +-------------- misc/frontier/rebalance-runners.sh | 162 +-------------------------- misc/phoenix/check-runners.sh | 34 +----- misc/phoenix/config.sh | 27 +---- misc/phoenix/create-runner.sh | 1 + misc/phoenix/list-runners.sh | 73 +----------- misc/phoenix/rebalance-runners.sh | 166 +-------------------------- 13 files changed, 387 insertions(+), 589 deletions(-) create mode 100644 misc/common/check-runners.sh create mode 100644 misc/common/list-runners.sh create mode 100644 misc/common/rebalance-runners.sh diff --git a/misc/common/check-runners.sh b/misc/common/check-runners.sh new file mode 100644 index 0000000000..365201e571 --- /dev/null +++ b/misc/common/check-runners.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Check runner health across all login nodes. +# +# Sourced by site wrappers (frontier/check-runners.sh, phoenix/check-runners.sh) +# after config.sh is loaded. Shows Runner.Listener processes per node with +# name, busy/idle status, slurm availability, and RSS memory. +# If CGROUP_LIMIT > 0, also shows per-node total memory vs the cgroup limit. +# +# Usage: bash check-runners.sh +set -euo pipefail + +declare -f sync_runner_nodes > /dev/null 2>&1 && { + echo "==> Syncing runner node locations..." + sync_runner_nodes +} + +for node in "${NODES[@]}"; do + echo "=== $node ===" + ssh $SSH_OPTS "$node" ' + found=0 + for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do + found=1 + exe=$(readlink -f /proc/$p/exe 2>/dev/null || echo "???") + dir=$(dirname "$(dirname "$exe")" 2>/dev/null || echo "???") + name=$(basename "$dir") + worker=$(ps aux | grep "Runner.Worker" | grep "$dir" | grep -v grep | awk "{print \$2}" | head -1) + [ -n "$worker" ] && status="BUSY" || status="idle" + rss=$(ps -p $p -o rss= 2>/dev/null | awk "{printf \"%.0f\", \$1/1024}" || echo "?") + slurm=$(tr "\0" "\n" < /proc/$p/environ 2>/dev/null | grep -c "^PATH=.*slurm" || echo 0) + [ "$slurm" -gt 0 ] && slurm_ok="ok" || slurm_ok="MISSING" + printf " %-30s %5s slurm=%-7s %s MB\n" "$name" "$status" "$slurm_ok" "$rss" + done + [ "$found" -eq 0 ] && echo " (no runners)" + ' 2>/dev/null || echo " (unreachable)" + + if [ "${CGROUP_LIMIT:-0}" -gt 0 ]; then + rss=$(ssh $SSH_OPTS "$node" \ + "ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" \ + 2>/dev/null || echo "?") + [[ "$rss" =~ ^[0-9]+$ ]] || rss=0 + echo " --- Total: ${rss} MB / ${CGROUP_LIMIT} MB ($(( CGROUP_LIMIT - rss )) MB free) ---" + fi + echo "" +done diff --git a/misc/common/list-runners.sh b/misc/common/list-runners.sh new file mode 100644 index 0000000000..6077dfe47e --- /dev/null +++ b/misc/common/list-runners.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +# List all runners combining GitHub API status with live node process info. +# +# Sourced by site wrappers (frontier/list-runners.sh, phoenix/list-runners.sh) +# after config.sh is loaded. Uses a parallel SSH sweep across all nodes +# simultaneously (one SSH per node regardless of runner count). +# Shows name, GitHub status, node, slurm availability, and RSS. +# If CGROUP_LIMIT > 0, also shows a per-node memory summary. +# +# Usage: bash list-runners.sh +set -euo pipefail + +declare -f sync_runner_nodes > /dev/null 2>&1 && { + echo "==> Syncing runner node locations..." + sync_runner_nodes +} + +tmpdir=$(mktemp -d) +trap 'rm -rf "$tmpdir"' EXIT + +sweep_all_nodes "$tmpdir" + +# Parse sweep results into associative arrays +declare -A runner_node runner_rss runner_slurm +for node in "${NODES[@]}"; do + while IFS= read -r line; do + read -r _s sweep_node dir rss slurm_ok <<< "$line" + runner_node["$dir"]="$sweep_node" + runner_rss["$dir"]="$rss" + runner_slurm["$dir"]="$slurm_ok" + done < <(grep '^RUNNER ' "$tmpdir/$node.out" 2>/dev/null || true) +done + +# Fetch GitHub API status +declare -A gh_status gh_busy +while read -r _id name status busy; do + gh_status["$name"]="$status" + gh_busy["$name"]="$busy" +done < <(gh_list_runners) + +# Print table +printf "%-25s %-8s %-20s %-8s %s\n" "NAME" "GITHUB" "NODE" "SLURM" "RSS" +printf "%s\n" "$(printf '%.0s-' {1..70})" + +while IFS= read -r dir; do + name=$(get_runner_name "$dir") + [ -z "$name" ] && continue + + [ "${gh_busy[$name]:-false}" = "true" ] && gh_col="BUSY" || gh_col="${gh_status[$name]:-unknown}" + + actual_node="${runner_node[$dir]:-}" + rss="${runner_rss[$dir]:-—}" + slurm="${runner_slurm[$dir]:-—}" + + if [ -z "$actual_node" ]; then + printf "%-25s %-8s %-20s %-8s %s\n" "$name" "$gh_col" "offline" "—" "—" + continue + fi + + # Flag stale runner.node entries + node_col="$actual_node" + if [ -f "$dir/runner.node" ]; then + recorded=$(cat "$dir/runner.node") + [ "$actual_node" != "$recorded" ] && node_col="${actual_node} *(stale: ${recorded})" + fi + + printf "%-25s %-8s %-20s %-8s %sMB\n" "$name" "$gh_col" "$node_col" "$slurm" "$rss" +done < <(find_runner_dirs) + +# Per-node memory summary (only when site has a cgroup limit) +if [ "${CGROUP_LIMIT:-0}" -gt 0 ]; then + echo "" + echo "=== Per-node memory ===" + for node in "${NODES[@]}"; do + count=$(ssh $SSH_OPTS "$node" \ + "ps aux | grep Runner.Listener | grep -v grep | wc -l" 2>/dev/null || echo 0) + rss=$(ssh $SSH_OPTS "$node" \ + "ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" \ + 2>/dev/null || echo "?") + [[ "$rss" =~ ^[0-9]+$ ]] || rss=0 + echo " $node: $count runners, ${rss} MB / ${CGROUP_LIMIT} MB ($(( CGROUP_LIMIT - rss )) MB free)" + done +fi diff --git a/misc/common/rebalance-runners.sh b/misc/common/rebalance-runners.sh new file mode 100644 index 0000000000..98546387fc --- /dev/null +++ b/misc/common/rebalance-runners.sh @@ -0,0 +1,174 @@ +#!/usr/bin/env bash +# Core rebalance algorithm for GitHub Actions runners. +# +# Sourced by site wrappers (frontier/rebalance-runners.sh, +# phoenix/rebalance-runners.sh) after config.sh is loaded. +# Discovers all runner directories, checks which node each is on, +# computes the optimal distribution, and moves runners to balance. +# Prefers moving idle runners over busy ones. Also places offline runners. +# +# Usage: bash rebalance-runners.sh # dry run +# APPLY=1 bash rebalance-runners.sh # execute +# APPLY=1 FORCE=1 bash rebalance-runners.sh # move busy runners too +set -euo pipefail + +declare -f sync_runner_nodes > /dev/null 2>&1 && { + echo "==> Syncing runner node locations..." + sync_runner_nodes +} + +# Discover runners +declare -a dirs=() names=() +while IFS= read -r dir; do + name=$(get_runner_name "$dir") + [ -z "$name" ] && continue + dirs+=("$dir") + names+=("$name") +done < <(find_runner_dirs) + +num_nodes=${#NODES[@]} +num_runners=${#dirs[@]} +target=$(( num_runners / num_nodes )) +remainder=$(( num_runners % num_nodes )) + +echo "=== Current state ===" +echo "Runners: $num_runners across $num_nodes nodes" +echo "Target: $target per node (+1 on first $remainder nodes)" +echo "" + +# Map runners to nodes and check busy status +declare -A node_runners +declare -A runner_node runner_busy + +for node in "${NODES[@]}"; do node_runners[$node]=""; done + +for i in "${!dirs[@]}"; do + node=$(find_node "${dirs[$i]}") + runner_node[$i]="$node" + if [ "$node" != "offline" ]; then + node_runners[$node]="${node_runners[$node]:-} $i" + worker=$(ssh $SSH_OPTS "$node" "ps aux | grep Runner.Worker | grep '${dirs[$i]}' | grep -v grep" 2>/dev/null || true) + [ -n "$worker" ] && runner_busy[$i]=1 || runner_busy[$i]=0 + else + runner_busy[$i]=0 + fi +done + +# Show current distribution +for node in "${NODES[@]}"; do + indices=(${node_runners[$node]:-}) + echo "$node: ${#indices[@]} runners" + for i in "${indices[@]}"; do + busy="" + [ "${runner_busy[$i]:-0}" = "1" ] && busy=" (BUSY)" + echo " ${names[$i]}$busy" + done +done + +offline=() +for i in "${!dirs[@]}"; do + [ "${runner_node[$i]}" = "offline" ] && offline+=("$i") +done +if [ ${#offline[@]} -gt 0 ]; then + echo "" + echo "OFFLINE:" + for i in "${offline[@]}"; do echo " ${names[$i]}"; done +fi +echo "" + +# Compute per-node targets +declare -A node_target +n=0 +for node in "${NODES[@]}"; do + node_target[$node]=$target + [ $n -lt $remainder ] && node_target[$node]=$(( target + 1 )) + n=$((n + 1)) +done + +# Plan moves: pull runners from overloaded nodes (idle first) +to_place=() +for node in "${NODES[@]}"; do + indices=(${node_runners[$node]:-}) + excess=$(( ${#indices[@]} - ${node_target[$node]} )) + [ $excess -le 0 ] && continue + idle=() busy=() + for i in "${indices[@]}"; do + [ "${runner_busy[$i]:-0}" = "1" ] && busy+=("$i") || idle+=("$i") + done + moved=0 + for i in "${idle[@]}" "${busy[@]}"; do + [ $moved -ge $excess ] && break + to_place+=("$node $i") + moved=$((moved + 1)) + done +done + +# Add offline runners to be placed +for i in "${offline[@]}"; do to_place+=("offline $i"); done + +# Assign to underloaded nodes +moves=() +for entry in "${to_place[@]}"; do + read -r src idx <<< "$entry" + best="" best_deficit=-999 + for node in "${NODES[@]}"; do + cur=(${node_runners[$node]:-}) + deficit=$(( ${node_target[$node]} - ${#cur[@]} )) + [ $deficit -gt $best_deficit ] && best_deficit=$deficit && best=$node + done + [ -z "$best" ] || [ "$best_deficit" -le 0 ] && continue + moves+=("$src $best $idx") + # Update bookkeeping so subsequent assignments reflect this move + if [ "$src" != "offline" ]; then + new="" + for j in ${node_runners[$src]}; do [ "$j" != "$idx" ] && new="$new $j"; done + node_runners[$src]="$new" + fi + node_runners[$best]="${node_runners[$best]:-} $idx" +done + +if [ ${#moves[@]} -eq 0 ]; then + echo "Already balanced." + exit 0 +fi + +echo "=== Planned moves ===" +has_busy=false +for move in "${moves[@]}"; do + read -r src dst idx <<< "$move" + busy="" + [ "${runner_busy[$idx]:-0}" = "1" ] && busy=" (BUSY!)" && has_busy=true + echo " ${names[$idx]}: $src -> $dst$busy" +done +echo "" +echo "=== Target ===" +for node in "${NODES[@]}"; do + cur=(${node_runners[$node]:-}) + echo " $node: ${#cur[@]} runners" +done + +[ "$has_busy" = true ] && [ "${FORCE:-0}" != "1" ] && echo "" && echo "Set FORCE=1 to move busy runners." && exit 1 +[ "${APPLY:-0}" != "1" ] && echo "" && echo "Dry run — set APPLY=1 to execute." && exit 0 + +echo "" +echo "=== Executing ===" +for move in "${moves[@]}"; do + read -r src dst idx <<< "$move" + echo "Moving ${names[$idx]}: $src -> $dst" + [ "$src" != "offline" ] && stop_runner "$src" "${dirs[$idx]}" + if start_runner "$dst" "${dirs[$idx]}"; then + echo "$dst" > "${dirs[$idx]}/runner.node" + pids=$(find_pids "$dst" "${dirs[$idx]}") + pid=${pids%% *} + if has_slurm "$dst" "$pid"; then + echo " OK: ${names[$idx]} started on $dst (slurm ok)" + else + echo " WARNING: ${names[$idx]} started on $dst but slurm MISSING from PATH" + fi + else + echo " ERROR: Failed to start ${names[$idx]} on $dst" + fi +done + +echo "" +bash "${SITE_SCRIPT_DIR}/check-runners.sh" diff --git a/misc/common/runner-lib.sh b/misc/common/runner-lib.sh index 4390611310..c0560e6bc5 100755 --- a/misc/common/runner-lib.sh +++ b/misc/common/runner-lib.sh @@ -5,8 +5,20 @@ # misc/phoenix/config.sh). Callers must define ORG, NODES, and SSH_OPTS # before sourcing this file. +# Default: no cgroup memory limit displayed. Override in site config (e.g. CGROUP_LIMIT=4096). +CGROUP_LIMIT=${CGROUP_LIMIT:-0} + # --- GitHub API --- +# List runners from the GitHub API, filtered to this site's label. +# Prints: id name status busy (one runner per line) +gh_list_runners() { + gh api "orgs/$ORG/actions/runners" --paginate \ + --jq ".runners[] + | select(.labels | map(.name) | index(\"$RUNNER_LABEL\")) + | \"\(.id) \(.name) \(.status) \(.busy)\"" +} + # Get a registration token for new runners. gh_registration_token() { gh api "orgs/$ORG/actions/runners/registration-token" --jq .token @@ -62,6 +74,40 @@ find_node() { echo "offline" } +# Check if a runner process has a slurm directory in its PATH. +# Works across sites regardless of the specific slurm installation path. +# Args: $1 = node, $2 = PID (or "PID rest..." — uses first token only) +has_slurm() { + local node="$1" pid="${2%% *}" + ssh $SSH_OPTS "$node" \ + "tr '\0' '\n' < /proc/$pid/environ 2>/dev/null | grep -q '^PATH=.*slurm'" \ + 2>/dev/null +} + +# Sweep all nodes in parallel, writing per-node result files to tmpdir. +# Each output line: RUNNER +# dir = runner directory derived from the Runner.Listener exe path +# slurm_ok = "ok" if slurm appears in the process PATH, "MISSING" otherwise +# Caller must create tmpdir and parse the output files. +# Args: $1 = tmpdir +sweep_all_nodes() { + local tmpdir="$1" node + for node in "${NODES[@]}"; do + ssh $SSH_OPTS "$node" ' + for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do + exe=$(readlink -f /proc/$p/exe 2>/dev/null || true) + [ -z "$exe" ] && continue + dir=$(dirname "$(dirname "$exe")") + rss=$(ps -p $p -o rss= 2>/dev/null | awk "{printf \"%.0f\", \$1/1024}" || echo 0) + slurm=$(tr "\0" "\n" < /proc/$p/environ 2>/dev/null | grep -c "^PATH=.*slurm" || echo 0) + [ "$slurm" -gt 0 ] && slurm_ok="ok" || slurm_ok="MISSING" + echo "RUNNER '"$node"' $dir $rss $slurm_ok" + done + ' 2>/dev/null > "$tmpdir/$node.out" & + done + wait +} + # Start a runner on a node. # Uses a login shell (bash -lc) so site PATH (e.g. SLURM) is available. # Args: $1 = node, $2 = runner directory diff --git a/misc/frontier/check-runners.sh b/misc/frontier/check-runners.sh index 74c84eaa2e..3004508cc8 100755 --- a/misc/frontier/check-runners.sh +++ b/misc/frontier/check-runners.sh @@ -1,32 +1,8 @@ #!/usr/bin/env bash -# Quick health check for GitHub Actions runners across Frontier login nodes. -# -# SSHes to each login node, finds Runner.Listener processes, and shows -# runner name, status (idle/BUSY), and memory usage. +# Check runner health across all Frontier login nodes. +# Thin wrapper — see misc/common/check-runners.sh for the implementation. # # Usage: bash check-runners.sh -set -euo pipefail - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" - -echo "==> Syncing runner node locations..." -sync_runner_nodes - -for node in "${NODES[@]}"; do - echo "=== $node ===" - ssh $SSH_OPTS "$node" ' - found=0 - for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do - found=1 - cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || echo "???") - worker=$(ps aux | grep "Runner.Worker" | grep "$cwd" | grep -v grep | awk "{print \$2}" | head -1) - [ -n "$worker" ] && status="BUSY" || status="idle" - rss=$(ps -p $p -o rss= 2>/dev/null | awk "{printf \"%.0f\", \$1/1024}" || echo "?") - name=$(basename "$cwd") - printf " %-30s %5s %s MB\n" "$name" "$status" "$rss" - done - [ "$found" -eq 0 ] && echo " (no runners)" - ' 2>/dev/null || echo " (unreachable)" - echo "" -done +source "$SCRIPT_DIR/../common/check-runners.sh" diff --git a/misc/frontier/config.sh b/misc/frontier/config.sh index 48089e1aac..d5c7f4398e 100755 --- a/misc/frontier/config.sh +++ b/misc/frontier/config.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash # Shared configuration for Frontier GitHub Actions runner management. # -# Sourced by all other scripts. Provides Frontier constants, GitHub API -# helpers, and login-node process management functions. +# Sourced by all other scripts. Provides Frontier constants and +# site-specific functions. Common functions live in ../common/runner-lib.sh. # --- Frontier constants --- ORG="MFlowCode" @@ -15,17 +15,6 @@ SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=10 -o BatchMode=yes -o S source "$(dirname "${BASH_SOURCE[0]}")/../common/runner-lib.sh" -# --- GitHub API --- - -# List Frontier runners from the GitHub API. -# Prints: id name status busy (one runner per line) -gh_list_runners() { - gh api "orgs/$ORG/actions/runners" --paginate \ - --jq ".runners[] - | select(.labels | map(.name) | index(\"$RUNNER_LABEL\")) - | \"\(.id) \(.name) \(.status) \(.busy)\"" -} - # --- Local filesystem --- # Find all runner directories on shared storage. @@ -43,29 +32,23 @@ find_runner_dirs() { # every primary script to ensure runner.node always reflects reality, # even if a runner was manually restarted on a different node. sync_runner_nodes() { - local tmpdir + local tmpdir node tmpdir=$(mktemp -d) trap 'rm -rf "$tmpdir"' RETURN + sweep_all_nodes "$tmpdir" + for node in "${NODES[@]}"; do - ( - ssh $SSH_OPTS "$node" ' - for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do - cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) - [ -n "$cwd" ] && echo "'"$node"' $cwd" - done - ' 2>/dev/null | grep -E '^[a-z0-9]+ /' - ) > "$tmpdir/$node" & + while IFS= read -r line; do + local dir sweep_node + read -r _s sweep_node dir _rss _slurm <<< "$line" + [ -f "$dir/runner.node" ] || continue + local recorded + recorded=$(cat "$dir/runner.node" 2>/dev/null || echo "") + if [ "$sweep_node" != "$recorded" ]; then + echo "==> $(basename "$dir"): runner.node updated $recorded -> $sweep_node" + echo "$sweep_node" > "$dir/runner.node" + fi + done < <(grep '^RUNNER ' "$tmpdir/$node.out" 2>/dev/null || true) done - wait - - while IFS=' ' read -r node dir; do - [ -f "$dir/runner.node" ] || continue - local recorded - recorded=$(cat "$dir/runner.node" 2>/dev/null || echo "") - if [ "$node" != "$recorded" ]; then - echo "==> $(basename "$dir"): runner.node updated $recorded -> $node" - echo "$node" > "$dir/runner.node" - fi - done < <(cat "$tmpdir"/*) } diff --git a/misc/frontier/list-runners.sh b/misc/frontier/list-runners.sh index 859fb5cd19..1ef2e541e5 100755 --- a/misc/frontier/list-runners.sh +++ b/misc/frontier/list-runners.sh @@ -1,89 +1,8 @@ #!/usr/bin/env bash -# List all Frontier runners, combining GitHub API status with login-node process info. -# -# Uses a parallel SSH sweep across all 11 login nodes simultaneously to avoid -# the overhead of serial per-runner node discovery. Each node is queried once; -# results are correlated with GitHub API status and the local runner directories. +# List all Frontier runners combining GitHub API status with live node process info. +# Thin wrapper — see misc/common/list-runners.sh for the implementation. # # Usage: bash list-runners.sh -set -euo pipefail - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" - -echo "==> Syncing runner node locations..." -sync_runner_nodes - -tmpdir=$(mktemp -d) -trap 'rm -rf "$tmpdir"' EXIT - -# --- Parallel SSH sweep across all login nodes --- -# Each node prints lines in the format: RUNNER -# The RUNNER sentinel prefix allows stripping MOTD noise with grep. -for node in "${NODES[@]}"; do - ssh $SSH_OPTS "$node" ' - for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do - cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) - rss=$(ps -p $p -o rss= 2>/dev/null | awk "{printf \"%.0f\", \$1/1024}" || echo 0) - [ -n "$cwd" ] && echo "RUNNER '"$node"' $cwd $rss" - done - ' 2>/dev/null > "$tmpdir/$node.out" & -done - -wait - -# --- Build associative arrays from sweep results --- -declare -A runner_node runner_rss -for node in "${NODES[@]}"; do - while IFS= read -r line; do - # Each line: RUNNER - read -r _sentinel sweep_node dir rss <<< "$line" - runner_node["$dir"]="$sweep_node" - runner_rss["$dir"]="$rss" - done < <(grep '^RUNNER ' "$tmpdir/$node.out" 2>/dev/null || true) -done - -# --- Fetch GitHub API status --- -declare -A gh_status gh_busy -while read -r _id name status busy; do - gh_status["$name"]="$status" - gh_busy["$name"]="$busy" -done < <(gh_list_runners) - -# --- Print table --- -printf "%-25s %-8s %-14s %s\n" "NAME" "GITHUB" "NODE" "RSS" -printf "%s\n" "$(printf '%.0s-' {1..60})" - -while IFS= read -r dir; do - name=$(get_runner_name "$dir") - [ -z "$name" ] && continue - - # GitHub status column - api_status="${gh_status[$name]:-unknown}" - api_busy="${gh_busy[$name]:-false}" - if [ "$api_busy" = "true" ]; then - gh_col="BUSY" - else - gh_col="$api_status" - fi - - # Node and RSS from parallel sweep - actual_node="${runner_node[$dir]:-}" - rss="${runner_rss[$dir]:-}" - - if [ -z "$actual_node" ]; then - printf "%-25s %-8s %-14s %s\n" "$name" "$gh_col" "offline" "—" - continue - fi - - # Compare sweep result to recorded runner.node; flag stale entries - node_col="$actual_node" - if [ -f "$dir/runner.node" ]; then - recorded=$(cat "$dir/runner.node") - if [ "$actual_node" != "$recorded" ]; then - node_col="${actual_node} *(stale: ${recorded})" - fi - fi - - printf "%-25s %-8s %-14s %sMB\n" "$name" "$gh_col" "$node_col" "$rss" -done < <(find_runner_dirs) +source "$SCRIPT_DIR/../common/list-runners.sh" diff --git a/misc/frontier/rebalance-runners.sh b/misc/frontier/rebalance-runners.sh index 640f24de17..e567e10e5c 100755 --- a/misc/frontier/rebalance-runners.sh +++ b/misc/frontier/rebalance-runners.sh @@ -1,167 +1,11 @@ #!/usr/bin/env bash # Automatically rebalance Frontier runners across login nodes. -# -# Discovers all runner directories, checks which node each is on, -# computes the optimal distribution, and moves runners to balance. -# Prefers moving idle runners over busy ones. Also places offline runners. +# Thin wrapper — see misc/common/rebalance-runners.sh for the implementation. # # Usage: bash rebalance-runners.sh # dry run # APPLY=1 bash rebalance-runners.sh # execute # APPLY=1 FORCE=1 bash rebalance-runners.sh # move busy runners too -set -euo pipefail - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SITE_SCRIPT_DIR="$SCRIPT_DIR" source "$SCRIPT_DIR/config.sh" - -echo "==> Syncing runner node locations..." -sync_runner_nodes - -# Discover runners -declare -a dirs=() names=() -while IFS= read -r dir; do - name=$(get_runner_name "$dir") - [ -z "$name" ] && continue - dirs+=("$dir") - names+=("$name") -done < <(find_runner_dirs) - -num_nodes=${#NODES[@]} -num_runners=${#dirs[@]} -target=$(( num_runners / num_nodes )) -remainder=$(( num_runners % num_nodes )) - -echo "=== Current state ===" -echo "Runners: $num_runners across $num_nodes nodes" -echo "Target: $target per node (+1 on first $remainder nodes)" -echo "" - -# Map runners to nodes and check busy status -declare -A node_runners -declare -A runner_node runner_busy - -for node in "${NODES[@]}"; do node_runners[$node]=""; done - -for i in "${!dirs[@]}"; do - node=$(find_node "${dirs[$i]}") - runner_node[$i]="$node" - if [ "$node" != "offline" ]; then - node_runners[$node]="${node_runners[$node]:-} $i" - worker=$(ssh $SSH_OPTS "$node" "ps aux | grep Runner.Worker | grep '${dirs[$i]}' | grep -v grep" 2>/dev/null || true) - [ -n "$worker" ] && runner_busy[$i]=1 || runner_busy[$i]=0 - else - runner_busy[$i]=0 - fi -done - -# Show current distribution -for node in "${NODES[@]}"; do - indices=(${node_runners[$node]:-}) - echo "$node: ${#indices[@]} runners" - for i in "${indices[@]}"; do - busy="" - [ "${runner_busy[$i]:-0}" = "1" ] && busy=" (BUSY)" - echo " ${names[$i]}$busy" - done -done - -offline=() -for i in "${!dirs[@]}"; do - [ "${runner_node[$i]}" = "offline" ] && offline+=("$i") -done -if [ ${#offline[@]} -gt 0 ]; then - echo "" - echo "OFFLINE:" - for i in "${offline[@]}"; do echo " ${names[$i]}"; done -fi -echo "" - -# Compute per-node targets -declare -A node_target -n=0 -for node in "${NODES[@]}"; do - node_target[$node]=$target - [ $n -lt $remainder ] && node_target[$node]=$(( target + 1 )) - n=$((n + 1)) -done - -# Plan moves: pull runners from overloaded nodes (idle first) -to_place=() -for node in "${NODES[@]}"; do - indices=(${node_runners[$node]:-}) - excess=$(( ${#indices[@]} - ${node_target[$node]} )) - [ $excess -le 0 ] && continue - idle=() busy=() - for i in "${indices[@]}"; do - [ "${runner_busy[$i]:-0}" = "1" ] && busy+=("$i") || idle+=("$i") - done - moved=0 - for i in "${idle[@]}" "${busy[@]}"; do - [ $moved -ge $excess ] && break - to_place+=("$node $i") - moved=$((moved + 1)) - done -done - -# Add offline runners to be placed -for i in "${offline[@]}"; do to_place+=("offline $i"); done - -# Assign to underloaded nodes -moves=() -for entry in "${to_place[@]}"; do - read -r src idx <<< "$entry" - best="" best_deficit=-999 - for node in "${NODES[@]}"; do - cur=(${node_runners[$node]:-}) - deficit=$(( ${node_target[$node]} - ${#cur[@]} )) - [ $deficit -gt $best_deficit ] && best_deficit=$deficit && best=$node - done - [ -z "$best" ] || [ "$best_deficit" -le 0 ] && continue - moves+=("$src $best $idx") - # Update bookkeeping so subsequent assignments reflect this move - if [ "$src" != "offline" ]; then - new="" - for j in ${node_runners[$src]}; do [ "$j" != "$idx" ] && new="$new $j"; done - node_runners[$src]="$new" - fi - node_runners[$best]="${node_runners[$best]:-} $idx" -done - -if [ ${#moves[@]} -eq 0 ]; then - echo "Already balanced." - exit 0 -fi - -echo "=== Planned moves ===" -has_busy=false -for move in "${moves[@]}"; do - read -r src dst idx <<< "$move" - busy="" - [ "${runner_busy[$idx]:-0}" = "1" ] && busy=" (BUSY!)" && has_busy=true - echo " ${names[$idx]}: $src -> $dst$busy" -done -echo "" -echo "=== Target ===" -for node in "${NODES[@]}"; do - cur=(${node_runners[$node]:-}) - echo " $node: ${#cur[@]} runners" -done - -[ "$has_busy" = true ] && [ "${FORCE:-0}" != "1" ] && echo "" && echo "Set FORCE=1 to move busy runners." && exit 1 -[ "${APPLY:-0}" != "1" ] && echo "" && echo "Dry run — set APPLY=1 to execute." && exit 0 - -echo "" -echo "=== Executing ===" -for move in "${moves[@]}"; do - read -r src dst idx <<< "$move" - echo "Moving ${names[$idx]}: $src -> $dst" - [ "$src" != "offline" ] && stop_runner "$src" "${dirs[$idx]}" - if start_runner "$dst" "${dirs[$idx]}"; then - echo "$dst" > "${dirs[$idx]}/runner.node" - echo " OK: ${names[$idx]} started on $dst" - else - echo " ERROR: Failed to start ${names[$idx]} on $dst" - fi -done - -echo "" -bash "$SCRIPT_DIR/check-runners.sh" +source "$SCRIPT_DIR/../common/rebalance-runners.sh" diff --git a/misc/phoenix/check-runners.sh b/misc/phoenix/check-runners.sh index b7ce2c044e..a5e4dbd8a2 100755 --- a/misc/phoenix/check-runners.sh +++ b/misc/phoenix/check-runners.sh @@ -1,34 +1,8 @@ -#!/bin/bash -# Quick health check for GitHub Actions runners across Phoenix login nodes. -# -# Lighter than list-runners.sh — doesn't query each runner individually, -# just shows per-node counts and memory. Use list-runners.sh for details. +#!/usr/bin/env bash +# Check runner health across all Phoenix login nodes. +# Thin wrapper — see misc/common/check-runners.sh for the implementation. # # Usage: bash check-runners.sh - -set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" - -for node in "${NODES[@]}"; do - echo "=== $node ===" - ssh $SSH_OPTS "$node" ' - for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do - cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || echo "???") - has_slurm=$(cat /proc/$p/environ 2>/dev/null | tr "\0" "\n" | grep -c /opt/slurm || echo 0) - worker=$(ps aux | grep "Runner.Worker" | grep "$cwd" | grep -v grep | awk "{print \$2}" | head -1) - [ -n "$worker" ] && status="BUSY" || status="idle" - rss=$(ps -p $p -o rss= 2>/dev/null | awk "{printf \"%.0f\", \$1/1024}" || echo "?") - name=$(basename "$cwd") - parent=$(basename $(dirname "$cwd")) - slurm_ok="ok" - [ "$has_slurm" -eq 0 ] && slurm_ok="MISSING" - printf " %-40s %5s slurm=%-7s %s MB\n" "$parent/$name" "$status" "$slurm_ok" "$rss" - done - ' 2>/dev/null || echo " (unreachable)" - - rss=$(ssh $SSH_OPTS "$node" "ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" 2>/dev/null || echo "?") - [[ "$rss" =~ ^[0-9]+$ ]] || rss=0 - echo " --- Total: ${rss} MB / ${CGROUP_LIMIT} MB ($(( CGROUP_LIMIT - rss )) MB free) ---" - echo "" -done +source "$SCRIPT_DIR/../common/check-runners.sh" diff --git a/misc/phoenix/config.sh b/misc/phoenix/config.sh index 9874b38196..e4fadfb225 100755 --- a/misc/phoenix/config.sh +++ b/misc/phoenix/config.sh @@ -1,8 +1,8 @@ #!/bin/bash # Shared configuration for Phoenix GitHub Actions runner management. # -# Sourced by all other scripts. Provides Phoenix constants, GitHub API -# helpers, and login-node process management functions. +# Sourced by all other scripts. Provides Phoenix constants and +# site-specific functions. Common functions live in ../common/runner-lib.sh. # --- Phoenix constants --- ORG="MFlowCode" @@ -21,17 +21,6 @@ RUNNER_PARENT_DIRS=( source "$(dirname "${BASH_SOURCE[0]}")/../common/runner-lib.sh" -# --- GitHub API --- - -# List Phoenix runners from the GitHub API. -# Prints: id name status busy (one runner per line) -gh_list_runners() { - gh api "orgs/$ORG/actions/runners" --paginate \ - --jq ".runners[] - | select(.labels | map(.name) | index(\"$RUNNER_LABEL\")) - | \"\(.id) \(.name) \(.status) \(.busy)\"" -} - # --- Local filesystem --- # Find all runner directories on shared storage. @@ -43,15 +32,3 @@ find_runner_dirs() { done done } - -# --- Login-node process management (Phoenix-specific) --- - -# Check if a runner process has /opt/slurm in PATH. -# Args: $1 = node, $2 = PID -has_slurm() { - local count - count=$(ssh $SSH_OPTS "$1" \ - "cat /proc/${2%% *}/environ 2>/dev/null | tr '\0' '\n' | grep -c /opt/slurm" \ - 2>/dev/null || echo 0) - [ "$count" -gt 0 ] -} diff --git a/misc/phoenix/create-runner.sh b/misc/phoenix/create-runner.sh index c0a7899f85..9991c43c4d 100755 --- a/misc/phoenix/create-runner.sh +++ b/misc/phoenix/create-runner.sh @@ -83,6 +83,7 @@ echo " Configured." # Start echo "Starting on $node..." if start_runner "$node" "$runner_dir"; then + echo "$node" > "$runner_dir/runner.node" pids=$(find_pids "$node" "$runner_dir") pid=${pids%% *} if has_slurm "$node" "$pid"; then diff --git a/misc/phoenix/list-runners.sh b/misc/phoenix/list-runners.sh index 76ac25a9bf..0d2571803f 100755 --- a/misc/phoenix/list-runners.sh +++ b/misc/phoenix/list-runners.sh @@ -1,73 +1,8 @@ -#!/bin/bash -# List all Phoenix runners, combining GitHub API status with login-node process info. -# -# Shows both what GitHub thinks (online/offline/busy) and the actual process -# state on the login nodes (which node, slurm PATH, memory). +#!/usr/bin/env bash +# List all Phoenix runners combining GitHub API status with live node process info. +# Thin wrapper — see misc/common/list-runners.sh for the implementation. # # Usage: bash list-runners.sh - -set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" - -printf "%-25s %-8s %-22s %-8s %6s %s\n" \ - "NAME" "GITHUB" "NODE" "SLURM" "RSS" "DIRECTORY" -printf "%s\n" "$(printf '%.0s-' {1..100})" - -# Get GitHub API status for all Phoenix runners -declare -A gh_status gh_busy -while read -r id name status busy; do - gh_status[$name]="$status" - gh_busy[$name]="$busy" -done <<< "$(gh_list_runners)" - -# Walk local runner directories and cross-reference -while IFS= read -r dir; do - name=$(get_runner_name "$dir") - [ -z "$name" ] && continue - - # GitHub status - api_status="${gh_status[$name]:-unknown}" - api_busy="${gh_busy[$name]:-false}" - if [ "$api_busy" = "true" ]; then - gh_col="BUSY" - else - gh_col="$api_status" - fi - - # Node status - node=$(find_node "$dir") - if [ "$node" = "offline" ]; then - printf "%-25s %-8s %-22s %-8s %6s %s\n" \ - "$name" "$gh_col" "—" "—" "—" "$dir" - continue - fi - - # Process details (one SSH call) - info=$(ssh $SSH_OPTS "$node" ' - for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do - cwd=$(readlink -f /proc/$p/cwd 2>/dev/null || true) - if [ "$cwd" = "'"$dir"'" ]; then - slurm=$(cat /proc/$p/environ 2>/dev/null | tr "\0" "\n" | grep -c /opt/slurm || echo 0) - [ "$slurm" -gt 0 ] && s="ok" || s="MISSING" - rss=$(ps -p $p -o rss= 2>/dev/null | awk "{printf \"%.0f\", \$1/1024}" || echo "?") - echo "$s $rss" - exit - fi - done - echo "? ?" - ' 2>/dev/null || echo "? ?") - read -r slurm rss <<< "$info" - - printf "%-25s %-8s %-22s %-8s %5sMB %s\n" \ - "$name" "$gh_col" "$node" "$slurm" "$rss" "$dir" -done < <(find_runner_dirs) - -echo "" -echo "=== Per-node memory ===" -for node in "${NODES[@]}"; do - count=$(ssh $SSH_OPTS "$node" "ps aux | grep Runner.Listener | grep -v grep | wc -l" 2>/dev/null || echo 0) - rss=$(ssh $SSH_OPTS "$node" "ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" 2>/dev/null || echo "?") - [[ "$rss" =~ ^[0-9]+$ ]] || rss=0 - echo " $node: $count runners, ${rss} MB / ${CGROUP_LIMIT} MB ($(( CGROUP_LIMIT - rss )) MB free)" -done +source "$SCRIPT_DIR/../common/list-runners.sh" diff --git a/misc/phoenix/rebalance-runners.sh b/misc/phoenix/rebalance-runners.sh index ca6d65a8b2..3e961f622c 100755 --- a/misc/phoenix/rebalance-runners.sh +++ b/misc/phoenix/rebalance-runners.sh @@ -1,169 +1,11 @@ -#!/bin/bash +#!/usr/bin/env bash # Automatically rebalance Phoenix runners across login nodes. -# -# Discovers all runner directories, checks which node each is on, -# computes the optimal distribution, and moves runners to balance. -# Prefers moving idle runners over busy ones. Also places offline runners. +# Thin wrapper — see misc/common/rebalance-runners.sh for the implementation. # # Usage: bash rebalance-runners.sh # dry run # APPLY=1 bash rebalance-runners.sh # execute # APPLY=1 FORCE=1 bash rebalance-runners.sh # move busy runners too - -set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SITE_SCRIPT_DIR="$SCRIPT_DIR" source "$SCRIPT_DIR/config.sh" - -# Discover runners -declare -a dirs=() names=() -while IFS= read -r dir; do - name=$(get_runner_name "$dir") - [ -z "$name" ] && continue - dirs+=("$dir") - names+=("$name") -done < <(find_runner_dirs) - -num_nodes=${#NODES[@]} -num_runners=${#dirs[@]} -target=$(( num_runners / num_nodes )) -remainder=$(( num_runners % num_nodes )) - -echo "=== Current state ===" -echo "Runners: $num_runners across $num_nodes nodes" -echo "Target: $target per node (+1 on first $remainder nodes)" -echo "" - -# Map runners to nodes -declare -A node_runners # node -> space-separated indices -declare -A runner_node runner_busy - -for node in "${NODES[@]}"; do node_runners[$node]=""; done - -for i in "${!dirs[@]}"; do - node=$(find_node "${dirs[$i]}") - runner_node[$i]="$node" - if [ "$node" != "offline" ]; then - node_runners[$node]="${node_runners[$node]:-} $i" - worker=$(ssh $SSH_OPTS "$node" "ps aux | grep Runner.Worker | grep '${dirs[$i]}' | grep -v grep" 2>/dev/null || true) - [ -n "$worker" ] && runner_busy[$i]=1 || runner_busy[$i]=0 - else - runner_busy[$i]=0 - fi -done - -# Show current distribution -for node in "${NODES[@]}"; do - indices=(${node_runners[$node]:-}) - echo "$node: ${#indices[@]} runners" - for i in "${indices[@]}"; do - busy="" - [ "${runner_busy[$i]:-0}" = "1" ] && busy=" (BUSY)" - echo " ${names[$i]}$busy" - done -done - -offline=() -for i in "${!dirs[@]}"; do - [ "${runner_node[$i]}" = "offline" ] && offline+=("$i") -done -if [ ${#offline[@]} -gt 0 ]; then - echo "" - echo "OFFLINE:" - for i in "${offline[@]}"; do echo " ${names[$i]}"; done -fi -echo "" - -# Compute targets -declare -A node_target -n=0 -for node in "${NODES[@]}"; do - node_target[$node]=$target - [ $n -lt $remainder ] && node_target[$node]=$(( target + 1 )) - n=$((n + 1)) -done - -# Plan moves: collect runners from overloaded nodes (idle first) -to_place=() -for node in "${NODES[@]}"; do - indices=(${node_runners[$node]:-}) - excess=$(( ${#indices[@]} - ${node_target[$node]} )) - [ $excess -le 0 ] && continue - idle=() busy=() - for i in "${indices[@]}"; do - [ "${runner_busy[$i]:-0}" = "1" ] && busy+=("$i") || idle+=("$i") - done - moved=0 - for i in "${idle[@]}" "${busy[@]}"; do - [ $moved -ge $excess ] && break - to_place+=("$node $i") - moved=$((moved + 1)) - done -done - -# Add offline runners -for i in "${offline[@]}"; do to_place+=("offline $i"); done - -# Assign to underloaded nodes -moves=() -for entry in "${to_place[@]}"; do - read -r src idx <<< "$entry" - best="" best_deficit=-999 - for node in "${NODES[@]}"; do - cur=(${node_runners[$node]:-}) - deficit=$(( ${node_target[$node]} - ${#cur[@]} )) - [ $deficit -gt $best_deficit ] && best_deficit=$deficit && best=$node - done - [ -z "$best" ] || [ "$best_deficit" -le 0 ] && continue - moves+=("$src $best $idx") - # Update bookkeeping - if [ "$src" != "offline" ]; then - new="" - for j in ${node_runners[$src]}; do [ "$j" != "$idx" ] && new="$new $j"; done - node_runners[$src]="$new" - fi - node_runners[$best]="${node_runners[$best]:-} $idx" -done - -if [ ${#moves[@]} -eq 0 ]; then - echo "Already balanced." - exit 0 -fi - -echo "=== Planned moves ===" -has_busy=false -for move in "${moves[@]}"; do - read -r src dst idx <<< "$move" - busy="" - [ "${runner_busy[$idx]:-0}" = "1" ] && busy=" (BUSY!)" && has_busy=true - echo " ${names[$idx]}: $src -> $dst$busy" -done -echo "" -echo "=== Target ===" -for node in "${NODES[@]}"; do - cur=(${node_runners[$node]:-}) - echo " $node: ${#cur[@]} runners" -done - -[ "$has_busy" = true ] && [ "${FORCE:-0}" != "1" ] && echo "" && echo "Set FORCE=1 to move busy runners." && exit 1 -[ "${APPLY:-0}" != "1" ] && echo "" && echo "Dry run — set APPLY=1 to execute." && exit 0 - -echo "" -echo "=== Executing ===" -for move in "${moves[@]}"; do - read -r src dst idx <<< "$move" - echo "Moving ${names[$idx]}: $src -> $dst" - [ "$src" != "offline" ] && stop_runner "$src" "${dirs[$idx]}" - if start_runner "$dst" "${dirs[$idx]}"; then - pids=$(find_pids "$dst" "${dirs[$idx]}") - pid=${pids%% *} - if has_slurm "$dst" "$pid"; then - echo " OK: PID $pid, slurm in PATH" - else - echo " WARNING: slurm MISSING" - fi - else - echo " ERROR: Failed to start" - fi -done - -echo "" -bash "$SCRIPT_DIR/check-runners.sh" +source "$SCRIPT_DIR/../common/rebalance-runners.sh" From 03ed7777891f12a01b667596d021899fb3257ad2 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 20:25:55 -0400 Subject: [PATCH 14/20] refactor: remove superseded restart-offline-runners.sh, update READMEs restart-offline-runners.sh is superseded by rebalance-runners.sh, which handles offline runners as part of its distribution algorithm and also calls sync_runner_nodes to self-heal stale runner.node files. Update frontier/README.md: remove restart-offline-runners.sh references, replace CWD-based discovery language with EXE-based, add rebalance-runners.sh. Update phoenix/README.md: fix config.sh description (functions now in runner-lib.sh), fix list-runners.sh description (now parallel sweep), fix slurm PATH wording (portable grep, not /opt/slurm). Co-Authored-By: Claude Sonnet 4.6 --- misc/frontier/README.md | 37 +++++---- misc/frontier/restart-offline-runners.sh | 99 ------------------------ misc/phoenix/README.md | 8 +- 3 files changed, 24 insertions(+), 120 deletions(-) delete mode 100755 misc/frontier/restart-offline-runners.sh diff --git a/misc/frontier/README.md b/misc/frontier/README.md index f6a2175350..e370de2662 100644 --- a/misc/frontier/README.md +++ b/misc/frontier/README.md @@ -14,35 +14,38 @@ starting it on another — no binary copying needed. Each runner directory contains a `runner.node` file recording which login node it was last started on. This is used as a fallback hint when restarting offline runners. The authoritative source of truth for whether a runner is running (and -on which node) is CWD-based process discovery — not any PID file. +on which node) is EXE-based process discovery via `/proc/$pid/exe` — not any +PID file. -`runner.node` is self-healing: `restart-offline-runners.sh` detects when a -runner is actually running on a different node than `runner.node` records (e.g. -after a manual restart) and corrects the file automatically. +`runner.node` is self-healing: `rebalance-runners.sh` calls `sync_runner_nodes` +at startup, which sweeps all nodes in parallel and corrects any stale +`runner.node` files automatically. Runners occasionally die due to OLCF's firewall/proxy dropping long-lived TCP -connections to GitHub's broker. The `restart-offline-runners.sh` script handles -recovery. Login nodes vary in stability — if a runner keeps dying on a +connections to GitHub's broker. Run `rebalance-runners.sh` to restart and +redistribute them. Login nodes vary in stability — if a runner keeps dying on a particular node, move it to a quieter one (login01 tends to have low load). ## Quick Reference ```bash -# List all runners with GitHub status, node, and memory usage +# List all runners with GitHub status, node, slurm, and memory usage bash list-runners.sh # Check runner health across all login nodes bash check-runners.sh +# Restart offline runners and rebalance across all 11 nodes +bash rebalance-runners.sh # dry run +APPLY=1 bash rebalance-runners.sh # execute +APPLY=1 FORCE=1 bash rebalance-runners.sh # move busy runners too + # Deploy a new runner on a specific node bash make-runner.sh 23 login01 # Deploy multiple runners across nodes (e.g. runners 23, 24, 25) bash deploy-runners.sh 23 login01 login02 login03 -# Restart all offline runners in place -bash restart-offline-runners.sh - # Move a runner to a different login node bash move-runner.sh frontier-1 login01 @@ -58,14 +61,14 @@ APPLY=1 bash ../common/rerun-failed.sh | Script | Purpose | |---|---| -| `config.sh` | Shared configuration, constants, GitHub API helpers, and CWD-based process management functions. Sourced by all other scripts. | -| `check-runners.sh` | SSH to each login node, show Runner.Listener processes with name, status (idle/BUSY), and RSS memory. | -| `list-runners.sh` | List all runners with GitHub API status, actual node (from parallel SSH sweep), and RSS memory. Flags stale `runner.node` entries. | +| `config.sh` | Shared configuration: Frontier constants, `find_runner_dirs()`, and `sync_runner_nodes()`. Sources `../common/runner-lib.sh` for shared functions. | +| `check-runners.sh` | SSH to each login node, show Runner.Listener processes with name, status (idle/BUSY), slurm PATH, and RSS memory. | +| `list-runners.sh` | List all runners with GitHub API status, actual node (from parallel SSH sweep), slurm status, and RSS memory. Flags stale `runner.node` entries. | +| `rebalance-runners.sh` | Sync node locations, compute optimal distribution, and move runners across all 11 login nodes. Handles offline runners. Dry run by default. | | `make-runner.sh` | Download runner binary, register with GitHub via API, start on target node. Usage: `make-runner.sh [login-node]` | | `move-runner.sh` | Move a runner to a different login node: stop on current node, update `runner.node`, start on target. Usage: `move-runner.sh ` | | `deploy-runners.sh` | Deploy multiple runners across login nodes in parallel. Usage: `deploy-runners.sh [node2 ...]` | -| `restart-offline-runners.sh` | Query GitHub for offline frontier runners, locate via CWD-based discovery, stop stale processes, then restart in parallel. Self-heals stale `runner.node` files. Prints final status. | -| `stop-runner.sh` | Locate runner via CWD-based discovery, stop the process, and deregister from GitHub. Usage: `stop-runner.sh ` | +| `stop-runner.sh` | Locate runner via EXE-based discovery, stop the process, and deregister from GitHub. Usage: `stop-runner.sh ` | | `../common/rerun-failed.sh` | Rerun failed GitHub Actions workflows on open PRs and master. No site-specific code. | ## Troubleshooting @@ -77,9 +80,9 @@ bash move-runner.sh frontier-1 login01 ``` **Multiple runners OFFLINE at once** — Usually a transient OLCF network blip -to GitHub. Run `restart-offline-runners.sh` to recover all at once. +to GitHub. Run `rebalance-runners.sh` to recover and redistribute all at once. **Runner appears offline on GitHub but process is running** — GitHub status can -lag. `restart-offline-runners.sh` uses CWD-based process discovery first: if a +lag. `rebalance-runners.sh` uses EXE-based process discovery first: if a process is found running, it will stop it before restarting, preventing duplicate runner processes. diff --git a/misc/frontier/restart-offline-runners.sh b/misc/frontier/restart-offline-runners.sh deleted file mode 100755 index 5c1f984a22..0000000000 --- a/misc/frontier/restart-offline-runners.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env bash -# Restart all offline frontier runners. -# -# Queries GitHub for offline frontier runners, locates each via CWD-based -# process discovery, stops any stale processes, then restarts in parallel. -# Falls back to runner.node for the target node if the runner is truly offline. -# -# Usage: bash restart-offline-runners.sh -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/config.sh" - -echo "==> Syncing runner node locations..." -sync_runner_nodes - -echo "==> Checking for offline frontier runners..." - -# Collect offline runner names from GitHub API -mapfile -t OFFLINE_NAMES < <( - gh_list_runners | while read -r id name status busy; do - [ "$status" = "offline" ] && echo "$name" - done -) - -if [ ${#OFFLINE_NAMES[@]} -eq 0 ]; then - echo "==> All frontier runners are online. Nothing to do." - exit 0 -fi - -echo "==> Offline runners: ${OFFLINE_NAMES[*]}" - -restart_one() { - local runner_name="$1" - local dir="${SHARED_DIR}/${runner_name}" - - if [ ! -d "$dir" ]; then - echo "WARN: No directory for ${runner_name}, skipping." - return - fi - - # Determine the recorded node from runner.node - local recorded_node target_node - if [ -f "${dir}/runner.node" ]; then - recorded_node=$(cat "${dir}/runner.node") - else - echo "WARN: No runner.node for ${runner_name}, skipping." - return - fi - - # Check if the runner is actually already running somewhere (GitHub may lag) - local actual_node - actual_node=$(find_node "$dir") - - if [ "$actual_node" != "offline" ]; then - # Self-healing: if the runner is on a different node than runner.node records, - # update runner.node to reflect reality before stopping and restarting. - if [ "$actual_node" != "$recorded_node" ]; then - echo "==> ${runner_name}: found on ${actual_node}, runner.node says ${recorded_node} — updating runner.node." - echo "$actual_node" > "${dir}/runner.node" - recorded_node="$actual_node" - fi - echo "==> ${runner_name} appears running on ${actual_node} (GitHub may lag) — stopping first..." - stop_runner "$actual_node" "$dir" - # Restart where it was actually running - target_node="$actual_node" - else - # Runner is truly offline; fall back to the last known node - target_node="$recorded_node" - fi - - echo "==> Starting ${runner_name} on ${target_node}..." - if start_runner "$target_node" "$dir"; then - echo "$target_node" > "${dir}/runner.node" - echo " ${runner_name}: started on ${target_node}." - else - echo " First start attempt failed. Retrying in 5 seconds..." - sleep 5 - if start_runner "$target_node" "$dir"; then - echo "$target_node" > "${dir}/runner.node" - echo " ${runner_name}: started on ${target_node}." - else - echo " ${runner_name}: ERROR — failed to start on ${target_node} after retry." >&2 - fi - fi -} - -# Restart all offline runners in parallel -for name in "${OFFLINE_NAMES[@]}"; do - restart_one "$name" & -done - -wait - -echo "" -echo "==> Final status:" -gh_list_runners | while read -r id name status busy; do - printf " %-30s %s\n" "$name" "$status" -done diff --git a/misc/phoenix/README.md b/misc/phoenix/README.md index b00c56724c..51488c89f9 100644 --- a/misc/phoenix/README.md +++ b/misc/phoenix/README.md @@ -47,9 +47,9 @@ APPLY=1 bash rerun-failed.sh # execute | Script | Purpose | |---|---| -| `config.sh` | Shared config: Phoenix constants (`ORG`, `RUNNER_GROUP`, `RUNNER_LABEL`, `NODES`), GitHub API helpers (`gh_list_runners`, `gh_registration_token`), and process management (`start_runner`, `stop_runner`, `find_node`, `find_pids`, `has_slurm`). | -| `check-runners.sh` | Quick per-node health check. One SSH per node. Shows runner names, idle/BUSY status, slurm PATH, RSS, and total memory. | -| `list-runners.sh` | Detailed table combining GitHub API status (online/offline/busy) with login-node process info. Slower (one SSH per runner + API call). | +| `config.sh` | Shared config: Phoenix constants (`ORG`, `RUNNER_GROUP`, `RUNNER_LABEL`, `NODES`, `CGROUP_LIMIT`, `RUNNER_PARENT_DIRS`) and `find_runner_dirs()`. Sources `../common/runner-lib.sh` for shared functions. | +| `check-runners.sh` | Quick per-node health check. One SSH per node. Shows runner names, idle/BUSY status, slurm PATH, RSS, and total cgroup memory. | +| `list-runners.sh` | Table combining GitHub API status with live node info from a parallel SSH sweep. Shows slurm status and flags stale `runner.node` entries. | | `restart-runner.sh` | Stop and restart one runner on a given node with proper login shell PATH and SSH detachment. | | `restart-all.sh` | Restart all runners in place. Skips BUSY runners unless `FORCE=1`. | | `rebalance-runners.sh` | Auto-compute optimal distribution and move runners. Prefers idle runners. Also places OFFLINE runners. | @@ -61,7 +61,7 @@ APPLY=1 bash rerun-failed.sh # execute - **Dry run by default**: `rebalance-runners.sh`, `restart-all.sh`, and `rerun-failed.sh` show what they would do unless `APPLY=1` is set. - **Busy runner protection**: Scripts skip BUSY runners unless `FORCE=1`. -- **Slurm PATH verification**: After starting, scripts verify `/opt/slurm` is +- **Slurm PATH verification**: After starting, scripts verify `slurm` appears in the runner's PATH and warn if missing. ## Configuration From e5934bc598434f4304fd745bfdd2218f4345d5aa Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 20:34:39 -0400 Subject: [PATCH 15/20] refactor: move remaining site-specific scripts to common/ stop-runner, move-runner, restart-runner, and restart-all were each only available on one cluster. Extract implementations to common/ and add thin wrappers for both sites so all operational scripts work on both Frontier and Phoenix. Improvements over the originals: - stop-runner: searches find_runner_dirs() by name instead of hardcoded SHARED_DIR path; fixes runner_id lookup to use process substitution - move-runner: optional sync_runner_nodes hook (runs on Frontier, no-op on Phoenix); searches find_runner_dirs() by name - restart-runner: writes runner.node after successful start (was missing) - restart-all: adds optional sync_runner_nodes hook; writes runner.node after each successful start (was missing) Update common/README.md and both site READMEs to reflect full script inventory. Co-Authored-By: Claude Sonnet 4.6 --- misc/common/README.md | 32 +++++-------- misc/common/move-runner.sh | 71 ++++++++++++++++++++++++++++ misc/common/restart-all.sh | 83 +++++++++++++++++++++++++++++++++ misc/common/restart-runner.sh | 35 ++++++++++++++ misc/common/stop-runner.sh | 52 +++++++++++++++++++++ misc/frontier/README.md | 34 ++++++++------ misc/frontier/move-runner.sh | 70 ++------------------------- misc/frontier/restart-all.sh | 10 ++++ misc/frontier/restart-runner.sh | 8 ++++ misc/frontier/stop-runner.sh | 43 ++--------------- misc/phoenix/README.md | 28 +++++++---- misc/phoenix/move-runner.sh | 8 ++++ misc/phoenix/restart-all.sh | 77 ++---------------------------- misc/phoenix/restart-runner.sh | 37 ++------------- misc/phoenix/stop-runner.sh | 8 ++++ 15 files changed, 342 insertions(+), 254 deletions(-) create mode 100644 misc/common/move-runner.sh create mode 100644 misc/common/restart-all.sh create mode 100644 misc/common/restart-runner.sh create mode 100644 misc/common/stop-runner.sh create mode 100644 misc/frontier/restart-all.sh create mode 100644 misc/frontier/restart-runner.sh create mode 100644 misc/phoenix/move-runner.sh create mode 100644 misc/phoenix/stop-runner.sh diff --git a/misc/common/README.md b/misc/common/README.md index ad2156b7b4..ac9448e6f2 100644 --- a/misc/common/README.md +++ b/misc/common/README.md @@ -1,27 +1,19 @@ # Common Runner Management Scripts -This directory contains site-agnostic scripts shared between the Phoenix and -Frontier runner management setups. Scripts here have no site-specific logic and -can be invoked directly or via thin site-specific wrappers. +Site-agnostic scripts shared between the Frontier and Phoenix runner setups. +All logic lives here; site directories contain only thin wrappers that source +`config.sh` then the relevant script from this directory. ## Scripts | Script | Purpose | |---|---| -| `rerun-failed.sh` | Rerun failed GitHub Actions workflows on open non-draft MFC PRs and master. Dry-run by default; set `APPLY=1` to actually trigger reruns. | - -## Usage - -```bash -# Dry run — show which failed workflows would be rerun -bash misc/common/rerun-failed.sh - -# Actually rerun failed workflows -APPLY=1 bash misc/common/rerun-failed.sh -``` - -## Site wrappers - -`misc/phoenix/rerun-failed.sh` is a thin wrapper that delegates to this -script, so both `bash misc/phoenix/rerun-failed.sh` and -`bash misc/common/rerun-failed.sh` invoke the same logic. +| `runner-lib.sh` | Shared library: GitHub API helpers, EXE-based process discovery, parallel node sweep, start/stop primitives. Sourced by site `config.sh` files. | +| `check-runners.sh` | Per-node health check: Runner.Listener processes with name, idle/BUSY, slurm PATH, RSS. Optional cgroup memory footer. | +| `list-runners.sh` | Full table: GitHub API status × parallel node sweep. Shows slurm status, flags stale `runner.node`. Optional cgroup memory footer. | +| `rebalance-runners.sh` | Compute optimal distribution and move runners across nodes. Handles offline runners. Writes `runner.node`. Dry run by default. | +| `restart-runner.sh` | Stop and restart one runner on a given node. Verifies slurm in PATH. Writes `runner.node`. | +| `restart-all.sh` | Restart all runners in place. Skips busy unless `FORCE=1`. Dry run by default. | +| `move-runner.sh` | Move a runner to a different login node by name. Stops on current node, starts on target. Writes `runner.node`. | +| `stop-runner.sh` | Stop a runner process and remove its GitHub registration. | +| `rerun-failed.sh` | Rerun failed GitHub Actions workflows on open non-draft PRs and master. Dry run by default. | diff --git a/misc/common/move-runner.sh b/misc/common/move-runner.sh new file mode 100644 index 0000000000..ffbadb269b --- /dev/null +++ b/misc/common/move-runner.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +# Move a runner to a different login node. +# +# Sourced by site wrappers (frontier/move-runner.sh, phoenix/move-runner.sh) +# after config.sh is loaded. Finds the runner by name, stops it on its current +# node, and starts it on the target node. Retries start once after 5 seconds. +# +# Usage: bash move-runner.sh +set -euo pipefail + +RUNNER_NAME="${1:?Usage: $0 }" +TARGET_NODE="${2:?Usage: $0 }" + +# Validate target node +valid=0 +for node in "${NODES[@]}"; do + [ "$node" = "$TARGET_NODE" ] && valid=1 && break +done +if [ "$valid" -eq 0 ]; then + echo "ERROR: '$TARGET_NODE' is not a valid login node." >&2 + echo " Valid nodes: ${NODES[*]}" >&2 + exit 1 +fi + +# Find runner directory by name +runner_dir="" +while IFS= read -r dir; do + if [ "$(get_runner_name "$dir")" = "$RUNNER_NAME" ]; then + runner_dir="$dir" + break + fi +done < <(find_runner_dirs) + +if [ -z "$runner_dir" ]; then + echo "ERROR: Runner '$RUNNER_NAME' not found in known runner directories." >&2 + exit 1 +fi + +declare -f sync_runner_nodes > /dev/null 2>&1 && { + echo "==> Syncing runner node locations..." + sync_runner_nodes +} + +echo "==> Locating $RUNNER_NAME..." +current_node=$(find_node "$runner_dir") + +if [ "$current_node" = "$TARGET_NODE" ]; then + echo "==> $RUNNER_NAME is already running on $TARGET_NODE. Nothing to do." + exit 0 +fi + +if [ "$current_node" != "offline" ]; then + echo "==> Stopping $RUNNER_NAME on $current_node..." + stop_runner "$current_node" "$runner_dir" +fi + +echo "==> Starting $RUNNER_NAME on $TARGET_NODE..." +if start_runner "$TARGET_NODE" "$runner_dir"; then + echo "$TARGET_NODE" > "$runner_dir/runner.node" + echo "==> $RUNNER_NAME is now running on $TARGET_NODE." +else + echo " First start attempt failed. Retrying in 5 seconds..." + sleep 5 + if start_runner "$TARGET_NODE" "$runner_dir"; then + echo "$TARGET_NODE" > "$runner_dir/runner.node" + echo "==> $RUNNER_NAME is now running on $TARGET_NODE." + else + echo "ERROR: $RUNNER_NAME failed to start on $TARGET_NODE after retry." >&2 + exit 1 + fi +fi diff --git a/misc/common/restart-all.sh b/misc/common/restart-all.sh new file mode 100644 index 0000000000..ff3de91ac9 --- /dev/null +++ b/misc/common/restart-all.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +# Restart all runners in place on their current nodes. +# +# Sourced by site wrappers (frontier/restart-all.sh, phoenix/restart-all.sh) +# after config.sh is loaded. Useful after a login node reboot or to pick up +# environment changes. Skips busy runners unless FORCE=1. Dry run by default. +# +# Usage: bash restart-all.sh # dry run +# APPLY=1 bash restart-all.sh # execute +# APPLY=1 FORCE=1 bash restart-all.sh # restart busy runners too +set -euo pipefail + +declare -f sync_runner_nodes > /dev/null 2>&1 && { + echo "==> Syncing runner node locations..." + sync_runner_nodes +} + +echo "=== Discovering runners ===" +declare -a restart_list=() + +while IFS= read -r dir; do + name=$(get_runner_name "$dir") + [ -z "$name" ] && continue + node=$(find_node "$dir") + + if [ "$node" = "offline" ]; then + echo " $name: OFFLINE (use rebalance-runners.sh to place)" + continue + fi + + worker=$(ssh $SSH_OPTS "$node" "ps aux | grep Runner.Worker | grep '$dir' | grep -v grep" 2>/dev/null || true) + if [ -n "$worker" ]; then + echo " $name: BUSY on $node" + if [ "${FORCE:-0}" != "1" ]; then + echo " Skipping. Set FORCE=1 to restart anyway." + continue + fi + else + echo " $name: idle on $node" + fi + + restart_list+=("$node $dir $name") +done < <(find_runner_dirs) + +if [ ${#restart_list[@]} -eq 0 ]; then + echo "Nothing to restart." + exit 0 +fi + +echo "" +echo "${#restart_list[@]} runners will be restarted." + +if [ "${APPLY:-0}" != "1" ]; then + echo "Dry run — set APPLY=1 to execute." + exit 0 +fi + +echo "" +echo "=== Restarting ===" +success=0; fail=0 +for entry in "${restart_list[@]}"; do + read -r node dir name <<< "$entry" + echo "--- $name on $node ---" + stop_runner "$node" "$dir" + if start_runner "$node" "$dir"; then + echo "$node" > "$dir/runner.node" + pids=$(find_pids "$node" "$dir") + pid=${pids%% *} + if has_slurm "$node" "$pid"; then + echo " OK: PID $pid, slurm in PATH" + success=$((success + 1)) + else + echo " WARNING: PID $pid but slurm MISSING" + fail=$((fail + 1)) + fi + else + echo " ERROR: Failed to start" + fail=$((fail + 1)) + fi +done + +echo "" +echo "=== Summary: $success succeeded, $fail failed ===" diff --git a/misc/common/restart-runner.sh b/misc/common/restart-runner.sh new file mode 100644 index 0000000000..6247fca40f --- /dev/null +++ b/misc/common/restart-runner.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Restart a single GitHub Actions runner on a given node. +# +# Sourced by site wrappers (frontier/restart-runner.sh, phoenix/restart-runner.sh) +# after config.sh is loaded. Stops any existing process, starts fresh, and +# verifies slurm is in PATH. +# +# Usage: bash restart-runner.sh +set -euo pipefail + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo "Nodes: ${NODES[*]}" + exit 1 +fi + +node="$1" +dir="$2" +name=$(get_runner_name "$dir" 2>/dev/null || basename "$dir") + +echo "Restarting $name on $node..." +stop_runner "$node" "$dir" + +if start_runner "$node" "$dir"; then + echo "$node" > "$dir/runner.node" + pids=$(find_pids "$node" "$dir") + pid=${pids%% *} + if has_slurm "$node" "$pid"; then + echo " OK: PID $pid, slurm in PATH" + else + echo " WARNING: PID $pid but slurm MISSING from PATH" + fi +else + echo " ERROR: Failed to start on $node" +fi diff --git a/misc/common/stop-runner.sh b/misc/common/stop-runner.sh new file mode 100644 index 0000000000..b3f77b2a6b --- /dev/null +++ b/misc/common/stop-runner.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Stop and deregister a GitHub Actions runner. +# +# Sourced by site wrappers (frontier/stop-runner.sh, phoenix/stop-runner.sh) +# after config.sh is loaded. Finds the runner directory by name via +# find_runner_dirs(), stops the process, and removes the GitHub registration. +# +# Usage: bash stop-runner.sh +set -euo pipefail + +RUNNER_NAME="${1:?Usage: $0 }" + +# Find runner directory by name +runner_dir="" +while IFS= read -r dir; do + if [ "$(get_runner_name "$dir")" = "$RUNNER_NAME" ]; then + runner_dir="$dir" + break + fi +done < <(find_runner_dirs) + +if [ -z "$runner_dir" ]; then + echo "ERROR: Runner '$RUNNER_NAME' not found in known runner directories." >&2 + exit 1 +fi + +# Locate and stop the process +echo "==> Locating $RUNNER_NAME..." +node=$(find_node "$runner_dir") + +if [ "$node" != "offline" ]; then + echo "==> Stopping $RUNNER_NAME on $node..." + stop_runner "$node" "$runner_dir" + echo "==> Process stopped." +else + echo "==> $RUNNER_NAME is not running (already offline)." +fi + +# Deregister from GitHub +echo "==> Fetching runner ID from GitHub..." +runner_id="" +while read -r id name _status _busy; do + [ "$name" = "$RUNNER_NAME" ] && runner_id="$id" && break +done < <(gh_list_runners) + +if [ -n "$runner_id" ]; then + echo "==> Deregistering runner (ID $runner_id)..." + gh_remove_runner "$runner_id" + echo "==> Done." +else + echo "==> Runner not found in GitHub API (may already be deregistered)." +fi diff --git a/misc/frontier/README.md b/misc/frontier/README.md index e370de2662..5dd92877da 100644 --- a/misc/frontier/README.md +++ b/misc/frontier/README.md @@ -35,16 +35,16 @@ bash list-runners.sh # Check runner health across all login nodes bash check-runners.sh -# Restart offline runners and rebalance across all 11 nodes +# Rebalance runners across all 11 nodes (also restarts offline runners) bash rebalance-runners.sh # dry run APPLY=1 bash rebalance-runners.sh # execute APPLY=1 FORCE=1 bash rebalance-runners.sh # move busy runners too -# Deploy a new runner on a specific node -bash make-runner.sh 23 login01 +# Restart all runners in place (e.g. after a node reboot) +APPLY=1 bash restart-all.sh -# Deploy multiple runners across nodes (e.g. runners 23, 24, 25) -bash deploy-runners.sh 23 login01 login02 login03 +# Restart one specific runner +bash restart-runner.sh login01 /path/to/runner-dir # Move a runner to a different login node bash move-runner.sh frontier-1 login01 @@ -52,7 +52,13 @@ bash move-runner.sh frontier-1 login01 # Stop and deregister a runner bash stop-runner.sh frontier-12 -# Rerun failed CI workflows (site-agnostic, also available at misc/common/) +# Deploy a new runner on a specific node +bash make-runner.sh 23 login01 + +# Deploy multiple runners across nodes (e.g. runners 23, 24, 25) +bash deploy-runners.sh 23 login01 login02 login03 + +# Rerun failed CI workflows bash ../common/rerun-failed.sh APPLY=1 bash ../common/rerun-failed.sh ``` @@ -63,13 +69,15 @@ APPLY=1 bash ../common/rerun-failed.sh |---|---| | `config.sh` | Shared configuration: Frontier constants, `find_runner_dirs()`, and `sync_runner_nodes()`. Sources `../common/runner-lib.sh` for shared functions. | | `check-runners.sh` | SSH to each login node, show Runner.Listener processes with name, status (idle/BUSY), slurm PATH, and RSS memory. | -| `list-runners.sh` | List all runners with GitHub API status, actual node (from parallel SSH sweep), slurm status, and RSS memory. Flags stale `runner.node` entries. | -| `rebalance-runners.sh` | Sync node locations, compute optimal distribution, and move runners across all 11 login nodes. Handles offline runners. Dry run by default. | -| `make-runner.sh` | Download runner binary, register with GitHub via API, start on target node. Usage: `make-runner.sh [login-node]` | -| `move-runner.sh` | Move a runner to a different login node: stop on current node, update `runner.node`, start on target. Usage: `move-runner.sh ` | -| `deploy-runners.sh` | Deploy multiple runners across login nodes in parallel. Usage: `deploy-runners.sh [node2 ...]` | -| `stop-runner.sh` | Locate runner via EXE-based discovery, stop the process, and deregister from GitHub. Usage: `stop-runner.sh ` | -| `../common/rerun-failed.sh` | Rerun failed GitHub Actions workflows on open PRs and master. No site-specific code. | +| `list-runners.sh` | List all runners with GitHub API status, actual node (parallel SSH sweep), slurm status, and RSS. Flags stale `runner.node`. | +| `rebalance-runners.sh` | Sync node locations, compute optimal distribution, move runners across all 11 nodes. Handles offline runners. Dry run by default. | +| `restart-all.sh` | Restart all runners in place. Skips busy unless `FORCE=1`. Dry run by default. | +| `restart-runner.sh` | Stop and restart one runner on a given node. Usage: `restart-runner.sh ` | +| `move-runner.sh` | Move a runner to a different login node by name. Usage: `move-runner.sh ` | +| `stop-runner.sh` | Stop the runner process and deregister from GitHub. Usage: `stop-runner.sh ` | +| `make-runner.sh` | Download runner binary, register with GitHub, start on target node. Usage: `make-runner.sh [node]` | +| `deploy-runners.sh` | Deploy multiple runners across nodes in parallel. Usage: `deploy-runners.sh [node2 ...]` | +| `../common/rerun-failed.sh` | Rerun failed GitHub Actions workflows on open PRs and master. | ## Troubleshooting diff --git a/misc/frontier/move-runner.sh b/misc/frontier/move-runner.sh index 29f300d1d0..418c292fba 100755 --- a/misc/frontier/move-runner.sh +++ b/misc/frontier/move-runner.sh @@ -1,72 +1,8 @@ #!/usr/bin/env bash # Move a Frontier runner to a different login node. +# Thin wrapper — see misc/common/move-runner.sh for the implementation. # -# Stops the runner on its current node, updates runner.node, and starts it on -# the target node. Retries the start once after 5 seconds if the first attempt -# fails. -# -# Usage: move-runner.sh -# Example: move-runner.sh frontier-1 login01 -set -euo pipefail - +# Usage: bash move-runner.sh SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" - -RUNNER_NAME="${1:?Usage: $0 }" -TARGET_NODE="${2:?Usage: $0 }" - -RUNNER_DIR="${SHARED_DIR}/${RUNNER_NAME}" - -# --- Validate runner directory --- -if [ ! -d "$RUNNER_DIR" ]; then - echo "ERROR: Runner directory not found: ${RUNNER_DIR}" >&2 - exit 1 -fi - -# --- Validate target node is in the known node list --- -valid=0 -for node in "${NODES[@]}"; do - [ "$node" = "$TARGET_NODE" ] && valid=1 && break -done -if [ "$valid" -eq 0 ]; then - echo "ERROR: '${TARGET_NODE}' is not a valid Frontier login node." >&2 - echo " Valid nodes: ${NODES[*]}" >&2 - exit 1 -fi - -# --- Sync runner.node files before acting --- -echo "==> Syncing runner node locations..." -sync_runner_nodes - -# --- Find current node --- -echo "==> Locating ${RUNNER_NAME}..." -current_node=$(find_node "$RUNNER_DIR") - -if [ "$current_node" = "$TARGET_NODE" ]; then - echo "==> ${RUNNER_NAME} is already running on ${TARGET_NODE}. Nothing to do." - exit 0 -fi - -# --- Stop runner on current node (if running) --- -if [ "$current_node" != "offline" ]; then - echo "==> Stopping ${RUNNER_NAME} on ${current_node}..." - stop_runner "$current_node" "$RUNNER_DIR" -fi - -# --- Update runner.node --- -echo "$TARGET_NODE" > "${RUNNER_DIR}/runner.node" - -# --- Start runner on target node (with one retry) --- -echo "==> Starting ${RUNNER_NAME} on ${TARGET_NODE}..." -if start_runner "$TARGET_NODE" "$RUNNER_DIR"; then - echo "==> ${RUNNER_NAME} is now running on ${TARGET_NODE}." -else - echo " First start attempt failed. Retrying in 5 seconds..." - sleep 5 - if start_runner "$TARGET_NODE" "$RUNNER_DIR"; then - echo "==> ${RUNNER_NAME} is now running on ${TARGET_NODE}." - else - echo "ERROR: ${RUNNER_NAME} failed to start on ${TARGET_NODE} after retry." >&2 - exit 1 - fi -fi +source "$SCRIPT_DIR/../common/move-runner.sh" diff --git a/misc/frontier/restart-all.sh b/misc/frontier/restart-all.sh new file mode 100644 index 0000000000..dd81968bbe --- /dev/null +++ b/misc/frontier/restart-all.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +# Restart all Frontier runners in place on their current nodes. +# Thin wrapper — see misc/common/restart-all.sh for the implementation. +# +# Usage: bash restart-all.sh # dry run +# APPLY=1 bash restart-all.sh # execute +# APPLY=1 FORCE=1 bash restart-all.sh # restart busy runners too +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" +source "$SCRIPT_DIR/../common/restart-all.sh" diff --git a/misc/frontier/restart-runner.sh b/misc/frontier/restart-runner.sh new file mode 100644 index 0000000000..dc891a1b7e --- /dev/null +++ b/misc/frontier/restart-runner.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +# Restart a single Frontier runner on a given node. +# Thin wrapper — see misc/common/restart-runner.sh for the implementation. +# +# Usage: bash restart-runner.sh +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" +source "$SCRIPT_DIR/../common/restart-runner.sh" diff --git a/misc/frontier/stop-runner.sh b/misc/frontier/stop-runner.sh index e7c1185fef..7fc769c080 100755 --- a/misc/frontier/stop-runner.sh +++ b/misc/frontier/stop-runner.sh @@ -1,41 +1,8 @@ #!/usr/bin/env bash -# Stop and deregister a GitHub Actions runner on Frontier. -# Usage: stop-runner.sh (e.g. frontier-12) -set -euo pipefail - +# Stop and deregister a Frontier runner. +# Thin wrapper — see misc/common/stop-runner.sh for the implementation. +# +# Usage: bash stop-runner.sh SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" - -RUNNER_NAME="${1:?Usage: $0 }" -RUNNER_DIR="${SHARED_DIR}/${RUNNER_NAME}" - -if [ ! -d "${RUNNER_DIR}" ]; then - echo "Runner dir not found: ${RUNNER_DIR}" >&2 - exit 1 -fi - -# --- Locate and kill the runner process --- -echo "==> Locating ${RUNNER_NAME}..." -node=$(find_node "$RUNNER_DIR") - -if [ "$node" != "offline" ]; then - echo "==> Stopping ${RUNNER_NAME} on ${node}..." - stop_runner "$node" "$RUNNER_DIR" - echo "==> Process stopped." -else - echo "==> ${RUNNER_NAME} is not running (already offline)." -fi - -# --- Deregister from GitHub --- -echo "==> Fetching runner ID from GitHub..." -runner_id=$(gh_list_runners | while read -r id name status busy; do - [ "$name" = "$RUNNER_NAME" ] && echo "$id" && break -done) - -if [ -n "$runner_id" ]; then - echo "==> Deregistering runner (ID ${runner_id})..." - gh_remove_runner "$runner_id" - echo "==> Done." -else - echo "==> Runner not found in GitHub API (may already be deregistered)." -fi +source "$SCRIPT_DIR/../common/stop-runner.sh" diff --git a/misc/phoenix/README.md b/misc/phoenix/README.md index 51488c89f9..0107f48eac 100644 --- a/misc/phoenix/README.md +++ b/misc/phoenix/README.md @@ -25,15 +25,21 @@ bash check-runners.sh # Detailed table with GitHub API status bash list-runners.sh -# Restart one runner -bash restart-runner.sh login-phoenix-gnr-2 /path/to/actions-runner-3 +# Auto-rebalance across nodes (also restarts offline runners) +bash rebalance-runners.sh # dry run +APPLY=1 bash rebalance-runners.sh # execute -# Restart all runners in place +# Restart all runners in place (e.g. after a node reboot) APPLY=1 bash restart-all.sh -# Auto-rebalance across nodes -bash rebalance-runners.sh # dry run -APPLY=1 bash rebalance-runners.sh # execute +# Restart one specific runner +bash restart-runner.sh login-phoenix-gnr-2 /path/to/actions-runner-3 + +# Move a runner to a different login node +bash move-runner.sh phoenix-3 login-phoenix-gnr-1 + +# Stop and deregister a runner +bash stop-runner.sh phoenix-3 # Create a new runner (needs gh CLI with admin:org scope) bash create-runner.sh phoenix-11 login-phoenix-gnr-2 @@ -50,10 +56,12 @@ APPLY=1 bash rerun-failed.sh # execute | `config.sh` | Shared config: Phoenix constants (`ORG`, `RUNNER_GROUP`, `RUNNER_LABEL`, `NODES`, `CGROUP_LIMIT`, `RUNNER_PARENT_DIRS`) and `find_runner_dirs()`. Sources `../common/runner-lib.sh` for shared functions. | | `check-runners.sh` | Quick per-node health check. One SSH per node. Shows runner names, idle/BUSY status, slurm PATH, RSS, and total cgroup memory. | | `list-runners.sh` | Table combining GitHub API status with live node info from a parallel SSH sweep. Shows slurm status and flags stale `runner.node` entries. | -| `restart-runner.sh` | Stop and restart one runner on a given node with proper login shell PATH and SSH detachment. | -| `restart-all.sh` | Restart all runners in place. Skips BUSY runners unless `FORCE=1`. | -| `rebalance-runners.sh` | Auto-compute optimal distribution and move runners. Prefers idle runners. Also places OFFLINE runners. | -| `create-runner.sh` | Download runner binary, register with GitHub via API, start on target node. Only needs runner name and node — org, group, and label come from `config.sh`. | +| `rebalance-runners.sh` | Auto-compute optimal distribution and move runners. Prefers idle runners. Also places OFFLINE runners. Dry run by default. | +| `restart-all.sh` | Restart all runners in place. Skips BUSY runners unless `FORCE=1`. Dry run by default. | +| `restart-runner.sh` | Stop and restart one runner on a given node. Usage: `restart-runner.sh ` | +| `move-runner.sh` | Move a runner to a different login node by name. Usage: `move-runner.sh ` | +| `stop-runner.sh` | Stop the runner process and deregister from GitHub. Usage: `stop-runner.sh ` | +| `create-runner.sh` | Download runner binary, register with GitHub via API, start on target node. Usage: `create-runner.sh [parent-dir]` | | `rerun-failed.sh` | Scan open non-draft PRs and master for failed workflows, rerun failed jobs only. | ## Safety diff --git a/misc/phoenix/move-runner.sh b/misc/phoenix/move-runner.sh new file mode 100644 index 0000000000..bc6d31d49b --- /dev/null +++ b/misc/phoenix/move-runner.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +# Move a Phoenix runner to a different login node. +# Thin wrapper — see misc/common/move-runner.sh for the implementation. +# +# Usage: bash move-runner.sh +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" +source "$SCRIPT_DIR/../common/move-runner.sh" diff --git a/misc/phoenix/restart-all.sh b/misc/phoenix/restart-all.sh index 93e926c6fb..f11b5a1c6e 100755 --- a/misc/phoenix/restart-all.sh +++ b/misc/phoenix/restart-all.sh @@ -1,79 +1,10 @@ -#!/bin/bash -# Restart all Phoenix runners on their current nodes. -# -# Useful after a login node reboot or to pick up environment changes. -# Restarts in place — no rebalancing. Skips busy runners unless FORCE=1. +#!/usr/bin/env bash +# Restart all Phoenix runners in place on their current nodes. +# Thin wrapper — see misc/common/restart-all.sh for the implementation. # # Usage: bash restart-all.sh # dry run # APPLY=1 bash restart-all.sh # execute # APPLY=1 FORCE=1 bash restart-all.sh # restart busy runners too - -set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" - -echo "=== Discovering runners ===" -declare -a restart_list=() - -while IFS= read -r dir; do - name=$(get_runner_name "$dir") - [ -z "$name" ] && continue - node=$(find_node "$dir") - - if [ "$node" = "offline" ]; then - echo " $name: OFFLINE (use rebalance-runners.sh to place)" - continue - fi - - worker=$(ssh $SSH_OPTS "$node" "ps aux | grep Runner.Worker | grep '$dir' | grep -v grep" 2>/dev/null || true) - if [ -n "$worker" ]; then - echo " $name: BUSY on $node" - if [ "${FORCE:-0}" != "1" ]; then - echo " Skipping. Set FORCE=1 to restart anyway." - continue - fi - else - echo " $name: idle on $node" - fi - - restart_list+=("$node $dir $name") -done < <(find_runner_dirs) - -if [ ${#restart_list[@]} -eq 0 ]; then - echo "Nothing to restart." - exit 0 -fi - -echo "" -echo "${#restart_list[@]} runners will be restarted." - -if [ "${APPLY:-0}" != "1" ]; then - echo "Dry run — set APPLY=1 to execute." - exit 0 -fi - -echo "" -echo "=== Restarting ===" -success=0; fail=0 -for entry in "${restart_list[@]}"; do - read -r node dir name <<< "$entry" - echo "--- $name on $node ---" - stop_runner "$node" "$dir" - if start_runner "$node" "$dir"; then - pids=$(find_pids "$node" "$dir") - pid=${pids%% *} - if has_slurm "$node" "$pid"; then - echo " OK: PID $pid, slurm in PATH" - success=$((success + 1)) - else - echo " WARNING: PID $pid but slurm MISSING" - fail=$((fail + 1)) - fi - else - echo " ERROR: Failed to start" - fail=$((fail + 1)) - fi -done - -echo "" -echo "=== Summary: $success succeeded, $fail failed ===" +source "$SCRIPT_DIR/../common/restart-all.sh" diff --git a/misc/phoenix/restart-runner.sh b/misc/phoenix/restart-runner.sh index f3bc8d72b7..c0c9f73549 100755 --- a/misc/phoenix/restart-runner.sh +++ b/misc/phoenix/restart-runner.sh @@ -1,37 +1,8 @@ -#!/bin/bash -# Restart a GitHub Actions runner on a specific Phoenix login node. -# -# Kills any existing instance, then starts a new one with a login shell -# (for /opt/slurm PATH) and full terminal detachment. +#!/usr/bin/env bash +# Restart a single Phoenix runner on a given node. +# Thin wrapper — see misc/common/restart-runner.sh for the implementation. # # Usage: bash restart-runner.sh -# Example: bash restart-runner.sh login-phoenix-gnr-2 /storage/scratch1/6/sbryngelson3/mfc-runners/actions-runner-3 - -set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" - -if [ $# -ne 2 ]; then - echo "Usage: $0 " - echo "Nodes: ${NODES[*]}" - exit 1 -fi - -node="$1" -dir="$2" -name=$(get_runner_name "$dir" 2>/dev/null || basename "$dir") - -echo "Restarting $name on $node..." -stop_runner "$node" "$dir" - -if start_runner "$node" "$dir"; then - pids=$(find_pids "$node" "$dir") - pid=${pids%% *} - if has_slurm "$node" "$pid"; then - echo " OK: PID $pid, slurm in PATH" - else - echo " WARNING: PID $pid but slurm MISSING from PATH" - fi -else - echo " ERROR: Failed to start on $node" -fi +source "$SCRIPT_DIR/../common/restart-runner.sh" diff --git a/misc/phoenix/stop-runner.sh b/misc/phoenix/stop-runner.sh new file mode 100644 index 0000000000..d4d1ea6b91 --- /dev/null +++ b/misc/phoenix/stop-runner.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +# Stop and deregister a Phoenix runner. +# Thin wrapper — see misc/common/stop-runner.sh for the implementation. +# +# Usage: bash stop-runner.sh +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" +source "$SCRIPT_DIR/../common/stop-runner.sh" From 92cf71e8cd2e1d1e12fd843b9c33c01c542a0940 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 20:50:38 -0400 Subject: [PATCH 16/20] Fix all PR review issues in runner management scripts - runner-lib.sh: use sys.argv[1] in get_runner_name (prevent path injection); verify process dead after SIGKILL in stop_runner; warn on sweep_all_nodes SSH failure; log SSH failure in start_runner - rebalance-runners.sh: fix busy-runner grep with trailing slash; handle stop_runner failure before attempting move - restart-all.sh: replace space-delimited restart_list with three separate arrays (space-safe); fix busy grep trailing slash; handle stop_runner failure - restart-runner.sh: exit 1 on start failure (was silently exiting 0) - stop-runner.sh: handle gh_list_runners API failure with warning - rerun-failed.sh: guard empty failed_jobs to avoid blank bullet; add per-job display and expired-run guard to master branch block - deploy-runners.sh: track background PIDs; detect and report failures - create-runner.sh: use #!/usr/bin/env bash; atomic curl download - frontier/config.sh: document RUNNER_GROUP="phoenix" is intentional - phoenix/config.sh: use #!/usr/bin/env bash Co-Authored-By: Claude Sonnet 4.6 --- misc/common/rebalance-runners.sh | 9 +++++-- misc/common/rerun-failed.sh | 18 ++++++++++--- misc/common/restart-all.sh | 24 +++++++++++------ misc/common/restart-runner.sh | 1 + misc/common/runner-lib.sh | 46 +++++++++++++++++++++++++------- misc/common/stop-runner.sh | 6 ++++- misc/frontier/config.sh | 2 +- misc/frontier/deploy-runners.sh | 17 +++++++++++- misc/phoenix/config.sh | 2 +- misc/phoenix/create-runner.sh | 7 +++-- 10 files changed, 102 insertions(+), 30 deletions(-) diff --git a/misc/common/rebalance-runners.sh b/misc/common/rebalance-runners.sh index 98546387fc..9cc8680479 100644 --- a/misc/common/rebalance-runners.sh +++ b/misc/common/rebalance-runners.sh @@ -47,7 +47,7 @@ for i in "${!dirs[@]}"; do runner_node[$i]="$node" if [ "$node" != "offline" ]; then node_runners[$node]="${node_runners[$node]:-} $i" - worker=$(ssh $SSH_OPTS "$node" "ps aux | grep Runner.Worker | grep '${dirs[$i]}' | grep -v grep" 2>/dev/null || true) + worker=$(ssh $SSH_OPTS "$node" "ps aux | grep Runner.Worker | grep '${dirs[$i]}/' | grep -v grep" 2>/dev/null || true) [ -n "$worker" ] && runner_busy[$i]=1 || runner_busy[$i]=0 else runner_busy[$i]=0 @@ -155,7 +155,12 @@ echo "=== Executing ===" for move in "${moves[@]}"; do read -r src dst idx <<< "$move" echo "Moving ${names[$idx]}: $src -> $dst" - [ "$src" != "offline" ] && stop_runner "$src" "${dirs[$idx]}" + if [ "$src" != "offline" ]; then + if ! stop_runner "$src" "${dirs[$idx]}"; then + echo " ERROR: Failed to stop ${names[$idx]} on $src; skipping move" >&2 + continue + fi + fi if start_runner "$dst" "${dirs[$idx]}"; then echo "$dst" > "${dirs[$idx]}/runner.node" pids=$(find_pids "$dst" "${dirs[$idx]}") diff --git a/misc/common/rerun-failed.sh b/misc/common/rerun-failed.sh index ed56609777..8fc827216d 100755 --- a/misc/common/rerun-failed.sh +++ b/misc/common/rerun-failed.sh @@ -40,9 +40,11 @@ for pr in $prs; do --json jobs --jq '.jobs[] | select(.conclusion == "failure" or .conclusion == "cancelled") | .name' \ 2>/dev/null) || { echo " WARNING: could not fetch jobs for run $run_id, skipping"; continue; } echo " Run $run_id ($run_name):" - while read -r job; do - echo " - $job" - done <<< "$failed_jobs" + if [ -n "$failed_jobs" ]; then + while read -r job; do + echo " - $job" + done <<< "$failed_jobs" + fi if [ "${APPLY:-0}" = "1" ]; then echo " Rerunning failed jobs..." @@ -60,7 +62,15 @@ master_failed=$(gh run list --repo "$REPO" --branch master --limit 5 \ --jq '.[] | select(.conclusion == "failure") | "\(.databaseId) \(.name)"') if [ -n "$master_failed" ]; then while read -r run_id run_name; do - echo " Run $run_id ($run_name)" + failed_jobs=$(gh run view --repo "$REPO" "$run_id" \ + --json jobs --jq '.jobs[] | select(.conclusion == "failure" or .conclusion == "cancelled") | .name' \ + 2>/dev/null) || { echo " WARNING: could not fetch jobs for run $run_id, skipping"; continue; } + echo " Run $run_id ($run_name):" + if [ -n "$failed_jobs" ]; then + while read -r job; do + echo " - $job" + done <<< "$failed_jobs" + fi if [ "${APPLY:-0}" = "1" ]; then echo " Rerunning failed jobs..." gh run rerun --repo "$REPO" "$run_id" --failed || echo " WARNING: rerun failed" diff --git a/misc/common/restart-all.sh b/misc/common/restart-all.sh index ff3de91ac9..3254a4f7cb 100644 --- a/misc/common/restart-all.sh +++ b/misc/common/restart-all.sh @@ -16,7 +16,7 @@ declare -f sync_runner_nodes > /dev/null 2>&1 && { } echo "=== Discovering runners ===" -declare -a restart_list=() +declare -a restart_nodes=() restart_dirs=() restart_names=() while IFS= read -r dir; do name=$(get_runner_name "$dir") @@ -28,7 +28,7 @@ while IFS= read -r dir; do continue fi - worker=$(ssh $SSH_OPTS "$node" "ps aux | grep Runner.Worker | grep '$dir' | grep -v grep" 2>/dev/null || true) + worker=$(ssh $SSH_OPTS "$node" "ps aux | grep Runner.Worker | grep '$dir/' | grep -v grep" 2>/dev/null || true) if [ -n "$worker" ]; then echo " $name: BUSY on $node" if [ "${FORCE:-0}" != "1" ]; then @@ -39,16 +39,18 @@ while IFS= read -r dir; do echo " $name: idle on $node" fi - restart_list+=("$node $dir $name") + restart_nodes+=("$node") + restart_dirs+=("$dir") + restart_names+=("$name") done < <(find_runner_dirs) -if [ ${#restart_list[@]} -eq 0 ]; then +if [ ${#restart_nodes[@]} -eq 0 ]; then echo "Nothing to restart." exit 0 fi echo "" -echo "${#restart_list[@]} runners will be restarted." +echo "${#restart_nodes[@]} runners will be restarted." if [ "${APPLY:-0}" != "1" ]; then echo "Dry run — set APPLY=1 to execute." @@ -58,10 +60,16 @@ fi echo "" echo "=== Restarting ===" success=0; fail=0 -for entry in "${restart_list[@]}"; do - read -r node dir name <<< "$entry" +for i in "${!restart_nodes[@]}"; do + node="${restart_nodes[$i]}" + dir="${restart_dirs[$i]}" + name="${restart_names[$i]}" echo "--- $name on $node ---" - stop_runner "$node" "$dir" + if ! stop_runner "$node" "$dir"; then + echo " ERROR: Failed to stop; skipping restart" >&2 + fail=$((fail + 1)) + continue + fi if start_runner "$node" "$dir"; then echo "$node" > "$dir/runner.node" pids=$(find_pids "$node" "$dir") diff --git a/misc/common/restart-runner.sh b/misc/common/restart-runner.sh index 6247fca40f..1517d69d32 100644 --- a/misc/common/restart-runner.sh +++ b/misc/common/restart-runner.sh @@ -32,4 +32,5 @@ if start_runner "$node" "$dir"; then fi else echo " ERROR: Failed to start on $node" + exit 1 fi diff --git a/misc/common/runner-lib.sh b/misc/common/runner-lib.sh index c0560e6bc5..91f5a6adc3 100755 --- a/misc/common/runner-lib.sh +++ b/misc/common/runner-lib.sh @@ -38,20 +38,26 @@ gh_remove_runner() { # --- Local filesystem --- # Get the GitHub runner name from a .runner config file. +# Uses sys.argv to avoid path injection into Python source code. +# Prints the agentName, or empty string if the file is missing or unparsable +# (with a warning to stderr). # Args: $1 = runner directory get_runner_name() { python3 -c " -import json -d = json.loads(open('$1/.runner').read().lstrip('\ufeff')) +import json, sys +d = json.loads(open(sys.argv[1] + '/.runner').read().lstrip('\ufeff')) print(d.get('agentName', '')) -" 2>/dev/null +" "$1" 2>/dev/null \ + || echo "WARNING: could not read runner name from '$1/.runner'" >&2 } # --- Login-node process management --- # Find PIDs of a runner on a node by matching its executable path. -# Matches /proc/$p/exe against $dir/bin/Runner.Listener — intrinsic to -# the binary, independent of CWD or how the process was launched. +# Candidate PIDs are found by grepping ps for Runner.Listener; each +# candidate's /proc/$p/exe is then resolved and matched against +# $dir/bin/Runner.Listener to confirm identity. This makes the confirmation +# step independent of CWD or process arguments. # Output is filtered to numeric lines only to strip SSH MOTD noise. # Args: $1 = node, $2 = runner directory # Prints: space-separated PIDs, or empty. @@ -76,9 +82,11 @@ find_node() { # Check if a runner process has a slurm directory in its PATH. # Works across sites regardless of the specific slurm installation path. -# Args: $1 = node, $2 = PID (or "PID rest..." — uses first token only) +# Returns non-zero if slurm is absent OR if the SSH check itself fails +# (callers should treat non-zero as "could not confirm slurm present"). +# Args: $1 = node, $2 = PID has_slurm() { - local node="$1" pid="${2%% *}" + local node="$1" pid="$2" ssh $SSH_OPTS "$node" \ "tr '\0' '\n' < /proc/$pid/environ 2>/dev/null | grep -q '^PATH=.*slurm'" \ 2>/dev/null @@ -88,6 +96,7 @@ has_slurm() { # Each output line: RUNNER # dir = runner directory derived from the Runner.Listener exe path # slurm_ok = "ok" if slurm appears in the process PATH, "MISSING" otherwise +# Warns to stderr for any node whose output file is empty (SSH likely failed). # Caller must create tmpdir and parse the output files. # Args: $1 = tmpdir sweep_all_nodes() { @@ -106,22 +115,34 @@ sweep_all_nodes() { ' 2>/dev/null > "$tmpdir/$node.out" & done wait + for node in "${NODES[@]}"; do + if [ ! -s "$tmpdir/$node.out" ]; then + echo "WARNING: no runner data from $node (SSH may have failed)" >&2 + fi + done } # Start a runner on a node. # Uses a login shell (bash -lc) so site PATH (e.g. SLURM) is available. +# SSH launch failures are logged to stderr but do not prevent the function +# from checking whether the process appeared (find_pids after 3s). # Args: $1 = node, $2 = runner directory -# Returns: 0 if running after start, 1 otherwise. +# Returns: 0 if a Runner.Listener process is found after start, 1 otherwise. start_runner() { local node="$1" dir="$2" timeout 15 ssh $SSH_OPTS "$node" \ "cd $dir && setsid bash -lc 'nohup ./run.sh >> runner.log 2>&1 < /dev/null &'" \ - /dev/null || true + /dev/null \ + || echo "WARNING: SSH launch to $node failed; checking for process anyway..." >&2 sleep 3 [ -n "$(find_pids "$node" "$dir")" ] } -# Stop a runner on a node (SIGTERM then SIGKILL). +# Stop a runner on a node (SIGTERM, 3s grace, then SIGKILL). +# After SIGKILL, waits 1s then re-checks with find_pids. If the process is +# still alive (e.g. SSH kill failed, wrong UID, kernel stuck), emits a warning +# to stderr and returns 1 so callers can react. Returns 0 if the runner is +# confirmed stopped or was never running. # Args: $1 = node, $2 = runner directory stop_runner() { local node="$1" dir="$2" pids @@ -136,4 +157,9 @@ stop_runner() { ssh $SSH_OPTS "$node" "kill -9 $pid" 2>/dev/null || true done sleep 1 + pids=$(find_pids "$node" "$dir") + if [ -n "$pids" ]; then + echo "WARNING: process(es) $pids on $node survived SIGKILL; runner may still be running" >&2 + return 1 + fi } diff --git a/misc/common/stop-runner.sh b/misc/common/stop-runner.sh index b3f77b2a6b..b9a8661846 100644 --- a/misc/common/stop-runner.sh +++ b/misc/common/stop-runner.sh @@ -39,9 +39,13 @@ fi # Deregister from GitHub echo "==> Fetching runner ID from GitHub..." runner_id="" +runner_list=$(gh_list_runners 2>/dev/null) || { + echo "WARNING: GitHub API call failed; runner may still be registered on GitHub." >&2 + exit 0 +} while read -r id name _status _busy; do [ "$name" = "$RUNNER_NAME" ] && runner_id="$id" && break -done < <(gh_list_runners) +done <<< "$runner_list" if [ -n "$runner_id" ]; then echo "==> Deregistering runner (ID $runner_id)..." diff --git a/misc/frontier/config.sh b/misc/frontier/config.sh index d5c7f4398e..2245f80593 100755 --- a/misc/frontier/config.sh +++ b/misc/frontier/config.sh @@ -6,7 +6,7 @@ # --- Frontier constants --- ORG="MFlowCode" -RUNNER_GROUP="phoenix" +RUNNER_GROUP="phoenix" # Both sites share one GitHub runner group named "phoenix" RUNNER_LABEL="frontier" NODES=(login01 login02 login03 login04 login05 login06 login07 login08 login09 login10 login11) SHARED_DIR="/lustre/orion/cfd154/proj-shared/runners" diff --git a/misc/frontier/deploy-runners.sh b/misc/frontier/deploy-runners.sh index a1cee47f29..b7df0bcea9 100755 --- a/misc/frontier/deploy-runners.sh +++ b/misc/frontier/deploy-runners.sh @@ -33,12 +33,27 @@ if [ ! -f "${SHARED_DIR}/${TARBALL}" ]; then fi export RUNNER_VERSION +declare -a pids=() nums=() deploy_nodes=() for i in "${!TARGET_NODES[@]}"; do NODE="${TARGET_NODES[$i]}" NUM=$((START_NUM + i)) echo "==> Deploying frontier-${NUM} on ${NODE}..." "$SCRIPT_DIR/make-runner.sh" "${NUM}" "${NODE}" & + pids+=($!) + nums+=("$NUM") + deploy_nodes+=("$NODE") done -wait +failed=0 +for i in "${!pids[@]}"; do + if ! wait "${pids[$i]}"; then + echo "ERROR: frontier-${nums[$i]} on ${deploy_nodes[$i]} failed." >&2 + failed=$((failed + 1)) + fi +done + +if [ "$failed" -gt 0 ]; then + echo "==> $failed runner(s) failed to deploy." >&2 + exit 1 +fi echo "==> All runners deployed." diff --git a/misc/phoenix/config.sh b/misc/phoenix/config.sh index e4fadfb225..40065099cc 100755 --- a/misc/phoenix/config.sh +++ b/misc/phoenix/config.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Shared configuration for Phoenix GitHub Actions runner management. # # Sourced by all other scripts. Provides Phoenix constants and diff --git a/misc/phoenix/create-runner.sh b/misc/phoenix/create-runner.sh index 9991c43c4d..8def3f339b 100755 --- a/misc/phoenix/create-runner.sh +++ b/misc/phoenix/create-runner.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Create and register a new GitHub Actions runner on Phoenix. # # Downloads the runner binary, registers with MFlowCode org, and starts @@ -64,7 +64,10 @@ echo " Version: $version" mkdir -p "$runner_dir" cd "$runner_dir" -curl -sL "$url" | tar xz +tmp="runner-download.tmp.$$" +curl -fsSL "$url" -o "$tmp" +tar xz < "$tmp" +rm -f "$tmp" echo " Extracted." # Configure From eb9f98a6609a68892553bbdf5a9a30d57c0dd41a Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 20:59:33 -0400 Subject: [PATCH 17/20] Replace 15 duplicate wrapper scripts with single dispatcher All shared commands (check-runners, list-runners, move-runner, rebalance-runners, restart-all, restart-runner, stop-runner, rerun-failed) previously existed as near-identical thin wrappers in both frontier/ and phoenix/. Replace them all with a single dispatcher: bash misc/runner.sh [args...] Site directories now contain only truly site-specific files: - frontier/: config.sh, make-runner.sh, deploy-runners.sh - phoenix/: config.sh, create-runner.sh Also fix rebalance-runners.sh to source common/check-runners.sh directly (was calling the now-deleted site wrapper). Update all READMEs to document the new dispatcher invocation. Co-Authored-By: Claude Sonnet 4.6 --- misc/common/README.md | 11 ++++-- misc/common/rebalance-runners.sh | 2 +- misc/frontier/README.md | 57 +++++++++++++++--------------- misc/frontier/check-runners.sh | 8 ----- misc/frontier/list-runners.sh | 8 ----- misc/frontier/move-runner.sh | 8 ----- misc/frontier/rebalance-runners.sh | 11 ------ misc/frontier/restart-all.sh | 10 ------ misc/frontier/restart-runner.sh | 8 ----- misc/frontier/stop-runner.sh | 8 ----- misc/phoenix/README.md | 53 +++++++++++++-------------- misc/phoenix/check-runners.sh | 8 ----- misc/phoenix/list-runners.sh | 8 ----- misc/phoenix/move-runner.sh | 8 ----- misc/phoenix/rebalance-runners.sh | 11 ------ misc/phoenix/rerun-failed.sh | 2 -- misc/phoenix/restart-all.sh | 10 ------ misc/phoenix/restart-runner.sh | 8 ----- misc/phoenix/stop-runner.sh | 8 ----- misc/runner.sh | 55 ++++++++++++++++++++++++++++ 20 files changed, 120 insertions(+), 182 deletions(-) delete mode 100755 misc/frontier/check-runners.sh delete mode 100755 misc/frontier/list-runners.sh delete mode 100755 misc/frontier/move-runner.sh delete mode 100755 misc/frontier/rebalance-runners.sh delete mode 100644 misc/frontier/restart-all.sh delete mode 100644 misc/frontier/restart-runner.sh delete mode 100755 misc/frontier/stop-runner.sh delete mode 100755 misc/phoenix/check-runners.sh delete mode 100755 misc/phoenix/list-runners.sh delete mode 100644 misc/phoenix/move-runner.sh delete mode 100755 misc/phoenix/rebalance-runners.sh delete mode 100755 misc/phoenix/rerun-failed.sh delete mode 100755 misc/phoenix/restart-all.sh delete mode 100755 misc/phoenix/restart-runner.sh delete mode 100644 misc/phoenix/stop-runner.sh create mode 100644 misc/runner.sh diff --git a/misc/common/README.md b/misc/common/README.md index ac9448e6f2..8cc2083054 100644 --- a/misc/common/README.md +++ b/misc/common/README.md @@ -1,8 +1,13 @@ # Common Runner Management Scripts Site-agnostic scripts shared between the Frontier and Phoenix runner setups. -All logic lives here; site directories contain only thin wrappers that source -`config.sh` then the relevant script from this directory. +All shared logic lives here; site directories contain only site-specific files +(`config.sh` and scripts unique to that cluster). + +Scripts are invoked via the dispatcher at `misc/runner.sh`: +```bash +bash misc/runner.sh [args...] +``` ## Scripts @@ -10,7 +15,7 @@ All logic lives here; site directories contain only thin wrappers that source |---|---| | `runner-lib.sh` | Shared library: GitHub API helpers, EXE-based process discovery, parallel node sweep, start/stop primitives. Sourced by site `config.sh` files. | | `check-runners.sh` | Per-node health check: Runner.Listener processes with name, idle/BUSY, slurm PATH, RSS. Optional cgroup memory footer. | -| `list-runners.sh` | Full table: GitHub API status × parallel node sweep. Shows slurm status, flags stale `runner.node`. Optional cgroup memory footer. | +| `list-runners.sh` | Full table: GitHub API status × parallel node sweep. Shows slurm status, flags stale `runner.node`. | | `rebalance-runners.sh` | Compute optimal distribution and move runners across nodes. Handles offline runners. Writes `runner.node`. Dry run by default. | | `restart-runner.sh` | Stop and restart one runner on a given node. Verifies slurm in PATH. Writes `runner.node`. | | `restart-all.sh` | Restart all runners in place. Skips busy unless `FORCE=1`. Dry run by default. | diff --git a/misc/common/rebalance-runners.sh b/misc/common/rebalance-runners.sh index 9cc8680479..20a20b0f7c 100644 --- a/misc/common/rebalance-runners.sh +++ b/misc/common/rebalance-runners.sh @@ -176,4 +176,4 @@ for move in "${moves[@]}"; do done echo "" -bash "${SITE_SCRIPT_DIR}/check-runners.sh" +source "$(dirname "${BASH_SOURCE[0]}")/check-runners.sh" diff --git a/misc/frontier/README.md b/misc/frontier/README.md index 5dd92877da..43c89592e5 100644 --- a/misc/frontier/README.md +++ b/misc/frontier/README.md @@ -17,50 +17,58 @@ runners. The authoritative source of truth for whether a runner is running (and on which node) is EXE-based process discovery via `/proc/$pid/exe` — not any PID file. -`runner.node` is self-healing: `rebalance-runners.sh` calls `sync_runner_nodes` +`runner.node` is self-healing: `rebalance-runners` calls `sync_runner_nodes` at startup, which sweeps all nodes in parallel and corrects any stale `runner.node` files automatically. Runners occasionally die due to OLCF's firewall/proxy dropping long-lived TCP -connections to GitHub's broker. Run `rebalance-runners.sh` to restart and +connections to GitHub's broker. Run `rebalance-runners` to restart and redistribute them. Login nodes vary in stability — if a runner keeps dying on a particular node, move it to a quieter one (login01 tends to have low load). +All commands are run via the dispatcher at `misc/runner.sh`: + +```bash +bash misc/runner.sh frontier [args...] +``` + ## Quick Reference ```bash +R="bash misc/runner.sh frontier" + # List all runners with GitHub status, node, slurm, and memory usage -bash list-runners.sh +$R list-runners # Check runner health across all login nodes -bash check-runners.sh +$R check-runners # Rebalance runners across all 11 nodes (also restarts offline runners) -bash rebalance-runners.sh # dry run -APPLY=1 bash rebalance-runners.sh # execute -APPLY=1 FORCE=1 bash rebalance-runners.sh # move busy runners too +$R rebalance-runners # dry run +APPLY=1 $R rebalance-runners # execute +APPLY=1 FORCE=1 $R rebalance-runners # move busy runners too # Restart all runners in place (e.g. after a node reboot) -APPLY=1 bash restart-all.sh +APPLY=1 $R restart-all # Restart one specific runner -bash restart-runner.sh login01 /path/to/runner-dir +$R restart-runner login01 /path/to/runner-dir # Move a runner to a different login node -bash move-runner.sh frontier-1 login01 +$R move-runner frontier-1 login01 # Stop and deregister a runner -bash stop-runner.sh frontier-12 +$R stop-runner frontier-12 # Deploy a new runner on a specific node -bash make-runner.sh 23 login01 +$R make-runner 23 login01 # Deploy multiple runners across nodes (e.g. runners 23, 24, 25) -bash deploy-runners.sh 23 login01 login02 login03 +$R deploy-runners 23 login01 login02 login03 # Rerun failed CI workflows -bash ../common/rerun-failed.sh -APPLY=1 bash ../common/rerun-failed.sh +$R rerun-failed +APPLY=1 $R rerun-failed ``` ## Scripts @@ -68,29 +76,22 @@ APPLY=1 bash ../common/rerun-failed.sh | Script | Purpose | |---|---| | `config.sh` | Shared configuration: Frontier constants, `find_runner_dirs()`, and `sync_runner_nodes()`. Sources `../common/runner-lib.sh` for shared functions. | -| `check-runners.sh` | SSH to each login node, show Runner.Listener processes with name, status (idle/BUSY), slurm PATH, and RSS memory. | -| `list-runners.sh` | List all runners with GitHub API status, actual node (parallel SSH sweep), slurm status, and RSS. Flags stale `runner.node`. | -| `rebalance-runners.sh` | Sync node locations, compute optimal distribution, move runners across all 11 nodes. Handles offline runners. Dry run by default. | -| `restart-all.sh` | Restart all runners in place. Skips busy unless `FORCE=1`. Dry run by default. | -| `restart-runner.sh` | Stop and restart one runner on a given node. Usage: `restart-runner.sh ` | -| `move-runner.sh` | Move a runner to a different login node by name. Usage: `move-runner.sh ` | -| `stop-runner.sh` | Stop the runner process and deregister from GitHub. Usage: `stop-runner.sh ` | -| `make-runner.sh` | Download runner binary, register with GitHub, start on target node. Usage: `make-runner.sh [node]` | -| `deploy-runners.sh` | Deploy multiple runners across nodes in parallel. Usage: `deploy-runners.sh [node2 ...]` | -| `../common/rerun-failed.sh` | Rerun failed GitHub Actions workflows on open PRs and master. | +| `make-runner.sh` | Download runner binary, register with GitHub, start on target node. Usage: `make-runner [node]` | +| `deploy-runners.sh` | Deploy multiple runners across nodes in parallel. Usage: `deploy-runners [node2 ...]` | +| `../common/` | All other commands (`check-runners`, `list-runners`, `rebalance-runners`, etc.) live here and are dispatched via `misc/runner.sh`. | ## Troubleshooting **Runner goes OFFLINE repeatedly on the same node** — That login node may have process culling or high memory pressure. Move it to a different node: ```bash -bash move-runner.sh frontier-1 login01 +bash misc/runner.sh frontier move-runner frontier-1 login01 ``` **Multiple runners OFFLINE at once** — Usually a transient OLCF network blip -to GitHub. Run `rebalance-runners.sh` to recover and redistribute all at once. +to GitHub. Run `rebalance-runners` to recover and redistribute all at once. **Runner appears offline on GitHub but process is running** — GitHub status can -lag. `rebalance-runners.sh` uses EXE-based process discovery first: if a +lag. `rebalance-runners` uses EXE-based process discovery first: if a process is found running, it will stop it before restarting, preventing duplicate runner processes. diff --git a/misc/frontier/check-runners.sh b/misc/frontier/check-runners.sh deleted file mode 100755 index 3004508cc8..0000000000 --- a/misc/frontier/check-runners.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash -# Check runner health across all Frontier login nodes. -# Thin wrapper — see misc/common/check-runners.sh for the implementation. -# -# Usage: bash check-runners.sh -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/config.sh" -source "$SCRIPT_DIR/../common/check-runners.sh" diff --git a/misc/frontier/list-runners.sh b/misc/frontier/list-runners.sh deleted file mode 100755 index 1ef2e541e5..0000000000 --- a/misc/frontier/list-runners.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash -# List all Frontier runners combining GitHub API status with live node process info. -# Thin wrapper — see misc/common/list-runners.sh for the implementation. -# -# Usage: bash list-runners.sh -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/config.sh" -source "$SCRIPT_DIR/../common/list-runners.sh" diff --git a/misc/frontier/move-runner.sh b/misc/frontier/move-runner.sh deleted file mode 100755 index 418c292fba..0000000000 --- a/misc/frontier/move-runner.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash -# Move a Frontier runner to a different login node. -# Thin wrapper — see misc/common/move-runner.sh for the implementation. -# -# Usage: bash move-runner.sh -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/config.sh" -source "$SCRIPT_DIR/../common/move-runner.sh" diff --git a/misc/frontier/rebalance-runners.sh b/misc/frontier/rebalance-runners.sh deleted file mode 100755 index e567e10e5c..0000000000 --- a/misc/frontier/rebalance-runners.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash -# Automatically rebalance Frontier runners across login nodes. -# Thin wrapper — see misc/common/rebalance-runners.sh for the implementation. -# -# Usage: bash rebalance-runners.sh # dry run -# APPLY=1 bash rebalance-runners.sh # execute -# APPLY=1 FORCE=1 bash rebalance-runners.sh # move busy runners too -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -SITE_SCRIPT_DIR="$SCRIPT_DIR" -source "$SCRIPT_DIR/config.sh" -source "$SCRIPT_DIR/../common/rebalance-runners.sh" diff --git a/misc/frontier/restart-all.sh b/misc/frontier/restart-all.sh deleted file mode 100644 index dd81968bbe..0000000000 --- a/misc/frontier/restart-all.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env bash -# Restart all Frontier runners in place on their current nodes. -# Thin wrapper — see misc/common/restart-all.sh for the implementation. -# -# Usage: bash restart-all.sh # dry run -# APPLY=1 bash restart-all.sh # execute -# APPLY=1 FORCE=1 bash restart-all.sh # restart busy runners too -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/config.sh" -source "$SCRIPT_DIR/../common/restart-all.sh" diff --git a/misc/frontier/restart-runner.sh b/misc/frontier/restart-runner.sh deleted file mode 100644 index dc891a1b7e..0000000000 --- a/misc/frontier/restart-runner.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash -# Restart a single Frontier runner on a given node. -# Thin wrapper — see misc/common/restart-runner.sh for the implementation. -# -# Usage: bash restart-runner.sh -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/config.sh" -source "$SCRIPT_DIR/../common/restart-runner.sh" diff --git a/misc/frontier/stop-runner.sh b/misc/frontier/stop-runner.sh deleted file mode 100755 index 7fc769c080..0000000000 --- a/misc/frontier/stop-runner.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash -# Stop and deregister a Frontier runner. -# Thin wrapper — see misc/common/stop-runner.sh for the implementation. -# -# Usage: bash stop-runner.sh -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/config.sh" -source "$SCRIPT_DIR/../common/stop-runner.sh" diff --git a/misc/phoenix/README.md b/misc/phoenix/README.md index 0107f48eac..36ec2e61f9 100644 --- a/misc/phoenix/README.md +++ b/misc/phoenix/README.md @@ -16,37 +16,45 @@ requires stopping the process on one node and starting it on another. Runners must be started with a **login shell** (`bash -l`) so they inherit `/opt/slurm/current/bin` in PATH (required for `sbatch`, `squeue`, `sacct`). +All commands are run via the dispatcher at `misc/runner.sh`: + +```bash +bash misc/runner.sh phoenix [args...] +``` + ## Quick Reference ```bash +R="bash misc/runner.sh phoenix" + # Check health (quick, one SSH per node) -bash check-runners.sh +$R check-runners # Detailed table with GitHub API status -bash list-runners.sh +$R list-runners # Auto-rebalance across nodes (also restarts offline runners) -bash rebalance-runners.sh # dry run -APPLY=1 bash rebalance-runners.sh # execute +$R rebalance-runners # dry run +APPLY=1 $R rebalance-runners # execute # Restart all runners in place (e.g. after a node reboot) -APPLY=1 bash restart-all.sh +APPLY=1 $R restart-all # Restart one specific runner -bash restart-runner.sh login-phoenix-gnr-2 /path/to/actions-runner-3 +$R restart-runner login-phoenix-gnr-2 /path/to/actions-runner-3 # Move a runner to a different login node -bash move-runner.sh phoenix-3 login-phoenix-gnr-1 +$R move-runner phoenix-3 login-phoenix-gnr-1 # Stop and deregister a runner -bash stop-runner.sh phoenix-3 +$R stop-runner phoenix-3 # Create a new runner (needs gh CLI with admin:org scope) -bash create-runner.sh phoenix-11 login-phoenix-gnr-2 +$R create-runner phoenix-11 login-phoenix-gnr-2 # Rerun failed CI on open PRs -bash rerun-failed.sh # dry run -APPLY=1 bash rerun-failed.sh # execute +$R rerun-failed # dry run +APPLY=1 $R rerun-failed # execute ``` ## Scripts @@ -54,20 +62,13 @@ APPLY=1 bash rerun-failed.sh # execute | Script | Purpose | |---|---| | `config.sh` | Shared config: Phoenix constants (`ORG`, `RUNNER_GROUP`, `RUNNER_LABEL`, `NODES`, `CGROUP_LIMIT`, `RUNNER_PARENT_DIRS`) and `find_runner_dirs()`. Sources `../common/runner-lib.sh` for shared functions. | -| `check-runners.sh` | Quick per-node health check. One SSH per node. Shows runner names, idle/BUSY status, slurm PATH, RSS, and total cgroup memory. | -| `list-runners.sh` | Table combining GitHub API status with live node info from a parallel SSH sweep. Shows slurm status and flags stale `runner.node` entries. | -| `rebalance-runners.sh` | Auto-compute optimal distribution and move runners. Prefers idle runners. Also places OFFLINE runners. Dry run by default. | -| `restart-all.sh` | Restart all runners in place. Skips BUSY runners unless `FORCE=1`. Dry run by default. | -| `restart-runner.sh` | Stop and restart one runner on a given node. Usage: `restart-runner.sh ` | -| `move-runner.sh` | Move a runner to a different login node by name. Usage: `move-runner.sh ` | -| `stop-runner.sh` | Stop the runner process and deregister from GitHub. Usage: `stop-runner.sh ` | -| `create-runner.sh` | Download runner binary, register with GitHub via API, start on target node. Usage: `create-runner.sh [parent-dir]` | -| `rerun-failed.sh` | Scan open non-draft PRs and master for failed workflows, rerun failed jobs only. | +| `create-runner.sh` | Download runner binary, register with GitHub via API, start on target node. Usage: `create-runner [parent-dir]` | +| `../common/` | All other commands (`check-runners`, `list-runners`, `rebalance-runners`, etc.) live here and are dispatched via `misc/runner.sh`. | ## Safety -- **Dry run by default**: `rebalance-runners.sh`, `restart-all.sh`, and - `rerun-failed.sh` show what they would do unless `APPLY=1` is set. +- **Dry run by default**: `rebalance-runners`, `restart-all`, and + `rerun-failed` show what they would do unless `APPLY=1` is set. - **Busy runner protection**: Scripts skip BUSY runners unless `FORCE=1`. - **Slurm PATH verification**: After starting, scripts verify `slurm` appears in the runner's PATH and warn if missing. @@ -84,13 +85,13 @@ Edit `config.sh` to change: ## Troubleshooting **"sbatch: command not found"** — Runner started without login shell. -Fix: `bash restart-runner.sh ` +Fix: `bash misc/runner.sh phoenix restart-runner ` **OOM kills** — Too many runners on one node. -Fix: `bash check-runners.sh` then `APPLY=1 bash rebalance-runners.sh` +Fix: `bash misc/runner.sh phoenix check-runners` then `APPLY=1 bash misc/runner.sh phoenix rebalance-runners` **Runner OFFLINE** — Process died or node rebooted. -Fix: `APPLY=1 bash rebalance-runners.sh` (auto-places on least-loaded node) +Fix: `APPLY=1 bash misc/runner.sh phoenix rebalance-runners` (auto-places on least-loaded node) **All runners down** — Node maintenance. -Fix: `APPLY=1 bash restart-all.sh` +Fix: `APPLY=1 bash misc/runner.sh phoenix restart-all` diff --git a/misc/phoenix/check-runners.sh b/misc/phoenix/check-runners.sh deleted file mode 100755 index a5e4dbd8a2..0000000000 --- a/misc/phoenix/check-runners.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash -# Check runner health across all Phoenix login nodes. -# Thin wrapper — see misc/common/check-runners.sh for the implementation. -# -# Usage: bash check-runners.sh -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/config.sh" -source "$SCRIPT_DIR/../common/check-runners.sh" diff --git a/misc/phoenix/list-runners.sh b/misc/phoenix/list-runners.sh deleted file mode 100755 index 0d2571803f..0000000000 --- a/misc/phoenix/list-runners.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash -# List all Phoenix runners combining GitHub API status with live node process info. -# Thin wrapper — see misc/common/list-runners.sh for the implementation. -# -# Usage: bash list-runners.sh -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/config.sh" -source "$SCRIPT_DIR/../common/list-runners.sh" diff --git a/misc/phoenix/move-runner.sh b/misc/phoenix/move-runner.sh deleted file mode 100644 index bc6d31d49b..0000000000 --- a/misc/phoenix/move-runner.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash -# Move a Phoenix runner to a different login node. -# Thin wrapper — see misc/common/move-runner.sh for the implementation. -# -# Usage: bash move-runner.sh -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/config.sh" -source "$SCRIPT_DIR/../common/move-runner.sh" diff --git a/misc/phoenix/rebalance-runners.sh b/misc/phoenix/rebalance-runners.sh deleted file mode 100755 index 3e961f622c..0000000000 --- a/misc/phoenix/rebalance-runners.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash -# Automatically rebalance Phoenix runners across login nodes. -# Thin wrapper — see misc/common/rebalance-runners.sh for the implementation. -# -# Usage: bash rebalance-runners.sh # dry run -# APPLY=1 bash rebalance-runners.sh # execute -# APPLY=1 FORCE=1 bash rebalance-runners.sh # move busy runners too -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -SITE_SCRIPT_DIR="$SCRIPT_DIR" -source "$SCRIPT_DIR/config.sh" -source "$SCRIPT_DIR/../common/rebalance-runners.sh" diff --git a/misc/phoenix/rerun-failed.sh b/misc/phoenix/rerun-failed.sh deleted file mode 100755 index 15ccc7fc46..0000000000 --- a/misc/phoenix/rerun-failed.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -exec "$(dirname "${BASH_SOURCE[0]}")/../common/rerun-failed.sh" "$@" diff --git a/misc/phoenix/restart-all.sh b/misc/phoenix/restart-all.sh deleted file mode 100755 index f11b5a1c6e..0000000000 --- a/misc/phoenix/restart-all.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env bash -# Restart all Phoenix runners in place on their current nodes. -# Thin wrapper — see misc/common/restart-all.sh for the implementation. -# -# Usage: bash restart-all.sh # dry run -# APPLY=1 bash restart-all.sh # execute -# APPLY=1 FORCE=1 bash restart-all.sh # restart busy runners too -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/config.sh" -source "$SCRIPT_DIR/../common/restart-all.sh" diff --git a/misc/phoenix/restart-runner.sh b/misc/phoenix/restart-runner.sh deleted file mode 100755 index c0c9f73549..0000000000 --- a/misc/phoenix/restart-runner.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash -# Restart a single Phoenix runner on a given node. -# Thin wrapper — see misc/common/restart-runner.sh for the implementation. -# -# Usage: bash restart-runner.sh -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/config.sh" -source "$SCRIPT_DIR/../common/restart-runner.sh" diff --git a/misc/phoenix/stop-runner.sh b/misc/phoenix/stop-runner.sh deleted file mode 100644 index d4d1ea6b91..0000000000 --- a/misc/phoenix/stop-runner.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash -# Stop and deregister a Phoenix runner. -# Thin wrapper — see misc/common/stop-runner.sh for the implementation. -# -# Usage: bash stop-runner.sh -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/config.sh" -source "$SCRIPT_DIR/../common/stop-runner.sh" diff --git a/misc/runner.sh b/misc/runner.sh new file mode 100644 index 0000000000..501720a768 --- /dev/null +++ b/misc/runner.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# Dispatcher for GitHub Actions runner management scripts. +# +# Loads site-specific configuration then runs the requested command. +# Common commands live in misc/common/; site-specific commands live in +# misc//. All site-specific scripts source their own config, so +# the dispatcher only pre-loads config for common commands. +# +# Usage: bash misc/runner.sh [args...] +# +# Sites: frontier phoenix +# Common: check-runners list-runners move-runner rebalance-runners +# restart-all restart-runner stop-runner rerun-failed +# Frontier: make-runner deploy-runners +# Phoenix: create-runner +# +# Examples: +# bash misc/runner.sh frontier check-runners +# bash misc/runner.sh phoenix list-runners +# APPLY=1 bash misc/runner.sh frontier rebalance-runners +# bash misc/runner.sh frontier restart-runner login01 /path/to/runner +# bash misc/runner.sh frontier make-runner 23 login01 +# bash misc/runner.sh phoenix create-runner phoenix-11 login-phoenix-gnr-2 + +set -euo pipefail + +MISC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +SITE="${1:?Usage: $0 [args...]}" +CMD="${2:?Usage: $0 [args...]}" +shift 2 + +if [ ! -f "$MISC_DIR/$SITE/config.sh" ]; then + echo "ERROR: Unknown site '$SITE'. Known sites: frontier, phoenix" >&2 + exit 1 +fi + +# Site-specific scripts are standalone — they source their own config. +if [ -f "$MISC_DIR/$SITE/$CMD.sh" ]; then + exec bash "$MISC_DIR/$SITE/$CMD.sh" "$@" +fi + +# Common scripts need the site config pre-loaded. +if [ -f "$MISC_DIR/common/$CMD.sh" ]; then + SITE_SCRIPT_DIR="$MISC_DIR/$SITE" + source "$MISC_DIR/$SITE/config.sh" + source "$MISC_DIR/common/$CMD.sh" "$@" + exit +fi + +echo "ERROR: Unknown command '$CMD' for site '$SITE'." >&2 +echo "Common: check-runners list-runners move-runner rebalance-runners restart-all restart-runner stop-runner rerun-failed" >&2 +echo "Frontier: make-runner deploy-runners" >&2 +echo "Phoenix: create-runner" >&2 +exit 1 From 26b86eb6747f7bc2a36b762db45d8c1d1d8076b6 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 21:14:39 -0400 Subject: [PATCH 18/20] Move runner scripts into misc/runners/ Consolidates all runner management scripts under misc/runners/ to separate them from unrelated misc/ files. Structure: misc/runners/runner.sh dispatcher misc/runners/common/ shared scripts and library misc/runners/frontier/ Frontier config + make/deploy scripts misc/runners/phoenix/ Phoenix config + create script All paths are computed from ${BASH_SOURCE[0]} so no internal path changes are needed. Co-Authored-By: Claude Sonnet 4.6 --- misc/{ => runners}/common/README.md | 0 misc/{ => runners}/common/check-runners.sh | 0 misc/{ => runners}/common/list-runners.sh | 0 misc/{ => runners}/common/move-runner.sh | 0 misc/{ => runners}/common/rebalance-runners.sh | 0 misc/{ => runners}/common/rerun-failed.sh | 0 misc/{ => runners}/common/restart-all.sh | 0 misc/{ => runners}/common/restart-runner.sh | 0 misc/{ => runners}/common/runner-lib.sh | 0 misc/{ => runners}/common/stop-runner.sh | 0 misc/{ => runners}/frontier/README.md | 0 misc/{ => runners}/frontier/config.sh | 0 misc/{ => runners}/frontier/deploy-runners.sh | 0 misc/{ => runners}/frontier/make-runner.sh | 0 misc/{ => runners}/phoenix/README.md | 0 misc/{ => runners}/phoenix/config.sh | 0 misc/{ => runners}/phoenix/create-runner.sh | 0 misc/{ => runners}/runner.sh | 0 18 files changed, 0 insertions(+), 0 deletions(-) rename misc/{ => runners}/common/README.md (100%) rename misc/{ => runners}/common/check-runners.sh (100%) rename misc/{ => runners}/common/list-runners.sh (100%) rename misc/{ => runners}/common/move-runner.sh (100%) rename misc/{ => runners}/common/rebalance-runners.sh (100%) rename misc/{ => runners}/common/rerun-failed.sh (100%) rename misc/{ => runners}/common/restart-all.sh (100%) rename misc/{ => runners}/common/restart-runner.sh (100%) rename misc/{ => runners}/common/runner-lib.sh (100%) rename misc/{ => runners}/common/stop-runner.sh (100%) rename misc/{ => runners}/frontier/README.md (100%) rename misc/{ => runners}/frontier/config.sh (100%) rename misc/{ => runners}/frontier/deploy-runners.sh (100%) rename misc/{ => runners}/frontier/make-runner.sh (100%) rename misc/{ => runners}/phoenix/README.md (100%) rename misc/{ => runners}/phoenix/config.sh (100%) rename misc/{ => runners}/phoenix/create-runner.sh (100%) rename misc/{ => runners}/runner.sh (100%) diff --git a/misc/common/README.md b/misc/runners/common/README.md similarity index 100% rename from misc/common/README.md rename to misc/runners/common/README.md diff --git a/misc/common/check-runners.sh b/misc/runners/common/check-runners.sh similarity index 100% rename from misc/common/check-runners.sh rename to misc/runners/common/check-runners.sh diff --git a/misc/common/list-runners.sh b/misc/runners/common/list-runners.sh similarity index 100% rename from misc/common/list-runners.sh rename to misc/runners/common/list-runners.sh diff --git a/misc/common/move-runner.sh b/misc/runners/common/move-runner.sh similarity index 100% rename from misc/common/move-runner.sh rename to misc/runners/common/move-runner.sh diff --git a/misc/common/rebalance-runners.sh b/misc/runners/common/rebalance-runners.sh similarity index 100% rename from misc/common/rebalance-runners.sh rename to misc/runners/common/rebalance-runners.sh diff --git a/misc/common/rerun-failed.sh b/misc/runners/common/rerun-failed.sh similarity index 100% rename from misc/common/rerun-failed.sh rename to misc/runners/common/rerun-failed.sh diff --git a/misc/common/restart-all.sh b/misc/runners/common/restart-all.sh similarity index 100% rename from misc/common/restart-all.sh rename to misc/runners/common/restart-all.sh diff --git a/misc/common/restart-runner.sh b/misc/runners/common/restart-runner.sh similarity index 100% rename from misc/common/restart-runner.sh rename to misc/runners/common/restart-runner.sh diff --git a/misc/common/runner-lib.sh b/misc/runners/common/runner-lib.sh similarity index 100% rename from misc/common/runner-lib.sh rename to misc/runners/common/runner-lib.sh diff --git a/misc/common/stop-runner.sh b/misc/runners/common/stop-runner.sh similarity index 100% rename from misc/common/stop-runner.sh rename to misc/runners/common/stop-runner.sh diff --git a/misc/frontier/README.md b/misc/runners/frontier/README.md similarity index 100% rename from misc/frontier/README.md rename to misc/runners/frontier/README.md diff --git a/misc/frontier/config.sh b/misc/runners/frontier/config.sh similarity index 100% rename from misc/frontier/config.sh rename to misc/runners/frontier/config.sh diff --git a/misc/frontier/deploy-runners.sh b/misc/runners/frontier/deploy-runners.sh similarity index 100% rename from misc/frontier/deploy-runners.sh rename to misc/runners/frontier/deploy-runners.sh diff --git a/misc/frontier/make-runner.sh b/misc/runners/frontier/make-runner.sh similarity index 100% rename from misc/frontier/make-runner.sh rename to misc/runners/frontier/make-runner.sh diff --git a/misc/phoenix/README.md b/misc/runners/phoenix/README.md similarity index 100% rename from misc/phoenix/README.md rename to misc/runners/phoenix/README.md diff --git a/misc/phoenix/config.sh b/misc/runners/phoenix/config.sh similarity index 100% rename from misc/phoenix/config.sh rename to misc/runners/phoenix/config.sh diff --git a/misc/phoenix/create-runner.sh b/misc/runners/phoenix/create-runner.sh similarity index 100% rename from misc/phoenix/create-runner.sh rename to misc/runners/phoenix/create-runner.sh diff --git a/misc/runner.sh b/misc/runners/runner.sh similarity index 100% rename from misc/runner.sh rename to misc/runners/runner.sh From 78c4efad32f1a70022d82b091fb475ee583cb0f5 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 21:23:05 -0400 Subject: [PATCH 19/20] Unify create-runner into common; make make-runner a thin wrapper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit frontier/make-runner.sh and phoenix/create-runner.sh were the same concept with different names and minor behavioral differences. Unified into common/create-runner.sh with site-specific behavior driven by two config.sh hooks: TARBALL_CACHE_DIR Frontier: "$SHARED_DIR" — tarball cached on shared Lustre, reused across parallel deployments (prevents download races). Phoenix: "" — each runner downloads independently. runner_install_dir [override-dir] Frontier: "$SHARED_DIR/" Phoenix: auto-numbered "$RUNNER_PARENT_DIRS[0]/actions-runner-N" frontier/make-runner.sh is now a 3-line wrapper that derives the name "frontier-" from a number argument and delegates to create-runner. deploy-runners.sh updated to reference $TARBALL_CACHE_DIR instead of $SHARED_DIR directly. Co-Authored-By: Claude Sonnet 4.6 --- misc/runners/common/README.md | 5 +- misc/runners/common/create-runner.sh | 114 ++++++++++++++++++++++++ misc/runners/frontier/README.md | 10 +-- misc/runners/frontier/config.sh | 11 +++ misc/runners/frontier/deploy-runners.sh | 6 +- misc/runners/frontier/make-runner.sh | 73 ++------------- misc/runners/phoenix/README.md | 19 ++-- misc/runners/phoenix/config.sh | 16 ++++ misc/runners/phoenix/create-runner.sh | 103 --------------------- misc/runners/runner.sh | 14 +-- 10 files changed, 177 insertions(+), 194 deletions(-) create mode 100644 misc/runners/common/create-runner.sh delete mode 100755 misc/runners/phoenix/create-runner.sh diff --git a/misc/runners/common/README.md b/misc/runners/common/README.md index 8cc2083054..2e840920a3 100644 --- a/misc/runners/common/README.md +++ b/misc/runners/common/README.md @@ -4,9 +4,9 @@ Site-agnostic scripts shared between the Frontier and Phoenix runner setups. All shared logic lives here; site directories contain only site-specific files (`config.sh` and scripts unique to that cluster). -Scripts are invoked via the dispatcher at `misc/runner.sh`: +Scripts are invoked via the dispatcher at `misc/runners/runner.sh`: ```bash -bash misc/runner.sh [args...] +bash misc/runners/runner.sh [args...] ``` ## Scripts @@ -22,3 +22,4 @@ bash misc/runner.sh [args...] | `move-runner.sh` | Move a runner to a different login node by name. Stops on current node, starts on target. Writes `runner.node`. | | `stop-runner.sh` | Stop a runner process and remove its GitHub registration. | | `rerun-failed.sh` | Rerun failed GitHub Actions workflows on open non-draft PRs and master. Dry run by default. | +| `create-runner.sh` | Download, register, and start a new runner. Requires `runner_install_dir()` and `TARBALL_CACHE_DIR` from site config. Usage: `create-runner [install-dir]` | diff --git a/misc/runners/common/create-runner.sh b/misc/runners/common/create-runner.sh new file mode 100644 index 0000000000..2e77c0e4bf --- /dev/null +++ b/misc/runners/common/create-runner.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env bash +# Create, register, and start a GitHub Actions runner. +# +# Sourced by misc/runners/runner.sh after config is loaded. +# Config must define runner_install_dir() and may set TARBALL_CACHE_DIR. +# +# runner_install_dir [override-dir] +# Returns the directory where the runner should be installed. +# If override-dir is given it is used directly; otherwise the site +# computes the path (e.g. SHARED_DIR/ on Frontier, or an +# auto-numbered actions-runner-N/ directory on Phoenix). +# +# TARBALL_CACHE_DIR +# If non-empty, the runner tarball is cached here and reused across +# installs (useful on Frontier where shared Lustre is visible from all +# login nodes). If empty or unset, a fresh download is made for each +# runner and the temporary file is removed after extraction. +# +# Usage: runner.sh create-runner [install-dir] +# name Runner name (e.g. frontier-23, phoenix-11) +# node Login node to start the runner on +# install-dir Optional: override the computed installation directory +set -euo pipefail + +RUNNER_NAME="${1:?Usage: create-runner [install-dir]}" +TARGET_NODE="${2:?Usage: create-runner [install-dir]}" +INSTALL_DIR_OVERRIDE="${3:-}" + +RUNNER_DIR=$(runner_install_dir "$RUNNER_NAME" "$INSTALL_DIR_OVERRIDE") +RUNNER_VERSION="${RUNNER_VERSION:-$(gh_latest_runner_version 2>/dev/null || echo "2.332.0")}" +TARBALL="actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" +TARBALL_URL="https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/${TARBALL}" + +echo "=== Creating runner ===" +echo " Name: $RUNNER_NAME" +echo " Node: $TARGET_NODE" +echo " Directory: $RUNNER_DIR" +echo " Org: $ORG" +echo " Group: $RUNNER_GROUP" +echo " Label: $RUNNER_LABEL" +echo " Version: $RUNNER_VERSION" +echo "" + +if [ -d "$RUNNER_DIR" ]; then + echo "ERROR: Directory already exists: $RUNNER_DIR" >&2 + exit 1 +fi + +# --- Download tarball --- +if [ -n "${TARBALL_CACHE_DIR:-}" ]; then + if [ ! -f "$TARBALL_CACHE_DIR/$TARBALL" ]; then + echo "==> Downloading runner v${RUNNER_VERSION} to cache..." + tmp="$TARBALL_CACHE_DIR/$TARBALL.tmp.$$" + curl -fsSL "$TARBALL_URL" -o "$tmp" + mv "$tmp" "$TARBALL_CACHE_DIR/$TARBALL" + fi + tarball_path="$TARBALL_CACHE_DIR/$TARBALL" +else + echo "==> Downloading runner v${RUNNER_VERSION}..." + mkdir -p "$RUNNER_DIR" + tarball_path="$RUNNER_DIR/runner-download.tmp.$$" + curl -fsSL "$TARBALL_URL" -o "$tarball_path" +fi + +# --- Extract --- +mkdir -p "$RUNNER_DIR" +echo "==> Extracting into $RUNNER_DIR..." +tar xzf "$tarball_path" -C "$RUNNER_DIR" +[ -z "${TARBALL_CACHE_DIR:-}" ] && rm -f "$tarball_path" + +if [ ! -f "$RUNNER_DIR/run.sh" ]; then + echo "ERROR: Extraction failed — run.sh not found in $RUNNER_DIR" >&2 + exit 1 +fi + +# --- Register --- +echo "==> Fetching registration token..." +token=$(gh_registration_token) +if [ -z "$token" ]; then + echo "ERROR: Failed to get registration token." >&2 + echo " Run: gh auth refresh -h github.com -s admin:org" >&2 + exit 1 +fi + +echo "==> Configuring runner..." +"$RUNNER_DIR/config.sh" \ + --url "https://github.com/$ORG" \ + --token "$token" \ + --name "$RUNNER_NAME" \ + --runnergroup "$RUNNER_GROUP" \ + --labels "$RUNNER_LABEL" \ + --work "_work" \ + --unattended \ + --replace +echo "==> Configured." + +# --- Start --- +echo "==> Starting on $TARGET_NODE..." +if start_runner "$TARGET_NODE" "$RUNNER_DIR"; then + echo "$TARGET_NODE" > "$RUNNER_DIR/runner.node" + pids=$(find_pids "$TARGET_NODE" "$RUNNER_DIR") + pid=${pids%% *} + if has_slurm "$TARGET_NODE" "$pid"; then + echo "==> OK: $RUNNER_NAME running on $TARGET_NODE (PID $pid, slurm in PATH)" + else + echo "==> WARNING: $RUNNER_NAME running on $TARGET_NODE (PID $pid) but slurm MISSING from PATH" + fi +else + echo "ERROR: $RUNNER_NAME did not start on $TARGET_NODE" >&2 + exit 1 +fi + +echo "" +echo "==> Log: $RUNNER_DIR/runner.log" diff --git a/misc/runners/frontier/README.md b/misc/runners/frontier/README.md index 43c89592e5..ca3bb07ce2 100644 --- a/misc/runners/frontier/README.md +++ b/misc/runners/frontier/README.md @@ -26,16 +26,16 @@ connections to GitHub's broker. Run `rebalance-runners` to restart and redistribute them. Login nodes vary in stability — if a runner keeps dying on a particular node, move it to a quieter one (login01 tends to have low load). -All commands are run via the dispatcher at `misc/runner.sh`: +All commands are run via the dispatcher at `misc/runners/runner.sh`: ```bash -bash misc/runner.sh frontier [args...] +bash misc/runners/runner.sh frontier [args...] ``` ## Quick Reference ```bash -R="bash misc/runner.sh frontier" +R="bash misc/runners/runner.sh frontier" # List all runners with GitHub status, node, slurm, and memory usage $R list-runners @@ -78,14 +78,14 @@ APPLY=1 $R rerun-failed | `config.sh` | Shared configuration: Frontier constants, `find_runner_dirs()`, and `sync_runner_nodes()`. Sources `../common/runner-lib.sh` for shared functions. | | `make-runner.sh` | Download runner binary, register with GitHub, start on target node. Usage: `make-runner [node]` | | `deploy-runners.sh` | Deploy multiple runners across nodes in parallel. Usage: `deploy-runners [node2 ...]` | -| `../common/` | All other commands (`check-runners`, `list-runners`, `rebalance-runners`, etc.) live here and are dispatched via `misc/runner.sh`. | +| `../common/` | All other commands (`check-runners`, `list-runners`, `rebalance-runners`, etc.) live here and are dispatched via `misc/runners/runner.sh`. | ## Troubleshooting **Runner goes OFFLINE repeatedly on the same node** — That login node may have process culling or high memory pressure. Move it to a different node: ```bash -bash misc/runner.sh frontier move-runner frontier-1 login01 +bash misc/runners/runner.sh frontier move-runner frontier-1 login01 ``` **Multiple runners OFFLINE at once** — Usually a transient OLCF network blip diff --git a/misc/runners/frontier/config.sh b/misc/runners/frontier/config.sh index 2245f80593..ce6962c0ab 100755 --- a/misc/runners/frontier/config.sh +++ b/misc/runners/frontier/config.sh @@ -17,6 +17,17 @@ source "$(dirname "${BASH_SOURCE[0]}")/../common/runner-lib.sh" # --- Local filesystem --- +# Cache downloaded runner tarballs here so parallel deployments don't race. +TARBALL_CACHE_DIR="$SHARED_DIR" + +# Return the directory where a named runner should be installed. +# Args: $1 = runner name, $2 = optional override dir +runner_install_dir() { + local name="$1" override="${2:-}" + [ -n "$override" ] && echo "$override" && return + echo "$SHARED_DIR/$name" +} + # Find all runner directories on shared storage. # Prints: one directory path per line. find_runner_dirs() { diff --git a/misc/runners/frontier/deploy-runners.sh b/misc/runners/frontier/deploy-runners.sh index b7df0bcea9..f4ad3ecbb5 100755 --- a/misc/runners/frontier/deploy-runners.sh +++ b/misc/runners/frontier/deploy-runners.sh @@ -23,13 +23,13 @@ fi # concurrently and corrupt it. The tmp+mv ensures an atomic final placement. RUNNER_VERSION="${RUNNER_VERSION:-$(gh_latest_runner_version 2>/dev/null || echo "2.332.0")}" TARBALL="actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" -if [ ! -f "${SHARED_DIR}/${TARBALL}" ]; then +if [ ! -f "${TARBALL_CACHE_DIR}/${TARBALL}" ]; then echo "==> Downloading runner v${RUNNER_VERSION}..." - tmp="${SHARED_DIR}/${TARBALL}.tmp.$$" + tmp="${TARBALL_CACHE_DIR}/${TARBALL}.tmp.$$" curl -fsSL \ "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/${TARBALL}" \ -o "$tmp" - mv "$tmp" "${SHARED_DIR}/${TARBALL}" + mv "$tmp" "${TARBALL_CACHE_DIR}/${TARBALL}" fi export RUNNER_VERSION diff --git a/misc/runners/frontier/make-runner.sh b/misc/runners/frontier/make-runner.sh index 53ca8cdcdb..bfd664da65 100755 --- a/misc/runners/frontier/make-runner.sh +++ b/misc/runners/frontier/make-runner.sh @@ -1,71 +1,16 @@ #!/usr/bin/env bash -# Create, configure, and start a single GitHub Actions runner on Frontier. -# Usage: make-runner.sh [login-node] -# runner-number Sequential number for this runner (e.g. 12) -# login-node Node to run on (default: current host) -# Example: make-runner.sh 12 login03 +# Frontier convenience wrapper: derives runner name "frontier-" from a number. +# For full name control, use: runner.sh frontier create-runner +# +# Usage: runner.sh frontier make-runner [node] +# num Runner number (e.g. 23 creates "frontier-23") +# node Login node to start on (default: current host) set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/config.sh" -RUNNER_VERSION="${RUNNER_VERSION:-$(gh_latest_runner_version 2>/dev/null || echo "2.332.0")}" +NUM="${1:?Usage: $0 [node]}" +NODE="${2:-$(hostname -s)}" -RUNNER_NUM="${1:?Usage: $0 [login-node]}" -TARGET_NODE="${2:-$(hostname -s)}" - -TARBALL="actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" -RUNNER_NAME="frontier-${RUNNER_NUM}" -RUNNER_DIR="${SHARED_DIR}/${RUNNER_NAME}" - -echo "==> Using runner version ${RUNNER_VERSION}" -echo "==> Setting up runner: ${RUNNER_NAME} on ${TARGET_NODE}" - -# --- Download tarball once to shared dir --- -if [ ! -f "${SHARED_DIR}/${TARBALL}" ]; then - echo "==> Downloading runner v${RUNNER_VERSION}..." - tmp="${SHARED_DIR}/${TARBALL}.tmp.$$" - curl -fsSL \ - "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/${TARBALL}" \ - -o "$tmp" - mv "$tmp" "${SHARED_DIR}/${TARBALL}" -fi - -# --- Extract (filesystem is shared across all nodes) --- -mkdir -p "${RUNNER_DIR}" -echo "==> Extracting runner into ${RUNNER_DIR}..." -tar xzf "${SHARED_DIR}/${TARBALL}" -C "${RUNNER_DIR}" - -if [ ! -f "${RUNNER_DIR}/run.sh" ]; then - echo "ERROR: Extraction failed — run.sh not found in ${RUNNER_DIR}" >&2 - exit 1 -fi - -# --- Configure --- -echo "==> Fetching registration token..." -REG_TOKEN=$(gh_registration_token) - -echo "==> Configuring runner..." -"${RUNNER_DIR}/config.sh" \ - --url "https://github.com/${ORG}" \ - --token "${REG_TOKEN}" \ - --name "${RUNNER_NAME}" \ - --labels "${RUNNER_LABEL}" \ - --runnergroup "${RUNNER_GROUP}" \ - --work "_work" \ - --unattended \ - --replace - -# --- Store which node this runner lives on --- -echo "${TARGET_NODE}" > "${RUNNER_DIR}/runner.node" - -# --- Start runner on target node --- -echo "==> Starting runner on ${TARGET_NODE}..." -if start_runner "${TARGET_NODE}" "${RUNNER_DIR}"; then - echo "==> Runner '${RUNNER_NAME}' is running on ${TARGET_NODE}." -else - echo "ERROR: Runner '${RUNNER_NAME}' did not start on ${TARGET_NODE}." >&2 - exit 1 -fi - -echo "==> Log: ${RUNNER_DIR}/runner.log" +source "$SCRIPT_DIR/../common/create-runner.sh" "frontier-$NUM" "$NODE" diff --git a/misc/runners/phoenix/README.md b/misc/runners/phoenix/README.md index 36ec2e61f9..3e632a2ae6 100644 --- a/misc/runners/phoenix/README.md +++ b/misc/runners/phoenix/README.md @@ -16,16 +16,16 @@ requires stopping the process on one node and starting it on another. Runners must be started with a **login shell** (`bash -l`) so they inherit `/opt/slurm/current/bin` in PATH (required for `sbatch`, `squeue`, `sacct`). -All commands are run via the dispatcher at `misc/runner.sh`: +All commands are run via the dispatcher at `misc/runners/runner.sh`: ```bash -bash misc/runner.sh phoenix [args...] +bash misc/runners/runner.sh phoenix [args...] ``` ## Quick Reference ```bash -R="bash misc/runner.sh phoenix" +R="bash misc/runners/runner.sh phoenix" # Check health (quick, one SSH per node) $R check-runners @@ -61,9 +61,8 @@ APPLY=1 $R rerun-failed # execute | Script | Purpose | |---|---| -| `config.sh` | Shared config: Phoenix constants (`ORG`, `RUNNER_GROUP`, `RUNNER_LABEL`, `NODES`, `CGROUP_LIMIT`, `RUNNER_PARENT_DIRS`) and `find_runner_dirs()`. Sources `../common/runner-lib.sh` for shared functions. | -| `create-runner.sh` | Download runner binary, register with GitHub via API, start on target node. Usage: `create-runner [parent-dir]` | -| `../common/` | All other commands (`check-runners`, `list-runners`, `rebalance-runners`, etc.) live here and are dispatched via `misc/runner.sh`. | +| `config.sh` | Shared config: Phoenix constants (`ORG`, `RUNNER_GROUP`, `RUNNER_LABEL`, `NODES`, `CGROUP_LIMIT`, `RUNNER_PARENT_DIRS`), `find_runner_dirs()`, and `runner_install_dir()`. Sources `../common/runner-lib.sh` for shared functions. | +| `../common/` | All commands (`check-runners`, `list-runners`, `create-runner`, `rebalance-runners`, etc.) live here and are dispatched via `misc/runners/runner.sh`. | ## Safety @@ -85,13 +84,13 @@ Edit `config.sh` to change: ## Troubleshooting **"sbatch: command not found"** — Runner started without login shell. -Fix: `bash misc/runner.sh phoenix restart-runner ` +Fix: `bash misc/runners/runner.sh phoenix restart-runner ` **OOM kills** — Too many runners on one node. -Fix: `bash misc/runner.sh phoenix check-runners` then `APPLY=1 bash misc/runner.sh phoenix rebalance-runners` +Fix: `bash misc/runners/runner.sh phoenix check-runners` then `APPLY=1 bash misc/runners/runner.sh phoenix rebalance-runners` **Runner OFFLINE** — Process died or node rebooted. -Fix: `APPLY=1 bash misc/runner.sh phoenix rebalance-runners` (auto-places on least-loaded node) +Fix: `APPLY=1 bash misc/runners/runner.sh phoenix rebalance-runners` (auto-places on least-loaded node) **All runners down** — Node maintenance. -Fix: `APPLY=1 bash misc/runner.sh phoenix restart-all` +Fix: `APPLY=1 bash misc/runners/runner.sh phoenix restart-all` diff --git a/misc/runners/phoenix/config.sh b/misc/runners/phoenix/config.sh index 40065099cc..2c5294bdd9 100755 --- a/misc/runners/phoenix/config.sh +++ b/misc/runners/phoenix/config.sh @@ -23,6 +23,22 @@ source "$(dirname "${BASH_SOURCE[0]}")/../common/runner-lib.sh" # --- Local filesystem --- +# No shared cache: each runner downloads its own tarball independently. +TARBALL_CACHE_DIR="" + +# Return the directory where a named runner should be installed. +# Auto-increments the actions-runner-N suffix within RUNNER_PARENT_DIRS[0]. +# Args: $1 = runner name (unused; directory is numbered, not named), $2 = optional override dir +runner_install_dir() { + local override="${2:-}" + [ -n "$override" ] && echo "$override" && return + local parent="${RUNNER_PARENT_DIRS[0]}" + local existing next_num + existing=$(ls -d "$parent"/actions-runner-* 2>/dev/null | sed 's/.*actions-runner-//' | sort -n | tail -1) + next_num=$(( ${existing:-0} + 1 )) + echo "$parent/actions-runner-$next_num" +} + # Find all runner directories on shared storage. # Prints: one directory path per line. find_runner_dirs() { diff --git a/misc/runners/phoenix/create-runner.sh b/misc/runners/phoenix/create-runner.sh deleted file mode 100755 index 8def3f339b..0000000000 --- a/misc/runners/phoenix/create-runner.sh +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env bash -# Create and register a new GitHub Actions runner on Phoenix. -# -# Downloads the runner binary, registers with MFlowCode org, and starts -# on the specified login node. Uses config.sh for org/group/label defaults. -# -# Prerequisites: gh CLI with admin:org scope (gh auth refresh -s admin:org) -# -# Usage: bash create-runner.sh [parent-dir] -# -# Examples: -# bash create-runner.sh phoenix-11 login-phoenix-gnr-2 -# bash create-runner.sh phoenix-12 login-phoenix-gnr-3 /storage/project/.../mfc-runners-2 - -set -euo pipefail -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/config.sh" - -if [ $# -lt 2 ]; then - echo "Usage: $0 [parent-dir]" - echo "" - echo " runner-name Name for the runner (e.g. phoenix-11)" - echo " node Login node (${NODES[*]})" - echo " parent-dir Parent directory (default: ${RUNNER_PARENT_DIRS[0]})" - exit 1 -fi - -runner_name="$1" -node="$2" -parent_dir="${3:-${RUNNER_PARENT_DIRS[0]}}" - -# Determine next available runner directory number -existing=$(ls -d "$parent_dir"/actions-runner-* 2>/dev/null | sed 's/.*actions-runner-//' | sort -n | tail -1) -next_num=$(( ${existing:-0} + 1 )) -runner_dir="$parent_dir/actions-runner-$next_num" - -echo "=== Creating Phoenix runner ===" -echo " Name: $runner_name" -echo " Node: $node" -echo " Directory: $runner_dir" -echo " Org: $ORG" -echo " Group: $RUNNER_GROUP" -echo " Label: $RUNNER_LABEL" -echo "" - -if [ -d "$runner_dir" ]; then - echo "ERROR: Directory already exists: $runner_dir" - exit 1 -fi - -# Registration token -echo "Getting registration token..." -token=$(gh_registration_token) -if [ -z "$token" ]; then - echo "ERROR: Failed to get token. Run: gh auth refresh -h github.com -s admin:org" - exit 1 -fi - -# Download runner -echo "Downloading latest runner binary..." -version=$(gh_latest_runner_version) -url="https://github.com/actions/runner/releases/download/v${version}/actions-runner-linux-x64-${version}.tar.gz" -echo " Version: $version" - -mkdir -p "$runner_dir" -cd "$runner_dir" -tmp="runner-download.tmp.$$" -curl -fsSL "$url" -o "$tmp" -tar xz < "$tmp" -rm -f "$tmp" -echo " Extracted." - -# Configure -echo "Configuring..." -./config.sh \ - --url "https://github.com/$ORG" \ - --token "$token" \ - --name "$runner_name" \ - --runnergroup "$RUNNER_GROUP" \ - --labels "$RUNNER_LABEL" \ - --work "_work" \ - --unattended \ - --replace -echo " Configured." - -# Start -echo "Starting on $node..." -if start_runner "$node" "$runner_dir"; then - echo "$node" > "$runner_dir/runner.node" - pids=$(find_pids "$node" "$runner_dir") - pid=${pids%% *} - if has_slurm "$node" "$pid"; then - echo " OK: PID $pid, slurm in PATH" - else - echo " WARNING: PID $pid but slurm MISSING from PATH" - fi -else - echo " ERROR: Failed to start." - echo " Try: ssh $node 'cd $runner_dir && setsid bash -lc \"nohup ./run.sh >> runner.log 2>&1 < /dev/null &\"'" -fi - -echo "" -echo "Created $runner_name at $runner_dir" diff --git a/misc/runners/runner.sh b/misc/runners/runner.sh index 501720a768..0de3f2d625 100644 --- a/misc/runners/runner.sh +++ b/misc/runners/runner.sh @@ -6,7 +6,7 @@ # misc//. All site-specific scripts source their own config, so # the dispatcher only pre-loads config for common commands. # -# Usage: bash misc/runner.sh [args...] +# Usage: bash misc/runners/runner.sh [args...] # # Sites: frontier phoenix # Common: check-runners list-runners move-runner rebalance-runners @@ -15,12 +15,12 @@ # Phoenix: create-runner # # Examples: -# bash misc/runner.sh frontier check-runners -# bash misc/runner.sh phoenix list-runners -# APPLY=1 bash misc/runner.sh frontier rebalance-runners -# bash misc/runner.sh frontier restart-runner login01 /path/to/runner -# bash misc/runner.sh frontier make-runner 23 login01 -# bash misc/runner.sh phoenix create-runner phoenix-11 login-phoenix-gnr-2 +# bash misc/runners/runner.sh frontier check-runners +# bash misc/runners/runner.sh phoenix list-runners +# APPLY=1 bash misc/runners/runner.sh frontier rebalance-runners +# bash misc/runners/runner.sh frontier restart-runner login01 /path/to/runner +# bash misc/runners/runner.sh frontier make-runner 23 login01 +# bash misc/runners/runner.sh phoenix create-runner phoenix-11 login-phoenix-gnr-2 set -euo pipefail From e0ad7714228f3fc224905b4475329814c747dd3c Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Mar 2026 21:24:57 -0400 Subject: [PATCH 20/20] Direct new Phoenix runners to mfc-runners-2 All new runner installs now go to RUNNER_PARENT_DIRS[1]: /storage/project/r-sbryngelson3-0/sbryngelson3/mfc-runners-2 Co-Authored-By: Claude Sonnet 4.6 --- misc/runners/phoenix/config.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/runners/phoenix/config.sh b/misc/runners/phoenix/config.sh index 2c5294bdd9..11826019c2 100755 --- a/misc/runners/phoenix/config.sh +++ b/misc/runners/phoenix/config.sh @@ -32,7 +32,7 @@ TARBALL_CACHE_DIR="" runner_install_dir() { local override="${2:-}" [ -n "$override" ] && echo "$override" && return - local parent="${RUNNER_PARENT_DIRS[0]}" + local parent="${RUNNER_PARENT_DIRS[1]}" local existing next_num existing=$(ls -d "$parent"/actions-runner-* 2>/dev/null | sed 's/.*actions-runner-//' | sort -n | tail -1) next_num=$(( ${existing:-0} + 1 ))