From aefcd41a88a8a3a2ffe5a268b22c6c9806163c34 Mon Sep 17 00:00:00 2001 From: rob Date: Fri, 19 Jun 2026 03:52:28 +0000 Subject: [PATCH] status sweep: cap check-health per node (timeout) so one stuck node can't wedge fleet rpc-update A hung check-health.sh (aztec-testnet, looping on an unresponsive reference RPC) blocked show-status.sh's parallel 'wait' for 3.5h, hanging the whole fleet rpc-update and holding the deploy lock. Each curl was bounded (-m 3) and the retry loop capped (3x), but the call itself wasn't time-bounded. - sync-status.sh: wrap each check-health.sh call in 'timeout ${HC_TIMEOUT:-30}' (-> exit 124 + 'timeout' status on overrun). - show-status.sh: wrap the whole per-node sync-status.sh call in 'timeout ${SYNC_TIMEOUT:-60}' so the parallel wait can never block forever. Co-Authored-By: Claude Opus 4.8 (1M context) --- show-status.sh | 4 +++- sync-status.sh | 17 +++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/show-status.sh b/show-status.sh index b2e8d10b..af190464 100755 --- a/show-status.sh +++ b/show-status.sh @@ -19,7 +19,9 @@ any_failure=false # Function to run the command and handle the result check_sync_status() { local part=$1 - result=$("$BASEPATH/sync-status.sh" "${part%.yml}") + # Cap the whole per-node branch (belt-and-suspenders over check-health's own cap), so no single + # node can ever block the 'wait' below — that is what wedged the fleet rpc-update for hours. + result=$(timeout "${SYNC_TIMEOUT:-60}" "$BASEPATH/sync-status.sh" "${part%.yml}") code=0 diff --git a/sync-status.sh b/sync-status.sh index 699a233d..1f0058ec 100755 --- a/sync-status.sh +++ b/sync-status.sh @@ -176,17 +176,22 @@ for path in $pathlist; do unset LAGGING_LAG SYNCING_LAG fi - # Call the health check script with RPC_URL, ref, and chain-type flag + # Call the health check script with RPC_URL, ref, and chain-type flag. + # HARD CAP each call: a single slow/unresponsive reference RPC must never hang the parallel + # fleet status sweep (show-status.sh waits on ALL of these). On timeout -> exit 124, reported. + HC_TIMEOUT="${HC_TIMEOUT:-30}" if $is_aztec; then - $BASEPATH/check-health.sh "$RPC_URL" --aztec $ref + timeout "$HC_TIMEOUT" $BASEPATH/check-health.sh "$RPC_URL" --aztec $ref elif $is_starknet; then - $BASEPATH/check-health.sh "$RPC_URL" --starknet $ref + timeout "$HC_TIMEOUT" $BASEPATH/check-health.sh "$RPC_URL" --starknet $ref elif $is_cosmos; then - $BASEPATH/check-health.sh "$RPC_URL" --cosmos $ref + timeout "$HC_TIMEOUT" $BASEPATH/check-health.sh "$RPC_URL" --cosmos $ref else - $BASEPATH/check-health.sh "$RPC_URL" $ref + timeout "$HC_TIMEOUT" $BASEPATH/check-health.sh "$RPC_URL" $ref fi - exit $? + rc=$? + [ "$rc" -eq 124 ] && echo "timeout (health check exceeded ${HC_TIMEOUT}s)" + exit "$rc" fi done