status sweep: cap check-health per node (timeout) so one stuck node can't wedge fleet rpc-update
A hung check-health.sh (aztec-testnet, looping on an unresponsive reference RPC)
blocked show-status.sh's parallel 'wait' for 3.5h, hanging the whole fleet
rpc-update and holding the deploy lock. Each curl was bounded (-m 3) and the
retry loop capped (3x), but the call itself wasn't time-bounded.
- sync-status.sh: wrap each check-health.sh call in 'timeout ${HC_TIMEOUT:-30}'
(-> exit 124 + 'timeout' status on overrun).
- show-status.sh: wrap the whole per-node sync-status.sh call in
'timeout ${SYNC_TIMEOUT:-60}' so the parallel wait can never block forever.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -19,7 +19,9 @@ any_failure=false
|
|||||||
# Function to run the command and handle the result
|
# Function to run the command and handle the result
|
||||||
check_sync_status() {
|
check_sync_status() {
|
||||||
local part=$1
|
local part=$1
|
||||||
result=$("$BASEPATH/sync-status.sh" "${part%.yml}")
|
# Cap the whole per-node branch (belt-and-suspenders over check-health's own cap), so no single
|
||||||
|
# node can ever block the 'wait' below — that is what wedged the fleet rpc-update for hours.
|
||||||
|
result=$(timeout "${SYNC_TIMEOUT:-60}" "$BASEPATH/sync-status.sh" "${part%.yml}")
|
||||||
|
|
||||||
code=0
|
code=0
|
||||||
|
|
||||||
|
|||||||
@@ -176,17 +176,22 @@ for path in $pathlist; do
|
|||||||
unset LAGGING_LAG SYNCING_LAG
|
unset LAGGING_LAG SYNCING_LAG
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Call the health check script with RPC_URL, ref, and chain-type flag
|
# Call the health check script with RPC_URL, ref, and chain-type flag.
|
||||||
|
# HARD CAP each call: a single slow/unresponsive reference RPC must never hang the parallel
|
||||||
|
# fleet status sweep (show-status.sh waits on ALL of these). On timeout -> exit 124, reported.
|
||||||
|
HC_TIMEOUT="${HC_TIMEOUT:-30}"
|
||||||
if $is_aztec; then
|
if $is_aztec; then
|
||||||
$BASEPATH/check-health.sh "$RPC_URL" --aztec $ref
|
timeout "$HC_TIMEOUT" $BASEPATH/check-health.sh "$RPC_URL" --aztec $ref
|
||||||
elif $is_starknet; then
|
elif $is_starknet; then
|
||||||
$BASEPATH/check-health.sh "$RPC_URL" --starknet $ref
|
timeout "$HC_TIMEOUT" $BASEPATH/check-health.sh "$RPC_URL" --starknet $ref
|
||||||
elif $is_cosmos; then
|
elif $is_cosmos; then
|
||||||
$BASEPATH/check-health.sh "$RPC_URL" --cosmos $ref
|
timeout "$HC_TIMEOUT" $BASEPATH/check-health.sh "$RPC_URL" --cosmos $ref
|
||||||
else
|
else
|
||||||
$BASEPATH/check-health.sh "$RPC_URL" $ref
|
timeout "$HC_TIMEOUT" $BASEPATH/check-health.sh "$RPC_URL" $ref
|
||||||
fi
|
fi
|
||||||
exit $?
|
rc=$?
|
||||||
|
[ "$rc" -eq 124 ] && echo "timeout (health check exceeded ${HC_TIMEOUT}s)"
|
||||||
|
exit "$rc"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user