Files
ethereum-rpc-docker/sync-status.sh
rob aefcd41a88 status sweep: cap check-health per node (timeout) so one stuck node can't wedge fleet rpc-update
A hung check-health.sh (aztec-testnet, looping on an unresponsive reference RPC)
blocked show-status.sh's parallel 'wait' for 3.5h, hanging the whole fleet
rpc-update and holding the deploy lock. Each curl was bounded (-m 3) and the
retry loop capped (3x), but the call itself wasn't time-bounded.
- sync-status.sh: wrap each check-health.sh call in 'timeout ${HC_TIMEOUT:-30}'
  (-> exit 124 + 'timeout' status on overrun).
- show-status.sh: wrap the whole per-node sync-status.sh call in
  'timeout ${SYNC_TIMEOUT:-60}' so the parallel wait can never block forever.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-19 03:57:18 +00:00

200 lines
8.2 KiB
Bash
Executable File

#!/bin/bash
BASEPATH="$(dirname "$0")"
source $BASEPATH/.env
blacklist=()
while IFS= read -r line; do
# Add each line to the array
blacklist+=("$line")
done < "$BASEPATH/path-blacklist.txt"
if [ -n "$NO_SSL" ]; then
PROTO="http"
DOMAIN="${DOMAIN:-0.0.0.0}"
else
PROTO="https"
fi
pathlist=$(cat $BASEPATH/$1.yml | grep -oP "stripprefix\.prefixes.*?/\K[^\"]+")
for path in $pathlist; do
include=true
for word in "${blacklist[@]}"; do
if echo "$path" | grep -qE "$word"; then
if echo "$path" | grep -qE "viction"; then
# excemption
include=$include
else
include=false
fi
fi
done
if $include; then
RPC_URL="$PROTO://$DOMAIN/$path"
# Resolve the protocol family from the compose x-upstreams chain label
# (drpc slug) via the registry. Falls back to legacy path-substring
# detection for composes without a resolved chain label.
chain_slug=$(grep -oP '^\s*chain:\s*\K\S+' "$BASEPATH/$1.yml" 2>/dev/null | head -1)
protocol=""
if [ -n "$chain_slug" ]; then
protocol=$($BASEPATH/reference-rpc-endpoint.sh --protocol "$chain_slug" 2>/dev/null) || protocol=""
fi
if [ -z "$protocol" ]; then
# Legacy detection by path substring
if echo "$path" | grep -qi "starknet"; then
protocol="starknet"
elif echo "$path" | grep -qi "aztec"; then
protocol="aztec"
else
protocol="eth"
fi
fi
case "$protocol" in
eth|starknet|aztec|cosmos)
;;
*)
# Protocol family known from the registry but no probe support
# in check-health.sh yet - honest output instead of a bogus
# eth_chainId error. Add a handler when we deploy such a node.
echo "unsupported protocol: $protocol ($chain_slug)"
exit 1
;;
esac
is_starknet=false
is_aztec=false
is_cosmos=false
[ "$protocol" = "starknet" ] && is_starknet=true
[ "$protocol" = "aztec" ] && is_aztec=true
[ "$protocol" = "cosmos" ] && is_cosmos=true
ref=''
if [ -n "$2" ]; then
ref="$2"
else
if $is_aztec; then
# Aztec: resolve ref by rollup_version from node (result.header.globalVariables.version)
aztec_block_response=$(curl -L --ipv4 -m 1 -s -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"node_getBlock","params":["latest"],"id":1}' $RPC_URL)
if [ $? -ne 0 ]; then
echo "error: failed to get Aztec block from $RPC_URL"
exit 1
fi
version_hex=$(echo "$aztec_block_response" | jq -r '.result.header.globalVariables.version' 2>/dev/null)
if [ -z "$version_hex" ] || [ "$version_hex" = "null" ]; then
echo "error: could not parse rollup version from Aztec node"
exit 1
fi
# Version is a 32-byte hash (0x...); rollup_version in JSON is the low 32 bits as decimal.
# Strip 0x, take last 8 hex chars, convert to integer for comparison.
version_hex="${version_hex#0x}"
version_hex="${version_hex: -8}"
version_decimal=$((16#$version_hex))
ref=$($BASEPATH/reference-rpc-endpoint.sh --rollup-version "$version_decimal" 2>/dev/null)
if [ $? -ne 0 ] || [ -z "$ref" ]; then
# Fallback: reference urls by chain slug from the compose label
if [ -n "$chain_slug" ]; then
ref=$($BASEPATH/reference-rpc-endpoint.sh --chain "$chain_slug" 2>/dev/null) || ref=""
fi
fi
if [ -z "$ref" ]; then
echo "error: no reference endpoint for Aztec rollup_version $version_decimal"
exit 1
fi
elif $is_starknet; then
# Starknet chain ID detection
chain_id_response=$(curl -L --ipv4 -m 1 -s -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"starknet_chainId","params":[],"id":1}' $RPC_URL)
if [ $? -eq 0 ]; then
chain_id=$(echo "$chain_id_response" | jq -r '.result' 2>/dev/null)
# Map Starknet chain IDs to reference endpoints
# Chain ID can be plain string or hex-encoded ASCII
case "$chain_id" in
"SN_MAIN"|"0x534e5f4d41494e")
ref=$($BASEPATH/reference-rpc-endpoint.sh 23448594291968336)
;;
"SN_SEPOLIA"|"0x534e5f5345504f4c4941")
ref=$($BASEPATH/reference-rpc-endpoint.sh 393402133025997800000000)
;;
*)
echo "error: unknown starknet chain $chain_id"
exit 1
;;
esac
else
echo "error"
exit 1
fi
elif $is_cosmos; then
# Cosmos/CometBFT: no chainid. Reference by drpc slug (optional — the node's
# own sync_info.catching_up is authoritative; ref only adds a head-gap check).
if [ -n "$chain_slug" ]; then
ref=$($BASEPATH/reference-rpc-endpoint.sh --chain "$chain_slug" 2>/dev/null) || ref=""
fi
else
# Ethereum chain ID detection
chain_id_response=$(curl -L --ipv4 -m 1 -s -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"eth_chainId","params":[],"id":1}' $RPC_URL)
if [ $? -eq 0 ]; then
chain_id=$(echo "$chain_id_response" | jq -r '.result' 2>/dev/null)
# echo "$RPC_URL: $chain_id"
if [[ "$chain_id" =~ ^0x[0-9a-fA-F]+$ ]]; then
chain_id_decimal=$((16#${chain_id#0x}))
ref=$($BASEPATH/reference-rpc-endpoint.sh $chain_id_decimal)
else
echo "error"
exit 1
fi
else
echo "error"
exit 1
fi
fi
fi
# Per-chain dRPC lag thresholds (in blocks) so check-health.sh reports
# online/lagging/syncing the same way the dRPC gateway does (homogeneous status).
# Resolve by drpc slug first, then by chain id; empty => check-health uses its
# built-in defaults.
lags=""
if [ -n "$chain_slug" ]; then
lags=$($BASEPATH/reference-rpc-endpoint.sh --lags "$chain_slug" 2>/dev/null) || lags=""
fi
if [ -z "$lags" ] && [ -n "$chain_id_decimal" ]; then
lags=$($BASEPATH/reference-rpc-endpoint.sh --lags "$chain_id_decimal" 2>/dev/null) || lags=""
fi
if [ -n "$lags" ]; then
export LAGGING_LAG="${lags%% *}"
export SYNCING_LAG="${lags##* }"
else
unset LAGGING_LAG SYNCING_LAG
fi
# Call the health check script with RPC_URL, ref, and chain-type flag.
# HARD CAP each call: a single slow/unresponsive reference RPC must never hang the parallel
# fleet status sweep (show-status.sh waits on ALL of these). On timeout -> exit 124, reported.
HC_TIMEOUT="${HC_TIMEOUT:-30}"
if $is_aztec; then
timeout "$HC_TIMEOUT" $BASEPATH/check-health.sh "$RPC_URL" --aztec $ref
elif $is_starknet; then
timeout "$HC_TIMEOUT" $BASEPATH/check-health.sh "$RPC_URL" --starknet $ref
elif $is_cosmos; then
timeout "$HC_TIMEOUT" $BASEPATH/check-health.sh "$RPC_URL" --cosmos $ref
else
timeout "$HC_TIMEOUT" $BASEPATH/check-health.sh "$RPC_URL" $ref
fi
rc=$?
[ "$rc" -eq 124 ] && echo "timeout (health check exceeded ${HC_TIMEOUT}s)"
exit "$rc"
fi
done
echo "unverified"
exit 1