sync-status: dRPC-homogeneous block-lag status + fix never-used reference fallbacks

Match the dRPC gateway's per-chain "how many blocks behind is ok" model instead of a
fixed 2s/5s timestamp tolerance:
- check-health.sh: compare the reference head vs local head by BLOCK NUMBER and classify
  with the chain's dRPC lag thresholds (LAGGING_LAG/SYNCING_LAG, in blocks, from
  chains.yaml). dRPC uses the two thresholds inconsistently across chains (sometimes
  lagging<syncing, sometimes the reverse) so the smaller is the online boundary and the
  larger the syncing/drop boundary. Defaults 2/6 when a chain has no thresholds.
- multicurl.sh: also skip responses with result:null (a lagging endpoint lacking the
  requested block) so the fallback reference URLs are actually tried. Previously the first
  endpoint's {"result":null} was accepted as success -> fallbacks never ran, and the null
  reference hash made check-health report false "forked" (the online/forked flapping).
- sync-status.sh: resolve the lag thresholds (by drpc slug or chain id) and export
  LAGGING_LAG/SYNCING_LAG.
- reference-rpc-endpoint.sh: add --lags and --block-time-ms lookups.
- reference-rpc-endpoint.json: regenerated with per-chain block_time_ms + lagging_lag +
  syncing_lag (additive).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-16 05:47:33 +00:00
parent df6c17f5cc
commit 1134a3774a
5 changed files with 1088 additions and 15 deletions

View File

@@ -189,12 +189,12 @@ if [ $? -eq 0 ]; then
response3=$(cat "$response_file3") response3=$(cat "$response_file3")
if $is_starknet; then if $is_starknet; then
latest_block_timestamp_decimal3=$(echo "$response3" | jq -r '.result.timestamp') ref_num=$(echo "$response3" | jq -r '.result.block_number // empty')
elif $is_aztec; then elif $is_aztec; then
latest_block_timestamp_decimal3=$(echo "$response3" | jq -r '.result.header.globalVariables.timestamp') ref_num=$(echo "$response3" | jq -r '.result.header.globalVariables.blockNumber // empty')
else else
latest_block_timestamp3=$(echo "$response3" | jq -r '.result.timestamp') ref_num_hex=$(echo "$response3" | jq -r '.result.number // empty')
latest_block_timestamp_decimal3=$((16#${latest_block_timestamp3#0x})) ref_num=$([ -n "$ref_num_hex" ] && printf '%d' "$ref_num_hex" 2>/dev/null)
fi fi
# echo "refer: $latest_block_timestamp_decimal3" # echo "refer: $latest_block_timestamp_decimal3"
@@ -205,25 +205,35 @@ if [ $? -eq 0 ]; then
response4=$(cat "$response_file4") response4=$(cat "$response_file4")
if $is_starknet; then if $is_starknet; then
latest_block_timestamp_decimal4=$(echo "$response4" | jq -r '.result.timestamp') local_num=$(echo "$response4" | jq -r '.result.block_number // empty')
elif $is_aztec; then elif $is_aztec; then
latest_block_timestamp_decimal4=$(echo "$response4" | jq -r '.result.header.globalVariables.timestamp') local_num=$(echo "$response4" | jq -r '.result.header.globalVariables.blockNumber // empty')
else else
latest_block_timestamp4=$(echo "$response4" | jq -r '.result.timestamp') local_num_hex=$(echo "$response4" | jq -r '.result.number // empty')
latest_block_timestamp_decimal4=$((16#${latest_block_timestamp4#0x})) local_num=$([ -n "$local_num_hex" ] && printf '%d' "$local_num_hex" 2>/dev/null)
fi fi
#echo "local: $latest_block_timestamp_decimal4" #echo "local: $latest_block_timestamp_decimal4"
rm "$response_file4" rm "$response_file4"
time_difference3=$(echo "scale=6; (${latest_block_timestamp_decimal3} - ${request_time3}) - (${latest_block_timestamp_decimal4} - ${request_time4})" | bc) # Lag in BLOCKS between the reference head and the local head
# (positive => local behind). Compare against dRPC's own per-chain
#echo "diff after network latency: $time_difference3 s" # thresholds (LAGGING_LAG / SYNCING_LAG from chains.yaml via
# sync-status.sh) so our status matches the dRPC gateway's view.
if (( $(echo "$time_difference3 < 2" | bc -l) )); then # dRPC uses the two thresholds inconsistently (sometimes
# lagging<syncing, sometimes the reverse), so treat the smaller as
# the online boundary and the larger as the syncing/drop boundary.
if [ -z "$ref_num" ] || [ -z "$local_num" ]; then
echo "error"
exit 1
fi
lag=$(( ref_num - local_num ))
lo=${LAGGING_LAG:-2}; hi=${SYNCING_LAG:-6}
if [ "$lo" -gt "$hi" ]; then tmp=$lo; lo=$hi; hi=$tmp; fi
if [ "$lag" -le "$lo" ]; then
echo "online" echo "online"
exit 0 exit 0
elif (( $(echo "$time_difference3 < 5" | bc -l) )); then elif [ "$lag" -le "$hi" ]; then
echo "lagging" echo "lagging"
exit 0 exit 0
else else

View File

@@ -43,7 +43,11 @@ for url in "${urls[@]}"; do
output=$(eval "curl -s ${options[@]@Q} '$url' --fail") output=$(eval "curl -s ${options[@]@Q} '$url' --fail")
if [[ $? -eq 0 ]]; then if [[ $? -eq 0 ]]; then
if cat "$temp_file" | jq -e 'has("error")' > /dev/null 2>&1; then # Skip and try the next reference URL when the response is a JSON-RPC error OR has a
# null result (a lagging endpoint that doesn't have the requested block/data yet).
# Without the result==null check the first endpoint's {"result":null} was accepted as
# success and the remaining fallback URLs were never tried.
if cat "$temp_file" | jq -e 'has("error") or (.result == null)' > /dev/null 2>&1; then
continue # Try the next URL continue # Try the next URL
fi fi

File diff suppressed because it is too large Load Diff

View File

@@ -44,6 +44,51 @@ if [ "$1" = "--protocol" ]; then
exit 0 exit 0
fi fi
# Look up the expected block time (milliseconds) for a registry key (drpc slug) or chain id.
# Used by sync-status.sh / check-health.sh to scale the lag thresholds per chain.
if [ "$1" = "--block-time-ms" ]; then
if [ $# -lt 2 ]; then
echo "Usage: $0 --block-time-ms <slug|chainid>"
exit 1
fi
key="$2"
# Try by slug first
bt=$(jq -r --arg k "$key" '.[$k].block_time_ms // empty' "$json_file" 2>/dev/null)
if [ -z "$bt" ]; then
# Fall back to lookup by chain id (decimal; convert hex)
idk="$key"
[[ "$idk" == 0x* ]] && idk=$(printf "%d" "$idk" 2>/dev/null)
if [[ "$idk" =~ ^[0-9]+$ ]]; then
bt=$(jq -r --arg id "$idk" 'first(.[] | select(.id == ($id | tonumber)) | .block_time_ms) // empty' "$json_file" 2>/dev/null)
fi
fi
[ -z "$bt" ] && exit 1
echo "$bt"
exit 0
fi
# Look up the dRPC lag thresholds (in BLOCKS) for a registry key (slug) or chain id.
# Prints "<lagging_lag> <syncing_lag>". Used by sync-status.sh -> check-health.sh so our
# online/lagging/syncing status matches the dRPC gateway's per-chain lag model.
if [ "$1" = "--lags" ]; then
if [ $# -lt 2 ]; then
echo "Usage: $0 --lags <slug|chainid>"
exit 1
fi
key="$2"
lags=$(jq -r --arg k "$key" 'if (.[$k].lagging_lag != null and .[$k].syncing_lag != null) then "\(.[$k].lagging_lag) \(.[$k].syncing_lag)" else empty end' "$json_file" 2>/dev/null)
if [ -z "$lags" ]; then
idk="$key"
[[ "$idk" == 0x* ]] && idk=$(printf "%d" "$idk" 2>/dev/null)
if [[ "$idk" =~ ^[0-9]+$ ]]; then
lags=$(jq -r --arg id "$idk" 'first(.[] | select(.id == ($id | tonumber)) | select(.lagging_lag != null and .syncing_lag != null) | "\(.lagging_lag) \(.syncing_lag)") // empty' "$json_file" 2>/dev/null)
fi
fi
[ -z "$lags" ] && exit 1
echo "$lags"
exit 0
fi
# Look up by rollup_version (for Aztec: version from result.header.globalVariables.version) # Look up by rollup_version (for Aztec: version from result.header.globalVariables.version)
if [ "$1" = "--rollup-version" ]; then if [ "$1" = "--rollup-version" ]; then
if [ $# -lt 2 ]; then if [ $# -lt 2 ]; then

View File

@@ -158,6 +158,24 @@ for path in $pathlist; do
fi fi
fi fi
# Per-chain dRPC lag thresholds (in blocks) so check-health.sh reports
# online/lagging/syncing the same way the dRPC gateway does (homogeneous status).
# Resolve by drpc slug first, then by chain id; empty => check-health uses its
# built-in defaults.
lags=""
if [ -n "$chain_slug" ]; then
lags=$($BASEPATH/reference-rpc-endpoint.sh --lags "$chain_slug" 2>/dev/null) || lags=""
fi
if [ -z "$lags" ] && [ -n "$chain_id_decimal" ]; then
lags=$($BASEPATH/reference-rpc-endpoint.sh --lags "$chain_id_decimal" 2>/dev/null) || lags=""
fi
if [ -n "$lags" ]; then
export LAGGING_LAG="${lags%% *}"
export SYNCING_LAG="${lags##* }"
else
unset LAGGING_LAG SYNCING_LAG
fi
# Call the health check script with RPC_URL, ref, and chain-type flag # Call the health check script with RPC_URL, ref, and chain-type flag
if $is_aztec; then if $is_aztec; then
$BASEPATH/check-health.sh "$RPC_URL" --aztec $ref $BASEPATH/check-health.sh "$RPC_URL" --aztec $ref