diff --git a/scripts/prune-reth.sh b/scripts/prune-reth.sh index d3608fca..8d25493d 100755 --- a/scripts/prune-reth.sh +++ b/scripts/prune-reth.sh @@ -11,88 +11,183 @@ echo "Starting Reth pruning process for static files in $STATIC_FILES_DIR..." mkdir -p "$DELETE_DIR" -# Step 1: List base filenames (without .conf/.off) and sort by starting block number -# Find files, remove extensions, sort uniquely, extract block number for numeric sort, then get original base name -echo "Finding and sorting static files..." -base_files=$(find "$STATIC_FILES_DIR" -maxdepth 1 -type f \( -name '*.conf' -o -name '*.off' -o -name '*[0-9]' \) | \ - sed -E "s/\.(conf|off)$//" | \ - sort -u | \ - awk -F_ '{print $NF+0, $0}' | \ - sort -n | \ - cut -d" " -f2-) +# Step 1: Find all unique base filenames (static_file_{group}_{start}_{end}) +echo "Finding unique static file base names..." +# Use find to get all relevant files, strip extensions, sort uniquely +# Ensure the base name includes the full path for mv later +unique_base_files=$(find "$STATIC_FILES_DIR" -maxdepth 1 -type f -name 'static_file_*_*_*' | \ + sed -E 's/\.(conf|off)$//' | \ + sort -u) -if [ -z "$base_files" ]; then - echo "No static files found to process." +if [ -z "$unique_base_files" ]; then + echo "No static files found matching the pattern 'static_file_*_*_*'." exit 0 fi -# Convert base_files to an array for easier manipulation -readarray -t base_files_array <<< "$base_files" -echo "Found ${#base_files_array[@]} unique base file ranges." +# Convert to array +readarray -t unique_base_files_array <<< "$unique_base_files" +echo "Found ${#unique_base_files_array[@]} unique base file ranges across all groups." -# Step 3: Group files by prefix and block range, keeping only the last two block ranges -declare -A file_groups +# Step 2: Group files by group_name (headers, receipts, transactions) +declare -A groups +echo "Grouping files by type (headers, receipts, transactions)..." +for base in "${unique_base_files_array[@]}"; do + filename=$(basename "$base") # Get just the filename part + # Extract group name assuming format static_file_{group_name}_{startblock}_{endblock} + group_name=$(echo "$filename" | cut -d_ -f3) -# Group files by prefix -echo "Grouping files by prefix..." -for base in "${base_files_array[@]}"; do - prefix=$(echo "$base" | sed -E "s/_([0-9]+)$//") # Get everything before the block range - block_range=$(echo "$base" | sed -E "s/.*_([0-9]+)$//") # Get the block range - file_groups["$prefix"]+="$block_range:$base " + # Store the full path base name, grouped by the extracted group name + if [[ "$group_name" == "headers" || "$group_name" == "receipts" || "$group_name" == "transactions" ]]; then + groups["$group_name"]+="$base " # Append base path with a space separator + else + echo "Warning: Skipping file with unexpected group name: $base" + fi done -# Step 4: Process each group -echo "Processing file groups to identify files for removal..." +# Step 3: Process each group according to retention rules moved_count=0 -for prefix in "${!file_groups[@]}"; do - # Read ranges into an array, sorting numerically by block range (the part before ':') - readarray -t block_ranges < <(echo "${file_groups[$prefix]}" | tr ' ' '\n' | sort -t: -k1,1n) +# Define the expected groups +declare -a group_names=("headers" "receipts" "transactions") - num_files=${#block_ranges[@]} - echo "Processing group '$prefix' with $num_files ranges." +echo "Processing file groups..." +for group_name in "${group_names[@]}"; do + # Get the space-separated list of base paths for the current group, default to empty string if group doesn't exist + group_bases_str="${groups[$group_name]:-}" - # Keep the last 2 block ranges (or fewer if less than 2 exist) - keep_count=2 - if [ "$num_files" -le "$keep_count" ]; then - echo "Keeping all files for group '$prefix' as there are $num_files ranges (<= $keep_count)." - continue - fi + if [ -z "$group_bases_str" ]; then + echo "No files found for group '$group_name'." + echo "--- Finished processing group '$group_name' ---" + continue + fi - num_to_move=$((num_files - keep_count)) - echo "Identified $num_to_move ranges to move for group '$prefix'." + # Sort base names within the group numerically by start block + # Use process substitution, awk for extraction/sorting, and readarray + readarray -t sorted_bases < <( \ + echo "$group_bases_str" | tr ' ' '\n' | \ + awk -F_ '{ + # Extract filename from full path if necessary + split($0, path_parts, "/"); + filename = path_parts[length(path_parts)]; + # Split filename by underscore and get the start block (4th field) + split(filename, name_parts, "_"); + start_block = name_parts[4]; + # Print start block (as number) and the original full base path + print start_block+0, $0 + }' | \ + sort -n | \ + cut -d' ' -f2- \ + ) - # Get the ranges to move (all except the last 'keep_count') - files_to_move=("${block_ranges[@]:0:$num_to_move}") + num_files=${#sorted_bases[@]} + echo "Processing group '$group_name' with $num_files ranges." - # Move files for the current group - for file_range in "${files_to_move[@]}"; do - base="${file_range#*:}" # Remove block range part, keeping the full filename path + # Use an associative array to track which base paths to keep (for efficient lookup) + declare -A files_to_keep + # Use a standard array to store base paths to move + files_to_move=() - # Handle files with extensions .conf and .off first - for ext in .conf .off; do - file="${base}${ext}" - if [[ -f "$file" ]]; then - echo "Moving $file to $DELETE_DIR" - mv "$file" "$DELETE_DIR/" - moved_count=$((moved_count + 1)) - fi + # --- Apply Retention Rules --- + # Rule 1: Always keep the _0_499999 range if it exists + first_range_kept=false + for base in "${sorted_bases[@]}"; do + filename=$(basename "$base") + if [[ "$filename" == *"_0_499999" ]]; then + # Check if already marked to avoid duplicate messages (though harmless) + if [[ -z "${files_to_keep[$base]}" ]]; then + echo "Marking first range '$filename' to keep for group '$group_name'." + files_to_keep["$base"]=1 # Mark this base path for keeping + first_range_kept=true + fi + # Don't break here; let it potentially be kept by Rule 2 as well if it's one of the last two + fi + done + # Add a warning if the expected first range wasn't found (and there were files) + if ! $first_range_kept && [[ $num_files -gt 0 ]]; then + echo "Warning: Did not find the expected first range (_0_499999) for group '$group_name'." + fi + + # Rule 2: Keep the last two ranges (sorted by start block) + keep_last_count=2 + # Determine how many to actually keep (can't keep 2 if only 0 or 1 exist) + num_to_keep_last=$((num_files < keep_last_count ? num_files : keep_last_count)) + + if [[ $num_to_keep_last -gt 0 ]]; then + echo "Marking last $num_to_keep_last range(s) to keep for group '$group_name':" + # Calculate the starting index for the last 'num_to_keep_last' elements + start_index=$((num_files - num_to_keep_last)) + # Loop through the indices of the ranges to keep + for (( i=start_index; i