provisioning/taskservs/polkadot/validator/default/validator-monitor.sh.j2

375 lines
12 KiB
Plaintext
Raw Permalink Normal View History

#!/bin/bash
# Info: Polkadot Validator Monitoring Script
# Author: Provisioning System
set -e
CHAIN="{{ polkadot_validator.network.chain }}"
VALIDATOR_NAME="{{ polkadot_validator.name }}"
PROMETHEUS_PORT="{{ polkadot_validator.monitoring.prometheus_port }}"
LOG_FILE="/var/log/polkadot/validator-monitor.log"
# Logging function
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a "$LOG_FILE"
}
# Check system resources
check_system_resources() {
log "=== System Resources ==="
# CPU usage
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
log "CPU Usage: ${CPU_USAGE}%"
# Memory usage
MEMORY_INFO=$(free -m | awk 'NR==2{printf "%.1f%%", $3*100/$2}')
log "Memory Usage: $MEMORY_INFO"
# Disk usage
DISK_USAGE=$(df -h {{ polkadot_validator.base_path }} | awk 'NR==2{print $5}')
log "Disk Usage: $DISK_USAGE"
# Load average
LOAD_AVG=$(uptime | awk -F'load average:' '{print $2}')
log "Load Average:$LOAD_AVG"
echo ""
}
# Check node health
check_node_health() {
log "=== Node Health ==="
# Service status
if systemctl is-active --quiet polkadot-validator; then
log "✅ Validator service: Running"
else
log "❌ Validator service: Not running"
return 1
fi
# RPC health check
HEALTH=$(curl -s -H "Content-Type: application/json" \
-d '{"id":1, "jsonrpc":"2.0", "method": "system_health", "params":[]}' \
http://localhost:9933 | jq -r '.result' 2>/dev/null)
if [ -n "$HEALTH" ]; then
IS_SYNCING=$(echo "$HEALTH" | jq -r '.isSyncing' 2>/dev/null || echo "true")
PEERS=$(echo "$HEALTH" | jq -r '.peers' 2>/dev/null || echo "0")
SHOULD_HAVE_PEERS=$(echo "$HEALTH" | jq -r '.shouldHavePeers' 2>/dev/null || echo "true")
log "Syncing: $IS_SYNCING"
log "Peers: $PEERS"
log "Should have peers: $SHOULD_HAVE_PEERS"
if [ "$IS_SYNCING" = "false" ] && [ "$PEERS" -gt 0 ]; then
log "✅ Node is healthy and synced"
else
log "⚠️ Node may have sync issues"
fi
else
log "❌ Cannot reach node RPC"
return 1
fi
echo ""
}
# Check validator status
check_validator_status() {
log "=== Validator Status ==="
# Get chain info
CHAIN_INFO=$(curl -s -H "Content-Type: application/json" \
-d '{"id":1, "jsonrpc":"2.0", "method": "system_chain", "params":[]}' \
http://localhost:9933 | jq -r '.result' 2>/dev/null)
log "Chain: $CHAIN_INFO"
# Get node version
VERSION=$(curl -s -H "Content-Type: application/json" \
-d '{"id":1, "jsonrpc":"2.0", "method": "system_version", "params":[]}' \
http://localhost:9933 | jq -r '.result' 2>/dev/null)
log "Version: $VERSION"
# Get node name
NODE_NAME=$(curl -s -H "Content-Type: application/json" \
-d '{"id":1, "jsonrpc":"2.0", "method": "system_name", "params":[]}' \
http://localhost:9933 | jq -r '.result' 2>/dev/null)
log "Node name: $NODE_NAME"
# Check if validator is in active set (requires additional tooling)
log "Note: Use Polkadot.js Apps or polkadot-js-api to check validator active status"
echo ""
}
# Check session keys
check_session_keys() {
log "=== Session Keys ==="
SESSION_KEYS_FILE="{{ polkadot_validator.session_keys.keys_file | default('/var/lib/polkadot/session-keys') }}"
if [ -f "$SESSION_KEYS_FILE" ]; then
SESSION_KEYS=$(cat "$SESSION_KEYS_FILE")
log "Session keys file exists"
log "Keys: ${SESSION_KEYS:0:20}..."
# Check if keys are loaded in node
HAS_KEYS=$(curl -s -H "Content-Type: application/json" \
-d "{\"id\":1, \"jsonrpc\":\"2.0\", \"method\": \"author_hasSessionKeys\", \"params\":[\"$SESSION_KEYS\"]}" \
http://localhost:9933 | jq -r '.result' 2>/dev/null || echo "false")
if [ "$HAS_KEYS" = "true" ]; then
log "✅ Session keys are loaded in the node"
else
log "❌ Session keys are NOT loaded in the node"
fi
# Check key age
CURRENT_TIME=$(date +%s)
FILE_TIME=$(stat -c %Y "$SESSION_KEYS_FILE" 2>/dev/null || echo "0")
TIME_DIFF=$((CURRENT_TIME - FILE_TIME))
HOURS_OLD=$((TIME_DIFF / 3600))
DAYS_OLD=$((HOURS_OLD / 24))
log "Session keys age: $DAYS_OLD days, $((HOURS_OLD % 24)) hours"
else
log "❌ Session keys file not found"
fi
echo ""
}
# Check network connectivity
check_network() {
log "=== Network Connectivity ==="
# Get network state
NETWORK_STATE=$(curl -s -H "Content-Type: application/json" \
-d '{"id":1, "jsonrpc":"2.0", "method": "system_networkState", "params":[]}' \
http://localhost:9933 | jq -r '.result' 2>/dev/null)
if [ -n "$NETWORK_STATE" ]; then
PEER_COUNT=$(echo "$NETWORK_STATE" | jq -r '.connectedPeers | length' 2>/dev/null || echo "0")
log "Connected peers: $PEER_COUNT"
# Show peer info (limited)
if [ "$PEER_COUNT" -gt 0 ]; then
echo "$NETWORK_STATE" | jq -r '.connectedPeers | keys | .[:5][]' 2>/dev/null | while read -r peer; do
log "Peer: ${peer:0:20}..."
done
fi
else
log "❌ Cannot get network state"
fi
echo ""
}
# Check block production
check_block_production() {
log "=== Block Production ==="
# Get current block
CURRENT_BLOCK=$(curl -s -H "Content-Type: application/json" \
-d '{"id":1, "jsonrpc":"2.0", "method": "chain_getHeader", "params":[]}' \
http://localhost:9933 | jq -r '.result.number' 2>/dev/null)
if [ -n "$CURRENT_BLOCK" ]; then
BLOCK_NUM=$(printf "%d" "$CURRENT_BLOCK" 2>/dev/null || echo "0")
log "Current block: $BLOCK_NUM"
# Check if we're producing blocks (simplified check)
sleep 30
NEW_BLOCK=$(curl -s -H "Content-Type: application/json" \
-d '{"id":1, "jsonrpc":"2.0", "method": "chain_getHeader", "params":[]}' \
http://localhost:9933 | jq -r '.result.number' 2>/dev/null)
if [ -n "$NEW_BLOCK" ]; then
NEW_BLOCK_NUM=$(printf "%d" "$NEW_BLOCK" 2>/dev/null || echo "0")
DIFF=$((NEW_BLOCK_NUM - BLOCK_NUM))
log "Block progression in 30s: $DIFF blocks"
if [ "$DIFF" -gt 0 ]; then
log "✅ Chain is progressing"
else
log "⚠️ Chain may be stalled"
fi
fi
else
log "❌ Cannot get current block"
fi
echo ""
}
# Get Prometheus metrics
check_prometheus_metrics() {
log "=== Prometheus Metrics ==="
if curl -s "http://localhost:$PROMETHEUS_PORT/metrics" > /dev/null; then
log "✅ Prometheus metrics available at :$PROMETHEUS_PORT/metrics"
# Get some key metrics
METRICS=$(curl -s "http://localhost:$PROMETHEUS_PORT/metrics")
# Block height
BLOCK_HEIGHT=$(echo "$METRICS" | grep "^substrate_block_height{" | tail -1 | awk '{print $2}')
[ -n "$BLOCK_HEIGHT" ] && log "Block height (Prometheus): $BLOCK_HEIGHT"
# Ready transactions
READY_TXS=$(echo "$METRICS" | grep "^substrate_ready_transactions_number" | awk '{print $2}')
[ -n "$READY_TXS" ] && log "Ready transactions: $READY_TXS"
# Database cache size
DB_CACHE=$(echo "$METRICS" | grep "^substrate_database_cache_bytes" | awk '{print $2}')
if [ -n "$DB_CACHE" ]; then
DB_CACHE_MB=$((DB_CACHE / 1024 / 1024))
log "Database cache: ${DB_CACHE_MB}MB"
fi
else
log "❌ Prometheus metrics not available"
fi
echo ""
}
# Generate summary report
generate_report() {
log "=== VALIDATOR MONITORING REPORT ==="
log "Validator: $VALIDATOR_NAME"
log "Chain: $CHAIN"
log "Timestamp: $(date)"
log "Report generated by: $0"
echo ""
check_system_resources
check_node_health
check_validator_status
check_session_keys
check_network
check_block_production
check_prometheus_metrics
log "=== END REPORT ==="
}
# Send alert
send_alert() {
local severity="$1"
local message="$2"
log "ALERT [$severity]: $message"
# Send to syslog
logger -t polkadot-validator-alert "[$severity] $message"
# Additional alerting can be added here
# Examples: email, Slack, PagerDuty, etc.
}
# Health check with alerting
health_check() {
log "Running health check..."
# Check if service is running
if ! systemctl is-active --quiet polkadot-validator; then
send_alert "CRITICAL" "Validator service is not running"
return 1
fi
# Check RPC connectivity
if ! curl -s -f http://localhost:9933 > /dev/null 2>&1; then
send_alert "CRITICAL" "Node RPC is not responding"
return 1
fi
# Check sync status
HEALTH=$(curl -s -H "Content-Type: application/json" \
-d '{"id":1, "jsonrpc":"2.0", "method": "system_health", "params":[]}' \
http://localhost:9933 | jq -r '.result' 2>/dev/null)
if [ -n "$HEALTH" ]; then
IS_SYNCING=$(echo "$HEALTH" | jq -r '.isSyncing' 2>/dev/null || echo "true")
PEERS=$(echo "$HEALTH" | jq -r '.peers' 2>/dev/null || echo "0")
if [ "$IS_SYNCING" = "true" ]; then
send_alert "WARNING" "Node is still syncing"
fi
if [ "$PEERS" -lt 3 ]; then
send_alert "WARNING" "Low peer count: $PEERS"
fi
fi
# Check session keys
SESSION_KEYS_FILE="{{ polkadot_validator.session_keys.keys_file | default('/var/lib/polkadot/session-keys') }}"
if [ -f "$SESSION_KEYS_FILE" ]; then
SESSION_KEYS=$(cat "$SESSION_KEYS_FILE")
HAS_KEYS=$(curl -s -H "Content-Type: application/json" \
-d "{\"id\":1, \"jsonrpc\":\"2.0\", \"method\": \"author_hasSessionKeys\", \"params\":[\"$SESSION_KEYS\"]}" \
http://localhost:9933 | jq -r '.result' 2>/dev/null || echo "false")
if [ "$HAS_KEYS" != "true" ]; then
send_alert "CRITICAL" "Session keys are not loaded in the node"
fi
else
send_alert "CRITICAL" "Session keys file not found"
fi
log "Health check completed"
}
# Main command handling
case "${1:-report}" in
"report")
generate_report
;;
"health")
health_check
;;
"system")
check_system_resources
;;
"node")
check_node_health
;;
"validator")
check_validator_status
;;
"keys")
check_session_keys
;;
"network")
check_network
;;
"blocks")
check_block_production
;;
"metrics")
check_prometheus_metrics
;;
*)
echo "Usage: $0 {report|health|system|node|validator|keys|network|blocks|metrics}"
echo ""
echo "Commands:"
echo " report Generate complete monitoring report"
echo " health Run health check with alerting"
echo " system Check system resources"
echo " node Check node health"
echo " validator Check validator status"
echo " keys Check session keys"
echo " network Check network connectivity"
echo " blocks Check block production"
echo " metrics Check Prometheus metrics"
echo ""
echo "Configuration:"
echo " Validator: $VALIDATOR_NAME"
echo " Chain: $CHAIN"
echo " Prometheus: :$PROMETHEUS_PORT"
echo " Log file: $LOG_FILE"
exit 1
;;
esac