375 lines
12 KiB
Django/Jinja
375 lines
12 KiB
Django/Jinja
#!/bin/bash
|
|
# Info: Polkadot Validator Monitoring Script
|
|
# Author: Provisioning System
|
|
|
|
set -e
|
|
|
|
CHAIN="{{ polkadot_validator.network.chain }}"
|
|
VALIDATOR_NAME="{{ polkadot_validator.name }}"
|
|
PROMETHEUS_PORT="{{ polkadot_validator.monitoring.prometheus_port }}"
|
|
LOG_FILE="/var/log/polkadot/validator-monitor.log"
|
|
|
|
# Logging function
|
|
log() {
|
|
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a "$LOG_FILE"
|
|
}
|
|
|
|
# Check system resources
|
|
check_system_resources() {
|
|
log "=== System Resources ==="
|
|
|
|
# CPU usage
|
|
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
|
|
log "CPU Usage: ${CPU_USAGE}%"
|
|
|
|
# Memory usage
|
|
MEMORY_INFO=$(free -m | awk 'NR==2{printf "%.1f%%", $3*100/$2}')
|
|
log "Memory Usage: $MEMORY_INFO"
|
|
|
|
# Disk usage
|
|
DISK_USAGE=$(df -h {{ polkadot_validator.base_path }} | awk 'NR==2{print $5}')
|
|
log "Disk Usage: $DISK_USAGE"
|
|
|
|
# Load average
|
|
LOAD_AVG=$(uptime | awk -F'load average:' '{print $2}')
|
|
log "Load Average:$LOAD_AVG"
|
|
|
|
echo ""
|
|
}
|
|
|
|
# Check node health
|
|
check_node_health() {
|
|
log "=== Node Health ==="
|
|
|
|
# Service status
|
|
if systemctl is-active --quiet polkadot-validator; then
|
|
log "✅ Validator service: Running"
|
|
else
|
|
log "❌ Validator service: Not running"
|
|
return 1
|
|
fi
|
|
|
|
# RPC health check
|
|
HEALTH=$(curl -s -H "Content-Type: application/json" \
|
|
-d '{"id":1, "jsonrpc":"2.0", "method": "system_health", "params":[]}' \
|
|
http://localhost:9933 | jq -r '.result' 2>/dev/null)
|
|
|
|
if [ -n "$HEALTH" ]; then
|
|
IS_SYNCING=$(echo "$HEALTH" | jq -r '.isSyncing' 2>/dev/null || echo "true")
|
|
PEERS=$(echo "$HEALTH" | jq -r '.peers' 2>/dev/null || echo "0")
|
|
SHOULD_HAVE_PEERS=$(echo "$HEALTH" | jq -r '.shouldHavePeers' 2>/dev/null || echo "true")
|
|
|
|
log "Syncing: $IS_SYNCING"
|
|
log "Peers: $PEERS"
|
|
log "Should have peers: $SHOULD_HAVE_PEERS"
|
|
|
|
if [ "$IS_SYNCING" = "false" ] && [ "$PEERS" -gt 0 ]; then
|
|
log "✅ Node is healthy and synced"
|
|
else
|
|
log "⚠️ Node may have sync issues"
|
|
fi
|
|
else
|
|
log "❌ Cannot reach node RPC"
|
|
return 1
|
|
fi
|
|
|
|
echo ""
|
|
}
|
|
|
|
# Check validator status
|
|
check_validator_status() {
|
|
log "=== Validator Status ==="
|
|
|
|
# Get chain info
|
|
CHAIN_INFO=$(curl -s -H "Content-Type: application/json" \
|
|
-d '{"id":1, "jsonrpc":"2.0", "method": "system_chain", "params":[]}' \
|
|
http://localhost:9933 | jq -r '.result' 2>/dev/null)
|
|
log "Chain: $CHAIN_INFO"
|
|
|
|
# Get node version
|
|
VERSION=$(curl -s -H "Content-Type: application/json" \
|
|
-d '{"id":1, "jsonrpc":"2.0", "method": "system_version", "params":[]}' \
|
|
http://localhost:9933 | jq -r '.result' 2>/dev/null)
|
|
log "Version: $VERSION"
|
|
|
|
# Get node name
|
|
NODE_NAME=$(curl -s -H "Content-Type: application/json" \
|
|
-d '{"id":1, "jsonrpc":"2.0", "method": "system_name", "params":[]}' \
|
|
http://localhost:9933 | jq -r '.result' 2>/dev/null)
|
|
log "Node name: $NODE_NAME"
|
|
|
|
# Check if validator is in active set (requires additional tooling)
|
|
log "Note: Use Polkadot.js Apps or polkadot-js-api to check validator active status"
|
|
|
|
echo ""
|
|
}
|
|
|
|
# Check session keys
|
|
check_session_keys() {
|
|
log "=== Session Keys ==="
|
|
|
|
SESSION_KEYS_FILE="{{ polkadot_validator.session_keys.keys_file | default('/var/lib/polkadot/session-keys') }}"
|
|
|
|
if [ -f "$SESSION_KEYS_FILE" ]; then
|
|
SESSION_KEYS=$(cat "$SESSION_KEYS_FILE")
|
|
log "Session keys file exists"
|
|
log "Keys: ${SESSION_KEYS:0:20}..."
|
|
|
|
# Check if keys are loaded in node
|
|
HAS_KEYS=$(curl -s -H "Content-Type: application/json" \
|
|
-d "{\"id\":1, \"jsonrpc\":\"2.0\", \"method\": \"author_hasSessionKeys\", \"params\":[\"$SESSION_KEYS\"]}" \
|
|
http://localhost:9933 | jq -r '.result' 2>/dev/null || echo "false")
|
|
|
|
if [ "$HAS_KEYS" = "true" ]; then
|
|
log "✅ Session keys are loaded in the node"
|
|
else
|
|
log "❌ Session keys are NOT loaded in the node"
|
|
fi
|
|
|
|
# Check key age
|
|
CURRENT_TIME=$(date +%s)
|
|
FILE_TIME=$(stat -c %Y "$SESSION_KEYS_FILE" 2>/dev/null || echo "0")
|
|
TIME_DIFF=$((CURRENT_TIME - FILE_TIME))
|
|
HOURS_OLD=$((TIME_DIFF / 3600))
|
|
DAYS_OLD=$((HOURS_OLD / 24))
|
|
|
|
log "Session keys age: $DAYS_OLD days, $((HOURS_OLD % 24)) hours"
|
|
else
|
|
log "❌ Session keys file not found"
|
|
fi
|
|
|
|
echo ""
|
|
}
|
|
|
|
# Check network connectivity
|
|
check_network() {
|
|
log "=== Network Connectivity ==="
|
|
|
|
# Get network state
|
|
NETWORK_STATE=$(curl -s -H "Content-Type: application/json" \
|
|
-d '{"id":1, "jsonrpc":"2.0", "method": "system_networkState", "params":[]}' \
|
|
http://localhost:9933 | jq -r '.result' 2>/dev/null)
|
|
|
|
if [ -n "$NETWORK_STATE" ]; then
|
|
PEER_COUNT=$(echo "$NETWORK_STATE" | jq -r '.connectedPeers | length' 2>/dev/null || echo "0")
|
|
log "Connected peers: $PEER_COUNT"
|
|
|
|
# Show peer info (limited)
|
|
if [ "$PEER_COUNT" -gt 0 ]; then
|
|
echo "$NETWORK_STATE" | jq -r '.connectedPeers | keys | .[:5][]' 2>/dev/null | while read -r peer; do
|
|
log "Peer: ${peer:0:20}..."
|
|
done
|
|
fi
|
|
else
|
|
log "❌ Cannot get network state"
|
|
fi
|
|
|
|
echo ""
|
|
}
|
|
|
|
# Check block production
|
|
check_block_production() {
|
|
log "=== Block Production ==="
|
|
|
|
# Get current block
|
|
CURRENT_BLOCK=$(curl -s -H "Content-Type: application/json" \
|
|
-d '{"id":1, "jsonrpc":"2.0", "method": "chain_getHeader", "params":[]}' \
|
|
http://localhost:9933 | jq -r '.result.number' 2>/dev/null)
|
|
|
|
if [ -n "$CURRENT_BLOCK" ]; then
|
|
BLOCK_NUM=$(printf "%d" "$CURRENT_BLOCK" 2>/dev/null || echo "0")
|
|
log "Current block: $BLOCK_NUM"
|
|
|
|
# Check if we're producing blocks (simplified check)
|
|
sleep 30
|
|
NEW_BLOCK=$(curl -s -H "Content-Type: application/json" \
|
|
-d '{"id":1, "jsonrpc":"2.0", "method": "chain_getHeader", "params":[]}' \
|
|
http://localhost:9933 | jq -r '.result.number' 2>/dev/null)
|
|
|
|
if [ -n "$NEW_BLOCK" ]; then
|
|
NEW_BLOCK_NUM=$(printf "%d" "$NEW_BLOCK" 2>/dev/null || echo "0")
|
|
DIFF=$((NEW_BLOCK_NUM - BLOCK_NUM))
|
|
log "Block progression in 30s: $DIFF blocks"
|
|
|
|
if [ "$DIFF" -gt 0 ]; then
|
|
log "✅ Chain is progressing"
|
|
else
|
|
log "⚠️ Chain may be stalled"
|
|
fi
|
|
fi
|
|
else
|
|
log "❌ Cannot get current block"
|
|
fi
|
|
|
|
echo ""
|
|
}
|
|
|
|
# Get Prometheus metrics
|
|
check_prometheus_metrics() {
|
|
log "=== Prometheus Metrics ==="
|
|
|
|
if curl -s "http://localhost:$PROMETHEUS_PORT/metrics" > /dev/null; then
|
|
log "✅ Prometheus metrics available at :$PROMETHEUS_PORT/metrics"
|
|
|
|
# Get some key metrics
|
|
METRICS=$(curl -s "http://localhost:$PROMETHEUS_PORT/metrics")
|
|
|
|
# Block height
|
|
BLOCK_HEIGHT=$(echo "$METRICS" | grep "^substrate_block_height{" | tail -1 | awk '{print $2}')
|
|
[ -n "$BLOCK_HEIGHT" ] && log "Block height (Prometheus): $BLOCK_HEIGHT"
|
|
|
|
# Ready transactions
|
|
READY_TXS=$(echo "$METRICS" | grep "^substrate_ready_transactions_number" | awk '{print $2}')
|
|
[ -n "$READY_TXS" ] && log "Ready transactions: $READY_TXS"
|
|
|
|
# Database cache size
|
|
DB_CACHE=$(echo "$METRICS" | grep "^substrate_database_cache_bytes" | awk '{print $2}')
|
|
if [ -n "$DB_CACHE" ]; then
|
|
DB_CACHE_MB=$((DB_CACHE / 1024 / 1024))
|
|
log "Database cache: ${DB_CACHE_MB}MB"
|
|
fi
|
|
|
|
else
|
|
log "❌ Prometheus metrics not available"
|
|
fi
|
|
|
|
echo ""
|
|
}
|
|
|
|
# Generate summary report
|
|
generate_report() {
|
|
log "=== VALIDATOR MONITORING REPORT ==="
|
|
log "Validator: $VALIDATOR_NAME"
|
|
log "Chain: $CHAIN"
|
|
log "Timestamp: $(date)"
|
|
log "Report generated by: $0"
|
|
echo ""
|
|
|
|
check_system_resources
|
|
check_node_health
|
|
check_validator_status
|
|
check_session_keys
|
|
check_network
|
|
check_block_production
|
|
check_prometheus_metrics
|
|
|
|
log "=== END REPORT ==="
|
|
}
|
|
|
|
# Send alert
|
|
send_alert() {
|
|
local severity="$1"
|
|
local message="$2"
|
|
|
|
log "ALERT [$severity]: $message"
|
|
|
|
# Send to syslog
|
|
logger -t polkadot-validator-alert "[$severity] $message"
|
|
|
|
# Additional alerting can be added here
|
|
# Examples: email, Slack, PagerDuty, etc.
|
|
}
|
|
|
|
# Health check with alerting
|
|
health_check() {
|
|
log "Running health check..."
|
|
|
|
# Check if service is running
|
|
if ! systemctl is-active --quiet polkadot-validator; then
|
|
send_alert "CRITICAL" "Validator service is not running"
|
|
return 1
|
|
fi
|
|
|
|
# Check RPC connectivity
|
|
if ! curl -s -f http://localhost:9933 > /dev/null 2>&1; then
|
|
send_alert "CRITICAL" "Node RPC is not responding"
|
|
return 1
|
|
fi
|
|
|
|
# Check sync status
|
|
HEALTH=$(curl -s -H "Content-Type: application/json" \
|
|
-d '{"id":1, "jsonrpc":"2.0", "method": "system_health", "params":[]}' \
|
|
http://localhost:9933 | jq -r '.result' 2>/dev/null)
|
|
|
|
if [ -n "$HEALTH" ]; then
|
|
IS_SYNCING=$(echo "$HEALTH" | jq -r '.isSyncing' 2>/dev/null || echo "true")
|
|
PEERS=$(echo "$HEALTH" | jq -r '.peers' 2>/dev/null || echo "0")
|
|
|
|
if [ "$IS_SYNCING" = "true" ]; then
|
|
send_alert "WARNING" "Node is still syncing"
|
|
fi
|
|
|
|
if [ "$PEERS" -lt 3 ]; then
|
|
send_alert "WARNING" "Low peer count: $PEERS"
|
|
fi
|
|
fi
|
|
|
|
# Check session keys
|
|
SESSION_KEYS_FILE="{{ polkadot_validator.session_keys.keys_file | default('/var/lib/polkadot/session-keys') }}"
|
|
if [ -f "$SESSION_KEYS_FILE" ]; then
|
|
SESSION_KEYS=$(cat "$SESSION_KEYS_FILE")
|
|
HAS_KEYS=$(curl -s -H "Content-Type: application/json" \
|
|
-d "{\"id\":1, \"jsonrpc\":\"2.0\", \"method\": \"author_hasSessionKeys\", \"params\":[\"$SESSION_KEYS\"]}" \
|
|
http://localhost:9933 | jq -r '.result' 2>/dev/null || echo "false")
|
|
|
|
if [ "$HAS_KEYS" != "true" ]; then
|
|
send_alert "CRITICAL" "Session keys are not loaded in the node"
|
|
fi
|
|
else
|
|
send_alert "CRITICAL" "Session keys file not found"
|
|
fi
|
|
|
|
log "Health check completed"
|
|
}
|
|
|
|
# Main command handling
|
|
case "${1:-report}" in
|
|
"report")
|
|
generate_report
|
|
;;
|
|
"health")
|
|
health_check
|
|
;;
|
|
"system")
|
|
check_system_resources
|
|
;;
|
|
"node")
|
|
check_node_health
|
|
;;
|
|
"validator")
|
|
check_validator_status
|
|
;;
|
|
"keys")
|
|
check_session_keys
|
|
;;
|
|
"network")
|
|
check_network
|
|
;;
|
|
"blocks")
|
|
check_block_production
|
|
;;
|
|
"metrics")
|
|
check_prometheus_metrics
|
|
;;
|
|
*)
|
|
echo "Usage: $0 {report|health|system|node|validator|keys|network|blocks|metrics}"
|
|
echo ""
|
|
echo "Commands:"
|
|
echo " report Generate complete monitoring report"
|
|
echo " health Run health check with alerting"
|
|
echo " system Check system resources"
|
|
echo " node Check node health"
|
|
echo " validator Check validator status"
|
|
echo " keys Check session keys"
|
|
echo " network Check network connectivity"
|
|
echo " blocks Check block production"
|
|
echo " metrics Check Prometheus metrics"
|
|
echo ""
|
|
echo "Configuration:"
|
|
echo " Validator: $VALIDATOR_NAME"
|
|
echo " Chain: $CHAIN"
|
|
echo " Prometheus: :$PROMETHEUS_PORT"
|
|
echo " Log file: $LOG_FILE"
|
|
exit 1
|
|
;;
|
|
esac |