#!/bin/bash # Info: Polkadot Validator Monitoring Script # Author: Provisioning System set -e CHAIN="{{ polkadot_validator.network.chain }}" VALIDATOR_NAME="{{ polkadot_validator.name }}" PROMETHEUS_PORT="{{ polkadot_validator.monitoring.prometheus_port }}" LOG_FILE="/var/log/polkadot/validator-monitor.log" # Logging function log() { echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a "$LOG_FILE" } # Check system resources check_system_resources() { log "=== System Resources ===" # CPU usage CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1) log "CPU Usage: ${CPU_USAGE}%" # Memory usage MEMORY_INFO=$(free -m | awk 'NR==2{printf "%.1f%%", $3*100/$2}') log "Memory Usage: $MEMORY_INFO" # Disk usage DISK_USAGE=$(df -h {{ polkadot_validator.base_path }} | awk 'NR==2{print $5}') log "Disk Usage: $DISK_USAGE" # Load average LOAD_AVG=$(uptime | awk -F'load average:' '{print $2}') log "Load Average:$LOAD_AVG" echo "" } # Check node health check_node_health() { log "=== Node Health ===" # Service status if systemctl is-active --quiet polkadot-validator; then log "✅ Validator service: Running" else log "❌ Validator service: Not running" return 1 fi # RPC health check HEALTH=$(curl -s -H "Content-Type: application/json" \ -d '{"id":1, "jsonrpc":"2.0", "method": "system_health", "params":[]}' \ http://localhost:9933 | jq -r '.result' 2>/dev/null) if [ -n "$HEALTH" ]; then IS_SYNCING=$(echo "$HEALTH" | jq -r '.isSyncing' 2>/dev/null || echo "true") PEERS=$(echo "$HEALTH" | jq -r '.peers' 2>/dev/null || echo "0") SHOULD_HAVE_PEERS=$(echo "$HEALTH" | jq -r '.shouldHavePeers' 2>/dev/null || echo "true") log "Syncing: $IS_SYNCING" log "Peers: $PEERS" log "Should have peers: $SHOULD_HAVE_PEERS" if [ "$IS_SYNCING" = "false" ] && [ "$PEERS" -gt 0 ]; then log "✅ Node is healthy and synced" else log "⚠️ Node may have sync issues" fi else log "❌ Cannot reach node RPC" return 1 fi echo "" } # Check validator status check_validator_status() { log "=== Validator Status ===" # Get chain info CHAIN_INFO=$(curl -s -H "Content-Type: application/json" \ -d '{"id":1, "jsonrpc":"2.0", "method": "system_chain", "params":[]}' \ http://localhost:9933 | jq -r '.result' 2>/dev/null) log "Chain: $CHAIN_INFO" # Get node version VERSION=$(curl -s -H "Content-Type: application/json" \ -d '{"id":1, "jsonrpc":"2.0", "method": "system_version", "params":[]}' \ http://localhost:9933 | jq -r '.result' 2>/dev/null) log "Version: $VERSION" # Get node name NODE_NAME=$(curl -s -H "Content-Type: application/json" \ -d '{"id":1, "jsonrpc":"2.0", "method": "system_name", "params":[]}' \ http://localhost:9933 | jq -r '.result' 2>/dev/null) log "Node name: $NODE_NAME" # Check if validator is in active set (requires additional tooling) log "Note: Use Polkadot.js Apps or polkadot-js-api to check validator active status" echo "" } # Check session keys check_session_keys() { log "=== Session Keys ===" SESSION_KEYS_FILE="{{ polkadot_validator.session_keys.keys_file | default('/var/lib/polkadot/session-keys') }}" if [ -f "$SESSION_KEYS_FILE" ]; then SESSION_KEYS=$(cat "$SESSION_KEYS_FILE") log "Session keys file exists" log "Keys: ${SESSION_KEYS:0:20}..." # Check if keys are loaded in node HAS_KEYS=$(curl -s -H "Content-Type: application/json" \ -d "{\"id\":1, \"jsonrpc\":\"2.0\", \"method\": \"author_hasSessionKeys\", \"params\":[\"$SESSION_KEYS\"]}" \ http://localhost:9933 | jq -r '.result' 2>/dev/null || echo "false") if [ "$HAS_KEYS" = "true" ]; then log "✅ Session keys are loaded in the node" else log "❌ Session keys are NOT loaded in the node" fi # Check key age CURRENT_TIME=$(date +%s) FILE_TIME=$(stat -c %Y "$SESSION_KEYS_FILE" 2>/dev/null || echo "0") TIME_DIFF=$((CURRENT_TIME - FILE_TIME)) HOURS_OLD=$((TIME_DIFF / 3600)) DAYS_OLD=$((HOURS_OLD / 24)) log "Session keys age: $DAYS_OLD days, $((HOURS_OLD % 24)) hours" else log "❌ Session keys file not found" fi echo "" } # Check network connectivity check_network() { log "=== Network Connectivity ===" # Get network state NETWORK_STATE=$(curl -s -H "Content-Type: application/json" \ -d '{"id":1, "jsonrpc":"2.0", "method": "system_networkState", "params":[]}' \ http://localhost:9933 | jq -r '.result' 2>/dev/null) if [ -n "$NETWORK_STATE" ]; then PEER_COUNT=$(echo "$NETWORK_STATE" | jq -r '.connectedPeers | length' 2>/dev/null || echo "0") log "Connected peers: $PEER_COUNT" # Show peer info (limited) if [ "$PEER_COUNT" -gt 0 ]; then echo "$NETWORK_STATE" | jq -r '.connectedPeers | keys | .[:5][]' 2>/dev/null | while read -r peer; do log "Peer: ${peer:0:20}..." done fi else log "❌ Cannot get network state" fi echo "" } # Check block production check_block_production() { log "=== Block Production ===" # Get current block CURRENT_BLOCK=$(curl -s -H "Content-Type: application/json" \ -d '{"id":1, "jsonrpc":"2.0", "method": "chain_getHeader", "params":[]}' \ http://localhost:9933 | jq -r '.result.number' 2>/dev/null) if [ -n "$CURRENT_BLOCK" ]; then BLOCK_NUM=$(printf "%d" "$CURRENT_BLOCK" 2>/dev/null || echo "0") log "Current block: $BLOCK_NUM" # Check if we're producing blocks (simplified check) sleep 30 NEW_BLOCK=$(curl -s -H "Content-Type: application/json" \ -d '{"id":1, "jsonrpc":"2.0", "method": "chain_getHeader", "params":[]}' \ http://localhost:9933 | jq -r '.result.number' 2>/dev/null) if [ -n "$NEW_BLOCK" ]; then NEW_BLOCK_NUM=$(printf "%d" "$NEW_BLOCK" 2>/dev/null || echo "0") DIFF=$((NEW_BLOCK_NUM - BLOCK_NUM)) log "Block progression in 30s: $DIFF blocks" if [ "$DIFF" -gt 0 ]; then log "✅ Chain is progressing" else log "⚠️ Chain may be stalled" fi fi else log "❌ Cannot get current block" fi echo "" } # Get Prometheus metrics check_prometheus_metrics() { log "=== Prometheus Metrics ===" if curl -s "http://localhost:$PROMETHEUS_PORT/metrics" > /dev/null; then log "✅ Prometheus metrics available at :$PROMETHEUS_PORT/metrics" # Get some key metrics METRICS=$(curl -s "http://localhost:$PROMETHEUS_PORT/metrics") # Block height BLOCK_HEIGHT=$(echo "$METRICS" | grep "^substrate_block_height{" | tail -1 | awk '{print $2}') [ -n "$BLOCK_HEIGHT" ] && log "Block height (Prometheus): $BLOCK_HEIGHT" # Ready transactions READY_TXS=$(echo "$METRICS" | grep "^substrate_ready_transactions_number" | awk '{print $2}') [ -n "$READY_TXS" ] && log "Ready transactions: $READY_TXS" # Database cache size DB_CACHE=$(echo "$METRICS" | grep "^substrate_database_cache_bytes" | awk '{print $2}') if [ -n "$DB_CACHE" ]; then DB_CACHE_MB=$((DB_CACHE / 1024 / 1024)) log "Database cache: ${DB_CACHE_MB}MB" fi else log "❌ Prometheus metrics not available" fi echo "" } # Generate summary report generate_report() { log "=== VALIDATOR MONITORING REPORT ===" log "Validator: $VALIDATOR_NAME" log "Chain: $CHAIN" log "Timestamp: $(date)" log "Report generated by: $0" echo "" check_system_resources check_node_health check_validator_status check_session_keys check_network check_block_production check_prometheus_metrics log "=== END REPORT ===" } # Send alert send_alert() { local severity="$1" local message="$2" log "ALERT [$severity]: $message" # Send to syslog logger -t polkadot-validator-alert "[$severity] $message" # Additional alerting can be added here # Examples: email, Slack, PagerDuty, etc. } # Health check with alerting health_check() { log "Running health check..." # Check if service is running if ! systemctl is-active --quiet polkadot-validator; then send_alert "CRITICAL" "Validator service is not running" return 1 fi # Check RPC connectivity if ! curl -s -f http://localhost:9933 > /dev/null 2>&1; then send_alert "CRITICAL" "Node RPC is not responding" return 1 fi # Check sync status HEALTH=$(curl -s -H "Content-Type: application/json" \ -d '{"id":1, "jsonrpc":"2.0", "method": "system_health", "params":[]}' \ http://localhost:9933 | jq -r '.result' 2>/dev/null) if [ -n "$HEALTH" ]; then IS_SYNCING=$(echo "$HEALTH" | jq -r '.isSyncing' 2>/dev/null || echo "true") PEERS=$(echo "$HEALTH" | jq -r '.peers' 2>/dev/null || echo "0") if [ "$IS_SYNCING" = "true" ]; then send_alert "WARNING" "Node is still syncing" fi if [ "$PEERS" -lt 3 ]; then send_alert "WARNING" "Low peer count: $PEERS" fi fi # Check session keys SESSION_KEYS_FILE="{{ polkadot_validator.session_keys.keys_file | default('/var/lib/polkadot/session-keys') }}" if [ -f "$SESSION_KEYS_FILE" ]; then SESSION_KEYS=$(cat "$SESSION_KEYS_FILE") HAS_KEYS=$(curl -s -H "Content-Type: application/json" \ -d "{\"id\":1, \"jsonrpc\":\"2.0\", \"method\": \"author_hasSessionKeys\", \"params\":[\"$SESSION_KEYS\"]}" \ http://localhost:9933 | jq -r '.result' 2>/dev/null || echo "false") if [ "$HAS_KEYS" != "true" ]; then send_alert "CRITICAL" "Session keys are not loaded in the node" fi else send_alert "CRITICAL" "Session keys file not found" fi log "Health check completed" } # Main command handling case "${1:-report}" in "report") generate_report ;; "health") health_check ;; "system") check_system_resources ;; "node") check_node_health ;; "validator") check_validator_status ;; "keys") check_session_keys ;; "network") check_network ;; "blocks") check_block_production ;; "metrics") check_prometheus_metrics ;; *) echo "Usage: $0 {report|health|system|node|validator|keys|network|blocks|metrics}" echo "" echo "Commands:" echo " report Generate complete monitoring report" echo " health Run health check with alerting" echo " system Check system resources" echo " node Check node health" echo " validator Check validator status" echo " keys Check session keys" echo " network Check network connectivity" echo " blocks Check block production" echo " metrics Check Prometheus metrics" echo "" echo "Configuration:" echo " Validator: $VALIDATOR_NAME" echo " Chain: $CHAIN" echo " Prometheus: :$PROMETHEUS_PORT" echo " Log file: $LOG_FILE" exit 1 ;; esac