# Telemetry and Monitoring Integration for Nushell Infrastructure # Secure telemetry collection and forwarding capabilities # Send telemetry data to configured endpoints export def send-telemetry [ data: record --endpoint(-e): string # Override default endpoint --format(-f): string = "json" # json, prometheus, influx --timeout(-t): int = 30 # Request timeout in seconds --retry(-r): int = 3 # Number of retries ] -> record { let telemetry_endpoint = ($endpoint | default ($env.NUSHELL_TELEMETRY_ENDPOINT? | default "")) if ($telemetry_endpoint | is-empty) { return { success: false error: "No telemetry endpoint configured" data_sent: false } } # Prepare data based on format let formatted_data = match $format { "prometheus" => { # Convert to Prometheus exposition format convert-to-prometheus $data } "influx" => { # Convert to InfluxDB line protocol convert-to-influx $data } _ => { # Default JSON format $data | to json } } # Add metadata let telemetry_payload = { timestamp: (date now | format date "%Y-%m-%dT%H:%M:%S.%fZ") hostname: ($env.HOSTNAME? | default "unknown") agent: "nushell-provisioning" version: "1.0.0" data: $data } # Send data with retries mut attempt = 1 while $attempt <= $retry { try { let response = (http post $telemetry_endpoint ($telemetry_payload | to json) --timeout ($timeout * 1000) --headers {"Content-Type": "application/json"}) return { success: true endpoint: $telemetry_endpoint response_status: 200 attempt: $attempt data_sent: true timestamp: (date now | format date "%Y-%m-%d %H:%M:%S") } } catch { |err| if $attempt == $retry { return { success: false error: ($err | get msg) endpoint: $telemetry_endpoint attempts: $attempt data_sent: false } } # Wait before retry (exponential backoff) let wait_time = ($attempt * $attempt * 2) sleep ($wait_time * 1sec) } $attempt = ($attempt + 1) } return { success: false error: "Max retries exceeded" attempts: $retry data_sent: false } } # Convert metrics to Prometheus exposition format def convert-to-prometheus [data: record] -> string { mut prometheus_output = "" # Process system metrics if available if ($data | get -i system | is-not-empty) { let sys = ($data | get system) # CPU metrics if ($sys | get -i cpu | is-not-empty) { let cpu = ($sys | get cpu) $prometheus_output = $prometheus_output + $"# HELP system_load_1m System load average over 1 minute\n" $prometheus_output = $prometheus_output + $"# TYPE system_load_1m gauge\n" $prometheus_output = $prometheus_output + $"system_load_1m{hostname=\"($env.HOSTNAME? | default 'unknown')\"} ($cpu.load_1m? | default 0)\n" $prometheus_output = $prometheus_output + $"# HELP system_load_5m System load average over 5 minutes\n" $prometheus_output = $prometheus_output + $"# TYPE system_load_5m gauge\n" $prometheus_output = $prometheus_output + $"system_load_5m{hostname=\"($env.HOSTNAME? | default 'unknown')\"} ($cpu.load_5m? | default 0)\n" } # Memory metrics if ($sys | get -i memory | is-not-empty) { let mem = ($sys | get memory) $prometheus_output = $prometheus_output + $"# HELP system_memory_usage_percent Memory usage percentage\n" $prometheus_output = $prometheus_output + $"# TYPE system_memory_usage_percent gauge\n" $prometheus_output = $prometheus_output + $"system_memory_usage_percent{hostname=\"($env.HOSTNAME? | default 'unknown')\"} ($mem.usage_percent? | default 0)\n" $prometheus_output = $prometheus_output + $"# HELP system_memory_total_bytes Total memory in bytes\n" $prometheus_output = $prometheus_output + $"# TYPE system_memory_total_bytes gauge\n" $prometheus_output = $prometheus_output + $"system_memory_total_bytes{hostname=\"($env.HOSTNAME? | default 'unknown')\"} (($mem.total_kb? | default 0) * 1024)\n" } } return $prometheus_output } # Convert metrics to InfluxDB line protocol def convert-to-influx [data: record] -> string { mut influx_lines = [] let timestamp = (date now | format date "%s%N") let hostname = ($env.HOSTNAME? | default "unknown") # Process system metrics if ($data | get -i system | is-not-empty) { let sys = ($data | get system) # CPU metrics if ($sys | get -i cpu | is-not-empty) { let cpu = ($sys | get cpu) $influx_lines = ($influx_lines | append $"system_cpu,hostname=($hostname) load_1m=($cpu.load_1m? | default 0),load_5m=($cpu.load_5m? | default 0),load_15m=($cpu.load_15m? | default 0) ($timestamp)") } # Memory metrics if ($sys | get -i memory | is-not-empty) { let mem = ($sys | get memory) $influx_lines = ($influx_lines | append $"system_memory,hostname=($hostname) usage_percent=($mem.usage_percent? | default 0),total_kb=($mem.total_kb? | default 0),used_kb=($mem.used_kb? | default 0) ($timestamp)") } # Process metrics if ($sys | get -i processes | is-not-empty) { let proc = ($sys | get processes) $influx_lines = ($influx_lines | append $"system_processes,hostname=($hostname) total=($proc.total? | default 0) ($timestamp)") } } return ($influx_lines | str join "\n") } # Create and manage telemetry batches export def batch-telemetry [ --max-batch-size(-s): int = 100 # Maximum items per batch --max-wait-time(-w): int = 30 # Maximum wait time in seconds --output-file(-o): string # File to store batched data ] -> nothing { mut batch = [] mut batch_start_time = (date now) print $"📊 Starting telemetry batching (max size: ($max_batch_size), max wait: ($max_wait_time)s)" # Monitor for telemetry data while true { # Check if we have data to batch (this would typically come from external sources) # For demonstration, we'll create sample data let current_time = (date now) # Collect current metrics try { use ../observability/collect.nu * let metrics = (collect-system-metrics) # Add to batch $batch = ($batch | append { timestamp: ($current_time | format date "%Y-%m-%dT%H:%M:%S.%fZ") type: "system_metrics" data: $metrics }) # Check batch conditions let batch_size = ($batch | length) let elapsed_time = (($current_time - $batch_start_time) / 1sec) if $batch_size >= $max_batch_size or $elapsed_time >= $max_wait_time { # Send batch let batch_result = (send-batch $batch --output-file $output_file) if $batch_result.success { print $"✅ Batch sent successfully: ($batch_size) items" } else { print $"❌ Batch send failed: ($batch_result.error)" } # Reset batch $batch = [] $batch_start_time = (date now) } } catch { |err| print $"⚠️ Error collecting metrics: ($err | get msg)" } # Wait before next collection sleep 10sec } } # Send a batch of telemetry data def send-batch [ batch: list --output-file(-o): string ] -> record { if ($batch | length) == 0 { return {success: true, message: "Empty batch, nothing to send"} } let batch_payload = { batch_id: (random uuid) batch_size: ($batch | length) batch_timestamp: (date now | format date "%Y-%m-%dT%H:%M:%S.%fZ") hostname: ($env.HOSTNAME? | default "unknown") agent: "nushell-telemetry" items: $batch } # Save to file if specified if ($output_file | is-not-empty) { try { $batch_payload | to json | save -a $output_file return { success: true message: $"Batch saved to file: ($output_file)" batch_size: ($batch | length) } } catch { |err| return { success: false error: $"Failed to save batch: ($err | get msg)" } } } # Send to telemetry endpoint let endpoint = ($env.NUSHELL_TELEMETRY_ENDPOINT? | default "") if ($endpoint | is-not-empty) { return (send-telemetry $batch_payload --endpoint $endpoint) } else { return { success: false error: "No telemetry endpoint configured" } } } # Monitor system health and send alerts export def health-monitoring [ --alert-threshold(-t): record = {cpu: 80, memory: 90, disk: 95} # Alert thresholds --check-interval(-i): int = 60 # Check interval in seconds --alert-endpoint(-e): string # Alert webhook endpoint ] -> nothing { print $"🔍 Starting health monitoring with thresholds: ($alert_threshold)" while true { try { use ../observability/collect.nu * let status = (status-check) # Check for threshold violations mut alerts = [] # CPU check if ($status.metrics.system.cpu.load_1m? | default 0) > ($alert_threshold.cpu / 10.0) { $alerts = ($alerts | append { type: "cpu_high" severity: "warning" message: $"High CPU load: ($status.metrics.system.cpu.load_1m)" threshold: $alert_threshold.cpu current_value: $status.metrics.system.cpu.load_1m }) } # Memory check if ($status.metrics.system.memory.usage_percent? | default 0) > $alert_threshold.memory { $alerts = ($alerts | append { type: "memory_high" severity: "critical" message: $"High memory usage: ($status.metrics.system.memory.usage_percent)%" threshold: $alert_threshold.memory current_value: $status.metrics.system.memory.usage_percent }) } # Disk check try { let high_disk_usage = ($status.metrics.system.disk | where {|disk| ($disk.percent | str replace "%" "" | into float) > $alert_threshold.disk }) if ($high_disk_usage | length) > 0 { for disk in $high_disk_usage { $alerts = ($alerts | append { type: "disk_high" severity: "critical" message: $"High disk usage on ($disk.mount): ($disk.percent)" threshold: $alert_threshold.disk current_value: ($disk.percent | str replace "%" "" | into float) filesystem: $disk.filesystem mount: $disk.mount }) } } } catch {} # Send alerts if any if ($alerts | length) > 0 { let alert_payload = { timestamp: (date now | format date "%Y-%m-%dT%H:%M:%S.%fZ") hostname: ($env.HOSTNAME? | default "unknown") alert_count: ($alerts | length) alerts: $alerts system_status: $status } # Send to telemetry endpoint let result = (send-telemetry $alert_payload --endpoint ($alert_endpoint | default ($env.NUSHELL_TELEMETRY_ENDPOINT? | default ""))) if $result.success { print $"🚨 Sent ($alerts | length) alerts to monitoring system" } else { print $"❌ Failed to send alerts: ($result.error)" } # Also log alerts locally $alerts | each { |alert| print $"⚠️ ALERT: ($alert.type) - ($alert.message)" } } # Send regular health status let health_payload = { type: "health_check" timestamp: (date now | format date "%Y-%m-%dT%H:%M:%S.%fZ") status: $status } send-telemetry $health_payload | ignore } catch { |err| print $"❌ Health monitoring error: ($err | get msg)" } sleep ($check_interval * 1sec) } } # Initialize telemetry configuration export def init-telemetry [ --endpoint(-e): string # Telemetry endpoint URL --format(-f): string = "json" # Default format --enable-health(-h) # Enable health monitoring --config-file(-c): string # Save configuration to file ] -> record { let config = { endpoint: ($endpoint | default "") format: $format health_monitoring: ($enable_health | default false) created: (date now | format date "%Y-%m-%d %H:%M:%S") version: "1.0.0" } # Set environment variables $env.NUSHELL_TELEMETRY_ENDPOINT = ($endpoint | default "") $env.NUSHELL_TELEMETRY_FORMAT = $format $env.NUSHELL_TELEMETRY_ENABLED = "true" # Save configuration if file specified if ($config_file | is-not-empty) { try { $config | to json | save $config_file print $"📝 Telemetry configuration saved to ($config_file)" } catch { |err| print $"⚠️ Failed to save configuration: ($err | get msg)" } } print $"🔧 Telemetry initialized:" print $" Endpoint: ($config.endpoint)" print $" Format: ($config.format)" print $" Health monitoring: ($config.health_monitoring)" return $config }