provisioning/taskservs/nushell/observability/telemetry.nu

398 lines
14 KiB
Plaintext
Raw Normal View History

# Telemetry and Monitoring Integration for Nushell Infrastructure
# Secure telemetry collection and forwarding capabilities
# Send telemetry data to configured endpoints
export def send-telemetry [
data: record
--endpoint(-e): string # Override default endpoint
--format(-f): string = "json" # json, prometheus, influx
--timeout(-t): int = 30 # Request timeout in seconds
--retry(-r): int = 3 # Number of retries
] -> record {
let telemetry_endpoint = ($endpoint | default ($env.NUSHELL_TELEMETRY_ENDPOINT? | default ""))
if ($telemetry_endpoint | is-empty) {
return {
success: false
error: "No telemetry endpoint configured"
data_sent: false
}
}
# Prepare data based on format
let formatted_data = match $format {
"prometheus" => {
# Convert to Prometheus exposition format
convert-to-prometheus $data
}
"influx" => {
# Convert to InfluxDB line protocol
convert-to-influx $data
}
_ => {
# Default JSON format
$data | to json
}
}
# Add metadata
let telemetry_payload = {
timestamp: (date now | format date "%Y-%m-%dT%H:%M:%S.%fZ")
hostname: ($env.HOSTNAME? | default "unknown")
agent: "nushell-provisioning"
version: "1.0.0"
data: $data
}
# Send data with retries
mut attempt = 1
while $attempt <= $retry {
try {
let response = (http post $telemetry_endpoint ($telemetry_payload | to json) --timeout ($timeout * 1000) --headers {"Content-Type": "application/json"})
return {
success: true
endpoint: $telemetry_endpoint
response_status: 200
attempt: $attempt
data_sent: true
timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
}
} catch { |err|
if $attempt == $retry {
return {
success: false
error: ($err | get msg)
endpoint: $telemetry_endpoint
attempts: $attempt
data_sent: false
}
}
# Wait before retry (exponential backoff)
let wait_time = ($attempt * $attempt * 2)
sleep ($wait_time * 1sec)
}
$attempt = ($attempt + 1)
}
return {
success: false
error: "Max retries exceeded"
attempts: $retry
data_sent: false
}
}
# Convert metrics to Prometheus exposition format
def convert-to-prometheus [data: record] -> string {
mut prometheus_output = ""
# Process system metrics if available
if ($data | get -i system | is-not-empty) {
let sys = ($data | get system)
# CPU metrics
if ($sys | get -i cpu | is-not-empty) {
let cpu = ($sys | get cpu)
$prometheus_output = $prometheus_output + $"# HELP system_load_1m System load average over 1 minute\n"
$prometheus_output = $prometheus_output + $"# TYPE system_load_1m gauge\n"
$prometheus_output = $prometheus_output + $"system_load_1m{hostname=\"($env.HOSTNAME? | default 'unknown')\"} ($cpu.load_1m? | default 0)\n"
$prometheus_output = $prometheus_output + $"# HELP system_load_5m System load average over 5 minutes\n"
$prometheus_output = $prometheus_output + $"# TYPE system_load_5m gauge\n"
$prometheus_output = $prometheus_output + $"system_load_5m{hostname=\"($env.HOSTNAME? | default 'unknown')\"} ($cpu.load_5m? | default 0)\n"
}
# Memory metrics
if ($sys | get -i memory | is-not-empty) {
let mem = ($sys | get memory)
$prometheus_output = $prometheus_output + $"# HELP system_memory_usage_percent Memory usage percentage\n"
$prometheus_output = $prometheus_output + $"# TYPE system_memory_usage_percent gauge\n"
$prometheus_output = $prometheus_output + $"system_memory_usage_percent{hostname=\"($env.HOSTNAME? | default 'unknown')\"} ($mem.usage_percent? | default 0)\n"
$prometheus_output = $prometheus_output + $"# HELP system_memory_total_bytes Total memory in bytes\n"
$prometheus_output = $prometheus_output + $"# TYPE system_memory_total_bytes gauge\n"
$prometheus_output = $prometheus_output + $"system_memory_total_bytes{hostname=\"($env.HOSTNAME? | default 'unknown')\"} (($mem.total_kb? | default 0) * 1024)\n"
}
}
return $prometheus_output
}
# Convert metrics to InfluxDB line protocol
def convert-to-influx [data: record] -> string {
mut influx_lines = []
let timestamp = (date now | format date "%s%N")
let hostname = ($env.HOSTNAME? | default "unknown")
# Process system metrics
if ($data | get -i system | is-not-empty) {
let sys = ($data | get system)
# CPU metrics
if ($sys | get -i cpu | is-not-empty) {
let cpu = ($sys | get cpu)
$influx_lines = ($influx_lines | append $"system_cpu,hostname=($hostname) load_1m=($cpu.load_1m? | default 0),load_5m=($cpu.load_5m? | default 0),load_15m=($cpu.load_15m? | default 0) ($timestamp)")
}
# Memory metrics
if ($sys | get -i memory | is-not-empty) {
let mem = ($sys | get memory)
$influx_lines = ($influx_lines | append $"system_memory,hostname=($hostname) usage_percent=($mem.usage_percent? | default 0),total_kb=($mem.total_kb? | default 0),used_kb=($mem.used_kb? | default 0) ($timestamp)")
}
# Process metrics
if ($sys | get -i processes | is-not-empty) {
let proc = ($sys | get processes)
$influx_lines = ($influx_lines | append $"system_processes,hostname=($hostname) total=($proc.total? | default 0) ($timestamp)")
}
}
return ($influx_lines | str join "\n")
}
# Create and manage telemetry batches
export def batch-telemetry [
--max-batch-size(-s): int = 100 # Maximum items per batch
--max-wait-time(-w): int = 30 # Maximum wait time in seconds
--output-file(-o): string # File to store batched data
] -> nothing {
mut batch = []
mut batch_start_time = (date now)
print $"📊 Starting telemetry batching (max size: ($max_batch_size), max wait: ($max_wait_time)s)"
# Monitor for telemetry data
while true {
# Check if we have data to batch (this would typically come from external sources)
# For demonstration, we'll create sample data
let current_time = (date now)
# Collect current metrics
try {
use ../observability/collect.nu *
let metrics = (collect-system-metrics)
# Add to batch
$batch = ($batch | append {
timestamp: ($current_time | format date "%Y-%m-%dT%H:%M:%S.%fZ")
type: "system_metrics"
data: $metrics
})
# Check batch conditions
let batch_size = ($batch | length)
let elapsed_time = (($current_time - $batch_start_time) / 1sec)
if $batch_size >= $max_batch_size or $elapsed_time >= $max_wait_time {
# Send batch
let batch_result = (send-batch $batch --output-file $output_file)
if $batch_result.success {
print $"✅ Batch sent successfully: ($batch_size) items"
} else {
print $"❌ Batch send failed: ($batch_result.error)"
}
# Reset batch
$batch = []
$batch_start_time = (date now)
}
} catch { |err|
print $"⚠️ Error collecting metrics: ($err | get msg)"
}
# Wait before next collection
sleep 10sec
}
}
# Send a batch of telemetry data
def send-batch [
batch: list<record>
--output-file(-o): string
] -> record {
if ($batch | length) == 0 {
return {success: true, message: "Empty batch, nothing to send"}
}
let batch_payload = {
batch_id: (random uuid)
batch_size: ($batch | length)
batch_timestamp: (date now | format date "%Y-%m-%dT%H:%M:%S.%fZ")
hostname: ($env.HOSTNAME? | default "unknown")
agent: "nushell-telemetry"
items: $batch
}
# Save to file if specified
if ($output_file | is-not-empty) {
try {
$batch_payload | to json | save -a $output_file
return {
success: true
message: $"Batch saved to file: ($output_file)"
batch_size: ($batch | length)
}
} catch { |err|
return {
success: false
error: $"Failed to save batch: ($err | get msg)"
}
}
}
# Send to telemetry endpoint
let endpoint = ($env.NUSHELL_TELEMETRY_ENDPOINT? | default "")
if ($endpoint | is-not-empty) {
return (send-telemetry $batch_payload --endpoint $endpoint)
} else {
return {
success: false
error: "No telemetry endpoint configured"
}
}
}
# Monitor system health and send alerts
export def health-monitoring [
--alert-threshold(-t): record = {cpu: 80, memory: 90, disk: 95} # Alert thresholds
--check-interval(-i): int = 60 # Check interval in seconds
--alert-endpoint(-e): string # Alert webhook endpoint
] -> nothing {
print $"🔍 Starting health monitoring with thresholds: ($alert_threshold)"
while true {
try {
use ../observability/collect.nu *
let status = (status-check)
# Check for threshold violations
mut alerts = []
# CPU check
if ($status.metrics.system.cpu.load_1m? | default 0) > ($alert_threshold.cpu / 10.0) {
$alerts = ($alerts | append {
type: "cpu_high"
severity: "warning"
message: $"High CPU load: ($status.metrics.system.cpu.load_1m)"
threshold: $alert_threshold.cpu
current_value: $status.metrics.system.cpu.load_1m
})
}
# Memory check
if ($status.metrics.system.memory.usage_percent? | default 0) > $alert_threshold.memory {
$alerts = ($alerts | append {
type: "memory_high"
severity: "critical"
message: $"High memory usage: ($status.metrics.system.memory.usage_percent)%"
threshold: $alert_threshold.memory
current_value: $status.metrics.system.memory.usage_percent
})
}
# Disk check
try {
let high_disk_usage = ($status.metrics.system.disk | where {|disk|
($disk.percent | str replace "%" "" | into float) > $alert_threshold.disk
})
if ($high_disk_usage | length) > 0 {
for disk in $high_disk_usage {
$alerts = ($alerts | append {
type: "disk_high"
severity: "critical"
message: $"High disk usage on ($disk.mount): ($disk.percent)"
threshold: $alert_threshold.disk
current_value: ($disk.percent | str replace "%" "" | into float)
filesystem: $disk.filesystem
mount: $disk.mount
})
}
}
} catch {}
# Send alerts if any
if ($alerts | length) > 0 {
let alert_payload = {
timestamp: (date now | format date "%Y-%m-%dT%H:%M:%S.%fZ")
hostname: ($env.HOSTNAME? | default "unknown")
alert_count: ($alerts | length)
alerts: $alerts
system_status: $status
}
# Send to telemetry endpoint
let result = (send-telemetry $alert_payload --endpoint ($alert_endpoint | default ($env.NUSHELL_TELEMETRY_ENDPOINT? | default "")))
if $result.success {
print $"🚨 Sent ($alerts | length) alerts to monitoring system"
} else {
print $"❌ Failed to send alerts: ($result.error)"
}
# Also log alerts locally
$alerts | each { |alert|
print $"⚠️ ALERT: ($alert.type) - ($alert.message)"
}
}
# Send regular health status
let health_payload = {
type: "health_check"
timestamp: (date now | format date "%Y-%m-%dT%H:%M:%S.%fZ")
status: $status
}
send-telemetry $health_payload | ignore
} catch { |err|
print $"❌ Health monitoring error: ($err | get msg)"
}
sleep ($check_interval * 1sec)
}
}
# Initialize telemetry configuration
export def init-telemetry [
--endpoint(-e): string # Telemetry endpoint URL
--format(-f): string = "json" # Default format
--enable-health(-h) # Enable health monitoring
--config-file(-c): string # Save configuration to file
] -> record {
let config = {
endpoint: ($endpoint | default "")
format: $format
health_monitoring: ($enable_health | default false)
created: (date now | format date "%Y-%m-%d %H:%M:%S")
version: "1.0.0"
}
# Set environment variables
$env.NUSHELL_TELEMETRY_ENDPOINT = ($endpoint | default "")
$env.NUSHELL_TELEMETRY_FORMAT = $format
$env.NUSHELL_TELEMETRY_ENABLED = "true"
# Save configuration if file specified
if ($config_file | is-not-empty) {
try {
$config | to json | save $config_file
print $"📝 Telemetry configuration saved to ($config_file)"
} catch { |err|
print $"⚠️ Failed to save configuration: ($err | get msg)"
}
}
print $"🔧 Telemetry initialized:"
print $" Endpoint: ($config.endpoint)"
print $" Format: ($config.format)"
print $" Health monitoring: ($config.health_monitoring)"
return $config
}