
- Add: GitHub API integration for live version checking in taskserv management - Add: HTTP client configuration option (http.use_curl) in config.defaults.toml - Add: Helper function fetch_latest_version with curl/http get support - Fix: Settings path structure for prov_data_dirpath access pattern - Remove: Legacy simulation code for version checking - Update: Core configuration name from "provisioning-system" to "provisioning" - Clean: Remove obsolete example configs and infrastructure files
398 lines
14 KiB
Plaintext
398 lines
14 KiB
Plaintext
# Telemetry and Monitoring Integration for Nushell Infrastructure
|
|
# Secure telemetry collection and forwarding capabilities
|
|
|
|
# Send telemetry data to configured endpoints
|
|
export def send-telemetry [
|
|
data: record
|
|
--endpoint(-e): string # Override default endpoint
|
|
--format(-f): string = "json" # json, prometheus, influx
|
|
--timeout(-t): int = 30 # Request timeout in seconds
|
|
--retry(-r): int = 3 # Number of retries
|
|
] -> record {
|
|
let telemetry_endpoint = ($endpoint | default ($env.NUSHELL_TELEMETRY_ENDPOINT? | default ""))
|
|
|
|
if ($telemetry_endpoint | is-empty) {
|
|
return {
|
|
success: false
|
|
error: "No telemetry endpoint configured"
|
|
data_sent: false
|
|
}
|
|
}
|
|
|
|
# Prepare data based on format
|
|
let formatted_data = match $format {
|
|
"prometheus" => {
|
|
# Convert to Prometheus exposition format
|
|
convert-to-prometheus $data
|
|
}
|
|
"influx" => {
|
|
# Convert to InfluxDB line protocol
|
|
convert-to-influx $data
|
|
}
|
|
_ => {
|
|
# Default JSON format
|
|
$data | to json
|
|
}
|
|
}
|
|
|
|
# Add metadata
|
|
let telemetry_payload = {
|
|
timestamp: (date now | format date "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
hostname: ($env.HOSTNAME? | default "unknown")
|
|
agent: "nushell-provisioning"
|
|
version: "1.0.0"
|
|
data: $data
|
|
}
|
|
|
|
# Send data with retries
|
|
mut attempt = 1
|
|
while $attempt <= $retry {
|
|
try {
|
|
let response = (http post $telemetry_endpoint ($telemetry_payload | to json) --timeout ($timeout * 1000) --headers {"Content-Type": "application/json"})
|
|
|
|
return {
|
|
success: true
|
|
endpoint: $telemetry_endpoint
|
|
response_status: 200
|
|
attempt: $attempt
|
|
data_sent: true
|
|
timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
|
|
}
|
|
|
|
} catch { |err|
|
|
if $attempt == $retry {
|
|
return {
|
|
success: false
|
|
error: ($err | get msg)
|
|
endpoint: $telemetry_endpoint
|
|
attempts: $attempt
|
|
data_sent: false
|
|
}
|
|
}
|
|
|
|
# Wait before retry (exponential backoff)
|
|
let wait_time = ($attempt * $attempt * 2)
|
|
sleep ($wait_time * 1sec)
|
|
}
|
|
|
|
$attempt = ($attempt + 1)
|
|
}
|
|
|
|
return {
|
|
success: false
|
|
error: "Max retries exceeded"
|
|
attempts: $retry
|
|
data_sent: false
|
|
}
|
|
}
|
|
|
|
# Convert metrics to Prometheus exposition format
|
|
def convert-to-prometheus [data: record] -> string {
|
|
mut prometheus_output = ""
|
|
|
|
# Process system metrics if available
|
|
if ($data | get -i system | is-not-empty) {
|
|
let sys = ($data | get system)
|
|
|
|
# CPU metrics
|
|
if ($sys | get -i cpu | is-not-empty) {
|
|
let cpu = ($sys | get cpu)
|
|
$prometheus_output = $prometheus_output + $"# HELP system_load_1m System load average over 1 minute\n"
|
|
$prometheus_output = $prometheus_output + $"# TYPE system_load_1m gauge\n"
|
|
$prometheus_output = $prometheus_output + $"system_load_1m{hostname=\"($env.HOSTNAME? | default 'unknown')\"} ($cpu.load_1m? | default 0)\n"
|
|
|
|
$prometheus_output = $prometheus_output + $"# HELP system_load_5m System load average over 5 minutes\n"
|
|
$prometheus_output = $prometheus_output + $"# TYPE system_load_5m gauge\n"
|
|
$prometheus_output = $prometheus_output + $"system_load_5m{hostname=\"($env.HOSTNAME? | default 'unknown')\"} ($cpu.load_5m? | default 0)\n"
|
|
}
|
|
|
|
# Memory metrics
|
|
if ($sys | get -i memory | is-not-empty) {
|
|
let mem = ($sys | get memory)
|
|
$prometheus_output = $prometheus_output + $"# HELP system_memory_usage_percent Memory usage percentage\n"
|
|
$prometheus_output = $prometheus_output + $"# TYPE system_memory_usage_percent gauge\n"
|
|
$prometheus_output = $prometheus_output + $"system_memory_usage_percent{hostname=\"($env.HOSTNAME? | default 'unknown')\"} ($mem.usage_percent? | default 0)\n"
|
|
|
|
$prometheus_output = $prometheus_output + $"# HELP system_memory_total_bytes Total memory in bytes\n"
|
|
$prometheus_output = $prometheus_output + $"# TYPE system_memory_total_bytes gauge\n"
|
|
$prometheus_output = $prometheus_output + $"system_memory_total_bytes{hostname=\"($env.HOSTNAME? | default 'unknown')\"} (($mem.total_kb? | default 0) * 1024)\n"
|
|
}
|
|
}
|
|
|
|
return $prometheus_output
|
|
}
|
|
|
|
# Convert metrics to InfluxDB line protocol
|
|
def convert-to-influx [data: record] -> string {
|
|
mut influx_lines = []
|
|
let timestamp = (date now | format date "%s%N")
|
|
let hostname = ($env.HOSTNAME? | default "unknown")
|
|
|
|
# Process system metrics
|
|
if ($data | get -i system | is-not-empty) {
|
|
let sys = ($data | get system)
|
|
|
|
# CPU metrics
|
|
if ($sys | get -i cpu | is-not-empty) {
|
|
let cpu = ($sys | get cpu)
|
|
$influx_lines = ($influx_lines | append $"system_cpu,hostname=($hostname) load_1m=($cpu.load_1m? | default 0),load_5m=($cpu.load_5m? | default 0),load_15m=($cpu.load_15m? | default 0) ($timestamp)")
|
|
}
|
|
|
|
# Memory metrics
|
|
if ($sys | get -i memory | is-not-empty) {
|
|
let mem = ($sys | get memory)
|
|
$influx_lines = ($influx_lines | append $"system_memory,hostname=($hostname) usage_percent=($mem.usage_percent? | default 0),total_kb=($mem.total_kb? | default 0),used_kb=($mem.used_kb? | default 0) ($timestamp)")
|
|
}
|
|
|
|
# Process metrics
|
|
if ($sys | get -i processes | is-not-empty) {
|
|
let proc = ($sys | get processes)
|
|
$influx_lines = ($influx_lines | append $"system_processes,hostname=($hostname) total=($proc.total? | default 0) ($timestamp)")
|
|
}
|
|
}
|
|
|
|
return ($influx_lines | str join "\n")
|
|
}
|
|
|
|
# Create and manage telemetry batches
|
|
export def batch-telemetry [
|
|
--max-batch-size(-s): int = 100 # Maximum items per batch
|
|
--max-wait-time(-w): int = 30 # Maximum wait time in seconds
|
|
--output-file(-o): string # File to store batched data
|
|
] -> nothing {
|
|
mut batch = []
|
|
mut batch_start_time = (date now)
|
|
|
|
print $"📊 Starting telemetry batching (max size: ($max_batch_size), max wait: ($max_wait_time)s)"
|
|
|
|
# Monitor for telemetry data
|
|
while true {
|
|
# Check if we have data to batch (this would typically come from external sources)
|
|
# For demonstration, we'll create sample data
|
|
let current_time = (date now)
|
|
|
|
# Collect current metrics
|
|
try {
|
|
use ../observability/collect.nu *
|
|
let metrics = (collect-system-metrics)
|
|
|
|
# Add to batch
|
|
$batch = ($batch | append {
|
|
timestamp: ($current_time | format date "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
type: "system_metrics"
|
|
data: $metrics
|
|
})
|
|
|
|
# Check batch conditions
|
|
let batch_size = ($batch | length)
|
|
let elapsed_time = (($current_time - $batch_start_time) / 1sec)
|
|
|
|
if $batch_size >= $max_batch_size or $elapsed_time >= $max_wait_time {
|
|
# Send batch
|
|
let batch_result = (send-batch $batch --output-file $output_file)
|
|
|
|
if $batch_result.success {
|
|
print $"✅ Batch sent successfully: ($batch_size) items"
|
|
} else {
|
|
print $"❌ Batch send failed: ($batch_result.error)"
|
|
}
|
|
|
|
# Reset batch
|
|
$batch = []
|
|
$batch_start_time = (date now)
|
|
}
|
|
|
|
} catch { |err|
|
|
print $"⚠️ Error collecting metrics: ($err | get msg)"
|
|
}
|
|
|
|
# Wait before next collection
|
|
sleep 10sec
|
|
}
|
|
}
|
|
|
|
# Send a batch of telemetry data
|
|
def send-batch [
|
|
batch: list<record>
|
|
--output-file(-o): string
|
|
] -> record {
|
|
if ($batch | length) == 0 {
|
|
return {success: true, message: "Empty batch, nothing to send"}
|
|
}
|
|
|
|
let batch_payload = {
|
|
batch_id: (random uuid)
|
|
batch_size: ($batch | length)
|
|
batch_timestamp: (date now | format date "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
hostname: ($env.HOSTNAME? | default "unknown")
|
|
agent: "nushell-telemetry"
|
|
items: $batch
|
|
}
|
|
|
|
# Save to file if specified
|
|
if ($output_file | is-not-empty) {
|
|
try {
|
|
$batch_payload | to json | save -a $output_file
|
|
return {
|
|
success: true
|
|
message: $"Batch saved to file: ($output_file)"
|
|
batch_size: ($batch | length)
|
|
}
|
|
} catch { |err|
|
|
return {
|
|
success: false
|
|
error: $"Failed to save batch: ($err | get msg)"
|
|
}
|
|
}
|
|
}
|
|
|
|
# Send to telemetry endpoint
|
|
let endpoint = ($env.NUSHELL_TELEMETRY_ENDPOINT? | default "")
|
|
if ($endpoint | is-not-empty) {
|
|
return (send-telemetry $batch_payload --endpoint $endpoint)
|
|
} else {
|
|
return {
|
|
success: false
|
|
error: "No telemetry endpoint configured"
|
|
}
|
|
}
|
|
}
|
|
|
|
# Monitor system health and send alerts
|
|
export def health-monitoring [
|
|
--alert-threshold(-t): record = {cpu: 80, memory: 90, disk: 95} # Alert thresholds
|
|
--check-interval(-i): int = 60 # Check interval in seconds
|
|
--alert-endpoint(-e): string # Alert webhook endpoint
|
|
] -> nothing {
|
|
print $"🔍 Starting health monitoring with thresholds: ($alert_threshold)"
|
|
|
|
while true {
|
|
try {
|
|
use ../observability/collect.nu *
|
|
let status = (status-check)
|
|
|
|
# Check for threshold violations
|
|
mut alerts = []
|
|
|
|
# CPU check
|
|
if ($status.metrics.system.cpu.load_1m? | default 0) > ($alert_threshold.cpu / 10.0) {
|
|
$alerts = ($alerts | append {
|
|
type: "cpu_high"
|
|
severity: "warning"
|
|
message: $"High CPU load: ($status.metrics.system.cpu.load_1m)"
|
|
threshold: $alert_threshold.cpu
|
|
current_value: $status.metrics.system.cpu.load_1m
|
|
})
|
|
}
|
|
|
|
# Memory check
|
|
if ($status.metrics.system.memory.usage_percent? | default 0) > $alert_threshold.memory {
|
|
$alerts = ($alerts | append {
|
|
type: "memory_high"
|
|
severity: "critical"
|
|
message: $"High memory usage: ($status.metrics.system.memory.usage_percent)%"
|
|
threshold: $alert_threshold.memory
|
|
current_value: $status.metrics.system.memory.usage_percent
|
|
})
|
|
}
|
|
|
|
# Disk check
|
|
try {
|
|
let high_disk_usage = ($status.metrics.system.disk | where {|disk|
|
|
($disk.percent | str replace "%" "" | into float) > $alert_threshold.disk
|
|
})
|
|
|
|
if ($high_disk_usage | length) > 0 {
|
|
for disk in $high_disk_usage {
|
|
$alerts = ($alerts | append {
|
|
type: "disk_high"
|
|
severity: "critical"
|
|
message: $"High disk usage on ($disk.mount): ($disk.percent)"
|
|
threshold: $alert_threshold.disk
|
|
current_value: ($disk.percent | str replace "%" "" | into float)
|
|
filesystem: $disk.filesystem
|
|
mount: $disk.mount
|
|
})
|
|
}
|
|
}
|
|
} catch {}
|
|
|
|
# Send alerts if any
|
|
if ($alerts | length) > 0 {
|
|
let alert_payload = {
|
|
timestamp: (date now | format date "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
hostname: ($env.HOSTNAME? | default "unknown")
|
|
alert_count: ($alerts | length)
|
|
alerts: $alerts
|
|
system_status: $status
|
|
}
|
|
|
|
# Send to telemetry endpoint
|
|
let result = (send-telemetry $alert_payload --endpoint ($alert_endpoint | default ($env.NUSHELL_TELEMETRY_ENDPOINT? | default "")))
|
|
|
|
if $result.success {
|
|
print $"🚨 Sent ($alerts | length) alerts to monitoring system"
|
|
} else {
|
|
print $"❌ Failed to send alerts: ($result.error)"
|
|
}
|
|
|
|
# Also log alerts locally
|
|
$alerts | each { |alert|
|
|
print $"⚠️ ALERT: ($alert.type) - ($alert.message)"
|
|
}
|
|
}
|
|
|
|
# Send regular health status
|
|
let health_payload = {
|
|
type: "health_check"
|
|
timestamp: (date now | format date "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
status: $status
|
|
}
|
|
|
|
send-telemetry $health_payload | ignore
|
|
|
|
} catch { |err|
|
|
print $"❌ Health monitoring error: ($err | get msg)"
|
|
}
|
|
|
|
sleep ($check_interval * 1sec)
|
|
}
|
|
}
|
|
|
|
# Initialize telemetry configuration
|
|
export def init-telemetry [
|
|
--endpoint(-e): string # Telemetry endpoint URL
|
|
--format(-f): string = "json" # Default format
|
|
--enable-health(-h) # Enable health monitoring
|
|
--config-file(-c): string # Save configuration to file
|
|
] -> record {
|
|
let config = {
|
|
endpoint: ($endpoint | default "")
|
|
format: $format
|
|
health_monitoring: ($enable_health | default false)
|
|
created: (date now | format date "%Y-%m-%d %H:%M:%S")
|
|
version: "1.0.0"
|
|
}
|
|
|
|
# Set environment variables
|
|
$env.NUSHELL_TELEMETRY_ENDPOINT = ($endpoint | default "")
|
|
$env.NUSHELL_TELEMETRY_FORMAT = $format
|
|
$env.NUSHELL_TELEMETRY_ENABLED = "true"
|
|
|
|
# Save configuration if file specified
|
|
if ($config_file | is-not-empty) {
|
|
try {
|
|
$config | to json | save $config_file
|
|
print $"📝 Telemetry configuration saved to ($config_file)"
|
|
} catch { |err|
|
|
print $"⚠️ Failed to save configuration: ($err | get msg)"
|
|
}
|
|
}
|
|
|
|
print $"🔧 Telemetry initialized:"
|
|
print $" Endpoint: ($config.endpoint)"
|
|
print $" Format: ($config.format)"
|
|
print $" Health monitoring: ($config.health_monitoring)"
|
|
|
|
return $config
|
|
} |