provisioning/taskservs/nushell/observability/telemetry.nu

# Telemetry and Monitoring Integration for Nushell Infrastructure
# Secure telemetry collection and forwarding capabilities

# Send telemetry data to configured endpoints
export def send-telemetry [
    data: record
    --endpoint(-e): string      # Override default endpoint
    --format(-f): string = "json"  # json, prometheus, influx
    --timeout(-t): int = 30     # Request timeout in seconds
    --retry(-r): int = 3        # Number of retries
] -> record {
    let telemetry_endpoint = ($endpoint | default ($env.NUSHELL_TELEMETRY_ENDPOINT? | default ""))

    if ($telemetry_endpoint | is-empty) {
        return {
            success: false
            error: "No telemetry endpoint configured"
            data_sent: false
        }
    }

    # Prepare data based on format
    let formatted_data = match $format {
        "prometheus" => {
            # Convert to Prometheus exposition format
            convert-to-prometheus $data
        }
        "influx" => {
            # Convert to InfluxDB line protocol
            convert-to-influx $data
        }
        _ => {
            # Default JSON format
            $data | to json
        }
    }

    # Add metadata
    let telemetry_payload = {
        timestamp: (date now | format date "%Y-%m-%dT%H:%M:%S.%fZ")
        hostname: ($env.HOSTNAME? | default "unknown")
        agent: "nushell-provisioning"
        version: "1.0.0"
        data: $data
    }

    # Send data with retries
    mut attempt = 1
    while $attempt <= $retry {
        try {
            let response = (http post $telemetry_endpoint ($telemetry_payload | to json) --timeout ($timeout * 1000) --headers {"Content-Type": "application/json"})

            return {
                success: true
                endpoint: $telemetry_endpoint
                response_status: 200
                attempt: $attempt
                data_sent: true
                timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
            }

        } catch { |err|
            if $attempt == $retry {
                return {
                    success: false
                    error: ($err | get msg)
                    endpoint: $telemetry_endpoint
                    attempts: $attempt
                    data_sent: false
                }
            }

            # Wait before retry (exponential backoff)
            let wait_time = ($attempt * $attempt * 2)
            sleep ($wait_time * 1sec)
        }

        $attempt = ($attempt + 1)
    }

    return {
        success: false
        error: "Max retries exceeded"
        attempts: $retry
        data_sent: false
    }
}

# Convert metrics to Prometheus exposition format
def convert-to-prometheus [data: record] -> string {
    mut prometheus_output = ""

    # Process system metrics if available
    if ($data | get -i system | is-not-empty) {
        let sys = ($data | get system)

        # CPU metrics
        if ($sys | get -i cpu | is-not-empty) {
            let cpu = ($sys | get cpu)
            $prometheus_output = $prometheus_output + $"# HELP system_load_1m System load average over 1 minute\n"
            $prometheus_output = $prometheus_output + $"# TYPE system_load_1m gauge\n"
            $prometheus_output = $prometheus_output + $"system_load_1m{hostname=\"($env.HOSTNAME? | default 'unknown')\"} ($cpu.load_1m? | default 0)\n"

            $prometheus_output = $prometheus_output + $"# HELP system_load_5m System load average over 5 minutes\n"
            $prometheus_output = $prometheus_output + $"# TYPE system_load_5m gauge\n"
            $prometheus_output = $prometheus_output + $"system_load_5m{hostname=\"($env.HOSTNAME? | default 'unknown')\"} ($cpu.load_5m? | default 0)\n"
        }

        # Memory metrics
        if ($sys | get -i memory | is-not-empty) {
            let mem = ($sys | get memory)
            $prometheus_output = $prometheus_output + $"# HELP system_memory_usage_percent Memory usage percentage\n"
            $prometheus_output = $prometheus_output + $"# TYPE system_memory_usage_percent gauge\n"
            $prometheus_output = $prometheus_output + $"system_memory_usage_percent{hostname=\"($env.HOSTNAME? | default 'unknown')\"} ($mem.usage_percent? | default 0)\n"

            $prometheus_output = $prometheus_output + $"# HELP system_memory_total_bytes Total memory in bytes\n"
            $prometheus_output = $prometheus_output + $"# TYPE system_memory_total_bytes gauge\n"
            $prometheus_output = $prometheus_output + $"system_memory_total_bytes{hostname=\"($env.HOSTNAME? | default 'unknown')\"} (($mem.total_kb? | default 0) * 1024)\n"
        }
    }

    return $prometheus_output
}

# Convert metrics to InfluxDB line protocol
def convert-to-influx [data: record] -> string {
    mut influx_lines = []
    let timestamp = (date now | format date "%s%N")
    let hostname = ($env.HOSTNAME? | default "unknown")

    # Process system metrics
    if ($data | get -i system | is-not-empty) {
        let sys = ($data | get system)

        # CPU metrics
        if ($sys | get -i cpu | is-not-empty) {
            let cpu = ($sys | get cpu)
            $influx_lines = ($influx_lines | append $"system_cpu,hostname=($hostname) load_1m=($cpu.load_1m? | default 0),load_5m=($cpu.load_5m? | default 0),load_15m=($cpu.load_15m? | default 0) ($timestamp)")
        }

        # Memory metrics
        if ($sys | get -i memory | is-not-empty) {
            let mem = ($sys | get memory)
            $influx_lines = ($influx_lines | append $"system_memory,hostname=($hostname) usage_percent=($mem.usage_percent? | default 0),total_kb=($mem.total_kb? | default 0),used_kb=($mem.used_kb? | default 0) ($timestamp)")
        }

        # Process metrics
        if ($sys | get -i processes | is-not-empty) {
            let proc = ($sys | get processes)
            $influx_lines = ($influx_lines | append $"system_processes,hostname=($hostname) total=($proc.total? | default 0) ($timestamp)")
        }
    }

    return ($influx_lines | str join "\n")
}

# Create and manage telemetry batches
export def batch-telemetry [
    --max-batch-size(-s): int = 100    # Maximum items per batch
    --max-wait-time(-w): int = 30      # Maximum wait time in seconds
    --output-file(-o): string          # File to store batched data
] -> nothing {
    mut batch = []
    mut batch_start_time = (date now)

    print $"📊 Starting telemetry batching (max size: ($max_batch_size), max wait: ($max_wait_time)s)"

    # Monitor for telemetry data
    while true {
        # Check if we have data to batch (this would typically come from external sources)
        # For demonstration, we'll create sample data
        let current_time = (date now)

        # Collect current metrics
        try {
            use ../observability/collect.nu *
            let metrics = (collect-system-metrics)

            # Add to batch
            $batch = ($batch | append {
                timestamp: ($current_time | format date "%Y-%m-%dT%H:%M:%S.%fZ")
                type: "system_metrics"
                data: $metrics
            })

            # Check batch conditions
            let batch_size = ($batch | length)
            let elapsed_time = (($current_time - $batch_start_time) / 1sec)

            if $batch_size >= $max_batch_size or $elapsed_time >= $max_wait_time {
                # Send batch
                let batch_result = (send-batch $batch --output-file $output_file)

                if $batch_result.success {
                    print $"✅ Batch sent successfully: ($batch_size) items"
                } else {
                    print $"❌ Batch send failed: ($batch_result.error)"
                }

                # Reset batch
                $batch = []
                $batch_start_time = (date now)
            }

        } catch { |err|
            print $"⚠️  Error collecting metrics: ($err | get msg)"
        }

        # Wait before next collection
        sleep 10sec
    }
}

# Send a batch of telemetry data
def send-batch [
    batch: list<record>
    --output-file(-o): string
] -> record {
    if ($batch | length) == 0 {
        return {success: true, message: "Empty batch, nothing to send"}
    }

    let batch_payload = {
        batch_id: (random uuid)
        batch_size: ($batch | length)
        batch_timestamp: (date now | format date "%Y-%m-%dT%H:%M:%S.%fZ")
        hostname: ($env.HOSTNAME? | default "unknown")
        agent: "nushell-telemetry"
        items: $batch
    }

    # Save to file if specified
    if ($output_file | is-not-empty) {
        try {
            $batch_payload | to json | save -a $output_file
            return {
                success: true
                message: $"Batch saved to file: ($output_file)"
                batch_size: ($batch | length)
            }
        } catch { |err|
            return {
                success: false
                error: $"Failed to save batch: ($err | get msg)"
            }
        }
    }

    # Send to telemetry endpoint
    let endpoint = ($env.NUSHELL_TELEMETRY_ENDPOINT? | default "")
    if ($endpoint | is-not-empty) {
        return (send-telemetry $batch_payload --endpoint $endpoint)
    } else {
        return {
            success: false
            error: "No telemetry endpoint configured"
        }
    }
}

# Monitor system health and send alerts
export def health-monitoring [
    --alert-threshold(-t): record = {cpu: 80, memory: 90, disk: 95}  # Alert thresholds
    --check-interval(-i): int = 60        # Check interval in seconds
    --alert-endpoint(-e): string          # Alert webhook endpoint
] -> nothing {
    print $"🔍 Starting health monitoring with thresholds: ($alert_threshold)"

    while true {
        try {
            use ../observability/collect.nu *
            let status = (status-check)

            # Check for threshold violations
            mut alerts = []

            # CPU check
            if ($status.metrics.system.cpu.load_1m? | default 0) > ($alert_threshold.cpu / 10.0) {
                $alerts = ($alerts | append {
                    type: "cpu_high"
                    severity: "warning"
                    message: $"High CPU load: ($status.metrics.system.cpu.load_1m)"
                    threshold: $alert_threshold.cpu
                    current_value: $status.metrics.system.cpu.load_1m
                })
            }

            # Memory check
            if ($status.metrics.system.memory.usage_percent? | default 0) > $alert_threshold.memory {
                $alerts = ($alerts | append {
                    type: "memory_high"
                    severity: "critical"
                    message: $"High memory usage: ($status.metrics.system.memory.usage_percent)%"
                    threshold: $alert_threshold.memory
                    current_value: $status.metrics.system.memory.usage_percent
                })
            }

            # Disk check
            try {
                let high_disk_usage = ($status.metrics.system.disk | where {|disk|
                    ($disk.percent | str replace "%" "" | into float) > $alert_threshold.disk
                })

                if ($high_disk_usage | length) > 0 {
                    for disk in $high_disk_usage {
                        $alerts = ($alerts | append {
                            type: "disk_high"
                            severity: "critical"
                            message: $"High disk usage on ($disk.mount): ($disk.percent)"
                            threshold: $alert_threshold.disk
                            current_value: ($disk.percent | str replace "%" "" | into float)
                            filesystem: $disk.filesystem
                            mount: $disk.mount
                        })
                    }
                }
            } catch {}

            # Send alerts if any
            if ($alerts | length) > 0 {
                let alert_payload = {
                    timestamp: (date now | format date "%Y-%m-%dT%H:%M:%S.%fZ")
                    hostname: ($env.HOSTNAME? | default "unknown")
                    alert_count: ($alerts | length)
                    alerts: $alerts
                    system_status: $status
                }

                # Send to telemetry endpoint
                let result = (send-telemetry $alert_payload --endpoint ($alert_endpoint | default ($env.NUSHELL_TELEMETRY_ENDPOINT? | default "")))

                if $result.success {
                    print $"🚨 Sent ($alerts | length) alerts to monitoring system"
                } else {
                    print $"❌ Failed to send alerts: ($result.error)"
                }

                # Also log alerts locally
                $alerts | each { |alert|
                    print $"⚠️  ALERT: ($alert.type) - ($alert.message)"
                }
            }

            # Send regular health status
            let health_payload = {
                type: "health_check"
                timestamp: (date now | format date "%Y-%m-%dT%H:%M:%S.%fZ")
                status: $status
            }

            send-telemetry $health_payload | ignore

        } catch { |err|
            print $"❌ Health monitoring error: ($err | get msg)"
        }

        sleep ($check_interval * 1sec)
    }
}

# Initialize telemetry configuration
export def init-telemetry [
    --endpoint(-e): string              # Telemetry endpoint URL
    --format(-f): string = "json"       # Default format
    --enable-health(-h)                 # Enable health monitoring
    --config-file(-c): string           # Save configuration to file
] -> record {
    let config = {
        endpoint: ($endpoint | default "")
        format: $format
        health_monitoring: ($enable_health | default false)
        created: (date now | format date "%Y-%m-%d %H:%M:%S")
        version: "1.0.0"
    }

    # Set environment variables
    $env.NUSHELL_TELEMETRY_ENDPOINT = ($endpoint | default "")
    $env.NUSHELL_TELEMETRY_FORMAT = $format
    $env.NUSHELL_TELEMETRY_ENABLED = "true"

    # Save configuration if file specified
    if ($config_file | is-not-empty) {
        try {
            $config | to json | save $config_file
            print $"📝 Telemetry configuration saved to ($config_file)"
        } catch { |err|
            print $"⚠️  Failed to save configuration: ($err | get msg)"
        }
    }

    print $"🔧 Telemetry initialized:"
    print $"   Endpoint: ($config.endpoint)"
    print $"   Format: ($config.format)"
    print $"   Health monitoring: ($config.health_monitoring)"

    return $config
}