provisioning/taskservs/nushell/observability/collect.nu

# Observability Collection Scripts for Nushell Infrastructure
# Secure collection of system metrics, logs, and telemetry data

# Collect comprehensive system metrics
export def collect-system-metrics []: nothing -> record {
    let timestamp = (date now)

    let base_metrics = {
        timestamp: ($timestamp | format date "%Y-%m-%d %H:%M:%S")
        hostname: ($env.HOSTNAME? | default "unknown")
        collection_version: "1.0.0"
    }

    # CPU metrics
    let cpu_metrics = try {
        let cpu_info = (cat /proc/cpuinfo | lines | where $it =~ "processor|model name|cpu MHz" | parse "{key}: {value}")
        let cpu_count = ($cpu_info | where key == "processor" | length)
        let cpu_model = ($cpu_info | where key =~ "model name" | first | get value)

        # Load average
        let loadavg = (cat /proc/loadavg | split row " ")

        {
            cores: $cpu_count
            model: $cpu_model
            load_1m: ($loadavg | get 0 | into float)
            load_5m: ($loadavg | get 1 | into float)
            load_15m: ($loadavg | get 2 | into float)
        }
    } catch {
        {error: "Failed to collect CPU metrics"}
    }

    # Memory metrics
    try {
        let meminfo = (cat /proc/meminfo | lines | parse "{key}: {value} kB")
        let total_mem = ($meminfo | where key == "MemTotal" | first | get value | into int)
        let free_mem = ($meminfo | where key == "MemFree" | first | get value | into int)
        let available_mem = ($meminfo | where key == "MemAvailable" | first | get value | into int)
        let buffers = ($meminfo | where key == "Buffers" | first | get value | into int)
        let cached = ($meminfo | where key == "Cached" | first | get value | into int)

        $metrics = ($metrics | insert memory {
            total_kb: $total_mem
            free_kb: $free_mem
            available_kb: $available_mem
            buffers_kb: $buffers
            cached_kb: $cached
            used_kb: ($total_mem - $free_mem)
            usage_percent: (($total_mem - $free_mem) / $total_mem * 100 | math round --precision 2)
        })
    } catch {
        $metrics = ($metrics | insert memory {error: "Failed to collect memory metrics"})
    }

    # Disk metrics
    try {
        let disk_usage = (df -k | lines | skip 1 | parse "{filesystem} {total} {used} {available} {percent} {mount}")
        $metrics = ($metrics | insert disk ($disk_usage | select filesystem total used available percent mount))
    } catch {
        $metrics = ($metrics | insert disk {error: "Failed to collect disk metrics"})
    }

    # Network metrics (basic)
    try {
        let network_stats = (cat /proc/net/dev | lines | skip 2 | parse "{interface}: {rx_bytes} {rx_packets} {rx_errs} {rx_drop} {rx_fifo} {rx_frame} {rx_compressed} {rx_multicast} {tx_bytes} {tx_packets} {tx_errs} {tx_drop} {tx_fifo} {tx_colls} {tx_carrier} {tx_compressed}")
        $metrics = ($metrics | insert network ($network_stats | select interface rx_bytes tx_bytes rx_packets tx_packets))
    } catch {
        $metrics = ($metrics | insert network {error: "Failed to collect network metrics"})
    }

    # Process count
    try {
        let process_count = (ls /proc | where name =~ "^[0-9]+$" | length)
        $metrics = ($metrics | insert processes {
            total: $process_count
        })
    } catch {
        $metrics = ($metrics | insert processes {error: "Failed to collect process metrics"})
    }

    return $metrics
}

# Collect container metrics (if running in containerized environment)
export def collect-container-metrics []: nothing -> record {
    let timestamp = (date now)

    mut metrics = {
        timestamp: ($timestamp | format date "%Y-%m-%d %H:%M:%S")
        container_runtime: "unknown"
    }

    # Check for Docker
    try {
        if (which docker | is-not-empty) {
            let containers = (docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" | lines | skip 1)
            $metrics = ($metrics | insert docker {
                available: true
                containers: ($containers | length)
                running: ($containers | where $it =~ "Up" | length)
            })
            $metrics = ($metrics | insert container_runtime "docker")
        }
    } catch {}

    # Check for Podman
    try {
        if (which podman | is-not-empty) {
            let containers = (podman ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" | lines | skip 1)
            $metrics = ($metrics | insert podman {
                available: true
                containers: ($containers | length)
                running: ($containers | where $it =~ "Up" | length)
            })
            if ($metrics.container_runtime == "unknown") {
                $metrics = ($metrics | insert container_runtime "podman")
            }
        }
    } catch {}

    # Check for Kubernetes
    try {
        if (which kubectl | is-not-empty) {
            let pods = (kubectl get pods --all-namespaces --no-headers | lines)
            $metrics = ($metrics | insert kubernetes {
                available: true
                pods_total: ($pods | length)
                pods_running: ($pods | where $it =~ "Running" | length)
                pods_pending: ($pods | where $it =~ "Pending" | length)
                pods_failed: ($pods | where $it =~ "Failed" | length)
            })
        }
    } catch {}

    return $metrics
}

# Collect application logs with filtering
export def collect-logs [
    --service(-s): string      # Specific service to collect logs from
    --since: string = "1h"     # Time range (1h, 30m, etc.)
    --level: string = "error"  # Log level filter
    --lines(-l): int = 100     # Maximum lines to collect
]: nothing -> list<record> {
    mut logs = []

    # Systemd journal logs
    try {
        mut journalctl_cmd = ["journalctl", "--output=json", "--no-pager", $"--since=($since)"]

        if ($service | is-not-empty) {
            $journalctl_cmd = ($journalctl_cmd | append ["-u", $service])
        }

        if (($level | is-not-empty) and ($level != "all")) {
            $journalctl_cmd = ($journalctl_cmd | append ["-p", $level])
        }

        if ($lines | is-not-empty) {
            $journalctl_cmd = ($journalctl_cmd | append ["-n", ($lines | into string)])
        }

        let journal_logs = (^$journalctl_cmd.0 ...$journalctl_cmd.1 | lines | where $it != "" | each { |line| $line | from json })
        $logs = ($logs | append $journal_logs)
    } catch {}

    # Container logs (Docker)
    try {
        if (which docker | is-not-empty and ($service | is-not-empty)) {
            let container_logs = (docker logs --since $since --tail $lines $service 2>/dev/null | lines | enumerate | each { |item|
                {
                    timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
                    source: "docker"
                    container: $service
                    message: $item.item
                    line_number: $item.index
                }
            })
            $logs = ($logs | append $container_logs)
        }
    } catch {}

    # File-based logs (common locations)
    let log_files = [
        "/var/log/syslog"
        "/var/log/messages"
        "/var/log/kern.log"
        "/var/log/auth.log"
    ]

    for log_file in $log_files {
        try {
            if ($log_file | path exists) {
                let file_logs = (tail -n $lines $log_file | lines | enumerate | each { |item|
                    {
                        timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
                        source: "file"
                        file: $log_file
                        message: $item.item
                        line_number: $item.index
                    }
                })
                $logs = ($logs | append $file_logs)
            }
        } catch {}
    }

    return ($logs | first $lines)
}

# Process and analyze log patterns
export def analyze-logs [logs: list<record>]: nothing -> record {
    let total_logs = ($logs | length)

    if $total_logs == 0 {
        return {
            total: 0
            analysis: "No logs to analyze"
        }
    }

    # Error pattern analysis
    let error_patterns = ["error", "failed", "exception", "critical", "fatal"]
    mut error_counts = {}

    for pattern in $error_patterns {
        let count = ($logs | where message =~ $"(?i)($pattern)" | length)
        $error_counts = ($error_counts | insert $pattern $count)
    }

    # Source distribution
    let source_dist = ($logs | group-by source | transpose key value | each { |item|
        {source: $item.key, count: ($item.value | length)}
    })

    # Time-based analysis (last hour)
    let recent_logs = ($logs | where timestamp > ((date now) - 1hr))

    return {
        total: $total_logs
        recent_count: ($recent_logs | length)
        error_patterns: $error_counts
        source_distribution: $source_dist
        analysis_timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
    }
}

# Export metrics in various formats
export def export-metrics [
    metrics: record
    --format(-f): string = "json"  # json, yaml, csv
    --output(-o): string           # Output file path
]: nothing -> any {
    let formatted_data = match $format {
        "yaml" => ($metrics | to yaml)
        "csv" => {
            # Flatten metrics for CSV export
            let flattened = ($metrics | flatten | transpose key value)
            $flattened | to csv
        }
        _ => ($metrics | to json)
    }

    if ($output | is-not-empty) {
        $formatted_data | save $output
        print $"Metrics exported to ($output)"
    } else {
        $formatted_data
    }
}

# Health monitoring function
export def health-monitor [
    --interval(-i): int = 60  # Collection interval in seconds
    --duration(-d): int = 300 # Total monitoring duration in seconds
    --output(-o): string      # Output file for continuous monitoring
]: nothing -> nothing {
    let start_time = (date now)
    let end_time = ($start_time + ($duration * 1sec))

    print $"🔍 Starting health monitoring for ($duration) seconds with ($interval)s intervals"
    print $"📊 Collecting system and container metrics"

    while (date now) < $end_time {
        let current_time = (date now)
        let system_metrics = (collect-system-metrics)
        let container_metrics = (collect-container-metrics)

        let combined_metrics = {
            collection_time: ($current_time | format date "%Y-%m-%d %H:%M:%S")
            system: $system_metrics
            containers: $container_metrics
        }

        if ($output | is-not-empty) {
            $combined_metrics | to json | save -a $output
        } else {
            print $"⏰ ($current_time | format date "%H:%M:%S") - CPU: ($system_metrics.cpu.load_1m?)% | Memory: ($system_metrics.memory.usage_percent?)%"
        }

        sleep ($interval * 1sec)
    }

    print "✅ Health monitoring completed"
}

# Quick system status check
export def status-check []: nothing -> record {
    let system = (collect-system-metrics)
    let containers = (collect-container-metrics)

    # Determine overall health
    mut health_status = "healthy"
    mut alerts = []

    # CPU load check
    if (($system.cpu.load_1m? | default 0) > 4.0) {
        $health_status = "warning"
        $alerts = ($alerts | append "High CPU load")
    }

    # Memory usage check
    if (($system.memory.usage_percent? | default 0) > 90) {
        $health_status = "critical"
        $alerts = ($alerts | append "High memory usage")
    }

    # Disk usage check
    try {
        let high_disk = ($system.disk | where {|x| ($x.percent | str replace "%" "" | into float) > 90})
        if ($high_disk | length) > 0 {
            $health_status = "warning"
            $alerts = ($alerts | append "High disk usage")
        }
    } catch {}

    return {
        status: $health_status
        alerts: $alerts
        metrics: {
            system: $system
            containers: $containers
        }
        timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
    }
}
feat(taskserv): implement real-time version checking with configurable HTTP client - Add: GitHub API integration for live version checking in taskserv management - Add: HTTP client configuration option (http.use_curl) in config.defaults.toml - Add: Helper function fetch_latest_version with curl/http get support - Fix: Settings path structure for prov_data_dirpath access pattern - Remove: Legacy simulation code for version checking - Update: Core configuration name from "provisioning-system" to "provisioning" - Clean: Remove obsolete example configs and infrastructure files 2025-09-24 00:55:06 +00:00			`# Observability Collection Scripts for Nushell Infrastructure`
			`# Secure collection of system metrics, logs, and telemetry data`

			`# Collect comprehensive system metrics`
			`export def collect-system-metrics []: nothing -> record {`
			`let timestamp = (date now)`

			`let base_metrics = {`
			`timestamp: ($timestamp \| format date "%Y-%m-%d %H:%M:%S")`
			`hostname: ($env.HOSTNAME? \| default "unknown")`
			`collection_version: "1.0.0"`
			`}`

			`# CPU metrics`
			`let cpu_metrics = try {`
			`let cpu_info = (cat /proc/cpuinfo \| lines \| where $it =~ "processor\|model name\|cpu MHz" \| parse "{key}: {value}")`
			`let cpu_count = ($cpu_info \| where key == "processor" \| length)`
			`let cpu_model = ($cpu_info \| where key =~ "model name" \| first \| get value)`

			`# Load average`
			`let loadavg = (cat /proc/loadavg \| split row " ")`

			`{`
			`cores: $cpu_count`
			`model: $cpu_model`
			`load_1m: ($loadavg \| get 0 \| into float)`
			`load_5m: ($loadavg \| get 1 \| into float)`
			`load_15m: ($loadavg \| get 2 \| into float)`
			`}`
			`} catch {`
			`{error: "Failed to collect CPU metrics"}`
			`}`

			`# Memory metrics`
			`try {`
			`let meminfo = (cat /proc/meminfo \| lines \| parse "{key}: {value} kB")`
			`let total_mem = ($meminfo \| where key == "MemTotal" \| first \| get value \| into int)`
			`let free_mem = ($meminfo \| where key == "MemFree" \| first \| get value \| into int)`
			`let available_mem = ($meminfo \| where key == "MemAvailable" \| first \| get value \| into int)`
			`let buffers = ($meminfo \| where key == "Buffers" \| first \| get value \| into int)`
			`let cached = ($meminfo \| where key == "Cached" \| first \| get value \| into int)`

			`$metrics = ($metrics \| insert memory {`
			`total_kb: $total_mem`
			`free_kb: $free_mem`
			`available_kb: $available_mem`
			`buffers_kb: $buffers`
			`cached_kb: $cached`
			`used_kb: ($total_mem - $free_mem)`
			`usage_percent: (($total_mem - $free_mem) / $total_mem * 100 \| math round --precision 2)`
			`})`
			`} catch {`
			`$metrics = ($metrics \| insert memory {error: "Failed to collect memory metrics"})`
			`}`

			`# Disk metrics`
			`try {`
			`let disk_usage = (df -k \| lines \| skip 1 \| parse "{filesystem} {total} {used} {available} {percent} {mount}")`
			`$metrics = ($metrics \| insert disk ($disk_usage \| select filesystem total used available percent mount))`
			`} catch {`
			`$metrics = ($metrics \| insert disk {error: "Failed to collect disk metrics"})`
			`}`

			`# Network metrics (basic)`
			`try {`
			`let network_stats = (cat /proc/net/dev \| lines \| skip 2 \| parse "{interface}: {rx_bytes} {rx_packets} {rx_errs} {rx_drop} {rx_fifo} {rx_frame} {rx_compressed} {rx_multicast} {tx_bytes} {tx_packets} {tx_errs} {tx_drop} {tx_fifo} {tx_colls} {tx_carrier} {tx_compressed}")`
			`$metrics = ($metrics \| insert network ($network_stats \| select interface rx_bytes tx_bytes rx_packets tx_packets))`
			`} catch {`
			`$metrics = ($metrics \| insert network {error: "Failed to collect network metrics"})`
			`}`

			`# Process count`
			`try {`
			`let process_count = (ls /proc \| where name =~ "^[0-9]+$" \| length)`
			`$metrics = ($metrics \| insert processes {`
			`total: $process_count`
			`})`
			`} catch {`
			`$metrics = ($metrics \| insert processes {error: "Failed to collect process metrics"})`
			`}`

			`return $metrics`
			`}`

			`# Collect container metrics (if running in containerized environment)`
			`export def collect-container-metrics []: nothing -> record {`
			`let timestamp = (date now)`

			`mut metrics = {`
			`timestamp: ($timestamp \| format date "%Y-%m-%d %H:%M:%S")`
			`container_runtime: "unknown"`
			`}`

			`# Check for Docker`
			`try {`
			`if (which docker \| is-not-empty) {`
			`let containers = (docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" \| lines \| skip 1)`
			`$metrics = ($metrics \| insert docker {`
			`available: true`
			`containers: ($containers \| length)`
			`running: ($containers \| where $it =~ "Up" \| length)`
			`})`
			`$metrics = ($metrics \| insert container_runtime "docker")`
			`}`
			`} catch {}`

			`# Check for Podman`
			`try {`
			`if (which podman \| is-not-empty) {`
			`let containers = (podman ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" \| lines \| skip 1)`
			`$metrics = ($metrics \| insert podman {`
			`available: true`
			`containers: ($containers \| length)`
			`running: ($containers \| where $it =~ "Up" \| length)`
			`})`
			`if ($metrics.container_runtime == "unknown") {`
			`$metrics = ($metrics \| insert container_runtime "podman")`
			`}`
			`}`
			`} catch {}`

			`# Check for Kubernetes`
			`try {`
			`if (which kubectl \| is-not-empty) {`
			`let pods = (kubectl get pods --all-namespaces --no-headers \| lines)`
			`$metrics = ($metrics \| insert kubernetes {`
			`available: true`
			`pods_total: ($pods \| length)`
			`pods_running: ($pods \| where $it =~ "Running" \| length)`
			`pods_pending: ($pods \| where $it =~ "Pending" \| length)`
			`pods_failed: ($pods \| where $it =~ "Failed" \| length)`
			`})`
			`}`
			`} catch {}`

			`return $metrics`
			`}`

			`# Collect application logs with filtering`
			`export def collect-logs [`
			`--service(-s): string # Specific service to collect logs from`
			`--since: string = "1h" # Time range (1h, 30m, etc.)`
			`--level: string = "error" # Log level filter`
			`--lines(-l): int = 100 # Maximum lines to collect`
			`]: nothing -> list<record> {`
			`mut logs = []`

			`# Systemd journal logs`
			`try {`
			`mut journalctl_cmd = ["journalctl", "--output=json", "--no-pager", $"--since=($since)"]`

			`if ($service \| is-not-empty) {`
			`$journalctl_cmd = ($journalctl_cmd \| append ["-u", $service])`
			`}`

			`if (($level \| is-not-empty) and ($level != "all")) {`
			`$journalctl_cmd = ($journalctl_cmd \| append ["-p", $level])`
			`}`

			`if ($lines \| is-not-empty) {`
			`$journalctl_cmd = ($journalctl_cmd \| append ["-n", ($lines \| into string)])`
			`}`

			`let journal_logs = (^$journalctl_cmd.0 ...$journalctl_cmd.1 \| lines \| where $it != "" \| each { \|line\| $line \| from json })`
			`$logs = ($logs \| append $journal_logs)`
			`} catch {}`

			`# Container logs (Docker)`
			`try {`
			`if (which docker \| is-not-empty and ($service \| is-not-empty)) {`
			`let container_logs = (docker logs --since $since --tail $lines $service 2>/dev/null \| lines \| enumerate \| each { \|item\|`
			`{`
			`timestamp: (date now \| format date "%Y-%m-%d %H:%M:%S")`
			`source: "docker"`
			`container: $service`
			`message: $item.item`
			`line_number: $item.index`
			`}`
			`})`
			`$logs = ($logs \| append $container_logs)`
			`}`
			`} catch {}`

			`# File-based logs (common locations)`
			`let log_files = [`
			`"/var/log/syslog"`
			`"/var/log/messages"`
			`"/var/log/kern.log"`
			`"/var/log/auth.log"`
			`]`

			`for log_file in $log_files {`
			`try {`
			`if ($log_file \| path exists) {`
			`let file_logs = (tail -n $lines $log_file \| lines \| enumerate \| each { \|item\|`
			`{`
			`timestamp: (date now \| format date "%Y-%m-%d %H:%M:%S")`
			`source: "file"`
			`file: $log_file`
			`message: $item.item`
			`line_number: $item.index`
			`}`
			`})`
			`$logs = ($logs \| append $file_logs)`
			`}`
			`} catch {}`
			`}`

			`return ($logs \| first $lines)`
			`}`

			`# Process and analyze log patterns`
			`export def analyze-logs [logs: list<record>]: nothing -> record {`
			`let total_logs = ($logs \| length)`

			`if $total_logs == 0 {`
			`return {`
			`total: 0`
			`analysis: "No logs to analyze"`
			`}`
			`}`

			`# Error pattern analysis`
			`let error_patterns = ["error", "failed", "exception", "critical", "fatal"]`
			`mut error_counts = {}`

			`for pattern in $error_patterns {`
			`let count = ($logs \| where message =~ $"(?i)($pattern)" \| length)`
			`$error_counts = ($error_counts \| insert $pattern $count)`
			`}`

			`# Source distribution`
			`let source_dist = ($logs \| group-by source \| transpose key value \| each { \|item\|`
			`{source: $item.key, count: ($item.value \| length)}`
			`})`

			`# Time-based analysis (last hour)`
			`let recent_logs = ($logs \| where timestamp > ((date now) - 1hr))`

			`return {`
			`total: $total_logs`
			`recent_count: ($recent_logs \| length)`
			`error_patterns: $error_counts`
			`source_distribution: $source_dist`
			`analysis_timestamp: (date now \| format date "%Y-%m-%d %H:%M:%S")`
			`}`
			`}`

			`# Export metrics in various formats`
			`export def export-metrics [`
			`metrics: record`
			`--format(-f): string = "json" # json, yaml, csv`
			`--output(-o): string # Output file path`
			`]: nothing -> any {`
			`let formatted_data = match $format {`
			`"yaml" => ($metrics \| to yaml)`
			`"csv" => {`
			`# Flatten metrics for CSV export`
			`let flattened = ($metrics \| flatten \| transpose key value)`
			`$flattened \| to csv`
			`}`
			`_ => ($metrics \| to json)`
			`}`

			`if ($output \| is-not-empty) {`
			`$formatted_data \| save $output`
			`print $"Metrics exported to ($output)"`
			`} else {`
			`$formatted_data`
			`}`
			`}`

			`# Health monitoring function`
			`export def health-monitor [`
			`--interval(-i): int = 60 # Collection interval in seconds`
			`--duration(-d): int = 300 # Total monitoring duration in seconds`
			`--output(-o): string # Output file for continuous monitoring`
			`]: nothing -> nothing {`
			`let start_time = (date now)`
			`let end_time = ($start_time + ($duration * 1sec))`

			`print $"🔍 Starting health monitoring for ($duration) seconds with ($interval)s intervals"`
			`print $"📊 Collecting system and container metrics"`

			`while (date now) < $end_time {`
			`let current_time = (date now)`
			`let system_metrics = (collect-system-metrics)`
			`let container_metrics = (collect-container-metrics)`

			`let combined_metrics = {`
			`collection_time: ($current_time \| format date "%Y-%m-%d %H:%M:%S")`
			`system: $system_metrics`
			`containers: $container_metrics`
			`}`

			`if ($output \| is-not-empty) {`
			`$combined_metrics \| to json \| save -a $output`
			`} else {`
			`print $"⏰ ($current_time \| format date "%H:%M:%S") - CPU: ($system_metrics.cpu.load_1m?)% \| Memory: ($system_metrics.memory.usage_percent?)%"`
			`}`

			`sleep ($interval * 1sec)`
			`}`

			`print "✅ Health monitoring completed"`
			`}`

			`# Quick system status check`
			`export def status-check []: nothing -> record {`
			`let system = (collect-system-metrics)`
			`let containers = (collect-container-metrics)`

			`# Determine overall health`
			`mut health_status = "healthy"`
			`mut alerts = []`

			`# CPU load check`
			`if (($system.cpu.load_1m? \| default 0) > 4.0) {`
			`$health_status = "warning"`
			`$alerts = ($alerts \| append "High CPU load")`
			`}`

			`# Memory usage check`
			`if (($system.memory.usage_percent? \| default 0) > 90) {`
			`$health_status = "critical"`
			`$alerts = ($alerts \| append "High memory usage")`
			`}`

			`# Disk usage check`
			`try {`
			`let high_disk = ($system.disk \| where {\|x\| ($x.percent \| str replace "%" "" \| into float) > 90})`
			`if ($high_disk \| length) > 0 {`
			`$health_status = "warning"`
			`$alerts = ($alerts \| append "High disk usage")`
			`}`
			`} catch {}`

			`return {`
			`status: $health_status`
			`alerts: $alerts`
			`metrics: {`
			`system: $system`
			`containers: $containers`
			`}`
			`timestamp: (date now \| format date "%Y-%m-%d %H:%M:%S")`
			`}`
			`}`