# Observability Collection Scripts for Nushell Infrastructure
# Secure collection of system metrics, logs, and telemetry data

# Collect comprehensive system metrics
export def collect-system-metrics []: nothing -> record {
    let timestamp = (date now)

    let base_metrics = {
        timestamp: ($timestamp | format date "%Y-%m-%d %H:%M:%S")
        hostname: ($env.HOSTNAME? | default "unknown")
        collection_version: "1.0.0"
    }

    # CPU metrics
    let cpu_metrics = try {
        let cpu_info = (cat /proc/cpuinfo | lines | where $it =~ "processor|model name|cpu MHz" | parse "{key}: {value}")
        let cpu_count = ($cpu_info | where key == "processor" | length)
        let cpu_model = ($cpu_info | where key =~ "model name" | first | get value)

        # Load average
        let loadavg = (cat /proc/loadavg | split row " ")

        {
            cores: $cpu_count
            model: $cpu_model
            load_1m: ($loadavg | get 0 | into float)
            load_5m: ($loadavg | get 1 | into float)
            load_15m: ($loadavg | get 2 | into float)
        }
    } catch {
        {error: "Failed to collect CPU metrics"}
    }

    # Memory metrics
    try {
        let meminfo = (cat /proc/meminfo | lines | parse "{key}: {value} kB")
        let total_mem = ($meminfo | where key == "MemTotal" | first | get value | into int)
        let free_mem = ($meminfo | where key == "MemFree" | first | get value | into int)
        let available_mem = ($meminfo | where key == "MemAvailable" | first | get value | into int)
        let buffers = ($meminfo | where key == "Buffers" | first | get value | into int)
        let cached = ($meminfo | where key == "Cached" | first | get value | into int)

        $metrics = ($metrics | insert memory {
            total_kb: $total_mem
            free_kb: $free_mem
            available_kb: $available_mem
            buffers_kb: $buffers
            cached_kb: $cached
            used_kb: ($total_mem - $free_mem)
            usage_percent: (($total_mem - $free_mem) / $total_mem * 100 | math round --precision 2)
        })
    } catch {
        $metrics = ($metrics | insert memory {error: "Failed to collect memory metrics"})
    }

    # Disk metrics
    try {
        let disk_usage = (df -k | lines | skip 1 | parse "{filesystem} {total} {used} {available} {percent} {mount}")
        $metrics = ($metrics | insert disk ($disk_usage | select filesystem total used available percent mount))
    } catch {
        $metrics = ($metrics | insert disk {error: "Failed to collect disk metrics"})
    }

    # Network metrics (basic)
    try {
        let network_stats = (cat /proc/net/dev | lines | skip 2 | parse "{interface}: {rx_bytes} {rx_packets} {rx_errs} {rx_drop} {rx_fifo} {rx_frame} {rx_compressed} {rx_multicast} {tx_bytes} {tx_packets} {tx_errs} {tx_drop} {tx_fifo} {tx_colls} {tx_carrier} {tx_compressed}")
        $metrics = ($metrics | insert network ($network_stats | select interface rx_bytes tx_bytes rx_packets tx_packets))
    } catch {
        $metrics = ($metrics | insert network {error: "Failed to collect network metrics"})
    }

    # Process count
    try {
        let process_count = (ls /proc | where name =~ "^[0-9]+$" | length)
        $metrics = ($metrics | insert processes {
            total: $process_count
        })
    } catch {
        $metrics = ($metrics | insert processes {error: "Failed to collect process metrics"})
    }

    return $metrics
}

# Collect container metrics (if running in containerized environment)
export def collect-container-metrics []: nothing -> record {
    let timestamp = (date now)

    mut metrics = {
        timestamp: ($timestamp | format date "%Y-%m-%d %H:%M:%S")
        container_runtime: "unknown"
    }

    # Check for Docker
    try {
        if (which docker | is-not-empty) {
            let containers = (docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" | lines | skip 1)
            $metrics = ($metrics | insert docker {
                available: true
                containers: ($containers | length)
                running: ($containers | where $it =~ "Up" | length)
            })
            $metrics = ($metrics | insert container_runtime "docker")
        }
    } catch {}

    # Check for Podman
    try {
        if (which podman | is-not-empty) {
            let containers = (podman ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" | lines | skip 1)
            $metrics = ($metrics | insert podman {
                available: true
                containers: ($containers | length)
                running: ($containers | where $it =~ "Up" | length)
            })
            if ($metrics.container_runtime == "unknown") {
                $metrics = ($metrics | insert container_runtime "podman")
            }
        }
    } catch {}

    # Check for Kubernetes
    try {
        if (which kubectl | is-not-empty) {
            let pods = (kubectl get pods --all-namespaces --no-headers | lines)
            $metrics = ($metrics | insert kubernetes {
                available: true
                pods_total: ($pods | length)
                pods_running: ($pods | where $it =~ "Running" | length)
                pods_pending: ($pods | where $it =~ "Pending" | length)
                pods_failed: ($pods | where $it =~ "Failed" | length)
            })
        }
    } catch {}

    return $metrics
}

# Collect application logs with filtering
export def collect-logs [
    --service(-s): string      # Specific service to collect logs from
    --since: string = "1h"     # Time range (1h, 30m, etc.)
    --level: string = "error"  # Log level filter
    --lines(-l): int = 100     # Maximum lines to collect
]: nothing -> list<record> {
    mut logs = []

    # Systemd journal logs
    try {
        mut journalctl_cmd = ["journalctl", "--output=json", "--no-pager", $"--since=($since)"]

        if ($service | is-not-empty) {
            $journalctl_cmd = ($journalctl_cmd | append ["-u", $service])
        }

        if (($level | is-not-empty) and ($level != "all")) {
            $journalctl_cmd = ($journalctl_cmd | append ["-p", $level])
        }

        if ($lines | is-not-empty) {
            $journalctl_cmd = ($journalctl_cmd | append ["-n", ($lines | into string)])
        }

        let journal_logs = (^$journalctl_cmd.0 ...$journalctl_cmd.1 | lines | where $it != "" | each { |line| $line | from json })
        $logs = ($logs | append $journal_logs)
    } catch {}

    # Container logs (Docker)
    try {
        if (which docker | is-not-empty and ($service | is-not-empty)) {
            let container_logs = (docker logs --since $since --tail $lines $service 2>/dev/null | lines | enumerate | each { |item|
                {
                    timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
                    source: "docker"
                    container: $service
                    message: $item.item
                    line_number: $item.index
                }
            })
            $logs = ($logs | append $container_logs)
        }
    } catch {}

    # File-based logs (common locations)
    let log_files = [
        "/var/log/syslog"
        "/var/log/messages"
        "/var/log/kern.log"
        "/var/log/auth.log"
    ]

    for log_file in $log_files {
        try {
            if ($log_file | path exists) {
                let file_logs = (tail -n $lines $log_file | lines | enumerate | each { |item|
                    {
                        timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
                        source: "file"
                        file: $log_file
                        message: $item.item
                        line_number: $item.index
                    }
                })
                $logs = ($logs | append $file_logs)
            }
        } catch {}
    }

    return ($logs | first $lines)
}

# Process and analyze log patterns
export def analyze-logs [logs: list<record>]: nothing -> record {
    let total_logs = ($logs | length)

    if $total_logs == 0 {
        return {
            total: 0
            analysis: "No logs to analyze"
        }
    }

    # Error pattern analysis
    let error_patterns = ["error", "failed", "exception", "critical", "fatal"]
    mut error_counts = {}

    for pattern in $error_patterns {
        let count = ($logs | where message =~ $"(?i)($pattern)" | length)
        $error_counts = ($error_counts | insert $pattern $count)
    }

    # Source distribution
    let source_dist = ($logs | group-by source | transpose key value | each { |item|
        {source: $item.key, count: ($item.value | length)}
    })

    # Time-based analysis (last hour)
    let recent_logs = ($logs | where timestamp > ((date now) - 1hr))

    return {
        total: $total_logs
        recent_count: ($recent_logs | length)
        error_patterns: $error_counts
        source_distribution: $source_dist
        analysis_timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
    }
}

# Export metrics in various formats
export def export-metrics [
    metrics: record
    --format(-f): string = "json"  # json, yaml, csv
    --output(-o): string           # Output file path
]: nothing -> any {
    let formatted_data = match $format {
        "yaml" => ($metrics | to yaml)
        "csv" => {
            # Flatten metrics for CSV export
            let flattened = ($metrics | flatten | transpose key value)
            $flattened | to csv
        }
        _ => ($metrics | to json)
    }

    if ($output | is-not-empty) {
        $formatted_data | save $output
        print $"Metrics exported to ($output)"
    } else {
        $formatted_data
    }
}

# Health monitoring function
export def health-monitor [
    --interval(-i): int = 60  # Collection interval in seconds
    --duration(-d): int = 300 # Total monitoring duration in seconds
    --output(-o): string      # Output file for continuous monitoring
]: nothing -> nothing {
    let start_time = (date now)
    let end_time = ($start_time + ($duration * 1sec))

    print $"🔍 Starting health monitoring for ($duration) seconds with ($interval)s intervals"
    print $"📊 Collecting system and container metrics"

    while (date now) < $end_time {
        let current_time = (date now)
        let system_metrics = (collect-system-metrics)
        let container_metrics = (collect-container-metrics)

        let combined_metrics = {
            collection_time: ($current_time | format date "%Y-%m-%d %H:%M:%S")
            system: $system_metrics
            containers: $container_metrics
        }

        if ($output | is-not-empty) {
            $combined_metrics | to json | save -a $output
        } else {
            print $"⏰ ($current_time | format date "%H:%M:%S") - CPU: ($system_metrics.cpu.load_1m?)% | Memory: ($system_metrics.memory.usage_percent?)%"
        }

        sleep ($interval * 1sec)
    }

    print "✅ Health monitoring completed"
}

# Quick system status check
export def status-check []: nothing -> record {
    let system = (collect-system-metrics)
    let containers = (collect-container-metrics)

    # Determine overall health
    mut health_status = "healthy"
    mut alerts = []

    # CPU load check
    if (($system.cpu.load_1m? | default 0) > 4.0) {
        $health_status = "warning"
        $alerts = ($alerts | append "High CPU load")
    }

    # Memory usage check
    if (($system.memory.usage_percent? | default 0) > 90) {
        $health_status = "critical"
        $alerts = ($alerts | append "High memory usage")
    }

    # Disk usage check
    try {
        let high_disk = ($system.disk | where {|x| ($x.percent | str replace "%" "" | into float) > 90})
        if ($high_disk | length) > 0 {
            $health_status = "warning"
            $alerts = ($alerts | append "High disk usage")
        }
    } catch {}

    return {
        status: $health_status
        alerts: $alerts
        metrics: {
            system: $system
            containers: $containers
        }
        timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
    }
}