provisioning/taskservs/nushell/observability/collect.nu

347 lines
12 KiB
Plaintext
Raw Normal View History

# Observability Collection Scripts for Nushell Infrastructure
# Secure collection of system metrics, logs, and telemetry data
# Collect comprehensive system metrics
export def collect-system-metrics []: nothing -> record {
let timestamp = (date now)
let base_metrics = {
timestamp: ($timestamp | format date "%Y-%m-%d %H:%M:%S")
hostname: ($env.HOSTNAME? | default "unknown")
collection_version: "1.0.0"
}
# CPU metrics
let cpu_metrics = try {
let cpu_info = (cat /proc/cpuinfo | lines | where $it =~ "processor|model name|cpu MHz" | parse "{key}: {value}")
let cpu_count = ($cpu_info | where key == "processor" | length)
let cpu_model = ($cpu_info | where key =~ "model name" | first | get value)
# Load average
let loadavg = (cat /proc/loadavg | split row " ")
{
cores: $cpu_count
model: $cpu_model
load_1m: ($loadavg | get 0 | into float)
load_5m: ($loadavg | get 1 | into float)
load_15m: ($loadavg | get 2 | into float)
}
} catch {
{error: "Failed to collect CPU metrics"}
}
# Memory metrics
try {
let meminfo = (cat /proc/meminfo | lines | parse "{key}: {value} kB")
let total_mem = ($meminfo | where key == "MemTotal" | first | get value | into int)
let free_mem = ($meminfo | where key == "MemFree" | first | get value | into int)
let available_mem = ($meminfo | where key == "MemAvailable" | first | get value | into int)
let buffers = ($meminfo | where key == "Buffers" | first | get value | into int)
let cached = ($meminfo | where key == "Cached" | first | get value | into int)
$metrics = ($metrics | insert memory {
total_kb: $total_mem
free_kb: $free_mem
available_kb: $available_mem
buffers_kb: $buffers
cached_kb: $cached
used_kb: ($total_mem - $free_mem)
usage_percent: (($total_mem - $free_mem) / $total_mem * 100 | math round --precision 2)
})
} catch {
$metrics = ($metrics | insert memory {error: "Failed to collect memory metrics"})
}
# Disk metrics
try {
let disk_usage = (df -k | lines | skip 1 | parse "{filesystem} {total} {used} {available} {percent} {mount}")
$metrics = ($metrics | insert disk ($disk_usage | select filesystem total used available percent mount))
} catch {
$metrics = ($metrics | insert disk {error: "Failed to collect disk metrics"})
}
# Network metrics (basic)
try {
let network_stats = (cat /proc/net/dev | lines | skip 2 | parse "{interface}: {rx_bytes} {rx_packets} {rx_errs} {rx_drop} {rx_fifo} {rx_frame} {rx_compressed} {rx_multicast} {tx_bytes} {tx_packets} {tx_errs} {tx_drop} {tx_fifo} {tx_colls} {tx_carrier} {tx_compressed}")
$metrics = ($metrics | insert network ($network_stats | select interface rx_bytes tx_bytes rx_packets tx_packets))
} catch {
$metrics = ($metrics | insert network {error: "Failed to collect network metrics"})
}
# Process count
try {
let process_count = (ls /proc | where name =~ "^[0-9]+$" | length)
$metrics = ($metrics | insert processes {
total: $process_count
})
} catch {
$metrics = ($metrics | insert processes {error: "Failed to collect process metrics"})
}
return $metrics
}
# Collect container metrics (if running in containerized environment)
export def collect-container-metrics []: nothing -> record {
let timestamp = (date now)
mut metrics = {
timestamp: ($timestamp | format date "%Y-%m-%d %H:%M:%S")
container_runtime: "unknown"
}
# Check for Docker
try {
if (which docker | is-not-empty) {
let containers = (docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" | lines | skip 1)
$metrics = ($metrics | insert docker {
available: true
containers: ($containers | length)
running: ($containers | where $it =~ "Up" | length)
})
$metrics = ($metrics | insert container_runtime "docker")
}
} catch {}
# Check for Podman
try {
if (which podman | is-not-empty) {
let containers = (podman ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" | lines | skip 1)
$metrics = ($metrics | insert podman {
available: true
containers: ($containers | length)
running: ($containers | where $it =~ "Up" | length)
})
if ($metrics.container_runtime == "unknown") {
$metrics = ($metrics | insert container_runtime "podman")
}
}
} catch {}
# Check for Kubernetes
try {
if (which kubectl | is-not-empty) {
let pods = (kubectl get pods --all-namespaces --no-headers | lines)
$metrics = ($metrics | insert kubernetes {
available: true
pods_total: ($pods | length)
pods_running: ($pods | where $it =~ "Running" | length)
pods_pending: ($pods | where $it =~ "Pending" | length)
pods_failed: ($pods | where $it =~ "Failed" | length)
})
}
} catch {}
return $metrics
}
# Collect application logs with filtering
export def collect-logs [
--service(-s): string # Specific service to collect logs from
--since: string = "1h" # Time range (1h, 30m, etc.)
--level: string = "error" # Log level filter
--lines(-l): int = 100 # Maximum lines to collect
]: nothing -> list<record> {
mut logs = []
# Systemd journal logs
try {
mut journalctl_cmd = ["journalctl", "--output=json", "--no-pager", $"--since=($since)"]
if ($service | is-not-empty) {
$journalctl_cmd = ($journalctl_cmd | append ["-u", $service])
}
if (($level | is-not-empty) and ($level != "all")) {
$journalctl_cmd = ($journalctl_cmd | append ["-p", $level])
}
if ($lines | is-not-empty) {
$journalctl_cmd = ($journalctl_cmd | append ["-n", ($lines | into string)])
}
let journal_logs = (^$journalctl_cmd.0 ...$journalctl_cmd.1 | lines | where $it != "" | each { |line| $line | from json })
$logs = ($logs | append $journal_logs)
} catch {}
# Container logs (Docker)
try {
if (which docker | is-not-empty and ($service | is-not-empty)) {
let container_logs = (docker logs --since $since --tail $lines $service 2>/dev/null | lines | enumerate | each { |item|
{
timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
source: "docker"
container: $service
message: $item.item
line_number: $item.index
}
})
$logs = ($logs | append $container_logs)
}
} catch {}
# File-based logs (common locations)
let log_files = [
"/var/log/syslog"
"/var/log/messages"
"/var/log/kern.log"
"/var/log/auth.log"
]
for log_file in $log_files {
try {
if ($log_file | path exists) {
let file_logs = (tail -n $lines $log_file | lines | enumerate | each { |item|
{
timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
source: "file"
file: $log_file
message: $item.item
line_number: $item.index
}
})
$logs = ($logs | append $file_logs)
}
} catch {}
}
return ($logs | first $lines)
}
# Process and analyze log patterns
export def analyze-logs [logs: list<record>]: nothing -> record {
let total_logs = ($logs | length)
if $total_logs == 0 {
return {
total: 0
analysis: "No logs to analyze"
}
}
# Error pattern analysis
let error_patterns = ["error", "failed", "exception", "critical", "fatal"]
mut error_counts = {}
for pattern in $error_patterns {
let count = ($logs | where message =~ $"(?i)($pattern)" | length)
$error_counts = ($error_counts | insert $pattern $count)
}
# Source distribution
let source_dist = ($logs | group-by source | transpose key value | each { |item|
{source: $item.key, count: ($item.value | length)}
})
# Time-based analysis (last hour)
let recent_logs = ($logs | where timestamp > ((date now) - 1hr))
return {
total: $total_logs
recent_count: ($recent_logs | length)
error_patterns: $error_counts
source_distribution: $source_dist
analysis_timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
}
}
# Export metrics in various formats
export def export-metrics [
metrics: record
--format(-f): string = "json" # json, yaml, csv
--output(-o): string # Output file path
]: nothing -> any {
let formatted_data = match $format {
"yaml" => ($metrics | to yaml)
"csv" => {
# Flatten metrics for CSV export
let flattened = ($metrics | flatten | transpose key value)
$flattened | to csv
}
_ => ($metrics | to json)
}
if ($output | is-not-empty) {
$formatted_data | save $output
print $"Metrics exported to ($output)"
} else {
$formatted_data
}
}
# Health monitoring function
export def health-monitor [
--interval(-i): int = 60 # Collection interval in seconds
--duration(-d): int = 300 # Total monitoring duration in seconds
--output(-o): string # Output file for continuous monitoring
]: nothing -> nothing {
let start_time = (date now)
let end_time = ($start_time + ($duration * 1sec))
print $"🔍 Starting health monitoring for ($duration) seconds with ($interval)s intervals"
print $"📊 Collecting system and container metrics"
while (date now) < $end_time {
let current_time = (date now)
let system_metrics = (collect-system-metrics)
let container_metrics = (collect-container-metrics)
let combined_metrics = {
collection_time: ($current_time | format date "%Y-%m-%d %H:%M:%S")
system: $system_metrics
containers: $container_metrics
}
if ($output | is-not-empty) {
$combined_metrics | to json | save -a $output
} else {
print $"⏰ ($current_time | format date "%H:%M:%S") - CPU: ($system_metrics.cpu.load_1m?)% | Memory: ($system_metrics.memory.usage_percent?)%"
}
sleep ($interval * 1sec)
}
print "✅ Health monitoring completed"
}
# Quick system status check
export def status-check []: nothing -> record {
let system = (collect-system-metrics)
let containers = (collect-container-metrics)
# Determine overall health
mut health_status = "healthy"
mut alerts = []
# CPU load check
if (($system.cpu.load_1m? | default 0) > 4.0) {
$health_status = "warning"
$alerts = ($alerts | append "High CPU load")
}
# Memory usage check
if (($system.memory.usage_percent? | default 0) > 90) {
$health_status = "critical"
$alerts = ($alerts | append "High memory usage")
}
# Disk usage check
try {
let high_disk = ($system.disk | where {|x| ($x.percent | str replace "%" "" | into float) > 90})
if ($high_disk | length) > 0 {
$health_status = "warning"
$alerts = ($alerts | append "High disk usage")
}
} catch {}
return {
status: $health_status
alerts: $alerts
metrics: {
system: $system
containers: $containers
}
timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
}
}