347 lines
12 KiB
Plaintext
347 lines
12 KiB
Plaintext
![]() |
# Observability Collection Scripts for Nushell Infrastructure
|
||
|
# Secure collection of system metrics, logs, and telemetry data
|
||
|
|
||
|
# Collect comprehensive system metrics
|
||
|
export def collect-system-metrics []: nothing -> record {
|
||
|
let timestamp = (date now)
|
||
|
|
||
|
let base_metrics = {
|
||
|
timestamp: ($timestamp | format date "%Y-%m-%d %H:%M:%S")
|
||
|
hostname: ($env.HOSTNAME? | default "unknown")
|
||
|
collection_version: "1.0.0"
|
||
|
}
|
||
|
|
||
|
# CPU metrics
|
||
|
let cpu_metrics = try {
|
||
|
let cpu_info = (cat /proc/cpuinfo | lines | where $it =~ "processor|model name|cpu MHz" | parse "{key}: {value}")
|
||
|
let cpu_count = ($cpu_info | where key == "processor" | length)
|
||
|
let cpu_model = ($cpu_info | where key =~ "model name" | first | get value)
|
||
|
|
||
|
# Load average
|
||
|
let loadavg = (cat /proc/loadavg | split row " ")
|
||
|
|
||
|
{
|
||
|
cores: $cpu_count
|
||
|
model: $cpu_model
|
||
|
load_1m: ($loadavg | get 0 | into float)
|
||
|
load_5m: ($loadavg | get 1 | into float)
|
||
|
load_15m: ($loadavg | get 2 | into float)
|
||
|
}
|
||
|
} catch {
|
||
|
{error: "Failed to collect CPU metrics"}
|
||
|
}
|
||
|
|
||
|
# Memory metrics
|
||
|
try {
|
||
|
let meminfo = (cat /proc/meminfo | lines | parse "{key}: {value} kB")
|
||
|
let total_mem = ($meminfo | where key == "MemTotal" | first | get value | into int)
|
||
|
let free_mem = ($meminfo | where key == "MemFree" | first | get value | into int)
|
||
|
let available_mem = ($meminfo | where key == "MemAvailable" | first | get value | into int)
|
||
|
let buffers = ($meminfo | where key == "Buffers" | first | get value | into int)
|
||
|
let cached = ($meminfo | where key == "Cached" | first | get value | into int)
|
||
|
|
||
|
$metrics = ($metrics | insert memory {
|
||
|
total_kb: $total_mem
|
||
|
free_kb: $free_mem
|
||
|
available_kb: $available_mem
|
||
|
buffers_kb: $buffers
|
||
|
cached_kb: $cached
|
||
|
used_kb: ($total_mem - $free_mem)
|
||
|
usage_percent: (($total_mem - $free_mem) / $total_mem * 100 | math round --precision 2)
|
||
|
})
|
||
|
} catch {
|
||
|
$metrics = ($metrics | insert memory {error: "Failed to collect memory metrics"})
|
||
|
}
|
||
|
|
||
|
# Disk metrics
|
||
|
try {
|
||
|
let disk_usage = (df -k | lines | skip 1 | parse "{filesystem} {total} {used} {available} {percent} {mount}")
|
||
|
$metrics = ($metrics | insert disk ($disk_usage | select filesystem total used available percent mount))
|
||
|
} catch {
|
||
|
$metrics = ($metrics | insert disk {error: "Failed to collect disk metrics"})
|
||
|
}
|
||
|
|
||
|
# Network metrics (basic)
|
||
|
try {
|
||
|
let network_stats = (cat /proc/net/dev | lines | skip 2 | parse "{interface}: {rx_bytes} {rx_packets} {rx_errs} {rx_drop} {rx_fifo} {rx_frame} {rx_compressed} {rx_multicast} {tx_bytes} {tx_packets} {tx_errs} {tx_drop} {tx_fifo} {tx_colls} {tx_carrier} {tx_compressed}")
|
||
|
$metrics = ($metrics | insert network ($network_stats | select interface rx_bytes tx_bytes rx_packets tx_packets))
|
||
|
} catch {
|
||
|
$metrics = ($metrics | insert network {error: "Failed to collect network metrics"})
|
||
|
}
|
||
|
|
||
|
# Process count
|
||
|
try {
|
||
|
let process_count = (ls /proc | where name =~ "^[0-9]+$" | length)
|
||
|
$metrics = ($metrics | insert processes {
|
||
|
total: $process_count
|
||
|
})
|
||
|
} catch {
|
||
|
$metrics = ($metrics | insert processes {error: "Failed to collect process metrics"})
|
||
|
}
|
||
|
|
||
|
return $metrics
|
||
|
}
|
||
|
|
||
|
# Collect container metrics (if running in containerized environment)
|
||
|
export def collect-container-metrics []: nothing -> record {
|
||
|
let timestamp = (date now)
|
||
|
|
||
|
mut metrics = {
|
||
|
timestamp: ($timestamp | format date "%Y-%m-%d %H:%M:%S")
|
||
|
container_runtime: "unknown"
|
||
|
}
|
||
|
|
||
|
# Check for Docker
|
||
|
try {
|
||
|
if (which docker | is-not-empty) {
|
||
|
let containers = (docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" | lines | skip 1)
|
||
|
$metrics = ($metrics | insert docker {
|
||
|
available: true
|
||
|
containers: ($containers | length)
|
||
|
running: ($containers | where $it =~ "Up" | length)
|
||
|
})
|
||
|
$metrics = ($metrics | insert container_runtime "docker")
|
||
|
}
|
||
|
} catch {}
|
||
|
|
||
|
# Check for Podman
|
||
|
try {
|
||
|
if (which podman | is-not-empty) {
|
||
|
let containers = (podman ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" | lines | skip 1)
|
||
|
$metrics = ($metrics | insert podman {
|
||
|
available: true
|
||
|
containers: ($containers | length)
|
||
|
running: ($containers | where $it =~ "Up" | length)
|
||
|
})
|
||
|
if ($metrics.container_runtime == "unknown") {
|
||
|
$metrics = ($metrics | insert container_runtime "podman")
|
||
|
}
|
||
|
}
|
||
|
} catch {}
|
||
|
|
||
|
# Check for Kubernetes
|
||
|
try {
|
||
|
if (which kubectl | is-not-empty) {
|
||
|
let pods = (kubectl get pods --all-namespaces --no-headers | lines)
|
||
|
$metrics = ($metrics | insert kubernetes {
|
||
|
available: true
|
||
|
pods_total: ($pods | length)
|
||
|
pods_running: ($pods | where $it =~ "Running" | length)
|
||
|
pods_pending: ($pods | where $it =~ "Pending" | length)
|
||
|
pods_failed: ($pods | where $it =~ "Failed" | length)
|
||
|
})
|
||
|
}
|
||
|
} catch {}
|
||
|
|
||
|
return $metrics
|
||
|
}
|
||
|
|
||
|
# Collect application logs with filtering
|
||
|
export def collect-logs [
|
||
|
--service(-s): string # Specific service to collect logs from
|
||
|
--since: string = "1h" # Time range (1h, 30m, etc.)
|
||
|
--level: string = "error" # Log level filter
|
||
|
--lines(-l): int = 100 # Maximum lines to collect
|
||
|
]: nothing -> list<record> {
|
||
|
mut logs = []
|
||
|
|
||
|
# Systemd journal logs
|
||
|
try {
|
||
|
mut journalctl_cmd = ["journalctl", "--output=json", "--no-pager", $"--since=($since)"]
|
||
|
|
||
|
if ($service | is-not-empty) {
|
||
|
$journalctl_cmd = ($journalctl_cmd | append ["-u", $service])
|
||
|
}
|
||
|
|
||
|
if (($level | is-not-empty) and ($level != "all")) {
|
||
|
$journalctl_cmd = ($journalctl_cmd | append ["-p", $level])
|
||
|
}
|
||
|
|
||
|
if ($lines | is-not-empty) {
|
||
|
$journalctl_cmd = ($journalctl_cmd | append ["-n", ($lines | into string)])
|
||
|
}
|
||
|
|
||
|
let journal_logs = (^$journalctl_cmd.0 ...$journalctl_cmd.1 | lines | where $it != "" | each { |line| $line | from json })
|
||
|
$logs = ($logs | append $journal_logs)
|
||
|
} catch {}
|
||
|
|
||
|
# Container logs (Docker)
|
||
|
try {
|
||
|
if (which docker | is-not-empty and ($service | is-not-empty)) {
|
||
|
let container_logs = (docker logs --since $since --tail $lines $service 2>/dev/null | lines | enumerate | each { |item|
|
||
|
{
|
||
|
timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
|
||
|
source: "docker"
|
||
|
container: $service
|
||
|
message: $item.item
|
||
|
line_number: $item.index
|
||
|
}
|
||
|
})
|
||
|
$logs = ($logs | append $container_logs)
|
||
|
}
|
||
|
} catch {}
|
||
|
|
||
|
# File-based logs (common locations)
|
||
|
let log_files = [
|
||
|
"/var/log/syslog"
|
||
|
"/var/log/messages"
|
||
|
"/var/log/kern.log"
|
||
|
"/var/log/auth.log"
|
||
|
]
|
||
|
|
||
|
for log_file in $log_files {
|
||
|
try {
|
||
|
if ($log_file | path exists) {
|
||
|
let file_logs = (tail -n $lines $log_file | lines | enumerate | each { |item|
|
||
|
{
|
||
|
timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
|
||
|
source: "file"
|
||
|
file: $log_file
|
||
|
message: $item.item
|
||
|
line_number: $item.index
|
||
|
}
|
||
|
})
|
||
|
$logs = ($logs | append $file_logs)
|
||
|
}
|
||
|
} catch {}
|
||
|
}
|
||
|
|
||
|
return ($logs | first $lines)
|
||
|
}
|
||
|
|
||
|
# Process and analyze log patterns
|
||
|
export def analyze-logs [logs: list<record>]: nothing -> record {
|
||
|
let total_logs = ($logs | length)
|
||
|
|
||
|
if $total_logs == 0 {
|
||
|
return {
|
||
|
total: 0
|
||
|
analysis: "No logs to analyze"
|
||
|
}
|
||
|
}
|
||
|
|
||
|
# Error pattern analysis
|
||
|
let error_patterns = ["error", "failed", "exception", "critical", "fatal"]
|
||
|
mut error_counts = {}
|
||
|
|
||
|
for pattern in $error_patterns {
|
||
|
let count = ($logs | where message =~ $"(?i)($pattern)" | length)
|
||
|
$error_counts = ($error_counts | insert $pattern $count)
|
||
|
}
|
||
|
|
||
|
# Source distribution
|
||
|
let source_dist = ($logs | group-by source | transpose key value | each { |item|
|
||
|
{source: $item.key, count: ($item.value | length)}
|
||
|
})
|
||
|
|
||
|
# Time-based analysis (last hour)
|
||
|
let recent_logs = ($logs | where timestamp > ((date now) - 1hr))
|
||
|
|
||
|
return {
|
||
|
total: $total_logs
|
||
|
recent_count: ($recent_logs | length)
|
||
|
error_patterns: $error_counts
|
||
|
source_distribution: $source_dist
|
||
|
analysis_timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
|
||
|
}
|
||
|
}
|
||
|
|
||
|
# Export metrics in various formats
|
||
|
export def export-metrics [
|
||
|
metrics: record
|
||
|
--format(-f): string = "json" # json, yaml, csv
|
||
|
--output(-o): string # Output file path
|
||
|
]: nothing -> any {
|
||
|
let formatted_data = match $format {
|
||
|
"yaml" => ($metrics | to yaml)
|
||
|
"csv" => {
|
||
|
# Flatten metrics for CSV export
|
||
|
let flattened = ($metrics | flatten | transpose key value)
|
||
|
$flattened | to csv
|
||
|
}
|
||
|
_ => ($metrics | to json)
|
||
|
}
|
||
|
|
||
|
if ($output | is-not-empty) {
|
||
|
$formatted_data | save $output
|
||
|
print $"Metrics exported to ($output)"
|
||
|
} else {
|
||
|
$formatted_data
|
||
|
}
|
||
|
}
|
||
|
|
||
|
# Health monitoring function
|
||
|
export def health-monitor [
|
||
|
--interval(-i): int = 60 # Collection interval in seconds
|
||
|
--duration(-d): int = 300 # Total monitoring duration in seconds
|
||
|
--output(-o): string # Output file for continuous monitoring
|
||
|
]: nothing -> nothing {
|
||
|
let start_time = (date now)
|
||
|
let end_time = ($start_time + ($duration * 1sec))
|
||
|
|
||
|
print $"🔍 Starting health monitoring for ($duration) seconds with ($interval)s intervals"
|
||
|
print $"📊 Collecting system and container metrics"
|
||
|
|
||
|
while (date now) < $end_time {
|
||
|
let current_time = (date now)
|
||
|
let system_metrics = (collect-system-metrics)
|
||
|
let container_metrics = (collect-container-metrics)
|
||
|
|
||
|
let combined_metrics = {
|
||
|
collection_time: ($current_time | format date "%Y-%m-%d %H:%M:%S")
|
||
|
system: $system_metrics
|
||
|
containers: $container_metrics
|
||
|
}
|
||
|
|
||
|
if ($output | is-not-empty) {
|
||
|
$combined_metrics | to json | save -a $output
|
||
|
} else {
|
||
|
print $"⏰ ($current_time | format date "%H:%M:%S") - CPU: ($system_metrics.cpu.load_1m?)% | Memory: ($system_metrics.memory.usage_percent?)%"
|
||
|
}
|
||
|
|
||
|
sleep ($interval * 1sec)
|
||
|
}
|
||
|
|
||
|
print "✅ Health monitoring completed"
|
||
|
}
|
||
|
|
||
|
# Quick system status check
|
||
|
export def status-check []: nothing -> record {
|
||
|
let system = (collect-system-metrics)
|
||
|
let containers = (collect-container-metrics)
|
||
|
|
||
|
# Determine overall health
|
||
|
mut health_status = "healthy"
|
||
|
mut alerts = []
|
||
|
|
||
|
# CPU load check
|
||
|
if (($system.cpu.load_1m? | default 0) > 4.0) {
|
||
|
$health_status = "warning"
|
||
|
$alerts = ($alerts | append "High CPU load")
|
||
|
}
|
||
|
|
||
|
# Memory usage check
|
||
|
if (($system.memory.usage_percent? | default 0) > 90) {
|
||
|
$health_status = "critical"
|
||
|
$alerts = ($alerts | append "High memory usage")
|
||
|
}
|
||
|
|
||
|
# Disk usage check
|
||
|
try {
|
||
|
let high_disk = ($system.disk | where {|x| ($x.percent | str replace "%" "" | into float) > 90})
|
||
|
if ($high_disk | length) > 0 {
|
||
|
$health_status = "warning"
|
||
|
$alerts = ($alerts | append "High disk usage")
|
||
|
}
|
||
|
} catch {}
|
||
|
|
||
|
return {
|
||
|
status: $health_status
|
||
|
alerts: $alerts
|
||
|
metrics: {
|
||
|
system: $system
|
||
|
containers: $containers
|
||
|
}
|
||
|
timestamp: (date now | format date "%Y-%m-%d %H:%M:%S")
|
||
|
}
|
||
|
}
|