provisioning/config-examples/config.prod.toml

490 lines
14 KiB
TOML
Raw Permalink Normal View History

# Production Environment Configuration Template
# Copy this file to config.prod.toml for production-ready settings
#
# This template provides secure, performance-optimized settings for production:
# - Minimal logging to reduce overhead
# - Security-focused configurations
# - Production provider defaults
# - Optimized performance settings
# - Robust error handling and validation
# =============================================================================
# PRODUCTION CORE CONFIGURATION
# =============================================================================
[core]
version = "1.0.0"
name = "provisioning-system-prod"
# =============================================================================
# PRODUCTION PATHS
# =============================================================================
# Configured for production deployment standards
[paths]
# Production base path - typically system-wide installation
# Standard production locations:
# base = "/opt/provisioning" # Standard system location
# base = "/usr/local/provisioning" # Alternative system location
# base = "/app/provisioning" # Container deployment
# base = "/srv/provisioning" # Service directory
base = "/opt/provisioning"
# Production paths follow security best practices
# All paths inherit from base for consistency
kloud = "{{paths.base}}/infra"
providers = "{{paths.base}}/providers"
taskservs = "{{paths.base}}/taskservs"
clusters = "{{paths.base}}/cluster"
resources = "{{paths.base}}/resources"
templates = "{{paths.base}}/templates"
tools = "{{paths.base}}/tools"
core = "{{paths.base}}/core"
[paths.files]
# Production configuration files with secure defaults
settings = "{{paths.base}}/kcl/settings.k"
keys = "{{paths.base}}/keys/prod-keys.yaml"
requirements = "{{paths.base}}/requirements.yaml"
notify_icon = "{{paths.base}}/resources/icon.png"
# =============================================================================
# PRODUCTION SECURITY AND DEBUGGING
# =============================================================================
# Minimal debugging for security and performance
[debug]
# Disable debug mode in production for security
enabled = false
# Never show metadata in production logs
metadata = false
# Never enable check mode by default in production
check = false
# Disable remote debugging in production
remote = false
# Use warning level logging to capture only important events
# This reduces log volume while maintaining operational visibility
log_level = "warn"
# Ensure terminal features work properly in production
no_terminal = false
# =============================================================================
# PRODUCTION OUTPUT CONFIGURATION
# =============================================================================
[output]
# Use less for reliable paging in production environments
file_viewer = "less"
# YAML format for human-readable production output
format = "yaml"
# =============================================================================
# PRODUCTION SOPS CONFIGURATION
# =============================================================================
# Secure secrets management for production
[sops]
# Enable SOPS for production secret management
use_sops = true
# Production SOPS configuration with strict security
config_path = "{{paths.base}}/.sops.yaml"
# Secure key search paths for production
# Only search trusted, secure locations
key_search_paths = [
"/etc/sops/age/keys.txt",
"{{paths.base}}/keys/age.txt",
"/var/lib/provisioning/keys/age.txt"
]
# =============================================================================
# PRODUCTION RUNTIME CONFIGURATION
# =============================================================================
[taskservs]
# Production runtime directory with proper permissions
run_path = "/var/lib/provisioning/taskservs"
[clusters]
# Production cluster runtime with persistence
run_path = "/var/lib/provisioning/clusters"
[generation]
# Production generation directory
dir_path = "/var/lib/provisioning/generated"
defs_file = "prod-defs.toml"
# =============================================================================
# PRODUCTION PROVIDER CONFIGURATION
# =============================================================================
# Production-ready cloud provider settings
[providers]
# Default to AWS for production deployments
# Change to your primary production cloud provider
default = "aws"
# AWS Production Configuration
[providers.aws]
# Use default AWS endpoints for production
api_url = ""
# Use IAM roles/instance profiles for authentication
auth = ""
# Use CLI interface for production stability
interface = "CLI"
# UpCloud Production Configuration
[providers.upcloud]
# Standard UpCloud API endpoint
api_url = "https://api.upcloud.com/1.3"
# Use API keys stored in environment/SOPS
auth = ""
# Use CLI interface for production
interface = "CLI"
# Local Provider (disabled in production)
[providers.local]
# Not typically used in production
api_url = ""
auth = ""
interface = "CLI"
# =============================================================================
# PRODUCTION ENVIRONMENT SETTINGS
# =============================================================================
# Production environment defaults
[environments.prod]
debug.enabled = false
debug.log_level = "warn"
debug.metadata = false
debug.check = false
debug.remote = false
providers.default = "aws"
output.format = "yaml"
output.file_viewer = "less"
# Development override (if needed for production debugging)
[environments.dev]
debug.enabled = true
debug.log_level = "info"
debug.check = true
providers.default = "local"
output.format = "json"
# Testing environment for production validation
[environments.test]
debug.enabled = false
debug.log_level = "info"
debug.check = true
providers.default = "aws"
output.format = "yaml"
# =============================================================================
# PRODUCTION PERFORMANCE OPTIMIZATION
# =============================================================================
# Performance settings optimized for production workloads
[performance]
# Higher parallelism for production efficiency
parallel_operations = 8
# Longer timeouts for production reliability
timeout_seconds = 600
# Enable caching for better performance
cache_enabled = true
# Production cache directory
cache_dir = "/var/cache/provisioning"
# Cache retention for production
cache_retention_hours = 24
# =============================================================================
# PRODUCTION SECURITY CONFIGURATION
# =============================================================================
# Security settings for production environment
[security]
# Always require confirmation for destructive operations
require_confirmation = true
# Never log sensitive data in production
log_sensitive_data = false
# Enable strict validation in production
strict_validation = true
# Production backup settings
auto_backup = true
backup_dir = "/var/backups/provisioning"
# Backup retention policy
backup_retention_days = 30
# Encrypt backups in production
backup_encryption = true
# Audit logging for production
audit_enabled = true
audit_log_path = "/var/log/provisioning/audit.log"
# =============================================================================
# PRODUCTION MONITORING AND ALERTING
# =============================================================================
# Production monitoring configuration
[monitoring]
# Enable comprehensive monitoring
enabled = true
# Production metrics endpoint
endpoint = "https://metrics.example.com/provisioning"
# Monitoring interval
interval = "60s"
# Health check configuration
health_check_enabled = true
health_check_port = 8080
# Log aggregation for production
log_endpoint = "https://logs.example.com/provisioning"
# Production alerting
[alerting]
# Enable production alerting
enabled = true
# Alert channels
email_enabled = true
email_recipients = ["ops@example.com", "devops@example.com"]
slack_enabled = true
slack_webhook = "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK"
# PagerDuty integration
pagerduty_enabled = true
pagerduty_key = "SOPS_ENCRYPTED_KEY"
# Alert thresholds
error_threshold = 5
warning_threshold = 10
# =============================================================================
# PRODUCTION BACKUP AND DISASTER RECOVERY
# =============================================================================
# Production backup configuration
[backup]
# Enable automated backups
enabled = true
# Backup schedule (production frequency)
schedule = "0 2 * * *" # Daily at 2 AM
# Backup retention policy
retention_days = 90
# Backup storage location
location = "/var/backups/provisioning"
# Remote backup storage
remote_enabled = true
remote_location = "s3://company-backups/provisioning/"
# Backup encryption
encryption_enabled = true
# Backup verification
verification_enabled = true
# Disaster recovery settings
[disaster_recovery]
# Enable DR procedures
enabled = true
# DR site configuration
dr_site = "us-west-2"
# RTO and RPO targets
rto_minutes = 60
rpo_minutes = 15
# DR testing schedule
test_schedule = "0 3 1 * *" # Monthly DR testing
# =============================================================================
# PRODUCTION COMPLIANCE AND GOVERNANCE
# =============================================================================
# Compliance settings for production
[compliance]
# Enable compliance monitoring
enabled = true
# Compliance frameworks
frameworks = ["SOC2", "PCI-DSS", "GDPR"]
# Compliance reporting
reporting_enabled = true
report_frequency = "monthly"
# Data retention policies
data_retention_days = 2555 # 7 years
# Encryption requirements
encryption_at_rest = true
encryption_in_transit = true
# Governance settings
[governance]
# Change management
change_approval_required = true
# Configuration drift detection
drift_detection_enabled = true
drift_check_interval = "24h"
# Policy enforcement
policy_enforcement_enabled = true
# Resource tagging requirements
required_tags = ["Environment", "Owner", "Project", "CostCenter"]
# =============================================================================
# PRODUCTION INTEGRATION SETTINGS
# =============================================================================
# CI/CD integration for production
[cicd]
# Enable CI/CD integration
enabled = true
# Pipeline triggers
trigger_on_config_change = true
# Deployment gates
require_approval = true
# Automated testing
run_tests = true
test_timeout = 1800
# Rollback capability
auto_rollback_enabled = true
# ITSM integration
[itsm]
# ServiceNow integration
servicenow_enabled = true
servicenow_instance = "https://company.service-now.com"
# Change request automation
auto_create_change_requests = true
# Incident management
auto_create_incidents = true
# =============================================================================
# PRODUCTION RESOURCE MANAGEMENT
# =============================================================================
# Resource quotas and limits for production
[resources]
# CPU limits
max_cpu_cores = 32
# Memory limits
max_memory_gb = 128
# Storage limits
max_storage_gb = 1000
# Network limits
max_bandwidth_mbps = 1000
# Instance limits
max_instances = 100
# Cost management
[cost_management]
# Enable cost tracking
enabled = true
# Budget alerts
budget_alerts_enabled = true
monthly_budget_limit = 10000
# Cost optimization
auto_optimize = false
optimization_schedule = "0 4 * * 0" # Weekly optimization review
# =============================================================================
# PRODUCTION OPERATIONAL PROCEDURES
# =============================================================================
# Maintenance windows
[maintenance]
# Scheduled maintenance
enabled = true
# Maintenance window schedule
schedule = "0 3 * * 0" # Sunday 3 AM
# Maintenance duration
duration_hours = 4
# Notification before maintenance
notification_hours = 24
# Incident response
[incident_response]
# Enable automated incident response
enabled = true
# Response team notifications
primary_contact = "ops@example.com"
escalation_contact = "management@example.com"
# Response time targets
response_time_minutes = 15
resolution_time_hours = 4
# =============================================================================
# PRODUCTION USAGE GUIDELINES
# =============================================================================
#
# Production Deployment Checklist:
# --------------------------------
#
# 1. Security Review:
# □ SOPS keys properly secured
# □ IAM roles configured with least privilege
# □ Network security groups configured
# □ Audit logging enabled
#
# 2. Performance Validation:
# □ Resource quotas set appropriately
# □ Monitoring and alerting configured
# □ Backup and DR procedures tested
# □ Load testing completed
#
# 3. Compliance Verification:
# □ Required tags applied to all resources
# □ Data encryption enabled
# □ Compliance frameworks configured
# □ Change management processes in place
#
# 4. Operational Readiness:
# □ Runbooks created and tested
# □ On-call procedures established
# □ Incident response tested
# □ Documentation updated
#
# Production Operations Commands:
# ------------------------------
#
# 1. Health Check:
# ./core/nulib/provisioning validate config --strict
#
# 2. Deploy Infrastructure:
# ./core/nulib/provisioning server create --infra production
#
# 3. Monitor Operations:
# ./core/nulib/provisioning show servers --infra production --out yaml
#
# 4. Backup Configuration:
# ./core/nulib/provisioning backup create --infra production
#
# 5. Emergency Procedures:
# ./core/nulib/provisioning cluster delete --infra production --emergency
#
# =============================================================================
# PRODUCTION TROUBLESHOOTING
# =============================================================================
#
# Common Production Issues:
# ------------------------
#
# 1. Authentication Failures:
# - Check IAM roles and policies
# - Verify SOPS key access
# - Validate provider credentials
#
# 2. Performance Issues:
# - Review parallel_operations setting
# - Check timeout_seconds values
# - Monitor resource utilization
#
# 3. Security Alerts:
# - Review audit logs
# - Check compliance status
# - Validate encryption settings
#
# 4. Backup Failures:
# - Verify backup storage access
# - Check retention policies
# - Test recovery procedures
#
# 5. Monitoring Gaps:
# - Validate monitoring endpoints
# - Check alert configurations
# - Test notification channels