# Production Environment Configuration Template # Copy this file to config.prod.toml for production-ready settings # # This template provides secure, performance-optimized settings for production: # - Minimal logging to reduce overhead # - Security-focused configurations # - Production provider defaults # - Optimized performance settings # - Robust error handling and validation # ============================================================================= # PRODUCTION CORE CONFIGURATION # ============================================================================= [core] version = "1.0.0" name = "provisioning-system-prod" # ============================================================================= # PRODUCTION PATHS # ============================================================================= # Configured for production deployment standards [paths] # Production base path - typically system-wide installation # Standard production locations: # base = "/opt/provisioning" # Standard system location # base = "/usr/local/provisioning" # Alternative system location # base = "/app/provisioning" # Container deployment # base = "/srv/provisioning" # Service directory base = "/opt/provisioning" # Production paths follow security best practices # All paths inherit from base for consistency kloud = "{{paths.base}}/infra" providers = "{{paths.base}}/providers" taskservs = "{{paths.base}}/taskservs" clusters = "{{paths.base}}/cluster" resources = "{{paths.base}}/resources" templates = "{{paths.base}}/templates" tools = "{{paths.base}}/tools" core = "{{paths.base}}/core" [paths.files] # Production configuration files with secure defaults settings = "{{paths.base}}/kcl/settings.k" keys = "{{paths.base}}/keys/prod-keys.yaml" requirements = "{{paths.base}}/requirements.yaml" notify_icon = "{{paths.base}}/resources/icon.png" # ============================================================================= # PRODUCTION SECURITY AND DEBUGGING # ============================================================================= # Minimal debugging for security and performance [debug] # Disable debug mode in production for security enabled = false # Never show metadata in production logs metadata = false # Never enable check mode by default in production check = false # Disable remote debugging in production remote = false # Use warning level logging to capture only important events # This reduces log volume while maintaining operational visibility log_level = "warn" # Ensure terminal features work properly in production no_terminal = false # ============================================================================= # PRODUCTION OUTPUT CONFIGURATION # ============================================================================= [output] # Use less for reliable paging in production environments file_viewer = "less" # YAML format for human-readable production output format = "yaml" # ============================================================================= # PRODUCTION SOPS CONFIGURATION # ============================================================================= # Secure secrets management for production [sops] # Enable SOPS for production secret management use_sops = true # Production SOPS configuration with strict security config_path = "{{paths.base}}/.sops.yaml" # Secure key search paths for production # Only search trusted, secure locations key_search_paths = [ "/etc/sops/age/keys.txt", "{{paths.base}}/keys/age.txt", "/var/lib/provisioning/keys/age.txt" ] # ============================================================================= # PRODUCTION RUNTIME CONFIGURATION # ============================================================================= [taskservs] # Production runtime directory with proper permissions run_path = "/var/lib/provisioning/taskservs" [clusters] # Production cluster runtime with persistence run_path = "/var/lib/provisioning/clusters" [generation] # Production generation directory dir_path = "/var/lib/provisioning/generated" defs_file = "prod-defs.toml" # ============================================================================= # PRODUCTION PROVIDER CONFIGURATION # ============================================================================= # Production-ready cloud provider settings [providers] # Default to AWS for production deployments # Change to your primary production cloud provider default = "aws" # AWS Production Configuration [providers.aws] # Use default AWS endpoints for production api_url = "" # Use IAM roles/instance profiles for authentication auth = "" # Use CLI interface for production stability interface = "CLI" # UpCloud Production Configuration [providers.upcloud] # Standard UpCloud API endpoint api_url = "https://api.upcloud.com/1.3" # Use API keys stored in environment/SOPS auth = "" # Use CLI interface for production interface = "CLI" # Local Provider (disabled in production) [providers.local] # Not typically used in production api_url = "" auth = "" interface = "CLI" # ============================================================================= # PRODUCTION ENVIRONMENT SETTINGS # ============================================================================= # Production environment defaults [environments.prod] debug.enabled = false debug.log_level = "warn" debug.metadata = false debug.check = false debug.remote = false providers.default = "aws" output.format = "yaml" output.file_viewer = "less" # Development override (if needed for production debugging) [environments.dev] debug.enabled = true debug.log_level = "info" debug.check = true providers.default = "local" output.format = "json" # Testing environment for production validation [environments.test] debug.enabled = false debug.log_level = "info" debug.check = true providers.default = "aws" output.format = "yaml" # ============================================================================= # PRODUCTION PERFORMANCE OPTIMIZATION # ============================================================================= # Performance settings optimized for production workloads [performance] # Higher parallelism for production efficiency parallel_operations = 8 # Longer timeouts for production reliability timeout_seconds = 600 # Enable caching for better performance cache_enabled = true # Production cache directory cache_dir = "/var/cache/provisioning" # Cache retention for production cache_retention_hours = 24 # ============================================================================= # PRODUCTION SECURITY CONFIGURATION # ============================================================================= # Security settings for production environment [security] # Always require confirmation for destructive operations require_confirmation = true # Never log sensitive data in production log_sensitive_data = false # Enable strict validation in production strict_validation = true # Production backup settings auto_backup = true backup_dir = "/var/backups/provisioning" # Backup retention policy backup_retention_days = 30 # Encrypt backups in production backup_encryption = true # Audit logging for production audit_enabled = true audit_log_path = "/var/log/provisioning/audit.log" # ============================================================================= # PRODUCTION MONITORING AND ALERTING # ============================================================================= # Production monitoring configuration [monitoring] # Enable comprehensive monitoring enabled = true # Production metrics endpoint endpoint = "https://metrics.example.com/provisioning" # Monitoring interval interval = "60s" # Health check configuration health_check_enabled = true health_check_port = 8080 # Log aggregation for production log_endpoint = "https://logs.example.com/provisioning" # Production alerting [alerting] # Enable production alerting enabled = true # Alert channels email_enabled = true email_recipients = ["ops@example.com", "devops@example.com"] slack_enabled = true slack_webhook = "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK" # PagerDuty integration pagerduty_enabled = true pagerduty_key = "SOPS_ENCRYPTED_KEY" # Alert thresholds error_threshold = 5 warning_threshold = 10 # ============================================================================= # PRODUCTION BACKUP AND DISASTER RECOVERY # ============================================================================= # Production backup configuration [backup] # Enable automated backups enabled = true # Backup schedule (production frequency) schedule = "0 2 * * *" # Daily at 2 AM # Backup retention policy retention_days = 90 # Backup storage location location = "/var/backups/provisioning" # Remote backup storage remote_enabled = true remote_location = "s3://company-backups/provisioning/" # Backup encryption encryption_enabled = true # Backup verification verification_enabled = true # Disaster recovery settings [disaster_recovery] # Enable DR procedures enabled = true # DR site configuration dr_site = "us-west-2" # RTO and RPO targets rto_minutes = 60 rpo_minutes = 15 # DR testing schedule test_schedule = "0 3 1 * *" # Monthly DR testing # ============================================================================= # PRODUCTION COMPLIANCE AND GOVERNANCE # ============================================================================= # Compliance settings for production [compliance] # Enable compliance monitoring enabled = true # Compliance frameworks frameworks = ["SOC2", "PCI-DSS", "GDPR"] # Compliance reporting reporting_enabled = true report_frequency = "monthly" # Data retention policies data_retention_days = 2555 # 7 years # Encryption requirements encryption_at_rest = true encryption_in_transit = true # Governance settings [governance] # Change management change_approval_required = true # Configuration drift detection drift_detection_enabled = true drift_check_interval = "24h" # Policy enforcement policy_enforcement_enabled = true # Resource tagging requirements required_tags = ["Environment", "Owner", "Project", "CostCenter"] # ============================================================================= # PRODUCTION INTEGRATION SETTINGS # ============================================================================= # CI/CD integration for production [cicd] # Enable CI/CD integration enabled = true # Pipeline triggers trigger_on_config_change = true # Deployment gates require_approval = true # Automated testing run_tests = true test_timeout = 1800 # Rollback capability auto_rollback_enabled = true # ITSM integration [itsm] # ServiceNow integration servicenow_enabled = true servicenow_instance = "https://company.service-now.com" # Change request automation auto_create_change_requests = true # Incident management auto_create_incidents = true # ============================================================================= # PRODUCTION RESOURCE MANAGEMENT # ============================================================================= # Resource quotas and limits for production [resources] # CPU limits max_cpu_cores = 32 # Memory limits max_memory_gb = 128 # Storage limits max_storage_gb = 1000 # Network limits max_bandwidth_mbps = 1000 # Instance limits max_instances = 100 # Cost management [cost_management] # Enable cost tracking enabled = true # Budget alerts budget_alerts_enabled = true monthly_budget_limit = 10000 # Cost optimization auto_optimize = false optimization_schedule = "0 4 * * 0" # Weekly optimization review # ============================================================================= # PRODUCTION OPERATIONAL PROCEDURES # ============================================================================= # Maintenance windows [maintenance] # Scheduled maintenance enabled = true # Maintenance window schedule schedule = "0 3 * * 0" # Sunday 3 AM # Maintenance duration duration_hours = 4 # Notification before maintenance notification_hours = 24 # Incident response [incident_response] # Enable automated incident response enabled = true # Response team notifications primary_contact = "ops@example.com" escalation_contact = "management@example.com" # Response time targets response_time_minutes = 15 resolution_time_hours = 4 # ============================================================================= # PRODUCTION USAGE GUIDELINES # ============================================================================= # # Production Deployment Checklist: # -------------------------------- # # 1. Security Review: # □ SOPS keys properly secured # □ IAM roles configured with least privilege # □ Network security groups configured # □ Audit logging enabled # # 2. Performance Validation: # □ Resource quotas set appropriately # □ Monitoring and alerting configured # □ Backup and DR procedures tested # □ Load testing completed # # 3. Compliance Verification: # □ Required tags applied to all resources # □ Data encryption enabled # □ Compliance frameworks configured # □ Change management processes in place # # 4. Operational Readiness: # □ Runbooks created and tested # □ On-call procedures established # □ Incident response tested # □ Documentation updated # # Production Operations Commands: # ------------------------------ # # 1. Health Check: # ./core/nulib/provisioning validate config --strict # # 2. Deploy Infrastructure: # ./core/nulib/provisioning server create --infra production # # 3. Monitor Operations: # ./core/nulib/provisioning show servers --infra production --out yaml # # 4. Backup Configuration: # ./core/nulib/provisioning backup create --infra production # # 5. Emergency Procedures: # ./core/nulib/provisioning cluster delete --infra production --emergency # # ============================================================================= # PRODUCTION TROUBLESHOOTING # ============================================================================= # # Common Production Issues: # ------------------------ # # 1. Authentication Failures: # - Check IAM roles and policies # - Verify SOPS key access # - Validate provider credentials # # 2. Performance Issues: # - Review parallel_operations setting # - Check timeout_seconds values # - Monitor resource utilization # # 3. Security Alerts: # - Review audit logs # - Check compliance status # - Validate encryption settings # # 4. Backup Failures: # - Verify backup storage access # - Check retention policies # - Test recovery procedures # # 5. Monitoring Gaps: # - Validate monitoring endpoints # - Check alert configurations # - Test notification channels