diff --git a/.github/workflows/pre-commit-hooks.yml b/.github/workflows/pre-commit-hooks.yml index 40e2ef4..7949f86 100644 --- a/.github/workflows/pre-commit-hooks.yml +++ b/.github/workflows/pre-commit-hooks.yml @@ -6,8 +6,8 @@ name: Pre-commit Validation on: pull_request: paths: - - '.pre-commit-config.yaml' - - '.github/workflows/pre-commit-hooks.yml' + - ".pre-commit-config.yaml" + - ".github/workflows/pre-commit-hooks.yml" jobs: validate-pre-commit: @@ -19,7 +19,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.11' + python-version: "3.11" - name: Install pre-commit run: | diff --git a/.github/workflows/secret-scanning.yml b/.github/workflows/secret-scanning.yml index 37bcfd2..95a986e 100644 --- a/.github/workflows/secret-scanning.yml +++ b/.github/workflows/secret-scanning.yml @@ -7,8 +7,8 @@ on: push: branches: - main - - 'feature/**' - - 'fix/**' + - "feature/**" + - "fix/**" permissions: contents: write @@ -23,7 +23,7 @@ jobs: - name: Checkout code uses: actions/checkout@v4 with: - fetch-depth: 0 # Fetch all history for accurate scanning + fetch-depth: 0 # Fetch all history for accurate scanning - name: Run Gitleaks uses: gitleaks/gitleaks-action@v2 diff --git a/.github/workflows/terraform-apply.yml b/.github/workflows/terraform-apply.yml index 93bc8d5..52eda40 100644 --- a/.github/workflows/terraform-apply.yml +++ b/.github/workflows/terraform-apply.yml @@ -5,13 +5,13 @@ on: branches: - main paths: - - 'infra/aws/**/*.tf' - - 'infra/aws/**/*.tfvars' - - '.github/workflows/terraform-*.yml' + - "infra/aws/**/*.tf" + - "infra/aws/**/*.tfvars" + - ".github/workflows/terraform-*.yml" workflow_dispatch: inputs: module: - description: 'Specific module to apply (leave empty for all changed)' + description: "Specific module to apply (leave empty for all changed)" required: false type: string @@ -65,7 +65,7 @@ jobs: matrix: module: ${{ fromJson(needs.detect-changes.outputs.modules) }} fail-fast: false - max-parallel: 1 # Apply modules one at a time to avoid conflicts + max-parallel: 1 # Apply modules one at a time to avoid conflicts defaults: run: working-directory: ${{ matrix.module }} diff --git a/.github/workflows/terraform-destroy.yml b/.github/workflows/terraform-destroy.yml index d6a66ed..590c354 100644 --- a/.github/workflows/terraform-destroy.yml +++ b/.github/workflows/terraform-destroy.yml @@ -4,7 +4,7 @@ on: workflow_dispatch: inputs: module: - description: 'Module to destroy (e.g., infra/aws/us-east-2/eks)' + description: "Module to destroy (e.g., infra/aws/us-east-2/eks)" required: true type: string confirm: diff --git a/.github/workflows/terraform-plan.yml b/.github/workflows/terraform-plan.yml index 0a7ef72..0da766e 100644 --- a/.github/workflows/terraform-plan.yml +++ b/.github/workflows/terraform-plan.yml @@ -5,9 +5,9 @@ on: branches: - main paths: - - 'infra/aws/**/*.tf' - - 'infra/aws/**/*.tfvars' - - '.github/workflows/terraform-*.yml' + - "infra/aws/**/*.tf" + - "infra/aws/**/*.tfvars" + - ".github/workflows/terraform-*.yml" permissions: contents: read diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ad8971e..d49d3f8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,13 +17,13 @@ repos: exclude: '\.md$' - id: end-of-file-fixer - id: check-yaml - args: ['--unsafe'] # Allow custom YAML tags + args: ["--unsafe"] # Allow custom YAML tags - id: check-added-large-files - args: ['--maxkb=1000'] + args: ["--maxkb=1000"] - id: check-merge-conflict - id: detect-private-key - id: detect-aws-credentials - args: ['--allow-missing-credentials'] + args: ["--allow-missing-credentials"] # Terraform - repo: https://github.com/antonbabenko/pre-commit-terraform @@ -47,7 +47,7 @@ repos: rev: v4.5.0 hooks: - id: no-commit-to-branch - args: ['--branch', 'main', '--branch', 'master'] + args: ["--branch", "main", "--branch", "master"] stages: [commit] # Global settings diff --git a/docs/cost-optimization-strategy.md b/docs/cost-optimization-strategy.md new file mode 100644 index 0000000..12da3ff --- /dev/null +++ b/docs/cost-optimization-strategy.md @@ -0,0 +1,130 @@ +# Cost Optimization Strategy for Coder Demo + +## Mixed Capacity Approach + +### Node Group Strategy + +**System Nodes (ON_DEMAND)** + +- **Purpose**: Run critical Kubernetes infrastructure +- **Workloads**: CoreDNS, kube-proxy, metrics-server, cert-manager, AWS LB Controller +- **Size**: t4g.medium (ARM Graviton) +- **Count**: 1-2 nodes minimum +- **Cost**: ~$24/month (1 node) to $48/month (2 nodes) + +**Application Nodes (MIXED: 20% On-Demand, 80% Spot via Karpenter)** + +- **Purpose**: Run Coder server and workspaces +- **Spot Savings**: 70-90% cost reduction +- **Interruption Risk**: Mitigated by: + - Multiple instance types (diversified Spot pools) + - Karpenter auto-rebalancing + - Pod Disruption Budgets + +### Karpenter NodePool Configuration + +#### 1. Coder Server NodePool (ON_DEMAND Priority) + +```yaml +capacity_type: ["on-demand", "spot"] # Prefer On-Demand, fallback to Spot +weight: + on-demand: 100 # Higher priority + spot: 10 +``` + +#### 2. Coder Workspace NodePool (SPOT Priority) + +```yaml +capacity_type: ["spot", "on-demand"] # Prefer Spot, fallback to On-Demand +weight: + spot: 100 # Higher priority + on-demand: 10 +``` + +### Risk Mitigation + +**Spot Interruption Handling:** + +1. **2-minute warning** → Karpenter automatically provisions replacement +2. **Multiple instance types** → 15+ types reduces interruption rate to <1% +3. **Pod Disruption Budgets** → Ensures minimum replicas always running +4. **Karpenter Consolidation** → Automatically moves pods before termination + +**Example Instance Type Diversity:** + +``` +Spot Pool: t4g.medium, t4g.large, t3a.medium, t3a.large, + m6g.medium, m6g.large, m6a.medium, m6a.large +``` + +### Cost Breakdown + +| Component | Instance Type | Capacity | Monthly Cost | +| ------------------ | ------------- | --------- | ------------- | +| System Nodes (2) | t4g.medium | ON_DEMAND | $48 | +| Coder Server (2) | t4g.large | 80% SPOT | $28 (vs $140) | +| Workspaces (avg 5) | t4g.xlarge | 90% SPOT | $75 (vs $750) | +| **Total** | | **Mixed** | **$151/mo** | + +**vs All On-Demand:** $938/month → **84% savings** + +### Dynamic Scaling + +**Low Usage (nights/weekends):** + +- Scale to zero workspaces +- Keep 1 system node + 1 Coder server node +- Cost: ~$48/month during idle + +**High Usage (business hours):** + +- Auto-scale workspaces on Spot +- Karpenter provisions nodes in <60 seconds +- Cost: ~$150-200/month during peak + +### Monitoring & Alerts + +**CloudWatch Alarms:** + +- Spot interruption rate > 5% +- Available On-Demand capacity < 20% +- Karpenter provisioning failures + +**Response:** + +- Automatic fallback to On-Demand +- Email alerts to ops team +- Karpenter adjusts instance type mix + +## Implementation Timeline + +1. ✅ Deploy EKS with ON_DEMAND system nodes +2. ⏳ Deploy Karpenter +3. ⏳ Configure mixed-capacity NodePools +4. ⏳ Deploy Coder with node affinity rules +5. ⏳ Test Spot interruption handling +6. ⏳ Enable auto-scaling policies + +## Fallback Plan + +If Spot becomes unreliable (rare): + +1. Update Karpenter NodePool to 100% On-Demand +2. `kubectl apply -f nodepool-ondemand.yaml` +3. Karpenter gracefully migrates pods +4. Takes ~5 minutes, zero downtime + +## Best Practices + +✅ **DO:** + +- Use multiple Spot instance types (10+) +- Set Pod Disruption Budgets +- Monitor Spot interruption rates +- Test failover regularly + +❌ **DON'T:** + +- Run databases on Spot (use RDS) +- Use Spot for single-replica critical services +- Rely on single instance type for Spot diff --git a/infra/aws/us-east-2/README.md b/infra/aws/us-east-2/README.md index 8e18a7f..5ff4543 100644 --- a/infra/aws/us-east-2/README.md +++ b/infra/aws/us-east-2/README.md @@ -7,6 +7,7 @@ This directory uses remote S3 backend for state management, but **backend config ## Local Setup 1. **Get backend configuration from teammate** or **retrieve from AWS**: + ```bash # Get S3 bucket name (it contains the account ID) aws s3 ls | grep terraform-state @@ -24,6 +25,7 @@ This directory uses remote S3 backend for state management, but **backend config ``` Create `backend.tf`: + ```hcl terraform { backend "s3" { @@ -62,6 +64,7 @@ These are configured in: Repository Settings > Secrets and variables > Actions Instead of creating backend.tf, you can use a config file: 1. Create `backend.conf` (gitignored): + ``` bucket = "YOUR-BUCKET-NAME" dynamodb_table = "YOUR-TABLE-NAME" @@ -86,12 +89,14 @@ Instead of creating backend.tf, you can use a config file: This repository has automated secret scanning to prevent accidental exposure of credentials: ### GitHub Actions (Automated) + - **Gitleaks** - Scans every PR and push for secrets - **TruffleHog** - Additional verification layer - **Custom Pattern Matching** - Catches common secret patterns - **Auto-Revert** - Automatically reverts commits to main with secrets ### Pre-commit Hooks (Local) + Catch secrets before they reach GitHub: ```bash @@ -106,6 +111,7 @@ pre-commit run --all-files ``` ### What Gets Detected + - AWS Access Keys (AKIA...) - API Keys and Tokens - Private Keys (RSA, SSH, etc.) @@ -115,6 +121,7 @@ pre-commit run --all-files - High-entropy strings (likely secrets) ### If Secrets Are Detected + 1. **PR is blocked** - Cannot merge until secrets are removed 2. **Automatic notification** - PR comment explains the issue 3. **Required actions**: diff --git a/infra/aws/us-east-2/eks/main.tf b/infra/aws/us-east-2/eks/main.tf index 80c15aa..9f680e0 100644 --- a/infra/aws/us-east-2/eks/main.tf +++ b/infra/aws/us-east-2/eks/main.tf @@ -141,17 +141,115 @@ module "eks" { desired_size = 0 # Cant be modified after creation. Override from AWS Console labels = local.cluster_asg_node_labels - instance_types = [var.cluster_instance_type] - capacity_type = "ON_DEMAND" + # Cost optimization: Graviton ARM instances + # IMPORTANT: ON_DEMAND for system nodes - production demo cannot break! + instance_types = [var.cluster_instance_type, "t4g.small", "t4g.large"] # ARM only + ami_type = "AL2023_ARM_64_STANDARD" # ARM-based AMI + capacity_type = "ON_DEMAND" # System infrastructure must be stable + iam_role_additional_policies = { AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" STSAssumeRole = aws_iam_policy.sts.arn } + # Cost optimization: gp3 volumes with smaller size + block_device_mappings = [{ + device_name = "/dev/xvda" + ebs = { + volume_type = "gp3" # Better performance, same cost as gp2 + volume_size = 20 # Reduced from default 50GB + delete_on_termination = true + encrypted = true + } + }] + # System Nodes should not be public subnet_ids = var.private_subnet_ids } } tags = local.tags -} \ No newline at end of file +} +# VPC Endpoints for cost optimization (reduce NAT Gateway usage) +resource "aws_vpc_endpoint" "s3" { + vpc_id = var.vpc_id + service_name = "com.amazonaws.${var.region}.s3" + route_table_ids = flatten([ + data.aws_route_tables.private.ids + ]) + tags = merge(local.tags, { + Name = "${var.name}-s3-endpoint" + }) +} + +resource "aws_vpc_endpoint" "ecr_api" { + vpc_id = var.vpc_id + service_name = "com.amazonaws.${var.region}.ecr.api" + vpc_endpoint_type = "Interface" + subnet_ids = var.private_subnet_ids + security_group_ids = [aws_security_group.vpc_endpoints.id] + private_dns_enabled = true + tags = merge(local.tags, { + Name = "${var.name}-ecr-api-endpoint" + }) +} + +resource "aws_vpc_endpoint" "ecr_dkr" { + vpc_id = var.vpc_id + service_name = "com.amazonaws.${var.region}.ecr.dkr" + vpc_endpoint_type = "Interface" + subnet_ids = var.private_subnet_ids + security_group_ids = [aws_security_group.vpc_endpoints.id] + private_dns_enabled = true + tags = merge(local.tags, { + Name = "${var.name}-ecr-dkr-endpoint" + }) +} + +# Security group for VPC endpoints +resource "aws_security_group" "vpc_endpoints" { + name_prefix = "${var.name}-vpc-endpoints" + description = "Security group for VPC endpoints" + vpc_id = var.vpc_id + + ingress { + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_blocks = ["10.0.0.0/16"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = merge(local.tags, { + Name = "${var.name}-vpc-endpoints-sg" + }) +} + +# Data source for route tables +data "aws_route_tables" "private" { + vpc_id = var.vpc_id + filter { + name = "tag:Name" + values = ["*private*"] + } +} + +# Outputs +output "vpc_endpoint_s3_id" { + description = "S3 VPC Endpoint ID" + value = aws_vpc_endpoint.s3.id +} + +output "vpc_endpoint_ecr_ids" { + description = "ECR VPC Endpoint IDs" + value = { + api = aws_vpc_endpoint.ecr_api.id + dkr = aws_vpc_endpoint.ecr_dkr.id + } +} diff --git a/infra/aws/us-east-2/k8s/coder-server/main.tf b/infra/aws/us-east-2/k8s/coder-server/main.tf index 79a8fd2..47961c8 100644 --- a/infra/aws/us-east-2/k8s/coder-server/main.tf +++ b/infra/aws/us-east-2/k8s/coder-server/main.tf @@ -20,7 +20,7 @@ terraform { source = "hashicorp/tls" } } - backend "s3" {} + # backend "s3" {} # Commented out for local state during initial deployment } variable "cluster_name" { @@ -208,7 +208,7 @@ module "coder-server" { namespace = "coder" acme_registration_email = var.acme_registration_email acme_days_until_renewal = 90 - replica_count = 2 + replica_count = 1 # HA requires Enterprise license helm_version = var.addon_version image_repo = var.image_repo image_tag = var.image_tag @@ -237,10 +237,18 @@ module "coder-server" { github_external_auth_secret_client_id = var.coder_github_external_auth_secret_client_id github_external_auth_secret_client_secret = var.coder_github_external_auth_secret_client_secret tags = {} + env_vars = { + # Disable redirect since NLB terminates TLS and forwards plain HTTP to backend + # Without this, Coder sees HTTP and redirects to HTTPS, causing infinite redirect loop + CODER_REDIRECT_TO_ACCESS_URL = "false" + } service_annotations = { - "service.beta.kubernetes.io/aws-load-balancer-nlb-target-type" = "instance" - "service.beta.kubernetes.io/aws-load-balancer-scheme" = "internet-facing" - "service.beta.kubernetes.io/aws-load-balancer-attributes" = "deletion_protection.enabled=true" + "service.beta.kubernetes.io/aws-load-balancer-nlb-target-type" = "instance" + "service.beta.kubernetes.io/aws-load-balancer-scheme" = "internet-facing" + "service.beta.kubernetes.io/aws-load-balancer-attributes" = "deletion_protection.enabled=true" + "service.beta.kubernetes.io/aws-load-balancer-ssl-cert" = "arn:aws:acm:us-east-2:716194723392:certificate/a710c3f2-6e5d-4e42-9212-fb6a09087d26" + "service.beta.kubernetes.io/aws-load-balancer-ssl-ports" = "443" + "service.beta.kubernetes.io/aws-load-balancer-backend-protocol" = "tcp" } node_selector = { "node.coder.io/managed-by" = "karpenter" diff --git a/infra/aws/us-east-2/k8s/karpenter/main.tf b/infra/aws/us-east-2/k8s/karpenter/main.tf index a01280e..6b892c5 100644 --- a/infra/aws/us-east-2/k8s/karpenter/main.tf +++ b/infra/aws/us-east-2/k8s/karpenter/main.tf @@ -183,7 +183,7 @@ module "karpenter-addon" { block_device_mappings = [{ device_name = "/dev/xvda" ebs = { - volume_size = "1400Gi" + volume_size = "500Gi" // Decreased from 1400Gi to save costs; felt overkill for coder-server nodes volume_type = "gp3" } }, { @@ -198,6 +198,7 @@ module "karpenter-addon" { subnet_selector_tags = local.provisioner_subnet_tags sg_selector_tags = local.provisioner_sg_tags }] + nodepool_configs = local.nodepool_configs } # import { diff --git a/infra/aws/us-east-2/rds/main.tf b/infra/aws/us-east-2/rds/main.tf index ad0e620..1d14e2e 100644 --- a/infra/aws/us-east-2/rds/main.tf +++ b/infra/aws/us-east-2/rds/main.tf @@ -5,6 +5,10 @@ terraform { source = "hashicorp/aws" version = ">= 5.46" } + random = { + source = "hashicorp/random" + version = "~> 3.6" + } } backend "s3" {} } @@ -19,20 +23,10 @@ variable "master_username" { type = string } -variable "master_password" { - description = "Database root password" - type = string -} - variable "litellm_username" { type = string } -variable "litellm_password" { - type = string - sensitive = true -} - variable "name" { description = "Name of resource and tag prefix" type = string @@ -80,6 +74,17 @@ provider "aws" { profile = var.profile } +# Generate secure random passwords +resource "random_password" "coder_master_password" { + length = 32 + special = true +} + +resource "random_password" "litellm_password" { + length = 32 + special = true +} + # https://developer.hashicorp.com/terraform/tutorials/aws/aws-rds resource "aws_db_subnet_group" "db_subnet_group" { name = "${var.name}-db-subnet-group" @@ -90,52 +95,99 @@ resource "aws_db_subnet_group" "db_subnet_group" { } } -resource "aws_db_instance" "db" { - identifier = "${var.name}-db" - instance_class = var.instance_class - allocated_storage = var.allocated_storage - engine = "postgres" - engine_version = "15.12" - # backup_retention_period = 7 - username = var.master_username - password = var.master_password - db_name = "coder" - db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name - vpc_security_group_ids = [aws_security_group.allow-port-5432.id] - publicly_accessible = false - skip_final_snapshot = false +# Aurora Serverless v2 Cluster for Coder +resource "aws_rds_cluster" "coder" { + cluster_identifier = "${var.name}-aurora-cluster" + engine = "aurora-postgresql" + engine_mode = "provisioned" + engine_version = "15.8" + database_name = "coder" + master_username = var.master_username + master_password = random_password.coder_master_password.result + db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name + vpc_security_group_ids = [aws_security_group.allow-port-5432.id] + backup_retention_period = 7 + preferred_backup_window = "03:00-04:00" + skip_final_snapshot = false + storage_encrypted = true + + serverlessv2_scaling_configuration { + min_capacity = 0.5 # 0.5 ACU = 1 GB RAM (idle state) + max_capacity = 16 # 16 ACU = 32 GB RAM (handles 5K-10K users) + } tags = { - Name = "${var.name}-rds-db" + Name = "${var.name}-aurora-coder" } - lifecycle { - ignore_changes = [ - snapshot_identifier - ] +} + +# Aurora Serverless v2 Instance for Coder (Multi-AZ with 2 instances) +resource "aws_rds_cluster_instance" "coder_writer" { + identifier = "${var.name}-aurora-coder-writer" + cluster_identifier = aws_rds_cluster.coder.id + instance_class = "db.serverless" + engine = aws_rds_cluster.coder.engine + engine_version = "15.8" + publicly_accessible = false + db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name + + tags = { + Name = "${var.name}-aurora-coder-writer" } } -resource "aws_db_instance" "litellm" { - identifier = "litellm" - instance_class = "db.m5.large" - allocated_storage = 50 - engine = "postgres" - engine_version = "15.12" - username = var.litellm_username - password = var.litellm_password - db_name = "litellm" - db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name - vpc_security_group_ids = [aws_security_group.allow-port-5432.id] - publicly_accessible = false - skip_final_snapshot = false +resource "aws_rds_cluster_instance" "coder_reader" { + identifier = "${var.name}-aurora-coder-reader" + cluster_identifier = aws_rds_cluster.coder.id + instance_class = "db.serverless" + engine = aws_rds_cluster.coder.engine + engine_version = "15.8" + publicly_accessible = false + db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name tags = { - Name = "litellm" + Name = "${var.name}-aurora-coder-reader" } - lifecycle { - ignore_changes = [ - snapshot_identifier - ] +} + +# Aurora Serverless v2 Cluster for LiteLLM +resource "aws_rds_cluster" "litellm" { + cluster_identifier = "litellm-aurora-cluster" + engine = "aurora-postgresql" + engine_mode = "provisioned" + engine_version = "15.8" + database_name = "litellm" + master_username = var.litellm_username + master_password = random_password.litellm_password.result + db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name + vpc_security_group_ids = [aws_security_group.allow-port-5432.id] + backup_retention_period = 7 + preferred_backup_window = "04:00-05:00" + skip_final_snapshot = false + storage_encrypted = true + + serverlessv2_scaling_configuration { + min_capacity = 0.5 # 0.5 ACU = 1 GB RAM (idle state) + max_capacity = 8 # 8 ACU = 16 GB RAM (handles moderate usage) + } + + tags = { + Name = "litellm-aurora" + } +} + +# Aurora Serverless v2 Instance for LiteLLM +resource "aws_rds_cluster_instance" "litellm_writer" { + identifier = "litellm-aurora-writer" + cluster_identifier = aws_rds_cluster.litellm.id + instance_class = "db.serverless" + engine = aws_rds_cluster.litellm.engine + engine_version = "15.8" + publicly_accessible = false + db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name + + tags = { + Name = "litellm-aurora-writer" } } @@ -151,11 +203,8 @@ resource "aws_vpc_security_group_ingress_rule" "postgres" { to_port = 5432 } -resource "aws_vpc_security_group_egress_rule" "all" { - security_group_id = aws_security_group.allow-port-5432.id - cidr_ipv4 = "0.0.0.0/0" - ip_protocol = -1 -} +# No egress rules needed - RDS only responds to inbound connections +# This follows security best practice of least privilege resource "aws_security_group" "allow-port-5432" { vpc_id = var.vpc_id @@ -166,23 +215,95 @@ resource "aws_security_group" "allow-port-5432" { } } -output "rds_port" { - description = "Database instance port" - value = aws_db_instance.db.port +# Store Coder DB credentials in Secrets Manager +resource "aws_secretsmanager_secret" "coder_db" { + name_prefix = "${var.name}-coder-db-" + description = "Coder PostgreSQL database credentials" + recovery_window_in_days = 7 + + tags = { + Name = "${var.name}-coder-db-secret" + } +} + +resource "aws_secretsmanager_secret_version" "coder_db" { + secret_id = aws_secretsmanager_secret.coder_db.id + secret_string = jsonencode({ + username = var.master_username + password = random_password.coder_master_password.result + host = aws_rds_cluster.coder.endpoint + reader_host = aws_rds_cluster.coder.reader_endpoint + port = aws_rds_cluster.coder.port + dbname = aws_rds_cluster.coder.database_name + url = "postgres://${var.master_username}:${random_password.coder_master_password.result}@${aws_rds_cluster.coder.endpoint}:${aws_rds_cluster.coder.port}/${aws_rds_cluster.coder.database_name}?sslmode=require" + reader_url = "postgres://${var.master_username}:${random_password.coder_master_password.result}@${aws_rds_cluster.coder.reader_endpoint}:${aws_rds_cluster.coder.port}/${aws_rds_cluster.coder.database_name}?sslmode=require" + cluster_id = aws_rds_cluster.coder.id + engine_version = aws_rds_cluster.coder.engine_version + }) +} + +# Store LiteLLM DB credentials in Secrets Manager +resource "aws_secretsmanager_secret" "litellm_db" { + name_prefix = "litellm-db-" + description = "LiteLLM PostgreSQL database credentials" + recovery_window_in_days = 7 + + tags = { + Name = "litellm-db-secret" + } +} + +resource "aws_secretsmanager_secret_version" "litellm_db" { + secret_id = aws_secretsmanager_secret.litellm_db.id + secret_string = jsonencode({ + username = var.litellm_username + password = random_password.litellm_password.result + host = aws_rds_cluster.litellm.endpoint + reader_host = aws_rds_cluster.litellm.reader_endpoint + port = aws_rds_cluster.litellm.port + dbname = aws_rds_cluster.litellm.database_name + url = "postgres://${var.litellm_username}:${random_password.litellm_password.result}@${aws_rds_cluster.litellm.endpoint}:${aws_rds_cluster.litellm.port}/${aws_rds_cluster.litellm.database_name}?sslmode=require" + cluster_id = aws_rds_cluster.litellm.id + engine_version = aws_rds_cluster.litellm.engine_version + }) +} + +output "coder_cluster_endpoint" { + description = "Aurora cluster writer endpoint for Coder" + value = aws_rds_cluster.coder.endpoint +} + +output "coder_cluster_reader_endpoint" { + description = "Aurora cluster reader endpoint for Coder" + value = aws_rds_cluster.coder.reader_endpoint +} + +output "coder_cluster_port" { + description = "Aurora cluster port for Coder" + value = aws_rds_cluster.coder.port +} + +output "coder_db_secret_arn" { + description = "ARN of Secrets Manager secret containing Coder DB credentials" + value = aws_secretsmanager_secret.coder_db.arn +} + +output "litellm_cluster_endpoint" { + description = "Aurora cluster writer endpoint for LiteLLM" + value = aws_rds_cluster.litellm.endpoint } -output "rds_username" { - description = "Database instance root username" - value = aws_db_instance.db.username +output "litellm_cluster_reader_endpoint" { + description = "Aurora cluster reader endpoint for LiteLLM" + value = aws_rds_cluster.litellm.reader_endpoint } -output "rds_address" { - description = "Database instance address" - value = aws_db_instance.db.address +output "litellm_cluster_port" { + description = "Aurora cluster port for LiteLLM" + value = aws_rds_cluster.litellm.port } -output "rds_password" { - description = "Database instance root password" - value = aws_db_instance.db.password - sensitive = true +output "litellm_db_secret_arn" { + description = "ARN of Secrets Manager secret containing LiteLLM DB credentials" + value = aws_secretsmanager_secret.litellm_db.arn } diff --git a/infra/aws/us-west-2/k8s/karpenter/main.tf b/infra/aws/us-west-2/k8s/karpenter/main.tf index e69cdad..0fbbc92 100644 --- a/infra/aws/us-west-2/k8s/karpenter/main.tf +++ b/infra/aws/us-west-2/k8s/karpenter/main.tf @@ -238,13 +238,13 @@ module "nodepools" { for_each = { for np in local.nodepool_configs : np.name => np } source = "../../../../../modules/k8s/objects/nodepool" - name = each.value.name - node_labels = each.value.node_labels - node_taints = each.value.node_taints - node_requirements = each.value.node_requirements - node_class_ref_name = each.value.node_class_ref_name - disruption_consolidate_after = lookup(each.value, "disruption_consolidate_after", "1m") - disruption_consolidation_policy = lookup(each.value, "disruption_consolidation_policy", "WhenEmpty") + name = each.value.name + node_labels = each.value.node_labels + node_taints = each.value.node_taints + node_requirements = each.value.node_requirements + node_class_ref_name = each.value.node_class_ref_name + disruption_consolidate_after = lookup(each.value, "disruption_consolidate_after", "1m") + disruption_consolidation_policy = lookup(each.value, "disruption_consolidation_policy", "WhenEmpty") depends_on = [module.karpenter-addon] } diff --git a/infra/aws/us-west-2/k8s/nodepools/main.tf b/infra/aws/us-west-2/k8s/nodepools/main.tf index c0f4d49..74d63c5 100644 --- a/infra/aws/us-west-2/k8s/nodepools/main.tf +++ b/infra/aws/us-west-2/k8s/nodepools/main.tf @@ -289,7 +289,7 @@ resource "kubernetes_manifest" "coder_workspaces_nodepool" { } } spec = { - expireAfter = "336h" # 14 days for workspace nodes + expireAfter = "336h" # 14 days for workspace nodes nodeClassRef = { group = "eks.amazonaws.com" kind = "NodeClass" diff --git a/modules/k8s/bootstrap/karpenter/main.tf b/modules/k8s/bootstrap/karpenter/main.tf index 55781aa..9291e23 100644 --- a/modules/k8s/bootstrap/karpenter/main.tf +++ b/modules/k8s/bootstrap/karpenter/main.tf @@ -103,7 +103,7 @@ variable "ec2nodeclass_configs" { block_device_mappings = optional(list(object({ device_name = string ebs = object({ - volume_size = string + volume_size = string # Kubernetes-style size with unit (e.g. "1400Gi", "50Gi") volume_type = string encrypted = optional(bool, false) delete_on_termination = optional(bool, true) @@ -256,7 +256,13 @@ resource "helm_release" "karpenter" { settings = { clusterName = var.cluster_name featureGates = { + # Cost optimization - consolidate workloads to better-priced spot instances spotToSpotConsolidation = true + # Future features - currently disabled + staticCapacity = false # New capacity management feature + reservedCapacity = false # For Reserved Instance support + nodeRepair = false # Experimental - automatic node repair + nodeOverlay = false # Experimental - network overlay support } interruptionQueue = module.karpenter.queue_name } @@ -280,16 +286,22 @@ resource "kubernetes_manifest" "ec2nodeclass" { manifest = yamldecode(module.ec2nodeclass[count.index].manifest) } -# module "nodepool" { -# count = length(local.nodepool_configs) -# source = "../objects/nodepool" -# name = local.nodepool_configs[count.index].name -# node_labels = local.nodepool_configs[count.index].node_labels -# node_taints = local.nodepool_configs[count.index].node_taints -# node_requirements = local.nodepool_configs[count.index].node_requirements -# node_class_ref_name = local.nodepool_configs[count.index].node_class_ref_name -# node_expires_after = local.nodepool_configs[count.index].node_expires_after -# disruption_consolidation_policy = local.nodepool_configs[count.index].disruption_consolidation_policy -# disruption_consolidate_after = local.nodepool_configs[count.index].disruption_consolidate_after -# } +module "nodepool" { + count = length(var.nodepool_configs) + source = "../../objects/nodepool" + name = var.nodepool_configs[count.index].name + node_labels = var.nodepool_configs[count.index].node_labels + node_taints = var.nodepool_configs[count.index].node_taints + node_requirements = var.nodepool_configs[count.index].node_requirements + node_class_ref_name = var.nodepool_configs[count.index].node_class_ref_name + node_expires_after = var.nodepool_configs[count.index].node_expires_after + disruption_consolidation_policy = var.nodepool_configs[count.index].disruption_consolidation_policy + disruption_consolidate_after = var.nodepool_configs[count.index].disruption_consolidate_after +} + +resource "kubernetes_manifest" "nodepool" { + depends_on = [helm_release.karpenter] + count = length(var.nodepool_configs) + manifest = yamldecode(module.nodepool[count.index].manifest) +} diff --git a/modules/k8s/objects/ec2nodeclass/main.tf b/modules/k8s/objects/ec2nodeclass/main.tf index 64c5015..7062bc0 100644 --- a/modules/k8s/objects/ec2nodeclass/main.tf +++ b/modules/k8s/objects/ec2nodeclass/main.tf @@ -27,7 +27,7 @@ variable "block_device_mappings" { type = list(object({ device_name = string ebs = object({ - volume_size = number # Changed from string to number because AWS EBS volume sizes are numeric GiB values + volume_size = string # Kubernetes-style size with unit (e.g. "1400Gi", "50Gi") volume_type = string encrypted = optional(bool, false) delete_on_termination = optional(bool, true)