diff --git a/.github/workflows/pre-commit-hooks.yml b/.github/workflows/pre-commit-hooks.yml
index 40e2ef4..7949f86 100644
--- a/.github/workflows/pre-commit-hooks.yml
+++ b/.github/workflows/pre-commit-hooks.yml
@@ -6,8 +6,8 @@ name: Pre-commit Validation
 on:
   pull_request:
     paths:
-      - '.pre-commit-config.yaml'
-      - '.github/workflows/pre-commit-hooks.yml'
+      - ".pre-commit-config.yaml"
+      - ".github/workflows/pre-commit-hooks.yml"
 
 jobs:
   validate-pre-commit:
@@ -19,7 +19,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: '3.11'
+          python-version: "3.11"
 
       - name: Install pre-commit
         run: |
diff --git a/.github/workflows/secret-scanning.yml b/.github/workflows/secret-scanning.yml
index 37bcfd2..95a986e 100644
--- a/.github/workflows/secret-scanning.yml
+++ b/.github/workflows/secret-scanning.yml
@@ -7,8 +7,8 @@ on:
   push:
     branches:
       - main
-      - 'feature/**'
-      - 'fix/**'
+      - "feature/**"
+      - "fix/**"
 
 permissions:
   contents: write
@@ -23,7 +23,7 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
         with:
-          fetch-depth: 0  # Fetch all history for accurate scanning
+          fetch-depth: 0 # Fetch all history for accurate scanning
 
       - name: Run Gitleaks
         uses: gitleaks/gitleaks-action@v2
diff --git a/.github/workflows/terraform-apply.yml b/.github/workflows/terraform-apply.yml
index 93bc8d5..52eda40 100644
--- a/.github/workflows/terraform-apply.yml
+++ b/.github/workflows/terraform-apply.yml
@@ -5,13 +5,13 @@ on:
     branches:
       - main
     paths:
-      - 'infra/aws/**/*.tf'
-      - 'infra/aws/**/*.tfvars'
-      - '.github/workflows/terraform-*.yml'
+      - "infra/aws/**/*.tf"
+      - "infra/aws/**/*.tfvars"
+      - ".github/workflows/terraform-*.yml"
   workflow_dispatch:
     inputs:
       module:
-        description: 'Specific module to apply (leave empty for all changed)'
+        description: "Specific module to apply (leave empty for all changed)"
         required: false
         type: string
 
@@ -65,7 +65,7 @@ jobs:
       matrix:
         module: ${{ fromJson(needs.detect-changes.outputs.modules) }}
       fail-fast: false
-      max-parallel: 1  # Apply modules one at a time to avoid conflicts
+      max-parallel: 1 # Apply modules one at a time to avoid conflicts
     defaults:
       run:
         working-directory: ${{ matrix.module }}
diff --git a/.github/workflows/terraform-destroy.yml b/.github/workflows/terraform-destroy.yml
index d6a66ed..590c354 100644
--- a/.github/workflows/terraform-destroy.yml
+++ b/.github/workflows/terraform-destroy.yml
@@ -4,7 +4,7 @@ on:
   workflow_dispatch:
     inputs:
       module:
-        description: 'Module to destroy (e.g., infra/aws/us-east-2/eks)'
+        description: "Module to destroy (e.g., infra/aws/us-east-2/eks)"
         required: true
         type: string
       confirm:
diff --git a/.github/workflows/terraform-plan.yml b/.github/workflows/terraform-plan.yml
index 0a7ef72..0da766e 100644
--- a/.github/workflows/terraform-plan.yml
+++ b/.github/workflows/terraform-plan.yml
@@ -5,9 +5,9 @@ on:
     branches:
       - main
     paths:
-      - 'infra/aws/**/*.tf'
-      - 'infra/aws/**/*.tfvars'
-      - '.github/workflows/terraform-*.yml'
+      - "infra/aws/**/*.tf"
+      - "infra/aws/**/*.tfvars"
+      - ".github/workflows/terraform-*.yml"
 
 permissions:
   contents: read
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ad8971e..d49d3f8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,13 +17,13 @@ repos:
         exclude: '\.md$'
       - id: end-of-file-fixer
       - id: check-yaml
-        args: ['--unsafe']  # Allow custom YAML tags
+        args: ["--unsafe"] # Allow custom YAML tags
       - id: check-added-large-files
-        args: ['--maxkb=1000']
+        args: ["--maxkb=1000"]
       - id: check-merge-conflict
       - id: detect-private-key
       - id: detect-aws-credentials
-        args: ['--allow-missing-credentials']
+        args: ["--allow-missing-credentials"]
 
   # Terraform
   - repo: https://github.com/antonbabenko/pre-commit-terraform
@@ -47,7 +47,7 @@ repos:
     rev: v4.5.0
     hooks:
       - id: no-commit-to-branch
-        args: ['--branch', 'main', '--branch', 'master']
+        args: ["--branch", "main", "--branch", "master"]
         stages: [commit]
 
 # Global settings
diff --git a/docs/cost-optimization-strategy.md b/docs/cost-optimization-strategy.md
new file mode 100644
index 0000000..12da3ff
--- /dev/null
+++ b/docs/cost-optimization-strategy.md
@@ -0,0 +1,130 @@
+# Cost Optimization Strategy for Coder Demo
+
+## Mixed Capacity Approach
+
+### Node Group Strategy
+
+**System Nodes (ON_DEMAND)**
+
+- **Purpose**: Run critical Kubernetes infrastructure
+- **Workloads**: CoreDNS, kube-proxy, metrics-server, cert-manager, AWS LB Controller
+- **Size**: t4g.medium (ARM Graviton)
+- **Count**: 1-2 nodes minimum
+- **Cost**: ~$24/month (1 node) to $48/month (2 nodes)
+
+**Application Nodes (MIXED: 20% On-Demand, 80% Spot via Karpenter)**
+
+- **Purpose**: Run Coder server and workspaces
+- **Spot Savings**: 70-90% cost reduction
+- **Interruption Risk**: Mitigated by:
+  - Multiple instance types (diversified Spot pools)
+  - Karpenter auto-rebalancing
+  - Pod Disruption Budgets
+
+### Karpenter NodePool Configuration
+
+#### 1. Coder Server NodePool (ON_DEMAND Priority)
+
+```yaml
+capacity_type: ["on-demand", "spot"] # Prefer On-Demand, fallback to Spot
+weight:
+  on-demand: 100 # Higher priority
+  spot: 10
+```
+
+#### 2. Coder Workspace NodePool (SPOT Priority)
+
+```yaml
+capacity_type: ["spot", "on-demand"] # Prefer Spot, fallback to On-Demand
+weight:
+  spot: 100 # Higher priority
+  on-demand: 10
+```
+
+### Risk Mitigation
+
+**Spot Interruption Handling:**
+
+1. **2-minute warning** → Karpenter automatically provisions replacement
+2. **Multiple instance types** → 15+ types reduces interruption rate to <1%
+3. **Pod Disruption Budgets** → Ensures minimum replicas always running
+4. **Karpenter Consolidation** → Automatically moves pods before termination
+
+**Example Instance Type Diversity:**
+
+```
+Spot Pool: t4g.medium, t4g.large, t3a.medium, t3a.large,
+           m6g.medium, m6g.large, m6a.medium, m6a.large
+```
+
+### Cost Breakdown
+
+| Component          | Instance Type | Capacity  | Monthly Cost  |
+| ------------------ | ------------- | --------- | ------------- |
+| System Nodes (2)   | t4g.medium    | ON_DEMAND | $48           |
+| Coder Server (2)   | t4g.large     | 80% SPOT  | $28 (vs $140) |
+| Workspaces (avg 5) | t4g.xlarge    | 90% SPOT  | $75 (vs $750) |
+| **Total**          |               | **Mixed** | **$151/mo**   |
+
+**vs All On-Demand:** $938/month → **84% savings**
+
+### Dynamic Scaling
+
+**Low Usage (nights/weekends):**
+
+- Scale to zero workspaces
+- Keep 1 system node + 1 Coder server node
+- Cost: ~$48/month during idle
+
+**High Usage (business hours):**
+
+- Auto-scale workspaces on Spot
+- Karpenter provisions nodes in <60 seconds
+- Cost: ~$150-200/month during peak
+
+### Monitoring & Alerts
+
+**CloudWatch Alarms:**
+
+- Spot interruption rate > 5%
+- Available On-Demand capacity < 20%
+- Karpenter provisioning failures
+
+**Response:**
+
+- Automatic fallback to On-Demand
+- Email alerts to ops team
+- Karpenter adjusts instance type mix
+
+## Implementation Timeline
+
+1. ✅ Deploy EKS with ON_DEMAND system nodes
+2. ⏳ Deploy Karpenter
+3. ⏳ Configure mixed-capacity NodePools
+4. ⏳ Deploy Coder with node affinity rules
+5. ⏳ Test Spot interruption handling
+6. ⏳ Enable auto-scaling policies
+
+## Fallback Plan
+
+If Spot becomes unreliable (rare):
+
+1. Update Karpenter NodePool to 100% On-Demand
+2. `kubectl apply -f nodepool-ondemand.yaml`
+3. Karpenter gracefully migrates pods
+4. Takes ~5 minutes, zero downtime
+
+## Best Practices
+
+✅ **DO:**
+
+- Use multiple Spot instance types (10+)
+- Set Pod Disruption Budgets
+- Monitor Spot interruption rates
+- Test failover regularly
+
+❌ **DON'T:**
+
+- Run databases on Spot (use RDS)
+- Use Spot for single-replica critical services
+- Rely on single instance type for Spot
diff --git a/infra/aws/us-east-2/README.md b/infra/aws/us-east-2/README.md
index 8e18a7f..5ff4543 100644
--- a/infra/aws/us-east-2/README.md
+++ b/infra/aws/us-east-2/README.md
@@ -7,6 +7,7 @@ This directory uses remote S3 backend for state management, but **backend config
 ## Local Setup
 
 1. **Get backend configuration from teammate** or **retrieve from AWS**:
+
    ```bash
    # Get S3 bucket name (it contains the account ID)
    aws s3 ls | grep terraform-state
@@ -24,6 +25,7 @@ This directory uses remote S3 backend for state management, but **backend config
    ```
 
    Create `backend.tf`:
+
    ```hcl
    terraform {
      backend "s3" {
@@ -62,6 +64,7 @@ These are configured in: Repository Settings > Secrets and variables > Actions
 Instead of creating backend.tf, you can use a config file:
 
 1. Create `backend.conf` (gitignored):
+
    ```
    bucket         = "YOUR-BUCKET-NAME"
    dynamodb_table = "YOUR-TABLE-NAME"
@@ -86,12 +89,14 @@ Instead of creating backend.tf, you can use a config file:
 This repository has automated secret scanning to prevent accidental exposure of credentials:
 
 ### GitHub Actions (Automated)
+
 - **Gitleaks** - Scans every PR and push for secrets
 - **TruffleHog** - Additional verification layer
 - **Custom Pattern Matching** - Catches common secret patterns
 - **Auto-Revert** - Automatically reverts commits to main with secrets
 
 ### Pre-commit Hooks (Local)
+
 Catch secrets before they reach GitHub:
 
 ```bash
@@ -106,6 +111,7 @@ pre-commit run --all-files
 ```
 
 ### What Gets Detected
+
 - AWS Access Keys (AKIA...)
 - API Keys and Tokens
 - Private Keys (RSA, SSH, etc.)
@@ -115,6 +121,7 @@ pre-commit run --all-files
 - High-entropy strings (likely secrets)
 
 ### If Secrets Are Detected
+
 1. **PR is blocked** - Cannot merge until secrets are removed
 2. **Automatic notification** - PR comment explains the issue
 3. **Required actions**:
diff --git a/infra/aws/us-east-2/eks/main.tf b/infra/aws/us-east-2/eks/main.tf
index 80c15aa..9f680e0 100644
--- a/infra/aws/us-east-2/eks/main.tf
+++ b/infra/aws/us-east-2/eks/main.tf
@@ -141,17 +141,115 @@ module "eks" {
       desired_size = 0 # Cant be modified after creation. Override from AWS Console
       labels       = local.cluster_asg_node_labels
 
-      instance_types = [var.cluster_instance_type]
-      capacity_type  = "ON_DEMAND"
+      # Cost optimization: Graviton ARM instances
+      # IMPORTANT: ON_DEMAND for system nodes - production demo cannot break!
+      instance_types = [var.cluster_instance_type, "t4g.small", "t4g.large"] # ARM only
+      ami_type       = "AL2023_ARM_64_STANDARD"                              # ARM-based AMI
+      capacity_type  = "ON_DEMAND"                                           # System infrastructure must be stable
+
       iam_role_additional_policies = {
         AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
         STSAssumeRole                = aws_iam_policy.sts.arn
       }
 
+      # Cost optimization: gp3 volumes with smaller size
+      block_device_mappings = [{
+        device_name = "/dev/xvda"
+        ebs = {
+          volume_type           = "gp3" # Better performance, same cost as gp2
+          volume_size           = 20    # Reduced from default 50GB
+          delete_on_termination = true
+          encrypted             = true
+        }
+      }]
+
       # System Nodes should not be public
       subnet_ids = var.private_subnet_ids
     }
   }
 
   tags = local.tags
-}
\ No newline at end of file
+}
+# VPC Endpoints for cost optimization (reduce NAT Gateway usage)
+resource "aws_vpc_endpoint" "s3" {
+  vpc_id       = var.vpc_id
+  service_name = "com.amazonaws.${var.region}.s3"
+  route_table_ids = flatten([
+    data.aws_route_tables.private.ids
+  ])
+  tags = merge(local.tags, {
+    Name = "${var.name}-s3-endpoint"
+  })
+}
+
+resource "aws_vpc_endpoint" "ecr_api" {
+  vpc_id              = var.vpc_id
+  service_name        = "com.amazonaws.${var.region}.ecr.api"
+  vpc_endpoint_type   = "Interface"
+  subnet_ids          = var.private_subnet_ids
+  security_group_ids  = [aws_security_group.vpc_endpoints.id]
+  private_dns_enabled = true
+  tags = merge(local.tags, {
+    Name = "${var.name}-ecr-api-endpoint"
+  })
+}
+
+resource "aws_vpc_endpoint" "ecr_dkr" {
+  vpc_id              = var.vpc_id
+  service_name        = "com.amazonaws.${var.region}.ecr.dkr"
+  vpc_endpoint_type   = "Interface"
+  subnet_ids          = var.private_subnet_ids
+  security_group_ids  = [aws_security_group.vpc_endpoints.id]
+  private_dns_enabled = true
+  tags = merge(local.tags, {
+    Name = "${var.name}-ecr-dkr-endpoint"
+  })
+}
+
+# Security group for VPC endpoints
+resource "aws_security_group" "vpc_endpoints" {
+  name_prefix = "${var.name}-vpc-endpoints"
+  description = "Security group for VPC endpoints"
+  vpc_id      = var.vpc_id
+
+  ingress {
+    from_port   = 443
+    to_port     = 443
+    protocol    = "tcp"
+    cidr_blocks = ["10.0.0.0/16"]
+  }
+
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
+
+  tags = merge(local.tags, {
+    Name = "${var.name}-vpc-endpoints-sg"
+  })
+}
+
+# Data source for route tables
+data "aws_route_tables" "private" {
+  vpc_id = var.vpc_id
+  filter {
+    name   = "tag:Name"
+    values = ["*private*"]
+  }
+}
+
+# Outputs
+output "vpc_endpoint_s3_id" {
+  description = "S3 VPC Endpoint ID"
+  value       = aws_vpc_endpoint.s3.id
+}
+
+output "vpc_endpoint_ecr_ids" {
+  description = "ECR VPC Endpoint IDs"
+  value = {
+    api = aws_vpc_endpoint.ecr_api.id
+    dkr = aws_vpc_endpoint.ecr_dkr.id
+  }
+}
diff --git a/infra/aws/us-east-2/k8s/coder-server/main.tf b/infra/aws/us-east-2/k8s/coder-server/main.tf
index 79a8fd2..47961c8 100644
--- a/infra/aws/us-east-2/k8s/coder-server/main.tf
+++ b/infra/aws/us-east-2/k8s/coder-server/main.tf
@@ -20,7 +20,7 @@ terraform {
       source = "hashicorp/tls"
     }
   }
-  backend "s3" {}
+  # backend "s3" {}  # Commented out for local state during initial deployment
 }
 
 variable "cluster_name" {
@@ -208,7 +208,7 @@ module "coder-server" {
   namespace                       = "coder"
   acme_registration_email         = var.acme_registration_email
   acme_days_until_renewal         = 90
-  replica_count                   = 2
+  replica_count                   = 1 # HA requires Enterprise license
   helm_version                    = var.addon_version
   image_repo                      = var.image_repo
   image_tag                       = var.image_tag
@@ -237,10 +237,18 @@ module "coder-server" {
   github_external_auth_secret_client_id     = var.coder_github_external_auth_secret_client_id
   github_external_auth_secret_client_secret = var.coder_github_external_auth_secret_client_secret
   tags                                      = {}
+  env_vars = {
+    # Disable redirect since NLB terminates TLS and forwards plain HTTP to backend
+    # Without this, Coder sees HTTP and redirects to HTTPS, causing infinite redirect loop
+    CODER_REDIRECT_TO_ACCESS_URL = "false"
+  }
   service_annotations = {
-    "service.beta.kubernetes.io/aws-load-balancer-nlb-target-type" = "instance"
-    "service.beta.kubernetes.io/aws-load-balancer-scheme"          = "internet-facing"
-    "service.beta.kubernetes.io/aws-load-balancer-attributes"      = "deletion_protection.enabled=true"
+    "service.beta.kubernetes.io/aws-load-balancer-nlb-target-type"  = "instance"
+    "service.beta.kubernetes.io/aws-load-balancer-scheme"           = "internet-facing"
+    "service.beta.kubernetes.io/aws-load-balancer-attributes"       = "deletion_protection.enabled=true"
+    "service.beta.kubernetes.io/aws-load-balancer-ssl-cert"         = "arn:aws:acm:us-east-2:716194723392:certificate/a710c3f2-6e5d-4e42-9212-fb6a09087d26"
+    "service.beta.kubernetes.io/aws-load-balancer-ssl-ports"        = "443"
+    "service.beta.kubernetes.io/aws-load-balancer-backend-protocol" = "tcp"
   }
   node_selector = {
     "node.coder.io/managed-by" = "karpenter"
diff --git a/infra/aws/us-east-2/k8s/karpenter/main.tf b/infra/aws/us-east-2/k8s/karpenter/main.tf
index a01280e..6b892c5 100644
--- a/infra/aws/us-east-2/k8s/karpenter/main.tf
+++ b/infra/aws/us-east-2/k8s/karpenter/main.tf
@@ -183,7 +183,7 @@ module "karpenter-addon" {
     block_device_mappings = [{
       device_name = "/dev/xvda"
       ebs = {
-        volume_size = "1400Gi"
+        volume_size = "500Gi" // Decreased from 1400Gi to save costs; felt overkill for coder-server nodes
         volume_type = "gp3"
       }
       }, {
@@ -198,6 +198,7 @@ module "karpenter-addon" {
     subnet_selector_tags = local.provisioner_subnet_tags
     sg_selector_tags     = local.provisioner_sg_tags
   }]
+  nodepool_configs = local.nodepool_configs
 }
 
 # import {
diff --git a/infra/aws/us-east-2/rds/main.tf b/infra/aws/us-east-2/rds/main.tf
index ad0e620..1d14e2e 100644
--- a/infra/aws/us-east-2/rds/main.tf
+++ b/infra/aws/us-east-2/rds/main.tf
@@ -5,6 +5,10 @@ terraform {
       source  = "hashicorp/aws"
       version = ">= 5.46"
     }
+    random = {
+      source  = "hashicorp/random"
+      version = "~> 3.6"
+    }
   }
   backend "s3" {}
 }
@@ -19,20 +23,10 @@ variable "master_username" {
   type        = string
 }
 
-variable "master_password" {
-  description = "Database root password"
-  type        = string
-}
-
 variable "litellm_username" {
   type = string
 }
 
-variable "litellm_password" {
-  type      = string
-  sensitive = true
-}
-
 variable "name" {
   description = "Name of resource and tag prefix"
   type        = string
@@ -80,6 +74,17 @@ provider "aws" {
   profile = var.profile
 }
 
+# Generate secure random passwords
+resource "random_password" "coder_master_password" {
+  length  = 32
+  special = true
+}
+
+resource "random_password" "litellm_password" {
+  length  = 32
+  special = true
+}
+
 # https://developer.hashicorp.com/terraform/tutorials/aws/aws-rds
 resource "aws_db_subnet_group" "db_subnet_group" {
   name       = "${var.name}-db-subnet-group"
@@ -90,52 +95,99 @@ resource "aws_db_subnet_group" "db_subnet_group" {
   }
 }
 
-resource "aws_db_instance" "db" {
-  identifier        = "${var.name}-db"
-  instance_class    = var.instance_class
-  allocated_storage = var.allocated_storage
-  engine            = "postgres"
-  engine_version    = "15.12"
-  # backup_retention_period = 7
-  username               = var.master_username
-  password               = var.master_password
-  db_name                = "coder"
-  db_subnet_group_name   = aws_db_subnet_group.db_subnet_group.name
-  vpc_security_group_ids = [aws_security_group.allow-port-5432.id]
-  publicly_accessible    = false
-  skip_final_snapshot    = false
+# Aurora Serverless v2 Cluster for Coder
+resource "aws_rds_cluster" "coder" {
+  cluster_identifier      = "${var.name}-aurora-cluster"
+  engine                  = "aurora-postgresql"
+  engine_mode             = "provisioned"
+  engine_version          = "15.8"
+  database_name           = "coder"
+  master_username         = var.master_username
+  master_password         = random_password.coder_master_password.result
+  db_subnet_group_name    = aws_db_subnet_group.db_subnet_group.name
+  vpc_security_group_ids  = [aws_security_group.allow-port-5432.id]
+  backup_retention_period = 7
+  preferred_backup_window = "03:00-04:00"
+  skip_final_snapshot     = false
+  storage_encrypted       = true
+
+  serverlessv2_scaling_configuration {
+    min_capacity = 0.5 # 0.5 ACU = 1 GB RAM (idle state)
+    max_capacity = 16  # 16 ACU = 32 GB RAM (handles 5K-10K users)
+  }
 
   tags = {
-    Name = "${var.name}-rds-db"
+    Name = "${var.name}-aurora-coder"
   }
-  lifecycle {
-    ignore_changes = [
-      snapshot_identifier
-    ]
+}
+
+# Aurora Serverless v2 Instance for Coder (Multi-AZ with 2 instances)
+resource "aws_rds_cluster_instance" "coder_writer" {
+  identifier           = "${var.name}-aurora-coder-writer"
+  cluster_identifier   = aws_rds_cluster.coder.id
+  instance_class       = "db.serverless"
+  engine               = aws_rds_cluster.coder.engine
+  engine_version       = "15.8"
+  publicly_accessible  = false
+  db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name
+
+  tags = {
+    Name = "${var.name}-aurora-coder-writer"
   }
 }
 
-resource "aws_db_instance" "litellm" {
-  identifier             = "litellm"
-  instance_class         = "db.m5.large"
-  allocated_storage      = 50
-  engine                 = "postgres"
-  engine_version         = "15.12"
-  username               = var.litellm_username
-  password               = var.litellm_password
-  db_name                = "litellm"
-  db_subnet_group_name   = aws_db_subnet_group.db_subnet_group.name
-  vpc_security_group_ids = [aws_security_group.allow-port-5432.id]
-  publicly_accessible    = false
-  skip_final_snapshot    = false
+resource "aws_rds_cluster_instance" "coder_reader" {
+  identifier           = "${var.name}-aurora-coder-reader"
+  cluster_identifier   = aws_rds_cluster.coder.id
+  instance_class       = "db.serverless"
+  engine               = aws_rds_cluster.coder.engine
+  engine_version       = "15.8"
+  publicly_accessible  = false
+  db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name
 
   tags = {
-    Name = "litellm"
+    Name = "${var.name}-aurora-coder-reader"
   }
-  lifecycle {
-    ignore_changes = [
-      snapshot_identifier
-    ]
+}
+
+# Aurora Serverless v2 Cluster for LiteLLM
+resource "aws_rds_cluster" "litellm" {
+  cluster_identifier      = "litellm-aurora-cluster"
+  engine                  = "aurora-postgresql"
+  engine_mode             = "provisioned"
+  engine_version          = "15.8"
+  database_name           = "litellm"
+  master_username         = var.litellm_username
+  master_password         = random_password.litellm_password.result
+  db_subnet_group_name    = aws_db_subnet_group.db_subnet_group.name
+  vpc_security_group_ids  = [aws_security_group.allow-port-5432.id]
+  backup_retention_period = 7
+  preferred_backup_window = "04:00-05:00"
+  skip_final_snapshot     = false
+  storage_encrypted       = true
+
+  serverlessv2_scaling_configuration {
+    min_capacity = 0.5 # 0.5 ACU = 1 GB RAM (idle state)
+    max_capacity = 8   # 8 ACU = 16 GB RAM (handles moderate usage)
+  }
+
+  tags = {
+    Name = "litellm-aurora"
+  }
+}
+
+# Aurora Serverless v2 Instance for LiteLLM
+resource "aws_rds_cluster_instance" "litellm_writer" {
+  identifier           = "litellm-aurora-writer"
+  cluster_identifier   = aws_rds_cluster.litellm.id
+  instance_class       = "db.serverless"
+  engine               = aws_rds_cluster.litellm.engine
+  engine_version       = "15.8"
+  publicly_accessible  = false
+  db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name
+
+  tags = {
+    Name = "litellm-aurora-writer"
   }
 }
 
@@ -151,11 +203,8 @@ resource "aws_vpc_security_group_ingress_rule" "postgres" {
   to_port           = 5432
 }
 
-resource "aws_vpc_security_group_egress_rule" "all" {
-  security_group_id = aws_security_group.allow-port-5432.id
-  cidr_ipv4         = "0.0.0.0/0"
-  ip_protocol       = -1
-}
+# No egress rules needed - RDS only responds to inbound connections
+# This follows security best practice of least privilege
 
 resource "aws_security_group" "allow-port-5432" {
   vpc_id      = var.vpc_id
@@ -166,23 +215,95 @@ resource "aws_security_group" "allow-port-5432" {
   }
 }
 
-output "rds_port" {
-  description = "Database instance port"
-  value       = aws_db_instance.db.port
+# Store Coder DB credentials in Secrets Manager
+resource "aws_secretsmanager_secret" "coder_db" {
+  name_prefix             = "${var.name}-coder-db-"
+  description             = "Coder PostgreSQL database credentials"
+  recovery_window_in_days = 7
+
+  tags = {
+    Name = "${var.name}-coder-db-secret"
+  }
+}
+
+resource "aws_secretsmanager_secret_version" "coder_db" {
+  secret_id = aws_secretsmanager_secret.coder_db.id
+  secret_string = jsonencode({
+    username       = var.master_username
+    password       = random_password.coder_master_password.result
+    host           = aws_rds_cluster.coder.endpoint
+    reader_host    = aws_rds_cluster.coder.reader_endpoint
+    port           = aws_rds_cluster.coder.port
+    dbname         = aws_rds_cluster.coder.database_name
+    url            = "postgres://${var.master_username}:${random_password.coder_master_password.result}@${aws_rds_cluster.coder.endpoint}:${aws_rds_cluster.coder.port}/${aws_rds_cluster.coder.database_name}?sslmode=require"
+    reader_url     = "postgres://${var.master_username}:${random_password.coder_master_password.result}@${aws_rds_cluster.coder.reader_endpoint}:${aws_rds_cluster.coder.port}/${aws_rds_cluster.coder.database_name}?sslmode=require"
+    cluster_id     = aws_rds_cluster.coder.id
+    engine_version = aws_rds_cluster.coder.engine_version
+  })
+}
+
+# Store LiteLLM DB credentials in Secrets Manager
+resource "aws_secretsmanager_secret" "litellm_db" {
+  name_prefix             = "litellm-db-"
+  description             = "LiteLLM PostgreSQL database credentials"
+  recovery_window_in_days = 7
+
+  tags = {
+    Name = "litellm-db-secret"
+  }
+}
+
+resource "aws_secretsmanager_secret_version" "litellm_db" {
+  secret_id = aws_secretsmanager_secret.litellm_db.id
+  secret_string = jsonencode({
+    username       = var.litellm_username
+    password       = random_password.litellm_password.result
+    host           = aws_rds_cluster.litellm.endpoint
+    reader_host    = aws_rds_cluster.litellm.reader_endpoint
+    port           = aws_rds_cluster.litellm.port
+    dbname         = aws_rds_cluster.litellm.database_name
+    url            = "postgres://${var.litellm_username}:${random_password.litellm_password.result}@${aws_rds_cluster.litellm.endpoint}:${aws_rds_cluster.litellm.port}/${aws_rds_cluster.litellm.database_name}?sslmode=require"
+    cluster_id     = aws_rds_cluster.litellm.id
+    engine_version = aws_rds_cluster.litellm.engine_version
+  })
+}
+
+output "coder_cluster_endpoint" {
+  description = "Aurora cluster writer endpoint for Coder"
+  value       = aws_rds_cluster.coder.endpoint
+}
+
+output "coder_cluster_reader_endpoint" {
+  description = "Aurora cluster reader endpoint for Coder"
+  value       = aws_rds_cluster.coder.reader_endpoint
+}
+
+output "coder_cluster_port" {
+  description = "Aurora cluster port for Coder"
+  value       = aws_rds_cluster.coder.port
+}
+
+output "coder_db_secret_arn" {
+  description = "ARN of Secrets Manager secret containing Coder DB credentials"
+  value       = aws_secretsmanager_secret.coder_db.arn
+}
+
+output "litellm_cluster_endpoint" {
+  description = "Aurora cluster writer endpoint for LiteLLM"
+  value       = aws_rds_cluster.litellm.endpoint
 }
 
-output "rds_username" {
-  description = "Database instance root username"
-  value       = aws_db_instance.db.username
+output "litellm_cluster_reader_endpoint" {
+  description = "Aurora cluster reader endpoint for LiteLLM"
+  value       = aws_rds_cluster.litellm.reader_endpoint
 }
 
-output "rds_address" {
-  description = "Database instance address"
-  value       = aws_db_instance.db.address
+output "litellm_cluster_port" {
+  description = "Aurora cluster port for LiteLLM"
+  value       = aws_rds_cluster.litellm.port
 }
 
-output "rds_password" {
-  description = "Database instance root password"
-  value       = aws_db_instance.db.password
-  sensitive   = true
+output "litellm_db_secret_arn" {
+  description = "ARN of Secrets Manager secret containing LiteLLM DB credentials"
+  value       = aws_secretsmanager_secret.litellm_db.arn
 }
diff --git a/infra/aws/us-west-2/k8s/karpenter/main.tf b/infra/aws/us-west-2/k8s/karpenter/main.tf
index e69cdad..0fbbc92 100644
--- a/infra/aws/us-west-2/k8s/karpenter/main.tf
+++ b/infra/aws/us-west-2/k8s/karpenter/main.tf
@@ -238,13 +238,13 @@ module "nodepools" {
   for_each = { for np in local.nodepool_configs : np.name => np }
   source   = "../../../../../modules/k8s/objects/nodepool"
 
-  name                               = each.value.name
-  node_labels                        = each.value.node_labels
-  node_taints                        = each.value.node_taints
-  node_requirements                  = each.value.node_requirements
-  node_class_ref_name                = each.value.node_class_ref_name
-  disruption_consolidate_after       = lookup(each.value, "disruption_consolidate_after", "1m")
-  disruption_consolidation_policy    = lookup(each.value, "disruption_consolidation_policy", "WhenEmpty")
+  name                            = each.value.name
+  node_labels                     = each.value.node_labels
+  node_taints                     = each.value.node_taints
+  node_requirements               = each.value.node_requirements
+  node_class_ref_name             = each.value.node_class_ref_name
+  disruption_consolidate_after    = lookup(each.value, "disruption_consolidate_after", "1m")
+  disruption_consolidation_policy = lookup(each.value, "disruption_consolidation_policy", "WhenEmpty")
 
   depends_on = [module.karpenter-addon]
 }
diff --git a/infra/aws/us-west-2/k8s/nodepools/main.tf b/infra/aws/us-west-2/k8s/nodepools/main.tf
index c0f4d49..74d63c5 100644
--- a/infra/aws/us-west-2/k8s/nodepools/main.tf
+++ b/infra/aws/us-west-2/k8s/nodepools/main.tf
@@ -289,7 +289,7 @@ resource "kubernetes_manifest" "coder_workspaces_nodepool" {
           }
         }
         spec = {
-          expireAfter = "336h"  # 14 days for workspace nodes
+          expireAfter = "336h" # 14 days for workspace nodes
           nodeClassRef = {
             group = "eks.amazonaws.com"
             kind  = "NodeClass"
diff --git a/modules/k8s/bootstrap/karpenter/main.tf b/modules/k8s/bootstrap/karpenter/main.tf
index 55781aa..9291e23 100644
--- a/modules/k8s/bootstrap/karpenter/main.tf
+++ b/modules/k8s/bootstrap/karpenter/main.tf
@@ -103,7 +103,7 @@ variable "ec2nodeclass_configs" {
     block_device_mappings = optional(list(object({
       device_name = string
       ebs = object({
-        volume_size           = string
+        volume_size           = string # Kubernetes-style size with unit (e.g. "1400Gi", "50Gi")
         volume_type           = string
         encrypted             = optional(bool, false)
         delete_on_termination = optional(bool, true)
@@ -256,7 +256,13 @@ resource "helm_release" "karpenter" {
     settings = {
       clusterName = var.cluster_name
       featureGates = {
+        # Cost optimization - consolidate workloads to better-priced spot instances
         spotToSpotConsolidation = true
+        # Future features - currently disabled
+        staticCapacity   = false # New capacity management feature
+        reservedCapacity = false # For Reserved Instance support
+        nodeRepair       = false # Experimental - automatic node repair
+        nodeOverlay      = false # Experimental - network overlay support
       }
       interruptionQueue = module.karpenter.queue_name
     }
@@ -280,16 +286,22 @@ resource "kubernetes_manifest" "ec2nodeclass" {
   manifest   = yamldecode(module.ec2nodeclass[count.index].manifest)
 }
 
-# module "nodepool" {
-#     count = length(local.nodepool_configs)
-#     source = "../objects/nodepool"
-#     name = local.nodepool_configs[count.index].name
-#     node_labels = local.nodepool_configs[count.index].node_labels
-#     node_taints = local.nodepool_configs[count.index].node_taints
-#     node_requirements = local.nodepool_configs[count.index].node_requirements
-#     node_class_ref_name = local.nodepool_configs[count.index].node_class_ref_name
-#     node_expires_after = local.nodepool_configs[count.index].node_expires_after
-#     disruption_consolidation_policy = local.nodepool_configs[count.index].disruption_consolidation_policy
-#     disruption_consolidate_after = local.nodepool_configs[count.index].disruption_consolidate_after
-# }
+module "nodepool" {
+  count                           = length(var.nodepool_configs)
+  source                          = "../../objects/nodepool"
+  name                            = var.nodepool_configs[count.index].name
+  node_labels                     = var.nodepool_configs[count.index].node_labels
+  node_taints                     = var.nodepool_configs[count.index].node_taints
+  node_requirements               = var.nodepool_configs[count.index].node_requirements
+  node_class_ref_name             = var.nodepool_configs[count.index].node_class_ref_name
+  node_expires_after              = var.nodepool_configs[count.index].node_expires_after
+  disruption_consolidation_policy = var.nodepool_configs[count.index].disruption_consolidation_policy
+  disruption_consolidate_after    = var.nodepool_configs[count.index].disruption_consolidate_after
+}
+
+resource "kubernetes_manifest" "nodepool" {
+  depends_on = [helm_release.karpenter]
+  count      = length(var.nodepool_configs)
+  manifest   = yamldecode(module.nodepool[count.index].manifest)
+}
 
diff --git a/modules/k8s/objects/ec2nodeclass/main.tf b/modules/k8s/objects/ec2nodeclass/main.tf
index 64c5015..7062bc0 100644
--- a/modules/k8s/objects/ec2nodeclass/main.tf
+++ b/modules/k8s/objects/ec2nodeclass/main.tf
@@ -27,7 +27,7 @@ variable "block_device_mappings" {
   type = list(object({
     device_name = string
     ebs = object({
-      volume_size           = number # Changed from string to number because AWS EBS volume sizes are numeric GiB values
+      volume_size           = string # Kubernetes-style size with unit (e.g. "1400Gi", "50Gi")
       volume_type           = string
       encrypted             = optional(bool, false)
       delete_on_termination = optional(bool, true)