diff --git a/.github/workflows/pre-commit-hooks.yml b/.github/workflows/pre-commit-hooks.yml
new file mode 100644
index 0000000..7949f86
--- /dev/null
+++ b/.github/workflows/pre-commit-hooks.yml
@@ -0,0 +1,56 @@
+# Optional: Pre-commit hooks workflow
+# This provides guidance for setting up local pre-commit hooks
+
+name: Pre-commit Validation
+
+on:
+  pull_request:
+    paths:
+      - ".pre-commit-config.yaml"
+      - ".github/workflows/pre-commit-hooks.yml"
+
+jobs:
+  validate-pre-commit:
+    name: Validate Pre-commit Configuration
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+
+      - name: Install pre-commit
+        run: |
+          pip install pre-commit
+          pre-commit --version
+
+      - name: Run pre-commit on all files
+        run: pre-commit run --all-files
+        continue-on-error: true
+
+      - name: Show pre-commit setup instructions
+        if: always()
+        run: |
+          echo "## 📋 Setting up Pre-commit Hooks Locally"
+          echo ""
+          echo "Pre-commit hooks help catch secrets BEFORE they reach GitHub."
+          echo ""
+          echo "### Installation:"
+          echo "\`\`\`bash"
+          echo "# Install pre-commit"
+          echo "pip install pre-commit"
+          echo ""
+          echo "# Install the git hooks"
+          echo "pre-commit install"
+          echo ""
+          echo "# (Optional) Run against all files"
+          echo "pre-commit run --all-files"
+          echo "\`\`\`"
+          echo ""
+          echo "### What it does:"
+          echo "- Scans for secrets before each commit"
+          echo "- Validates Terraform formatting"
+          echo "- Checks for merge conflicts"
+          echo "- Prevents large files from being committed"
diff --git a/.github/workflows/secret-scanning.yml b/.github/workflows/secret-scanning.yml
new file mode 100644
index 0000000..95a986e
--- /dev/null
+++ b/.github/workflows/secret-scanning.yml
@@ -0,0 +1,282 @@
+name: Secret Scanning
+
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+      - "feature/**"
+      - "fix/**"
+
+permissions:
+  contents: write
+  pull-requests: write
+  issues: write
+
+jobs:
+  gitleaks:
+    name: Gitleaks Secret Scanning
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # Fetch all history for accurate scanning
+
+      - name: Run Gitleaks
+        uses: gitleaks/gitleaks-action@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITLEAKS_ENABLE_COMMENTS: true
+
+      - name: Upload Gitleaks Report
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: gitleaks-report
+          path: results.sarif
+          retention-days: 7
+
+  trufflehog:
+    name: TruffleHog Secret Scanning
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: TruffleHog OSS
+        uses: trufflesecurity/trufflehog@main
+        with:
+          path: ./
+          base: ${{ github.event.repository.default_branch }}
+          head: HEAD
+          extra_args: --debug --only-verified
+
+  custom-pattern-check:
+    name: Custom Pattern Detection
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Check for common secret patterns
+        id: secret_check
+        run: |
+          echo "Scanning for common secret patterns..."
+
+          # Define patterns to search for
+          PATTERNS=(
+            "aws_access_key_id"
+            "aws_secret_access_key"
+            "AKIA[0-9A-Z]{16}"  # AWS Access Key
+            "(?i)api[_-]?key.*['\"][0-9a-zA-Z]{32,}['\"]"  # Generic API keys
+            "(?i)password.*['\"][^'\"]{8,}['\"]"  # Passwords in quotes
+            "(?i)secret.*['\"][0-9a-zA-Z]{32,}['\"]"  # Generic secrets
+            "(?i)token.*['\"][0-9a-zA-Z]{32,}['\"]"  # Tokens
+            "private[_-]?key"
+            "-----BEGIN (RSA|OPENSSH|DSA|EC) PRIVATE KEY-----"  # Private keys
+            "ghp_[0-9a-zA-Z]{36}"  # GitHub Personal Access Token
+            "ghs_[0-9a-zA-Z]{36}"  # GitHub OAuth Secret
+            "sk_live_[0-9a-zA-Z]{24,}"  # Stripe Live Secret Key
+            "pk_live_[0-9a-zA-Z]{24,}"  # Stripe Live Public Key
+          )
+
+          FOUND_SECRETS=0
+          REPORT_FILE="secret_scan_report.txt"
+
+          echo "=== Secret Scanning Report ===" > $REPORT_FILE
+          echo "Timestamp: $(date)" >> $REPORT_FILE
+          echo "" >> $REPORT_FILE
+
+          # Get list of changed files
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            FILES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD)
+          else
+            FILES=$(git diff --name-only HEAD~1 HEAD)
+          fi
+
+          # Skip certain file types and directories
+          FILES=$(echo "$FILES" | grep -v ".terraform/" | grep -v ".git/" | grep -v "node_modules/" || true)
+
+          for FILE in $FILES; do
+            if [ -f "$FILE" ]; then
+              echo "Scanning: $FILE" >> $REPORT_FILE
+
+              for PATTERN in "${PATTERNS[@]}"; do
+                MATCHES=$(grep -niE "$PATTERN" "$FILE" 2>/dev/null || true)
+                if [ ! -z "$MATCHES" ]; then
+                  FOUND_SECRETS=1
+                  echo "  ❌ FOUND POTENTIAL SECRET:" >> $REPORT_FILE
+                  echo "     Pattern: $PATTERN" >> $REPORT_FILE
+                  echo "$MATCHES" | while IFS= read -r line; do
+                    # Redact the actual secret value
+                    REDACTED=$(echo "$line" | sed -E 's/['\''"][0-9a-zA-Z]{8,}['\''"]/***REDACTED***/g')
+                    echo "     $REDACTED" >> $REPORT_FILE
+                  done
+                  echo "" >> $REPORT_FILE
+                fi
+              done
+            fi
+          done
+
+          if [ $FOUND_SECRETS -eq 1 ]; then
+            echo "status=failed" >> $GITHUB_OUTPUT
+            cat $REPORT_FILE
+            echo ""
+            echo "❌ SECRETS DETECTED! Please remove sensitive data before committing."
+            exit 1
+          else
+            echo "status=passed" >> $GITHUB_OUTPUT
+            echo "✅ No secrets detected"
+          fi
+
+      - name: Comment on PR with findings
+        if: failure() && github.event_name == 'pull_request'
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            let report = '⚠️ **Secret Scanning Failed**\n\n';
+            report += '**Potential secrets or API keys were detected in your changes.**\n\n';
+            report += 'Please review and remove any sensitive data before merging.\n\n';
+            report += '### What to do:\n';
+            report += '1. Remove the secret from your code\n';
+            report += '2. Use environment variables or GitHub Secrets instead\n';
+            report += '3. If the secret was already committed, you must:\n';
+            report += '   - Rotate/invalidate the exposed secret\n';
+            report += '   - Remove it from git history using `git filter-branch` or BFG Repo-Cleaner\n\n';
+            report += '### Common secret patterns detected:\n';
+            report += '- AWS Access Keys (AKIA...)\n';
+            report += '- API Keys\n';
+            report += '- Private Keys\n';
+            report += '- Passwords or tokens in code\n\n';
+            report += '**This PR cannot be merged until all secrets are removed.**';
+
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: report
+            });
+
+  block-merge:
+    name: Block Merge if Secrets Found
+    runs-on: ubuntu-latest
+    needs: [gitleaks, trufflehog, custom-pattern-check]
+    if: always()
+    steps:
+      - name: Check scan results
+        run: |
+          if [ "${{ needs.gitleaks.result }}" = "failure" ] || \
+             [ "${{ needs.trufflehog.result }}" = "failure" ] || \
+             [ "${{ needs.custom-pattern-check.result }}" = "failure" ]; then
+            echo "❌ Secret scanning failed. Blocking merge."
+            exit 1
+          else
+            echo "✅ All secret scans passed. Safe to merge."
+          fi
+
+  # Optional: Auto-revert commits with secrets on main branch
+  auto-revert:
+    name: Auto-revert Commits with Secrets
+    runs-on: ubuntu-latest
+    needs: [gitleaks, trufflehog, custom-pattern-check]
+    if: |
+      failure() &&
+      github.event_name == 'push' &&
+      github.ref == 'refs/heads/main'
+    permissions:
+      contents: write
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Configure git
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+
+      - name: Revert last commit
+        run: |
+          COMMIT_SHA="${{ github.sha }}"
+          COMMIT_MSG=$(git log -1 --pretty=%B $COMMIT_SHA)
+
+          echo "⚠️ Reverting commit: $COMMIT_SHA"
+          echo "Commit message: $COMMIT_MSG"
+
+          git revert --no-edit $COMMIT_SHA
+          git push origin main
+
+      - name: Create issue for manual review
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const issue = await github.rest.issues.create({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              title: '🚨 Secrets Detected - Commit Automatically Reverted',
+              body: `## Security Alert: Secrets Detected
+
+            **Commit**: \`${{ github.sha }}\`
+            **Author**: @${{ github.actor }}
+            **Branch**: main
+
+            ### What happened?
+            Secret scanning detected potential secrets or API keys in a commit to the main branch.
+            The commit has been automatically reverted to prevent exposure.
+
+            ### Required Actions:
+
+            1. **⚠️ ROTATE ALL EXPOSED SECRETS IMMEDIATELY**
+               - If the secret was an API key, revoke it
+               - If it was an AWS key, disable it in IAM
+               - Generate new credentials
+
+            2. **Clean up your local branch**:
+               \`\`\`bash
+               git fetch origin
+               git reset --hard origin/main
+               \`\`\`
+
+            3. **Remove the secret properly**:
+               - Use environment variables
+               - Use GitHub Secrets
+               - Use AWS Secrets Manager / Parameter Store
+               - Add pattern to .gitignore
+
+            4. **Re-commit without secrets**:
+               - Make your changes again
+               - Ensure no secrets are in the code
+               - Submit a new PR
+
+            ### Preventing Future Incidents:
+
+            - Always use \`.tfvars\` files for sensitive values (they're gitignored)
+            - Use \`backend.tf\` for backend config (also gitignored)
+            - Store secrets in GitHub Secrets or AWS Secrets Manager
+            - Run \`git diff\` before committing to review changes
+            - Enable pre-commit hooks for local secret scanning
+
+            **This issue will remain open until confirmed that exposed secrets have been rotated.**`,
+              labels: ['security', 'urgent', 'secrets-detected']
+            });
+
+            console.log('Created issue:', issue.data.number);
+
+      - name: Send alert notification
+        if: always()
+        run: |
+          echo "🚨 SECURITY ALERT: Secrets detected in commit ${{ github.sha }}"
+          echo "Commit has been reverted and an issue has been created."
+          echo "Please rotate any exposed credentials immediately."
diff --git a/.github/workflows/terraform-apply.yml b/.github/workflows/terraform-apply.yml
new file mode 100644
index 0000000..52eda40
--- /dev/null
+++ b/.github/workflows/terraform-apply.yml
@@ -0,0 +1,111 @@
+name: Terraform Apply
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "infra/aws/**/*.tf"
+      - "infra/aws/**/*.tfvars"
+      - ".github/workflows/terraform-*.yml"
+  workflow_dispatch:
+    inputs:
+      module:
+        description: "Specific module to apply (leave empty for all changed)"
+        required: false
+        type: string
+
+permissions:
+  contents: read
+  id-token: write
+
+jobs:
+  detect-changes:
+    name: Detect Changed Modules
+    runs-on: ubuntu-latest
+    outputs:
+      modules: ${{ steps.detect.outputs.modules }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+
+      - name: Detect changed Terraform modules
+        id: detect
+        run: |
+          if [ "${{ github.event_name }}" == "workflow_dispatch" ] && [ -n "${{ inputs.module }}" ]; then
+            # Manual trigger with specific module
+            MODULES=$(echo '["${{ inputs.module }}"]')
+            echo "Manual module specified: $MODULES"
+            echo "modules=$MODULES" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          # Get changed files from the last commit
+          CHANGED_FILES=$(git diff --name-only HEAD~1 HEAD | grep -E '^infra/aws/.*\.tf(vars)?$' || true)
+
+          if [ -z "$CHANGED_FILES" ]; then
+            echo "No Terraform files changed"
+            echo "modules=[]" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          # Extract unique module directories
+          MODULES=$(echo "$CHANGED_FILES" | xargs -n1 dirname | sort -u | jq -R -s -c 'split("\n")[:-1]')
+          echo "Changed modules: $MODULES"
+          echo "modules=$MODULES" >> $GITHUB_OUTPUT
+
+  terraform-apply:
+    name: Apply - ${{ matrix.module }}
+    runs-on: ubuntu-latest
+    needs: detect-changes
+    if: needs.detect-changes.outputs.modules != '[]'
+    strategy:
+      matrix:
+        module: ${{ fromJson(needs.detect-changes.outputs.modules) }}
+      fail-fast: false
+      max-parallel: 1 # Apply modules one at a time to avoid conflicts
+    defaults:
+      run:
+        working-directory: ${{ matrix.module }}
+    environment:
+      name: production-demo
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
+          aws-region: us-east-2
+          role-session-name: GitHubActions-TerraformApply
+
+      - name: Setup Terraform
+        uses: hashicorp/setup-terraform@v3
+        with:
+          terraform_version: "~1.6"
+
+      - name: Terraform Init
+        env:
+          TF_CLI_ARGS_init: >-
+            -backend-config="bucket=${{ secrets.TF_STATE_BUCKET }}"
+            -backend-config="dynamodb_table=${{ secrets.TF_STATE_LOCK_TABLE }}"
+            -backend-config="region=us-east-2"
+            -backend-config="encrypt=true"
+        run: terraform init -input=false
+
+      - name: Terraform Plan
+        run: terraform plan -no-color -input=false -out=tfplan
+
+      - name: Terraform Apply
+        run: terraform apply -no-color -input=false tfplan
+
+      - name: Upload Terraform State (backup)
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: terraform-state-${{ hashFiles(format('{0}/**', matrix.module)) }}
+          path: ${{ matrix.module }}/.terraform/
+          retention-days: 7
diff --git a/.github/workflows/terraform-destroy.yml b/.github/workflows/terraform-destroy.yml
new file mode 100644
index 0000000..590c354
--- /dev/null
+++ b/.github/workflows/terraform-destroy.yml
@@ -0,0 +1,68 @@
+name: Terraform Destroy
+
+on:
+  workflow_dispatch:
+    inputs:
+      module:
+        description: "Module to destroy (e.g., infra/aws/us-east-2/eks)"
+        required: true
+        type: string
+      confirm:
+        description: 'Type "destroy" to confirm'
+        required: true
+        type: string
+
+permissions:
+  contents: read
+  id-token: write
+
+jobs:
+  terraform-destroy:
+    name: Destroy - ${{ inputs.module }}
+    runs-on: ubuntu-latest
+    if: inputs.confirm == 'destroy'
+    defaults:
+      run:
+        working-directory: ${{ inputs.module }}
+    environment:
+      name: production-demo
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
+          aws-region: us-east-2
+          role-session-name: GitHubActions-TerraformDestroy
+
+      - name: Setup Terraform
+        uses: hashicorp/setup-terraform@v3
+        with:
+          terraform_version: "~1.6"
+
+      - name: Terraform Init
+        env:
+          TF_CLI_ARGS_init: >-
+            -backend-config="bucket=${{ secrets.TF_STATE_BUCKET }}"
+            -backend-config="dynamodb_table=${{ secrets.TF_STATE_LOCK_TABLE }}"
+            -backend-config="region=us-east-2"
+            -backend-config="encrypt=true"
+        run: terraform init -input=false
+
+      - name: Terraform Plan Destroy
+        run: terraform plan -destroy -no-color -input=false -out=tfplan
+
+      - name: Terraform Destroy
+        run: terraform apply -no-color -input=false tfplan
+
+  validation-failed:
+    name: Validation Failed
+    runs-on: ubuntu-latest
+    if: inputs.confirm != 'destroy'
+    steps:
+      - name: Confirmation not provided
+        run: |
+          echo "::error::Destroy confirmation not provided. You must type 'destroy' to confirm."
+          exit 1
diff --git a/.github/workflows/terraform-plan.yml b/.github/workflows/terraform-plan.yml
new file mode 100644
index 0000000..0da766e
--- /dev/null
+++ b/.github/workflows/terraform-plan.yml
@@ -0,0 +1,140 @@
+name: Terraform Plan
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "infra/aws/**/*.tf"
+      - "infra/aws/**/*.tfvars"
+      - ".github/workflows/terraform-*.yml"
+
+permissions:
+  contents: read
+  pull-requests: write
+  id-token: write
+
+jobs:
+  detect-changes:
+    name: Detect Changed Modules
+    runs-on: ubuntu-latest
+    outputs:
+      modules: ${{ steps.detect.outputs.modules }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Detect changed Terraform modules
+        id: detect
+        run: |
+          # Get changed files
+          CHANGED_FILES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD | grep -E '^infra/aws/.*\.tf(vars)?$' || true)
+
+          if [ -z "$CHANGED_FILES" ]; then
+            echo "No Terraform files changed"
+            echo "modules=[]" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          # Extract unique module directories
+          MODULES=$(echo "$CHANGED_FILES" | xargs -n1 dirname | sort -u | jq -R -s -c 'split("\n")[:-1]')
+          echo "Changed modules: $MODULES"
+          echo "modules=$MODULES" >> $GITHUB_OUTPUT
+
+  terraform-plan:
+    name: Plan - ${{ matrix.module }}
+    runs-on: ubuntu-latest
+    needs: detect-changes
+    if: needs.detect-changes.outputs.modules != '[]'
+    strategy:
+      matrix:
+        module: ${{ fromJson(needs.detect-changes.outputs.modules) }}
+      fail-fast: false
+    defaults:
+      run:
+        working-directory: ${{ matrix.module }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
+          aws-region: us-east-2
+          role-session-name: GitHubActions-TerraformPlan
+
+      - name: Setup Terraform
+        uses: hashicorp/setup-terraform@v3
+        with:
+          terraform_version: "~1.6"
+
+      - name: Terraform Format Check
+        id: fmt
+        run: terraform fmt -check -recursive
+        continue-on-error: true
+
+      - name: Terraform Init
+        id: init
+        env:
+          TF_CLI_ARGS_init: >-
+            -backend-config="bucket=${{ secrets.TF_STATE_BUCKET }}"
+            -backend-config="dynamodb_table=${{ secrets.TF_STATE_LOCK_TABLE }}"
+            -backend-config="region=us-east-2"
+            -backend-config="encrypt=true"
+        run: terraform init -input=false
+
+      - name: Terraform Validate
+        id: validate
+        run: terraform validate -no-color
+
+      - name: Terraform Plan
+        id: plan
+        run: |
+          terraform plan -no-color -input=false -out=tfplan
+          terraform show -no-color tfplan > plan.txt
+        continue-on-error: true
+
+      - name: Comment PR with Plan
+        uses: actions/github-script@v7
+        if: github.event_name == 'pull_request'
+        env:
+          PLAN: ${{ steps.plan.outputs.stdout }}
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            const module = '${{ matrix.module }}';
+            const plan = fs.existsSync('${{ matrix.module }}/plan.txt')
+              ? fs.readFileSync('${{ matrix.module }}/plan.txt', 'utf8')
+              : 'Plan output not available';
+
+            const output = `### Terraform Plan: \`${module}\`
+
+            #### Format and Style 🖌 \`${{ steps.fmt.outcome }}\`
+            #### Initialization ⚙️ \`${{ steps.init.outcome }}\`
+            #### Validation 🤖 \`${{ steps.validate.outcome }}\`
+            #### Plan 📖 \`${{ steps.plan.outcome }}\`
+
+            <details><summary>Show Plan</summary>
+
+            \`\`\`terraform
+            ${plan.slice(0, 65000)}
+            \`\`\`
+
+            </details>
+
+            *Pusher: @${{ github.actor }}, Action: \`${{ github.event_name }}\`, Workflow: \`${{ github.workflow }}\`*`;
+
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: output
+            });
+
+      - name: Fail if plan failed
+        if: steps.plan.outcome == 'failure'
+        run: exit 1
diff --git a/.gitignore b/.gitignore
index 839afa9..e15c52f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,9 +2,22 @@
 .terraform/
 .terraform.lock.hcl
 terraform.tfstate*
-**.tfvars**
 tf.plan
-
+tfplan
+*.tfplan
+*.log
+
+# Backend configuration (contains sensitive IDs)
+backend.tf
+backend.tfvars
+*.backend.tfvars
+backend.hcl
+*.backend.hcl
+
+# Terraform variable files (may contain sensitive IDs, ARNs, domains)
+*.tfvars
+!*.tfvars.example
+terraform.tfvars.example
 # Helm + Kubernetes
 infra/aws/us-east-2/apps/coder-ws/experiment/prometheus.yaml
 infra/aws/us-east-2/apps/coder-devel/build-and-push
diff --git a/.gitleaks.toml b/.gitleaks.toml
new file mode 100644
index 0000000..f1ef882
--- /dev/null
+++ b/.gitleaks.toml
@@ -0,0 +1,107 @@
+# Gitleaks configuration file
+# https://github.com/gitleaks/gitleaks
+
+title = "Gitleaks Configuration for Coder Infrastructure"
+
+[extend]
+# useDefault will extend the base configuration with all default gitleaks rules
+useDefault = true
+
+[allowlist]
+description = "Allowlist for non-sensitive patterns"
+
+# Ignore test/example values
+regexes = [
+  '''test[_-]?(token|key|secret|password)''',  # Test credentials
+  '''example[_-]?(token|key|secret)''',
+  '''dummy[_-]?(token|key|secret)''',
+  '''fake[_-]?(token|key|secret)''',
+  '''YOUR[_-]''',  # Placeholder values like YOUR_API_KEY
+  '''REPLACE[_-]''',
+  '''CHANGEME''',
+  '''TODO''',
+]
+
+# Ignore certain file paths
+paths = [
+  '''\.git/''',
+  '''\.terraform/''',
+  '''node_modules/''',
+  '''vendor/''',
+  '''\.(tfstate|tfstate\.backup)$''',
+  '''\.example$''',  # Example configuration files
+  '''\.md$''',  # Documentation files (review these manually)
+  '''go\.sum$''',
+  '''package-lock\.json$''',
+]
+
+# Ignore certain commits (if needed, add commit SHAs here)
+commits = []
+
+# Custom rules for infrastructure-specific secrets
+[[rules]]
+id = "terraform-sensitive-variable"
+description = "Terraform sensitive variable not marked as sensitive"
+regex = '''variable\s+"([^"]+)"\s+\{[^}]*default\s+=\s+["']([^"']{8,})["'][^}]*\}'''
+tags = ["terraform", "sensitive"]
+
+[[rules]]
+id = "aws-account-id"
+description = "AWS Account ID"
+regex = '''\d{12}'''
+tags = ["aws", "account-id"]
+# Note: Account IDs aren't secrets, but good to track
+[rules.allowlist]
+regexes = [
+  '''(region|zone|ami|snapshot|volume)-\d{12}''',  # Not account IDs
+]
+
+[[rules]]
+id = "coder-access-url"
+description = "Coder access URL with potential secrets"
+regex = '''coder_access_url\s*=\s*["\']https?://[^"\']*:[^"\'@]*@'''
+tags = ["coder", "url", "credentials"]
+
+[[rules]]
+id = "database-connection-string"
+description = "Database connection string with credentials"
+regex = '''postgres://([^:]+):([^@]+)@'''
+tags = ["database", "credentials"]
+[rules.allowlist]
+regexes = [
+  '''postgres://\w+@localhost''',  # Local connections without password
+  '''mode=memory''',  # In-memory databases
+]
+
+[[rules]]
+id = "route53-zone-id"
+description = "Route53 Hosted Zone ID"
+regex = '''Z[A-Z0-9]{12,}'''
+tags = ["aws", "route53"]
+# These are semi-sensitive; track but don't necessarily block
+
+[[rules]]
+id = "oidc-provider-arn"
+description = "OIDC Provider ARN containing account ID"
+regex = '''arn:aws:iam::\d{12}:oidc-provider'''
+tags = ["aws", "oidc", "arn"]
+
+[[rules]]
+id = "kubernetes-secret-value"
+description = "Kubernetes secret value in manifest"
+regex = '''(apiVersion:\s*v1\s+kind:\s*Secret.*data:.*\n\s+\w+:\s+)([A-Za-z0-9+/=]{16,})'''
+tags = ["kubernetes", "secret", "base64"]
+
+# Entropy-based detection for high-entropy strings (likely secrets)
+[[rules]]
+id = "high-entropy-string"
+description = "High entropy string (possible secret)"
+regex = '''['\"]([A-Za-z0-9+/=]{32,})['\"]'''
+entropy = 4.5  # Minimum entropy threshold
+tags = ["entropy", "generic"]
+[rules.allowlist]
+paths = [
+  '''\.lock$''',
+  '''\.sum$''',
+  '''\.json$''',
+]
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..d49d3f8
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,55 @@
+# Pre-commit hooks configuration
+# Install: pip install pre-commit && pre-commit install
+# Run manually: pre-commit run --all-files
+
+repos:
+  # Gitleaks - Secret detection
+  - repo: https://github.com/gitleaks/gitleaks
+    rev: v8.18.4
+    hooks:
+      - id: gitleaks
+
+  # General checks
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: trailing-whitespace
+        exclude: '\.md$'
+      - id: end-of-file-fixer
+      - id: check-yaml
+        args: ["--unsafe"] # Allow custom YAML tags
+      - id: check-added-large-files
+        args: ["--maxkb=1000"]
+      - id: check-merge-conflict
+      - id: detect-private-key
+      - id: detect-aws-credentials
+        args: ["--allow-missing-credentials"]
+
+  # Terraform
+  - repo: https://github.com/antonbabenko/pre-commit-terraform
+    rev: v1.88.4
+    hooks:
+      - id: terraform_fmt
+      - id: terraform_validate
+        args:
+          - --hook-config=--retry-once-with-cleanup=true
+      - id: terraform_tflint
+        args:
+          - --args=--config=__GIT_WORKING_DIR__/.tflint.hcl
+      - id: terraform_docs
+        args:
+          - --hook-config=--path-to-file=README.md
+          - --hook-config=--add-to-existing-file=true
+          - --hook-config=--create-file-if-not-exist=true
+
+  # Prevent commits to main
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: no-commit-to-branch
+        args: ["--branch", "main", "--branch", "master"]
+        stages: [commit]
+
+# Global settings
+default_language_version:
+  python: python3.11
diff --git a/GITHUB_APP_SETUP.md b/GITHUB_APP_SETUP.md
new file mode 100644
index 0000000..adc457c
--- /dev/null
+++ b/GITHUB_APP_SETUP.md
@@ -0,0 +1,56 @@
+# GitHub App Setup for Coder
+
+## Correct Callback URLs
+
+When configuring your GitHub App for Coder, use these **exact** callback URLs:
+
+### Primary OAuth (User Authentication)
+
+```
+https://coderdemo.io/api/v2/users/oauth2/github/callback
+```
+
+### External Auth (Git Operations in Workspaces)
+
+```
+https://coderdemo.io/api/v2/external-auth/primary-github/callback
+```
+
+## Important Settings
+
+1. **Request user authorization (OAuth) during installation**: ✅ **MUST be checked**
+   - This allows users to log into Coder with their GitHub identity
+
+2. **Permissions Required**:
+   - **Account permissions**:
+     - Email addresses: Read-only
+   - **Repository permissions**:
+     - Contents: Read and write
+     - Metadata: Read-only (auto-required)
+     - Pull requests: Read and write (optional, for PR creation)
+     - Issues: Read and write (optional, for issue management)
+
+3. **Installation**:
+   - Install the app to your account/organization
+   - Grant access to "All repositories" or specific repos
+
+## Common Issues
+
+### "redirect_uri is not associated with this application"
+
+- **Cause**: Callback URLs don't match what Coder is sending
+- **Solution**: Verify the URLs above are **exactly** correct (including `/api/v2/users/` and `/api/v2/`)
+
+### "Not HTTPS Secure" warning
+
+- **Cause**: Accessing `http://coderdemo.io` instead of `https://coderdemo.io`
+- **Solution**: Always use `https://` when accessing Coder
+
+## After Setup
+
+Once configured, users can:
+
+- Log into Coder using GitHub authentication
+- Clone repositories in their workspaces
+- Push/pull code
+- Create pull requests (if permissions granted)
diff --git a/README.md b/README.md
index 2d11dd7..4278a73 100644
--- a/README.md
+++ b/README.md
@@ -1,407 +1,557 @@
-# AI Demo Environment (ai.coder.com)
+# Coder Demo Environment (coderdemo.io)
 
-Welcome to the AI Demo Environment's Github repository!
+Welcome to the Coder Demo Environment's Github repository!
 
-This project is used by ["ai.coder.com"](https://ai.coder.com), allowing users to experiment with the latest AI features in Coder and create demoes for them.
+This project powers ["coderdemo.io"](https://coderdemo.io), a production-grade, multi-region demonstration environment showcasing Coder's cloud development capabilities, workspace proxies, and global deployment patterns.
 
----
+> [!IMPORTANT]
+> **This infrastructure is HEAVILY AWS-opinionated.**
+>
+> This repository uses AWS-specific services and patterns throughout (EKS, Aurora Serverless v2, VPC, Route53, ACM, etc.). While Coder itself is cloud-agnostic, this particular deployment is designed exclusively for AWS. If you're deploying on GCP, Azure, or other cloud providers, you'll need to significantly adapt the infrastructure code.
 
-## Getting Hand's On
+---
 
-> [!IMPORTANT] Before accessing the deployment, make sure you've been invited to our "coder-contrib" Github organization. If not, reach out to `jullian@coder.com` and send your Github handle to be added in. Otherwise, if you're an internal user, you should already have access to to the environment.
+## Getting Started
 
 ### Accessing the Deployment:
 
-Get Started Here 👉 [https://ai.coder.com](https://ai.coder.com)
+Get Started Here 👉 [https://coderdemo.io](https://coderdemo.io)
 
 **Login Flow**
 
-- Non-Coder Employee
+1. Click "Sign in with GitHub"
+2. Authorize the Coder Demo GitHub App
+3. Start creating workspaces in your preferred region!
+
+**Available Regions:**
 
-1. Select "GitHub"
+- 🇺🇸 **US East (Ohio)** - Primary deployment with database
+- 🇺🇸 **US West (Oregon)** - Secondary server + workspace proxy
+- 🇪🇺 **EU West (London)** - Workspace proxy
+
+> [!NOTE] This is a demo environment. For production Coder deployments, refer to the [official Coder documentation](https://coder.com/docs).
+
+---
 
-2. Login with your Github account (that has access to the coder-contrib Github Organization).
+## Architecture Overview
 
-- Coder Employee
+This deployment implements a **hub-and-spoke architecture** across three AWS regions:
 
-1. Select "Okta"
+### Hub Region: us-east-2 (Ohio)
 
-2. Login with your Github account (that has access to the coder-contrib Github Organization).
+The primary region containing foundational, non-repeatable infrastructure:
+
+- **Central Database**: Aurora Serverless v2 PostgreSQL cluster (shared by all regions)
+- **Terraform Backend**: S3 bucket and DynamoDB table for state management
+- **Container Registry**: ECR for custom images
+- **Primary VPC**: Custom VPC with peering to spoke regions
+- **Primary Coder Server**: Main deployment handling authentication and control plane
+- **Additional Services**: Redis, LiteLLM, and custom applications
+
+### Spoke Regions: us-west-2 (Oregon) & eu-west-2 (London)
+
+Repeatable regional infrastructure for workspace proxies:
+
+- **Workspace Proxies**: Low-latency access to workspaces
+- **EKS Clusters**: Regional Kubernetes clusters with Karpenter autoscaling
+- **Route53**: Regional DNS records for proxy endpoints
+- **AWS ACM**: Regional SSL/TLS certificates
+
+```
+                    ┌─────────────────────────────────┐
+                    │   us-east-2 (Primary Hub)       │
+                    │                                 │
+                    │  ┌─────────────────────────┐   │
+                    │  │   Coder Server          │   │
+                    │  │   Aurora Serverless v2  │   │
+                    │  │   Redis / ECR           │   │
+                    │  └─────────────────────────┘   │
+                    │                                 │
+                    └────────────┬───────────────────┘
+                                 │
+                    ┌────────────┴────────────┐
+                    │                         │
+         ┌──────────▼──────────┐   ┌─────────▼──────────┐
+         │  us-west-2 (Spoke)  │   │ eu-west-2 (Spoke)  │
+         │                     │   │                    │
+         │  ┌───────────────┐  │   │  ┌──────────────┐ │
+         │  │ Coder Proxy   │  │   │  │ Coder Proxy  │ │
+         │  │ Coder Server  │  │   │  │ Workspaces   │ │
+         │  │ Workspaces    │  │   │  └──────────────┘ │
+         │  └───────────────┘  │   │                    │
+         └─────────────────────┘   └────────────────────┘
+```
+
+For detailed architecture documentation, see:
+
+- [Multi-Region Deployment Guide](./docs/MULTI_REGION_DEPLOYMENT.md)
+- [Infrastructure Best Practices](./docs/INFRASTRUCTURE_BEST_PRACTICES.md)
+- [Architecture Diagram](./docs/ARCHITECTURE_DIAGRAM.md)
 
 ---
 
 ## How-To-Deploy
 
-> [!WARNING] The following environment is heavily opinionated towards: AWS. Make sure to pull the modules and modify according to your use-case. Additionally, the [`infra/aws/us-east-2`](./infra/aws/us-east-2) project is not repeatable. For repeatable references, check out [`infra/aws/us-west-2`](./infra/aws/us-west-2) and [`infra/aws/eu-west-2`](./infra/aws/eu-west-2)
+> [!WARNING]
+> **Infrastructure Repeatability Notice**
+>
+> This environment is heavily opinionated towards AWS and uses a hub-and-spoke architecture:
+>
+> - **[`infra/aws/us-east-2`](./infra/aws/us-east-2)** - Primary hub region with foundational infrastructure (database, terraform backend, VPC, etc.). **This is NOT repeatable** - it's meant to be deployed once as your control plane.
+> - **[`infra/aws/eu-west-2`](./infra/aws/eu-west-2)** - Clean spoke region example with workspace proxy only. **This IS repeatable** for adding new regions.
+> - **[`infra/aws/us-west-2`](./infra/aws/us-west-2)** - Hybrid spoke region with both server and proxy deployments. Use this as a reference for redundant server deployments.
+>
+> When deploying to new regions, use `eu-west-2` as your template for workspace proxies.
+
+### Deployment Overview
+
+The infrastructure is deployed in layers:
+
+1. **Foundation Layer** (us-east-2 only - deploy once)
+   - Terraform backend (S3 + DynamoDB)
+   - VPC with custom networking
+   - Aurora Serverless v2 PostgreSQL database
+   - ECR for container images
+   - Redis for caching
+
+2. **Compute Layer** (all regions)
+   - EKS clusters with managed node groups
+   - Karpenter for workspace autoscaling
+   - VPC peering (for spoke regions to hub)
+
+3. **Certificate & DNS Layer** (all regions)
+   - AWS Certificate Manager (ACM) for SSL/TLS
+   - Route53 for DNS management
+   - Regional subdomains (e.g., `us-west-2.coderdemo.io`)
+
+4. **Kubernetes Applications Layer** (all regions)
+   - AWS Load Balancer Controller
+   - AWS EBS CSI Driver
+   - Karpenter node provisioner
+   - Metrics Server
+   - Cert Manager
+
+5. **Coder Layer**
+   - **Primary (us-east-2)**: Coder Server with database connection
+   - **Spoke regions**: Coder Workspace Proxies connected to primary
+
+### About the Infrastructure Modules
+
+This repository provides reusable Terraform modules for deploying Coder on AWS:
+
+#### Network Module: [`eks-vpc`](./modules/network/eks-vpc)
+
+Creates an opinionated VPC designed for EKS and Coder workloads:
+
+- Customizable public and private subnets across multiple AZs
+- Internet Gateway for public access
+- Cost-optimized NAT Gateway using [fck-nat](https://github.com/RaJiska/terraform-aws-fck-nat)
+- Automatic routing configuration
+- Subnet tagging for EKS and Karpenter integration
+
+#### Compute Module: [`eks-cluster`](./modules/compute/cluster)
+
+Creates a production-ready EKS cluster similar to [EKS Auto Mode](https://docs.aws.amazon.com/eks/latest/userguide/automode.html):
+
+- Leverages the [AWS Managed Terraform EKS module](https://github.com/terraform-aws-modules/terraform-aws-eks/tree/master)
+- Pre-configured IAM roles and policies for:
+  - [Karpenter](https://karpenter.sh/) - Node autoscaling
+  - [AWS EBS CSI Driver](https://github.com/kubernetes-sigs/aws-ebs-csi-driver) - Persistent volumes
+  - [AWS Load Balancer Controller](https://github.com/kubernetes-sigs/aws-load-balancer-controller) - Ingress management
+  - [Coder External Provisioner](https://coder.com/docs/admin/provisioners) - Workspace provisioning
+  - [Amazon Bedrock](https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html) - AI capabilities
+- IRSA (IAM Roles for Service Accounts) configuration
+- Node group with custom launch templates
+
+#### Kubernetes Bootstrap Modules: [`modules/k8s/bootstrap/`](./modules/k8s/bootstrap/)
+
+Helm-based Kubernetes application deployments:
+
+- **[`lb-controller`](./modules/k8s/bootstrap/lb-controller)** - AWS Load Balancer Controller
+- **[`ebs-controller`](./modules/k8s/bootstrap/ebs-controller)** - AWS EBS CSI Driver
+- **[`metrics-server`](./modules/k8s/bootstrap/metrics-server)** - Kubernetes Metrics Server
+- **[`karpenter`](./modules/k8s/bootstrap/karpenter)** - Karpenter autoscaler with NodePools
+- **[`cert-manager`](./modules/k8s/bootstrap/cert-manager)** - Certificate management
+- **[`coder-server`](./modules/k8s/bootstrap/coder-server)** - Primary Coder deployment
+- **[`coder-proxy`](./modules/k8s/bootstrap/coder-proxy)** - Workspace proxy deployments
 
-In this repository, we deploy the infrastructure separately from the K8s applications which includes Coder.
+---
 
-To make things easy, we generate K8s app manifests from any `k8s/` project subfolders which reference the main `eks/` application indirectly which auto-populates any infrastructure dependent resource names.
+## Deployment Guide
+
+### Prerequisites
+
+- AWS CLI configured with appropriate credentials
+- Terraform >= 1.9.0
+- kubectl
+- Helm 3.x
+- GitHub OAuth App credentials (for authentication)
+
+### Step 1: Deploy Foundation Infrastructure (us-east-2 only)
+
+> [!IMPORTANT]
+> Only deploy this once for your entire multi-region setup.
+
+```bash
+cd infra/aws/us-east-2
+
+# 1. Create Terraform backend
+cd terraform-backend
+terraform init
+terraform apply
+cd ..
+
+# 2. Create VPC
+cd vpc
+terraform init -backend-config=backend.hcl
+terraform apply
+cd ..
+
+# 3. Deploy EKS cluster
+cd eks
+terraform init -backend-config=backend.hcl
+terraform apply
+cd ..
+
+# 4. Deploy Aurora Serverless v2 database
+cd rds
+terraform init -backend-config=backend.hcl
+terraform apply
+cd ..
+
+# 5. Set up Route53 and ACM for primary domain
+cd route53
+terraform init -backend-config=backend.hcl
+terraform apply
+cd ..
+
+cd acm
+terraform init -backend-config=backend.hcl
+terraform apply
+cd ..
+```
 
-### About the Infrastructure
+### Step 2: Deploy Kubernetes Applications (us-east-2)
 
-The deployment currently has 2 repeatable components: [`eks-vpc` module](./modules/network/eks-vpc) and [`eks-cluster` module](./modules/compute/cluster).
+```bash
+cd infra/aws/us-east-2/k8s
 
-#### [`eks-vpc`](./modules/network/eks-vpc)
+# Update kubeconfig
+aws eks update-kubeconfig --region us-east-2 --name coderdemo
 
-The following module creates an opinionated VPC that let's you granularly define individual subnets. This includes unevenly defining public and private subnets.
+# Deploy in order (each depends on previous)
+cd lb-controller && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+cd ebs-controller && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+cd metrics-server && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+cd karpenter && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+cd cert-manager && terraform init -backend-config=backend.hcl && terraform apply && cd ..
 
-This will come with an Internet Gateway and a Custom NAT Gateway (using [RaJiska/terraform-aws-fck-nat](github.com/RaJiska/terraform-aws-fck-nat)).
+# Deploy Coder Server
+cd coder-server && terraform init -backend-config=backend.hcl && terraform apply && cd ..
 
-The public subnets will have automatic routes to the IGW and private subnets with routes to the NAT.
+# Deploy Coder Workspace Provisioner
+cd coder-ws && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+```
 
-#### [`eks-cluster`](./modules/compute/cluster).
+### Step 3: Deploy Spoke Regions (repeatable)
 
-The following module creates an opinionated cluster, similar to [EKS Auto Mode](https://docs.aws.amazon.com/eks/latest/userguide/automode.html), that creates both the EKS Cluster (using the [AWS Managed Terraform EKS module](https://github.com/terraform-aws-modules/terraform-aws-eks/tree/master)), and resources needed by:
+For each additional region (use `eu-west-2` as template):
 
-- [Karpenter](https://karpenter.sh/)
-- [Amazon Bedrock](https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html)
-- [AWS EBS Controller](https://github.com/kubernetes-sigs/aws-ebs-csi-driver)
-- [AWS Load Balancer Controller](https://github.com/kubernetes-sigs/aws-load-balancer-controller)
-- [Coder External Provisioner](https://coder.com/docs/admin/provisioners)
+```bash
+# Example: Deploy to eu-west-2
+cd infra/aws/eu-west-2
 
-##### Karpenter
+# 1. Deploy EKS cluster
+cd eks
+terraform init -backend-config=backend.hcl
+terraform apply
+cd ..
 
-We use the the [AWS Managed Terraform EKS Module for Karpenter in the background](https://github.com/terraform-aws-modules/terraform-aws-eks/tree/master/modules/karpenter).
+# 2. Deploy Kubernetes applications (same order as us-east-2)
+cd k8s
+aws eks update-kubeconfig --region eu-west-2 --name coderdemo-euw2
 
-This automatically creates:
+cd lb-controller && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+cd ebs-controller && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+cd metrics-server && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+cd karpenter && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+cd cert-manager && terraform init -backend-config=backend.hcl && terraform apply && cd ..
 
-- SQS Queue
-- IAM Roles
-- Event Bridge
+# 3. Deploy Coder Workspace Proxy
+cd coder-proxy && terraform init -backend-config=backend.hcl && terraform apply && cd ..
 
-##### Amazon Bedrock
+# 4. Deploy Coder Workspace Provisioner
+cd coder-ws && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+```
 
-Auto-Creates
+### Step 4: Configure DNS and Certificates
 
-- IAM Role
+Each region requires:
 
-##### AWS EBS Controller
+1. Route53 DNS records pointing to the regional load balancer
+2. ACM certificate for the regional subdomain
+3. TLS certificate configuration in Coder proxy/server
 
-Auto-Creates
+See the region-specific configurations in:
 
-- IAM Role
+- `infra/aws/us-east-2/route53/`
+- `infra/aws/us-west-2/route53/`
+- `infra/aws/us-west-2/acm/`
 
-##### AWS Load Balancer Controller
+---
 
-Auto-Creates
+## Configuration
 
-- IAM Role
+### Terraform Variables
 
-##### Coder External Provisioner
+Each deployment requires a `terraform.tfvars` file (gitignored for security). Key variables include:
 
-Auto-Creates
+#### EKS Variables
 
-- IAM Role
+```hcl
+cluster_name    = "coderdemo"
+cluster_region  = "us-east-2"
+cluster_profile = "your-aws-profile"
+```
 
-### Creating the Infrastructure (on AWS)
+#### Coder Variables
 
-To deploy the base infrastructure, you can get started with referencing our [modules directory](./modules).
+```hcl
+coder_access_url          = "https://coderdemo.io"
+coder_wildcard_access_url = "*.coderdemo.io"
+addon_version             = "2.27.1"  # Coder version
+```
 
-If you don't have an existing network infrastructure, then you can start with deploying the [`eks-vpc` module](./modules/network/eks-vpc).
+#### Database (us-east-2 only)
 
-Additionally, if you don't have an existing cluster infrastructure, then you can start with deploying the [`eks-cluster` module](./modules/compute/cluster).
+```hcl
+coder_db_secret_url = "postgres://user:pass@host:5432/coder?sslmode=require"
+```
 
-Lastly, for Coder's backend database, you can refer to our deployment in [`./aidev/infra/aws/us-east-2/rds`](./aidev/infra/aws/us-east-2/rds) to see how to deploy it.
+#### Authentication
 
-We just an [`aws_db_instance`](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/db_instance) that uses Postgres.
+```hcl
+# GitHub OAuth
+coder_oauth_secret_client_id     = "your-github-oauth-client-id"
+coder_oauth_secret_client_secret = "your-github-oauth-secret"
 
-Refer to the example below to see how this would look like put together:
+# GitHub External Auth (for workspace git operations)
+coder_github_external_auth_secret_client_id     = "your-github-app-id"
+coder_github_external_auth_secret_client_secret = "your-github-app-secret"
+```
 
-```terraform
+#### SSL/TLS Configuration
 
-terraform {
-    required_version = ">= 1.0"
-    required_providers {
-        aws = {
-            source  = "hashicorp/aws"
-            version = ">= 5.100.0"
-        }
-    }
-}
+```hcl
+# Using AWS ACM (recommended)
+kubernetes_create_ssl_secret = false
+kubernetes_ssl_secret_name   = "coder-tls"
+acme_registration_email      = "admin@coderdemo.io"
+```
 
-variable "name" {
-    description = "The resource name."
-    type        = string
-}
+### Backend Configuration
 
-variable "region" {
-    description = "The aws region to deploy eks cluster"
-    type        = string
-}
+Each region uses S3 for Terraform state. Create a `backend.hcl` file:
 
-variable "cluster_version" {
-    description = "The EKS Version"
-    type        = string
-}
-
-variable "cluster_instance_type" {
-    description = "EKS Instance Size/Type."
-    default     = "t3.xlarge"
-    type        = string
-}
-
-variable "coder_ws_volume_size" {
-    description = "Coder Workspace K8s Node Volume Size."
-    default     = 50
-    type        = number
-}
-
-variable "coder_ws_instance_type" {
-    description = "Coder Workspace K8s Node Instance Size/Type."
-    default     = "t3.xlarge"
-    type        = string
-}
-
-variable "network_cidr_block" {
-    description = "VPC CIDR Block"
-    type = string
-    default = "10.0.0.0/16"
-}
-
-variable "db_instance_class" {
-    description = "RDS DB Instance Class"
-    type = string
-    default = "db.m5.large"
-}
-
-variable "db_allocated_storage" {
-    description = "RDS DB Allocated Storage Amount"
-    type = string
-    default = "40"
-}
-
-variable "db_master_username" {
-    description = "RDS DB Master Username"
-    type = string
-    sensitive = true
-}
-
-variable "db_master_password" {
-    description = "RDS DB Master Password"
-    type = string
-    sensitive = true
-}
-
-module "eks-network" {
-    source = "../../../../modules/network/eks-vpc"
-
-    name = var.name
-    vpc_cidr_block = var.network_cidr_block
-    public_subnets = {
-        # System subnets requiring public access (e.g. NAT Gateways, Load Balancers, IGW, etc.)
-        "system0" = {
-            cidr_block = "10.0.10.0/24"
-            availability_zone = "${data.aws_region.this.name}a"
-            map_public_ip_on_launch = true
-            private_dns_hostname_type_on_launch = "ip-name"
-        }
-        "system1" = {
-            cidr_block = "10.0.11.0/24"
-            availability_zone = "${data.aws_region.this.name}b"
-            map_public_ip_on_launch = true
-            private_dns_hostname_type_on_launch = "ip-name"
-        }
-    }
-    private_subnets = {
-        # System subnets that don't need to be exposed publically (e.g. K8s Worker Nodes, Database, etc.)
-        "system0" = {
-            cidr_block = "10.0.20.0/24"
-            availability_zone = "${data.aws_region.this.name}a"
-            private_dns_hostname_type_on_launch = "ip-name"
-            tags = local.system_subnet_tags
-        }
-        "system1" = {
-            cidr_block = "10.0.21.0/24"
-            availability_zone = "${data.aws_region.this.name}b"
-            private_dns_hostname_type_on_launch = "ip-name"
-            tags = local.system_subnet_tags
-        }
-        "provisioner" = {
-            cidr_block = "10.0.22.0/24"
-            availability_zone = "${data.aws_region.this.name}a"
-            map_public_ip_on_launch = true
-            private_dns_hostname_type_on_launch = "ip-name"
-            tags = local.provisioner_subnet_tags
-        }
-        "ws-all" = {
-            cidr_block = "10.0.16.0/22"
-            availability_zone = "${data.aws_region.this.name}b"
-            map_public_ip_on_launch = true
-            private_dns_hostname_type_on_launch = "ip-name"
-            tags = local.ws_all_subnet_tags
-        }
-    }
-}
-
-data "aws_iam_policy_document" "sts" {
-    statement {
-        effect = "Allow"
-        actions = ["sts:*"]
-        resources = ["*"]
-    }
-}
-
-resource "aws_iam_policy" "sts" {
-    name_prefix = "sts"
-    path = "/"
-    description = "Assume Role Policy"
-    policy = data.aws_iam_policy_document.sts.json
-}
-
-module "eks-cluster" {
-    source = "../../../../modules/compute/cluster"
-
-    vpc_id     = module.eks-network.vpc_id
-    cluster_public_subnet_ids = module.eks-network.public_subnet_ids
-    cluster_private_subnet_ids = module.eks-network.private_subnet_ids
-    cluster_intra_subnet_ids = module.eks-network.intra_subnet_ids
-    cluster_instance_type = var.cluster_instance_type
-
-    cluster_name                    = var.name
-    cluster_version                 = var.cluster_version
-    cluster_asg_additional_policies = {
-        AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
-        STSAssumeRole = aws_iam_policy.sts.arn
-    }
-    cluster_node_security_group_tags = merge(
-        local.system_sg_tags,
-        merge(local.provisioner_sg_tags, local.ws_all_sg_tags)
-    )
-    cluster_asg_node_labels = local.cluster_asg_node_labels
-    cluster_addons = {
-        coredns = {
-            most_recent = true
-        }
-        kube-proxy = {
-            most_recent = true
-        }
-        vpc-cni = {
-            most_recent = true
-        }
-    }
-
-    karpenter_controller_policy_statements = [{
-      effect = "Allow",
-      actions = toset(["iam:PassRole"]),
-      resources = toset(["*"]),
-    }]
-
-    karpenter_node_role_policies = {
-        AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
-        STSAssumeRole = aws_iam_policy.sts.arn
-    }
-
-    coder_ws_instance_type = var.coder_ws_instance_type
-    coder_ws_volume_size = var.coder_ws_volume_size
-}
-
-###
-# Only deploy the database if you're creating the central Coder infrastructure.
-# Otherwise, if you're deploying separate clusters for Coder proxies + provisioners in a different network, then there's no need for another database.
-###
-
-resource "aws_db_subnet_group" "db_subnet_group" {
-  name       = "${var.name}-db-subnet-group"
-  subnet_ids = module.eks-network.private_subnet_ids
-
-  tags = {
-    Name = "${var.name}-db-subnet-group"
-  }
-}
-
-resource "aws_db_instance" "db" {
-  identifier        = "${var.name}-db"
-  instance_class    = var.instance_class
-  allocated_storage = var.allocated_storage
-  engine            = "postgres"
-  engine_version    = "15.12"
-  username               = var.master_username
-  password               = var.master_password
-  db_name                = "coder"
-  db_subnet_group_name   = aws_db_subnet_group.db_subnet_group.name
-  vpc_security_group_ids = [ aws_security_group.postgres.id ]
-  publicly_accessible = false
-  skip_final_snapshot = false
-
-  tags = {
-    Name = "${var.name}-rds-db"
-  }
-  lifecycle {
-    ignore_changes = [
-      snapshot_identifier
-    ]
-  }
-}
-
-resource "aws_vpc_security_group_ingress_rule" "postgres" {
-  security_group_id = aws_security_group.postgres.id
-  cidr_ipv4 = var.network_cidr_block
-  ip_protocol = "tcp"
-  from_port = 5432
-  to_port = 5432
-}
-
-resource "aws_vpc_security_group_egress_rule" "all" {
-  security_group_id = aws_security_group.postgres.id
-  cidr_ipv4 = "0.0.0.0/0"
-  ip_protocol = -1
-}
-
-resource "aws_security_group" "postgres" {
-  vpc_id      = module.eks-network.vpc_id
-  name        = "${var.name}-postgres"
-  description = "Security Group for Postgres traffic"
-  tags = {
-    Name = "${var.name}-postgres"
-  }
-}
+```hcl
+bucket         = "your-terraform-state-bucket"
+key            = "path/to/state/terraform.tfstate"
+region         = "us-east-2"
+dynamodb_table = "your-terraform-locks-table"
+encrypt        = true
+profile        = "your-aws-profile"
 ```
 
-The deployment may take a while (~20 minutes or more). In the meantime, you can then get started with creating other dependencies.
+---
+
+## Multi-Region Architecture Details
+
+### Database Strategy
+
+This deployment uses a **centralized database** approach:
+
+- Aurora Serverless v2 PostgreSQL in us-east-2
+- All regions connect to the same database over VPC peering
+- Benefits: Simplified data consistency, no replication complexity
+- Trade-offs: All regions depend on us-east-2 availability
+
+For production high-availability requirements, consider:
+
+- Aurora Global Database for multi-region read replicas
+- Active-active deployments with database replication
+- Regional database failover strategies
+
+See [Multi-Region Deployment Guide](./docs/MULTI_REGION_DEPLOYMENT.md) for more details.
+
+### Workspace Proxy Strategy
+
+Workspace proxies provide:
+
+- **Low-latency connections** to workspaces in remote regions
+- **Reduced bandwidth costs** by keeping traffic regional
+- **Improved user experience** for global teams
+
+Each proxy:
+
+1. Registers with the primary Coder server (us-east-2)
+2. Receives a session token for authentication
+3. Proxies workspace connections without database access
+4. Can run workspace provisioners locally
+
+### Network Architecture
+
+- **VPC Peering**: Spoke regions peer with hub region for database access
+- **NAT Strategy**: Cost-optimized fck-nat for outbound internet access
+- **Load Balancers**: NLB for Coder, ALB for other services
+- **DNS**: Regional subdomains route to closest workspace proxy
+
+---
+
+## Monitoring and Observability
+
+> [!NOTE]
+> Observability stack configuration is in progress.
+
+Planned integrations:
+
+- Prometheus for metrics collection
+- Grafana for visualization
+- CloudWatch for AWS resource monitoring
+- Coder built-in metrics and health endpoints
+
+---
+
+## Security Considerations
+
+### Secrets Management
+
+- **Database credentials**: Stored in terraform.tfvars (gitignored)
+- **OAuth credentials**: Stored in terraform.tfvars (gitignored)
+- **TLS certificates**: Managed by AWS ACM
+- **Kubernetes secrets**: Created by Terraform, stored in etcd
+
+For production, consider:
 
-### Deploying Required Apps
+- AWS Secrets Manager for credential rotation
+- External Secrets Operator for Kubernetes
+- HashiCorp Vault for centralized secret management
 
-Once the K8s (and maybe the Database) infrastructure is deployed, the next step is to deploy the K8s apps.
+### Network Security
 
-Before getting to Coder, we should first deploy:
+- Private subnets for all compute resources
+- Security groups restricting traffic between tiers
+- VPC peering for controlled cross-region access
+- TLS encryption for all external endpoints
 
-- [`AWS Load Balancer Controller`](https://github.com/kubernetes-sigs/aws-load-balancer-controller)
-- [`AWS EBS Controller`](https://github.com/kubernetes-sigs/aws-ebs-csi-driver)
-- [`K8s Metrics Server`](github.com/kubernetes-sigs/metrics-server)
-- [`Karpenter`](https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/#4-install-karpenter)
-- [`Cert-Manager`](https://cert-manager.io/docs/installation/helm/)
+### IAM Best Practices
 
-Afterwards, you can then deploy
+- IRSA (IAM Roles for Service Accounts) for pod-level permissions
+- Least privilege principle for all IAM policies
+- No long-lived credentials in pods
+- Regular IAM policy audits
+
+---
+
+## Cost Optimization
+
+Key strategies used in this deployment:
+
+1. **Karpenter Autoscaling**: Scales nodes to zero when workspaces are idle
+2. **Aurora Serverless v2**: Scales database capacity based on load
+3. **fck-nat**: Open-source NAT solution (90% cheaper than AWS NAT Gateway)
+4. **Spot Instances**: Karpenter uses spot for workspace nodes where appropriate
+5. **Regional Resources**: Only deploy proxies in regions with active users
+
+Estimated monthly costs:
+
+- Hub region (us-east-2): $200-400/month base + per-workspace costs
+- Spoke regions: $100-200/month base + per-workspace costs
+
+See [Infrastructure Best Practices](./docs/INFRASTRUCTURE_BEST_PRACTICES.md) for detailed cost analysis.
+
+---
 
-- [`Coder Server`](https://artifacthub.io/packages/helm/coder-v2/coder)
-- [`Coder Proxy` (uses same chart as the Coder Server)](https://artifacthub.io/packages/helm/coder-v2/coder)
-- [`Coder Workspace`](https://artifacthub.io/packages/helm/coder-v2/coder-provisioner)
+## Troubleshooting
 
-You can deploy the above manually yourself following your own preferred methods.
+### Common Issues
 
-Otherwise, you can leverage our K8s app TF modules to automatically generate the manifests:
+**EKS cluster creation fails**
 
-#### [`lb-controller`](./modules/k8s/apps/lb-controller)
+- Verify IAM permissions for EKS and VPC operations
+- Check VPC CIDR doesn't conflict with existing networks
+- Ensure sufficient EIPs available in the region
 
-#### [`ebs-controller`](./modules/k8s/apps/ebs-controller)
+**Karpenter not scaling nodes**
 
-#### [`metrics-server`](./modules/k8s/apps/metrics-server)
+- Verify Karpenter controller has IRSA permissions
+- Check NodePool configurations in `k8s/karpenter/`
+- Review Karpenter logs: `kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter`
 
-#### [`karpenter`](./modules/k8s/apps/karpenter)
+**Coder proxy not connecting**
 
-#### [`cert-manager`](./modules/k8s/apps/cert-manager)
+- Verify proxy token is correctly configured
+- Check network connectivity from proxy to primary server
+- Review NLB health checks and target group status
 
-#### [`coder-server`](./modules/k8s/apps/coder-server)
+**Database connection failures**
 
-#### [`coder-proxy`](./modules/k8s/apps/coder-proxy)
+- Verify security group allows traffic from EKS nodes
+- Check VPC peering routes are configured
+- Confirm database URL includes `?sslmode=require`
 
-#### [`coder-ws`](./modules/k8s/apps/coder-ws)
+### Useful Commands
 
-## How-It-Works
+```bash
+# Check EKS cluster status
+aws eks describe-cluster --name coderdemo --region us-east-2
 
-> <!TODO>
+# Get kubeconfig
+aws eks update-kubeconfig --name coderdemo --region us-east-2
 
-### Coder Tasks
+# View Karpenter logs
+kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter -f
+
+# Check Coder server logs
+kubectl logs -n coder -l app.kubernetes.io/name=coder -f
+
+# List all Karpenter nodes
+kubectl get nodes -l karpenter.sh/initialized=true
+
+# Check workspace proxy status
+kubectl get pods -n coder-proxy
+```
+
+---
+
+## Contributing
+
+This repository represents a production demo environment. For general Coder questions or contributions, please visit:
+
+- [Coder GitHub](https://github.com/coder/coder)
+- [Coder Documentation](https://coder.com/docs)
+- [Coder Community Discord](https://coder.com/chat)
+
+---
+
+## License
+
+This infrastructure code is provided as-is for reference purposes. Refer to individual component licenses:
+
+- [Coder License](https://github.com/coder/coder/blob/main/LICENSE)
+- [Terraform License](https://github.com/hashicorp/terraform/blob/main/LICENSE)
+- [AWS Provider License](https://github.com/hashicorp/terraform-provider-aws/blob/main/LICENSE)
+
+---
+
+## Additional Resources
+
+- [Coder Documentation](https://coder.com/docs)
+- [Coder Template Examples](https://github.com/coder/coder/tree/main/examples/templates)
+- [EKS Best Practices Guide](https://aws.github.io/aws-eks-best-practices/)
+- [Karpenter Documentation](https://karpenter.sh/docs/)
+- [Multi-Region Deployment Guide](./docs/MULTI_REGION_DEPLOYMENT.md)
+- [Infrastructure Best Practices](./docs/INFRASTRUCTURE_BEST_PRACTICES.md)
+
+---
 
-> <!TODO>
+**Built with ❤️ by the Coder team**
diff --git a/docs/ARCHITECTURE_DIAGRAM.md b/docs/ARCHITECTURE_DIAGRAM.md
new file mode 100644
index 0000000..864f173
--- /dev/null
+++ b/docs/ARCHITECTURE_DIAGRAM.md
@@ -0,0 +1,814 @@
+# Coder Demo Environment Architecture Diagram
+
+This document provides a comprehensive visual representation of the **coderdemo.io** infrastructure architecture.
+
+---
+
+## Table of Contents
+
+1. [Overview Diagram](#overview-diagram)
+2. [Component Details](#component-details)
+3. [Traffic Flow](#traffic-flow)
+4. [Key Architecture Decisions](#key-architecture-decisions)
+
+---
+
+## Overview Diagram
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                              INTERNET / USERS                                │
+└─────────────────────────────────────────────────────────────────────────────┘
+                                      │
+                                      │ HTTPS
+                                      ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                         AWS ROUTE 53 (coderdemo.io)                          │
+│                                                                               │
+│  ┌─────────────────────────────────────────────────────────────────────┐   │
+│  │ LATENCY-BASED ROUTING (Automatic)                                    │   │
+│  │  • coderdemo.io          → Nearest region (health check monitored)   │   │
+│  │  • *.coderdemo.io        → Workspace apps (latency-routed)           │   │
+│  └─────────────────────────────────────────────────────────────────────┘   │
+│                                                                               │
+│  ┌─────────────────────────────────────────────────────────────────────┐   │
+│  │ REGION-SPECIFIC ROUTING (Manual Override)                            │   │
+│  │  • us-east-2.coderdemo.io      → Force Ohio region                   │   │
+│  │  • us-west-2.coderdemo.io      → Force Oregon region                 │   │
+│  │  • *.us-east-2.coderdemo.io    → Ohio workspace apps                 │   │
+│  │  • *.us-west-2.coderdemo.io    → Oregon workspace apps               │   │
+│  └─────────────────────────────────────────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────────────────────────┘
+              │                                              │
+              │                                              │
+    ┌─────────▼──────────┐                       ┌──────────▼─────────┐
+    │  US-EAST-2 (Ohio)  │                       │ US-WEST-2 (Oregon) │
+    │   PRIMARY REGION   │                       │  SECONDARY REGION  │
+    └────────────────────┘                       └────────────────────┘
+
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                          US-EAST-2 REGION (PRIMARY)                          │
+├─────────────────────────────────────────────────────────────────────────────┤
+│                                                                               │
+│  ┌────────────────────────────────────────────────────────────────────┐    │
+│  │                    NETWORK LOAD BALANCER (NLB)                      │    │
+│  │  • TLS Termination (ACM Certificate)                                │    │
+│  │  • Static IP Addresses (per AZ)                                     │    │
+│  │  • Layer 4 (TCP) - Low latency                                      │    │
+│  │  • Source IP Preservation                                           │    │
+│  │  • HTTPS:443 → HTTP:8080 (backend)                                  │    │
+│  └────────────────────────────────────────────────────────────────────┘    │
+│                                  │                                           │
+│  ┌───────────────────────────────▼──────────────────────────────────┐      │
+│  │                         VPC (10.0.0.0/16)                          │      │
+│  │                                                                     │      │
+│  │  ┌─────────────────────────────────────────────────────────┐     │      │
+│  │  │                  PUBLIC SUBNETS (system0, system1)       │     │      │
+│  │  │  • Internet Gateway (IGW)                                │     │      │
+│  │  │  • NAT Gateway (fck-nat - cost optimized)                │     │      │
+│  │  │  • Network Load Balancers                                │     │      │
+│  │  │  • Multi-AZ (us-east-2a, us-east-2b)                     │     │      │
+│  │  └─────────────────────────────────────────────────────────┘     │      │
+│  │                              │                                     │      │
+│  │  ┌───────────────────────────▼─────────────────────────────┐     │      │
+│  │  │                  PRIVATE SUBNETS                          │     │      │
+│  │  │                                                            │     │      │
+│  │  │  ┌──────────────────────────────────────────────────┐   │     │      │
+│  │  │  │ SYSTEM SUBNETS (system0, system1)                │   │     │      │
+│  │  │  │  • EKS Control Plane                              │   │     │      │
+│  │  │  │  • EKS Managed Node Groups                        │   │     │      │
+│  │  │  │  • Graviton ARM instances (t4g.xlarge)            │   │     │      │
+│  │  │  │  • ON_DEMAND capacity (stable)                    │   │     │      │
+│  │  │  └──────────────────────────────────────────────────┘   │     │      │
+│  │  │                                                            │     │      │
+│  │  │  ┌──────────────────────────────────────────────────┐   │     │      │
+│  │  │  │ PROVISIONER SUBNET                                │   │     │      │
+│  │  │  │  • Coder External Provisioner pods                │   │     │      │
+│  │  │  │  • Workspace orchestration                        │   │     │      │
+│  │  │  └──────────────────────────────────────────────────┘   │     │      │
+│  │  │                                                            │     │      │
+│  │  │  ┌──────────────────────────────────────────────────┐   │     │      │
+│  │  │  │ WORKSPACE SUBNET (ws-all)                         │   │     │      │
+│  │  │  │  • Coder Workspace pods                           │   │     │      │
+│  │  │  │  • Karpenter auto-scaled nodes                    │   │     │      │
+│  │  │  │  • User development environments                  │   │     │      │
+│  │  │  └──────────────────────────────────────────────────┘   │     │      │
+│  │  │                                                            │     │      │
+│  │  │  ┌──────────────────────────────────────────────────┐   │     │      │
+│  │  │  │ RDS SUBNET (Database)                             │   │     │      │
+│  │  │  │  • Aurora PostgreSQL 15.8 (Serverless v2)         │   │     │      │
+│  │  │  │  • Auto-scaling: 0.5-16 ACU (1-32 GB RAM)         │   │     │      │
+│  │  │  │  • Multi-AZ: Writer + Reader instances            │   │     │      │
+│  │  │  │  • Private only (no public access)                │   │     │      │
+│  │  │  │  • Shared across regions                          │   │     │      │
+│  │  │  └──────────────────────────────────────────────────┘   │     │      │
+│  │  │                                                            │     │      │
+│  │  │  ┌──────────────────────────────────────────────────┐   │     │      │
+│  │  │  │ VPC ENDPOINTS (Cost Optimization)                 │   │     │      │
+│  │  │  │  • S3 Gateway Endpoint                            │   │     │      │
+│  │  │  │  • ECR API Interface Endpoint                     │   │     │      │
+│  │  │  │  • ECR DKR Interface Endpoint                     │   │     │      │
+│  │  │  │  • Reduces NAT Gateway data transfer costs        │   │     │      │
+│  │  │  └──────────────────────────────────────────────────┘   │     │      │
+│  │  └────────────────────────────────────────────────────────┘     │      │
+│  └─────────────────────────────────────────────────────────────────┘      │
+│                                                                               │
+│  ┌────────────────────────────────────────────────────────────────────┐    │
+│  │                   EKS CLUSTER (Kubernetes 1.x)                      │    │
+│  │                                                                      │    │
+│  │  ┌──────────────────────────────────────────────────────────┐     │    │
+│  │  │ CODER NAMESPACE                                           │     │    │
+│  │  │  • Coder Server (Deployment)                              │     │    │
+│  │  │    - CODER_TLS_ENABLE = false (NLB handles TLS)           │     │    │
+│  │  │    - CODER_SECURE_AUTH_COOKIE = true                      │     │    │
+│  │  │    - CODER_REDIRECT_TO_ACCESS_URL = false                 │     │    │
+│  │  │    - GitHub OAuth integration                             │     │    │
+│  │  │    - PostgreSQL RDS connection                            │     │    │
+│  │  │  • Service Type: LoadBalancer (creates NLB)               │     │    │
+│  │  │  • ACM Certificate for TLS termination                    │     │    │
+│  │  └──────────────────────────────────────────────────────────┘     │    │
+│  │                                                                      │    │
+│  │  ┌──────────────────────────────────────────────────────────┐     │    │
+│  │  │ CODER-WS NAMESPACE (Workspaces)                           │     │    │
+│  │  │  • Coder External Provisioner (Deployment)                │     │    │
+│  │  │  • Workspace pods (dynamically created)                   │     │    │
+│  │  │  • EBS volumes for persistent storage                     │     │    │
+│  │  │  • IRSA for AWS permissions                               │     │    │
+│  │  └──────────────────────────────────────────────────────────┘     │    │
+│  │                                                                      │    │
+│  │  ┌──────────────────────────────────────────────────────────┐     │    │
+│  │  │ INFRASTRUCTURE SERVICES (kube-system, etc.)               │     │    │
+│  │  │  • AWS Load Balancer Controller                           │     │    │
+│  │  │    - Creates and manages NLBs                             │     │    │
+│  │  │    - Service annotations for TLS termination              │     │    │
+│  │  │  • Karpenter                                               │     │    │
+│  │  │    - Auto-scaling for workspace nodes                     │     │    │
+│  │  │    - SQS queue + EventBridge                              │     │    │
+│  │  │    - Cost-optimized instance selection                    │     │    │
+│  │  │  • EBS CSI Driver                                          │     │    │
+│  │  │    - Dynamic volume provisioning                          │     │    │
+│  │  │  • Cert-Manager                                            │     │    │
+│  │  │    - Certificate management                               │     │    │
+│  │  │  • Metrics Server                                          │     │    │
+│  │  │    - Resource metrics collection                          │     │    │
+│  │  │  • CoreDNS, kube-proxy, vpc-cni (EKS addons)              │     │    │
+│  │  └──────────────────────────────────────────────────────────┘     │    │
+│  └────────────────────────────────────────────────────────────────────┘    │
+└─────────────────────────────────────────────────────────────────────────────┘
+
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                          US-WEST-2 REGION (SECONDARY)                        │
+├─────────────────────────────────────────────────────────────────────────────┤
+│  • Similar architecture to us-east-2                                         │
+│  • Infrastructure code exists (acm/, k8s/coder-server/, route53/)            │
+│  • NOT YET DEPLOYED (pending deployment)                                     │
+│  • Would share the same RDS database for unified accounts                    │
+│  • Independent EKS cluster with own NLB                                      │
+└─────────────────────────────────────────────────────────────────────────────┘
+
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                               SECURITY LAYER                                 │
+├─────────────────────────────────────────────────────────────────────────────┤
+│  • IAM Roles (IRSA - IAM Roles for Service Accounts)                        │
+│    - Coder Server → RDS access                                              │
+│    - Coder Provisioner → EC2/EKS permissions                                │
+│    - EBS Controller → EBS volume management                                 │
+│    - Load Balancer Controller → ELB management                              │
+│    - Karpenter → EC2 instance launching                                     │
+│  • Security Groups                                                           │
+│    - EKS cluster security group                                              │
+│    - Node security group                                                     │
+│    - RDS security group (port 5432 from VPC CIDR)                           │
+│    - VPC endpoints security group (port 443)                                │
+│  • Network ACLs                                                              │
+│  • TLS Certificates (ACM)                                                    │
+│    - Auto-renewal enabled                                                    │
+│    - Dynamically fetched (not hardcoded)                                    │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Component Details
+
+### DNS Layer (Route 53)
+
+**Hosted Zone:** `coderdemo.io`
+
+**Routing Policies:**
+
+1. **Latency-Based Routing (Primary)**
+   - Automatically routes users to the nearest AWS region
+   - Health checks monitor regional availability
+   - Automatic failover if a region becomes unhealthy
+   - Records: `coderdemo.io` and `*.coderdemo.io`
+
+2. **Region-Specific Routing (Manual Override)**
+   - Allows explicit region selection
+   - Useful for demos, testing, and regional preferences
+   - Records:
+     - `us-east-2.coderdemo.io` (Ohio)
+     - `us-west-2.coderdemo.io` (Oregon)
+     - Wildcards for workspace apps
+
+### Network Architecture
+
+**VPC Configuration:**
+
+- CIDR Block: `10.0.0.0/16`
+- Multi-AZ deployment (2 availability zones per region)
+
+**Subnet Types:**
+
+1. **Public Subnets** (`system0`, `system1`)
+   - Internet Gateway for outbound internet access
+   - NAT Gateway (fck-nat for cost optimization)
+   - Network Load Balancers
+   - CIDR: `10.0.10.0/24`, `10.0.11.0/24`
+
+2. **Private Subnets**
+   - **System Subnets** (`system0`, `system1`)
+     - EKS managed node groups
+     - Core infrastructure services
+     - CIDR: `10.0.20.0/24`, `10.0.21.0/24`
+
+   - **Provisioner Subnet**
+     - Coder external provisioner pods
+     - Workspace orchestration
+     - CIDR: `10.0.22.0/24`
+
+   - **Workspace Subnet** (`ws-all`)
+     - User workspace pods
+     - Karpenter-managed nodes
+     - CIDR: `10.0.16.0/22` (larger range for scalability)
+
+   - **RDS Subnet**
+     - PostgreSQL database
+     - Multi-AZ for high availability
+     - No public access
+
+**VPC Endpoints (Cost Optimization):**
+
+- S3 Gateway Endpoint
+- ECR API Interface Endpoint
+- ECR DKR Interface Endpoint
+- Reduces NAT Gateway data transfer costs
+
+### Load Balancing
+
+**Network Load Balancer (NLB):**
+
+- **Type:** Layer 4 (TCP/TLS)
+- **TLS Termination:** Yes (via ACM certificates)
+- **Benefits:**
+  - Low latency for WebSocket connections
+  - Source IP preservation for audit logs
+  - Static IP addresses per availability zone
+  - Better for long-lived connections
+- **Configuration:**
+  - Listener: HTTPS:443 → HTTP:8080 (Coder backend)
+  - Health checks enabled
+  - Cross-zone load balancing enabled
+
+### Compute Layer
+
+**EKS Cluster:**
+
+- Kubernetes version: Latest stable
+- Control plane: Fully managed by AWS
+- Public and private endpoint access enabled
+
+**Node Groups:**
+
+1. **System Managed Node Group**
+   - Instance type: `t4g.xlarge` (Graviton ARM)
+   - Capacity: ON_DEMAND (stable, no interruptions)
+   - Auto-scaling: 0-10 nodes
+   - Volume: 20GB gp3 (cost-optimized)
+   - Purpose: Core Kubernetes services
+
+2. **Workspace Nodes (Karpenter-managed)**
+   - Dynamic provisioning based on workspace requirements
+   - Cost-optimized instance selection
+   - Automatic scaling and termination
+   - Spot instances supported for cost savings
+
+**Karpenter Configuration:**
+
+- SQS queue for event handling
+- EventBridge for EC2 spot interruption notifications
+- IAM role for instance launching
+- Custom node classes for different workspace types
+
+### Storage Layer
+
+**Aurora Serverless v2 (PostgreSQL):**
+
+- Engine: Aurora PostgreSQL 15.8
+- Instance class: `db.serverless` (auto-scaling)
+- Scaling: 0.5-16 ACU (Coder), 0.5-8 ACU (LiteLLM)
+- Multi-AZ: Writer + Reader instances
+- Encryption: At rest and in transit
+- Backup: Automated daily backups (7-day retention)
+- Access: Private only (from VPC CIDR)
+- Cost: Pay-per-ACU-hour (~$9-$400/month depending on load)
+
+**Amazon EBS:**
+
+- CSI Driver: Installed via Helm
+- Volume type: gp3 (general purpose SSD)
+- Dynamic provisioning for workspace persistent storage
+- Encryption: Enabled
+
+### Kubernetes Services
+
+**Core Services:**
+
+1. **Coder Server** (Namespace: `coder`)
+   - Deployment with multiple replicas
+   - Service type: LoadBalancer (creates NLB)
+   - Environment variables:
+     - `CODER_TLS_ENABLE=false` (NLB handles TLS)
+     - `CODER_SECURE_AUTH_COOKIE=true`
+     - `CODER_REDIRECT_TO_ACCESS_URL=false`
+   - Connected to PostgreSQL RDS
+   - GitHub OAuth integration
+
+2. **Coder External Provisioner** (Namespace: `coder-ws`)
+   - Manages workspace lifecycle
+   - Creates and destroys workspace pods
+   - IRSA for AWS permissions
+
+3. **AWS Load Balancer Controller**
+   - Reconciles Kubernetes Service resources
+   - Creates and manages NLBs
+   - Handles TLS certificate attachment
+   - Service annotations for configuration
+
+4. **Karpenter**
+   - Node auto-scaling
+   - Instance type selection
+   - Spot instance management
+   - Cost optimization
+
+5. **EBS CSI Driver**
+   - Dynamic volume provisioning
+   - Volume snapshots
+   - Volume resizing
+
+6. **Cert-Manager**
+   - SSL/TLS certificate management
+   - Automatic renewal
+   - Integration with Let's Encrypt or ACM
+
+7. **Metrics Server**
+   - Resource metrics collection
+   - HPA (Horizontal Pod Autoscaler) support
+
+**EKS Addons:**
+
+- CoreDNS (DNS resolution)
+- kube-proxy (network proxy)
+- vpc-cni (VPC networking)
+
+### Security
+
+**IAM Roles (IRSA):**
+
+- Coder Server: RDS access, Secrets Manager
+- Coder Provisioner: EC2, EKS permissions
+- EBS Controller: EBS volume operations
+- Load Balancer Controller: ELB operations
+- Karpenter: EC2 instance launching
+
+**Security Groups:**
+
+- EKS cluster security group
+- Node security group
+- RDS security group (port 5432 from VPC)
+- VPC endpoints security group (port 443)
+
+**TLS Certificates:**
+
+- Managed by ACM
+- Automatic renewal
+- Attached to NLB via Load Balancer Controller
+
+---
+
+## Traffic Flow
+
+### User Authentication Flow
+
+```
+User Browser
+    │
+    │ HTTPS
+    ▼
+Route 53 (coderdemo.io)
+    │
+    │ Latency-based routing
+    ▼
+Network Load Balancer (TLS termination)
+    │
+    │ HTTP:8080
+    ▼
+Coder Server Pod
+    │
+    ├──→ GitHub OAuth (authentication)
+    │
+    └──→ PostgreSQL RDS (user data)
+```
+
+### Workspace Creation Flow
+
+```
+User (via Coder UI)
+    │
+    ▼
+Coder Server
+    │
+    │ Creates workspace resource
+    ▼
+Coder External Provisioner
+    │
+    ├──→ Checks node capacity
+    │
+    ├──→ Karpenter provisions new node (if needed)
+    │     │
+    │     └──→ EC2 API (launches instance)
+    │
+    ├──→ Schedules workspace pod on node
+    │
+    ├──→ EBS CSI creates persistent volume
+    │
+    └──→ Workspace pod starts
+          │
+          └──→ User can access workspace
+```
+
+### Workspace Application Access Flow
+
+```
+User Browser
+    │
+    │ HTTPS (workspace-123.coderdemo.io)
+    ▼
+Route 53 (*.coderdemo.io wildcard)
+    │
+    │ Latency-based routing
+    ▼
+Network Load Balancer
+    │
+    │ HTTP
+    ▼
+Coder Server (proxy)
+    │
+    │ Proxies to workspace
+    ▼
+Workspace Pod (port 8000, 3000, etc.)
+```
+
+---
+
+## Key Architecture Decisions
+
+### 1. Network Load Balancer (NLB) over Application Load Balancer (ALB)
+
+**Why NLB:**
+
+- **Lower latency:** Layer 4 (TCP) vs Layer 7 (HTTP)
+- **Source IP preservation:** Essential for Coder audit logs
+- **Static IPs:** Easier for enterprise firewall rules
+- **Long-lived connections:** Better for WebSocket connections (terminals, live updates)
+- **Cost efficiency:** Lower cost at high volume
+
+**TLS Termination at NLB:**
+
+- NLBs DO support TLS termination when configured with ACM certificates
+- Configured via AWS Load Balancer Controller service annotations
+- Traffic flow: User (HTTPS:443) → NLB (terminates TLS) → Coder (HTTP:8080)
+
+### 2. Multi-Region with Latency-Based Routing
+
+**Benefits:**
+
+- **Automatic performance optimization:** Users connect to nearest region
+- **Built-in failover:** Route53 health checks automatically remove unhealthy regions
+- **Manual override available:** Region-specific URLs for demos and testing
+- **Global reach:** Serves users worldwide with low latency
+
+**Implementation:**
+
+- Route53 latency routing policy
+- Health checks per region
+- Shared RDS database across regions (for unified accounts)
+
+### 3. Cost Optimizations
+
+**Implemented:**
+
+- **Graviton ARM instances:** t4g.xlarge (lower cost than x86)
+- **VPC Endpoints:** S3, ECR API/DKR (reduces NAT Gateway costs)
+- **fck-nat:** Custom NAT solution instead of AWS NAT Gateway
+- **Karpenter:** Right-sized workspace nodes, automatic termination
+- **gp3 volumes:** Better performance than gp2 at same cost
+- **Spot instances:** For workspace nodes (when interruption-tolerant)
+
+### 4. Security Best Practices
+
+**IRSA (IAM Roles for Service Accounts):**
+
+- No AWS credentials stored in Kubernetes secrets
+- Least-privilege access per service
+- Automatic credential rotation
+
+**Network Segmentation:**
+
+- Separate subnets for system, provisioner, and workspaces
+- RDS in private subnet with no public access
+- Security groups restrict traffic by source/destination
+
+**TLS Everywhere:**
+
+- ACM certificates with auto-renewal
+- TLS termination at load balancer
+- Secure cookies enabled
+
+### 5. Helm Chart Management
+
+**Decision: `upgrade_install = true`**
+
+- Idempotent Terraform applies
+- No "already exists" errors in CI/CD
+- Declarative version management
+- Re-added in Helm provider version 3.1.1
+
+### 6. Aurora Serverless v2 for Cost Optimization
+
+**Configuration:**
+
+- Engine: Aurora PostgreSQL 15.8 (Serverless v2)
+- Scaling: 0.5-16 ACU for Coder, 0.5-8 ACU for LiteLLM
+- Multi-AZ: Writer + Reader instances
+
+**Benefits:**
+
+- **Cost savings:** Scales down to 0.5 ACU (~$9/month) during idle periods
+- **Auto-scaling:** Automatically scales up to handle load (up to 16 ACU = 32 GB RAM)
+- **No manual intervention:** Seamless scaling based on demand
+- **Pay-per-use:** Only pay for ACU-hours consumed vs 24/7 provisioned instance
+
+**Trade-off:**
+
+- **Cold start delay:** 5-10 second initial response after idle period (>30 minutes)
+- **Acceptable for demo environment** where cost optimization outweighs instant response
+
+---
+
+## Known Behaviors (Demo Environment)
+
+This section documents expected behaviors in the demo environment that optimize for cost over instant response time.
+
+### 1. Aurora Serverless v2 Cold Start (5-10 seconds)
+
+**When it happens:**
+
+- After 30+ minutes of no database activity
+- First visitor after idle period
+
+**What you'll see:**
+
+- Site takes 5-10 seconds to load initially
+- Subsequent requests are instant (<100ms)
+- Aurora scales from 0.5 ACU → 1-2 ACU automatically
+
+**Why it's acceptable:**
+
+- Demo environment prioritizes cost savings
+- Saves ~$120/month vs provisioned RDS
+- No errors, just slower initial load
+- Perfect for sporadic demo usage
+
+**To eliminate (if needed):**
+
+- Increase `min_capacity = 2` in `infra/aws/us-east-2/rds/main.tf`
+- Trade-off: ~$35/month baseline vs $9/month
+
+### 2. HTTP→HTTPS Redirect Delay ("Not Secure" Warning)
+
+**When it happens:**
+
+- User types `coderdemo.io` without `https://`
+- Browser tries HTTP:80 first (standard behavior)
+
+**What you'll see:**
+
+1. Browser shows "Connecting..." or spinning
+2. Brief "Site is not secure" warning (2-3 seconds)
+3. Warning disappears, site loads normally with HTTPS
+
+**Root cause:**
+
+- NLB only has port 443 (HTTPS) listener configured
+- No port 80 (HTTP) listener to redirect to HTTPS
+- NLBs don't support HTTP→HTTPS redirects (ALB feature only)
+- Browser timeout on port 80, then retries port 443
+
+**Why it's acceptable:**
+
+- Demo environment, not production
+- Site works perfectly once HTTPS connects
+- No security risk (just UX delay)
+- Users who bookmark or click links use HTTPS directly
+
+**Why HSTS is NOT configured:**
+
+HSTS (HTTP Strict Transport Security) headers would help eliminate the "not secure" warning by making browsers automatically use HTTPS after the first visit. However, **Coder's HSTS feature does not work when behind a reverse proxy.**
+
+**Investigation findings:**
+
+- Coder supports HSTS via `CODER_STRICT_TRANSPORT_SECURITY` environment variable
+- However, Coder only sends HSTS headers when it directly terminates TLS (`CODER_TLS_ENABLE=true`)
+- When behind an NLB/reverse proxy with `CODER_TLS_ENABLE=false`, Coder sees incoming HTTP traffic
+- Coder's help states: "This header should only be set if the server is accessed via HTTPS"
+- Since Coder doesn't detect it's behind an HTTPS proxy, it won't send HSTS headers
+
+**Workaround not possible without:**
+
+- Switching to ALB (which can do HTTP→HTTPS redirect at load balancer level)
+- Having Coder terminate TLS directly (loses NLB benefits)
+- Waiting for Coder to add reverse-proxy awareness for HSTS feature
+- Using CloudFront in front of NLB for HTTP→HTTPS redirect
+
+**Alternative mitigation options:**
+
+- Option A: Add CloudFront with HTTP→HTTPS redirect (adds complexity and cost)
+- Option B: Switch to ALB (loses NLB benefits: lower latency, source IP preservation)
+- Option C: Configure port 80 forwarding in Coder service (complex, not standard)
+- Option D: Accept current behavior (recommended for demo environment)
+
+### Summary of Expected Load Times
+
+| Scenario                  | Load Time       | Behavior                                           |
+| ------------------------- | --------------- | -------------------------------------------------- |
+| **First visit (HTTP)**    | 7-13 seconds    | HTTP:80 timeout (2-3s) + Aurora cold start (5-10s) |
+| **First visit (HTTPS)**   | 5-10 seconds    | Aurora cold start only                             |
+| **Return visit (HTTP)**   | 7-13 seconds    | HTTP:80 timeout (2-3s) + Aurora cold start (5-10s) |
+| **After warm-up (HTTPS)** | <100ms          | Instant, everything cached                         |
+| **Bookmarked/HTTPS link** | <100ms or 5-10s | Instant if warm, cold start if idle                |
+
+**Note:** Always share URLs as `https://coderdemo.io` to avoid the 2-3 second HTTP:80 timeout delay.
+
+---
+
+## Infrastructure as Code
+
+All infrastructure is managed via Terraform:
+
+**Directory Structure:**
+
+```
+infra/aws/
+├── us-east-2/          # Primary region (deployed)
+│   ├── eks/            # EKS cluster
+│   ├── rds/            # PostgreSQL database
+│   ├── route53/        # DNS records
+│   └── k8s/            # Kubernetes applications
+│       ├── coder-server/
+│       ├── karpenter/
+│       ├── lb-controller/
+│       └── ...
+├── us-west-2/          # Secondary region (code exists, not deployed)
+│   ├── acm/
+│   ├── eks/
+│   ├── route53/
+│   └── k8s/
+└── eu-west-2/          # Tertiary region (partial code)
+
+modules/
+├── compute/
+│   └── cluster/        # Reusable EKS cluster module
+├── network/
+│   └── eks-vpc/        # Reusable VPC module
+└── k8s/
+    └── bootstrap/      # Reusable K8s app modules
+```
+
+**Terraform State:**
+
+- Stored in S3 backend
+- State locking via DynamoDB
+- Separate state files per region/component
+
+---
+
+## Deployment Status
+
+### US-EAST-2 (Ohio) - PRIMARY
+
+✅ **DEPLOYED**
+
+- EKS cluster
+- RDS PostgreSQL
+- Route53 DNS records
+- All Kubernetes services
+- Coder server operational
+
+### US-WEST-2 (Oregon) - SECONDARY
+
+⏳ **PENDING DEPLOYMENT**
+
+- Infrastructure code exists
+- ACM certificates ready to deploy
+- Coder server configuration ready
+- Route53 DNS records ready
+- Needs deployment to become active
+
+### EU-WEST-2 (London) - TERTIARY
+
+🚧 **PARTIAL CODE**
+
+- Some infrastructure modules present
+- Not fully configured
+
+---
+
+## Monitoring and Observability
+
+**Currently Configured:**
+
+- Route53 health checks
+- EKS control plane logs
+- Kubernetes metrics server
+- Load balancer metrics (CloudWatch)
+
+**Recommended Additions:**
+
+- Prometheus for metrics collection
+- Grafana for visualization
+- AWS X-Ray for distributed tracing
+- CloudWatch Container Insights
+- Coder audit logs to CloudWatch/S3
+
+---
+
+## Disaster Recovery
+
+**Current Strategy:**
+
+- Multi-AZ RDS deployment (automatic failover)
+- Multi-region infrastructure code (can deploy us-west-2 rapidly)
+- Route53 health checks and automatic failover
+- Automated daily RDS backups
+
+**RTO/RPO:**
+
+- **RTO (Recovery Time Objective):** ~20 minutes (deploy us-west-2)
+- **RPO (Recovery Point Objective):** <1 minute (RDS Multi-AZ synchronous replication)
+
+---
+
+## Scaling Considerations
+
+**Horizontal Scaling:**
+
+- Coder server: Increase replica count in Helm values
+- Workspace nodes: Karpenter automatically scales based on demand
+- System nodes: Adjust EKS managed node group size
+
+**Vertical Scaling:**
+
+- RDS: Change instance class (requires downtime or blue/green deployment)
+- Workspace resources: Update Coder template resource requests/limits
+- Node instance types: Modify Karpenter NodePool configuration
+
+**Regional Expansion:**
+
+- Deploy us-west-2 for West Coast users
+- Deploy eu-west-2 for European users
+- Consider VPC peering or Transit Gateway for inter-region communication
+
+---
+
+## Related Documentation
+
+- [Infrastructure Best Practices](./INFRASTRUCTURE_BEST_PRACTICES.md)
+- [README](../README.md)
+
+---
+
+## Changelog
+
+- **2025-11-26**:
+  - Updated to reflect Aurora Serverless v2 configuration
+  - Added "Known Behaviors" section documenting cold start and HTTP redirect behavior
+  - Investigated and documented why HSTS cannot be configured when Coder is behind reverse proxy
+  - Documented alternative mitigation options for HTTP→HTTPS redirect delay
+- **2025-11-25**: Initial architecture diagram created
+
+---
+
+## Questions or Feedback
+
+For technical questions about this architecture, contact the infrastructure team.
diff --git a/docs/INFRASTRUCTURE_BEST_PRACTICES.md b/docs/INFRASTRUCTURE_BEST_PRACTICES.md
new file mode 100644
index 0000000..2a80306
--- /dev/null
+++ b/docs/INFRASTRUCTURE_BEST_PRACTICES.md
@@ -0,0 +1,505 @@
+# Infrastructure Best Practices for Coder Deployment
+
+This document outlines the architectural decisions, best practices, and rationale behind the Coder infrastructure deployment on AWS EKS. Use this as a reference when discussing technical implementation with customers and prospects.
+
+---
+
+## Table of Contents
+
+1. [Load Balancer Architecture](#load-balancer-architecture)
+2. [DNS and Multi-Region Setup](#dns-and-multi-region-setup)
+3. [LiteLLM Integration Architecture](#litellm-integration-architecture)
+4. [Helm Chart Management](#helm-chart-management)
+5. [Security Considerations](#security-considerations)
+
+---
+
+## Load Balancer Architecture
+
+### Decision: Network Load Balancer (NLB) with TLS Termination
+
+**What We Did:**
+
+- Deployed NLB with TLS termination using ACM certificates
+- Configured `CODER_TLS_ENABLE = "false"` on Coder server
+- NLB terminates TLS and forwards plain HTTP to backend
+
+**Why This Approach:**
+
+#### NLB Advantages for Coder
+
+1. **Lower Latency** - Layer 4 (TCP) vs Layer 7 (HTTP)
+   - Less protocol overhead
+   - Direct connection forwarding
+   - Critical for long-lived WebSocket connections (terminals, live updates)
+
+2. **Source IP Preservation**
+   - NLB preserves client source IP addresses
+   - Essential for Coder's audit logs and security monitoring
+   - No need to parse `X-Forwarded-For` headers
+
+3. **Static IP Addresses**
+   - NLB provides static IPs per availability zone
+   - Easier for enterprise firewall rules and allowlists
+   - ALB uses dynamic IPs (requires DNS-based allowlisting)
+
+4. **Connection Handling**
+   - Better for long-lived persistent connections
+   - Coder workspaces maintain extended connections
+   - Lower overhead per connection
+
+5. **Cost Efficiency**
+   - NLB: $0.0225/hour + $0.006/GB processed
+   - ALB: $0.0225/hour + $0.008/GB processed + per-rule charges
+   - Lower cost at high volume
+
+#### TLS Termination at NLB
+
+**Common Misconception:**
+
+> "NLBs don't terminate TLS - they're Layer 4 pass-through only"
+
+**Reality:**
+NLBs **DO support TLS termination** when configured with ACM certificates via the AWS Load Balancer Controller.
+
+**Configuration:**
+
+```hcl
+service_annotations = {
+  "service.beta.kubernetes.io/aws-load-balancer-ssl-cert"  = data.aws_acm_certificate.coder.arn
+  "service.beta.kubernetes.io/aws-load-balancer-ssl-ports" = "443"
+}
+```
+
+**Traffic Flow:**
+
+```
+User (HTTPS:443) → NLB (terminates TLS) → Coder Backend (HTTP:8080)
+```
+
+**Coder Configuration:**
+
+```hcl
+env_vars = {
+  CODER_REDIRECT_TO_ACCESS_URL = "false"  # Prevent redirect loops
+  CODER_TLS_ENABLE             = "false"  # NLB handles TLS
+  CODER_SECURE_AUTH_COOKIE     = "true"   # Users connect via HTTPS
+}
+```
+
+**Official Documentation:**
+
+- [AWS: Create TLS Listener for NLB](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/create-tls-listener.html)
+- [AWS: NLB TLS Termination Announcement](https://aws.amazon.com/blogs/aws/new-tls-termination-for-network-load-balancers/)
+- [AWS Load Balancer Controller: NLB TLS Termination](https://kubernetes-sigs.github.io/aws-load-balancer-controller/latest/guide/use_cases/nlb_tls_termination/)
+
+#### When to Use ALB Instead
+
+Consider ALB only if you need:
+
+- Path-based routing (`/api` → service A, `/web` → service B)
+- Host-based routing (multiple domains to different backends)
+- HTTP-level features (redirects, header manipulation, authentication)
+- WAF (Web Application Firewall) integration
+- More detailed HTTP metrics
+
+**For Coder:** These features are not needed - it's a single application without complex routing requirements.
+
+---
+
+## DNS and Multi-Region Setup
+
+### Architecture Overview
+
+**Root Domain:** `coderdemo.io` (Route53 hosted zone)
+
+**DNS Records:**
+
+#### 1. Latency-Based Routing (Automatic)
+
+```
+coderdemo.io              → Routes to nearest region (us-east-2 or us-west-2)
+*.coderdemo.io            → Wildcard for workspace apps (latency-routed)
+```
+
+**Configuration:**
+
+```hcl
+resource "aws_route53_record" "coder_latency" {
+  zone_id        = var.hosted_zone_id
+  name           = var.domain_name
+  type           = "A"
+  set_identifier = var.set_identifier  # e.g., "us-east-2"
+
+  alias {
+    name                   = local.nlb_hostname
+    zone_id                = data.aws_lb.coder_nlb.zone_id
+    evaluate_target_health = true
+  }
+
+  latency_routing_policy {
+    region = var.cluster_region
+  }
+
+  health_check_id = aws_route53_health_check.coder[0].id
+}
+```
+
+#### 2. Region-Specific Subdomains (Manual Selection)
+
+```
+us-east-2.coderdemo.io    → Force Ohio region
+us-west-2.coderdemo.io    → Force Oregon region
+*.us-east-2.coderdemo.io  → Ohio workspace apps
+*.us-west-2.coderdemo.io  → Oregon workspace apps
+```
+
+**Use Case:**
+Instructor in East Coast can join West Coast customer demo by using `us-west-2.coderdemo.io` instead of relying on latency-based routing.
+
+### Benefits
+
+1. **Automatic Failover**
+   - Route53 health checks monitor each region
+   - Unhealthy regions automatically removed from rotation
+   - Users transparently routed to healthy region
+
+2. **Performance Optimization**
+   - Users connect to geographically nearest region
+   - Lower latency for all interactions
+   - Better experience for global teams
+
+3. **Manual Override**
+   - Region-specific URLs allow explicit region selection
+   - Useful for demos, testing, or specific customer requirements
+   - No code changes needed - just use different URL
+
+### Multi-Region Coder Visibility
+
+**Current State:**
+
+- Only `us-east-2` appears in Coder's region dropdown
+- `us-west-2` infrastructure code exists but not deployed
+
+**For us-west-2 to Appear:**
+
+1. Deploy ACM certificates (`infra/aws/us-west-2/acm/`)
+2. Deploy Coder server (`infra/aws/us-west-2/k8s/coder-server/`)
+3. Deploy Route53 records (`infra/aws/us-west-2/route53/`)
+4. Ensure shared RDS database or database replication
+
+**Important:** Both regions must use the same database for unified user accounts and workspace state.
+
+---
+
+## LiteLLM Integration Architecture
+
+### Decision: Separate Service with Subdomain
+
+**Architecture:**
+
+```
+coderdemo.io                → Coder (latency-routed)
+llm.coderdemo.io            → LiteLLM (separate NLB)
+```
+
+**Deployment:**
+
+- LiteLLM: Separate Kubernetes deployment with own NLB
+- Each Coder workspace namespace gets LiteLLM API keys via secret rotation
+- Keys automatically rotated from AWS Secrets Manager
+
+**Why This Approach:**
+
+#### Option 1: Separate Subdomain ✅ (Implemented)
+
+**Advantages:**
+
+- Keep NLB for both services (no ALB needed)
+- Clean separation of concerns
+- Independent scaling and monitoring
+- No path rewriting complexity
+
+#### Option 2: Path-Based Routing (Not Recommended)
+
+```
+coderdemo.io/        → Coder
+coderdemo.io/v1/*    → LiteLLM
+```
+
+**Disadvantages:**
+
+- Requires switching to ALB
+- More complex configuration
+- Potential URL rewriting issues
+- No clear benefit for this use case
+
+#### Option 3: Internal Only (Alternative)
+
+**For Maximum Security:**
+
+- Don't expose LiteLLM externally at all
+- Coder communicates via internal Kubernetes service DNS
+- Only Coder → LiteLLM traffic allowed
+- No additional load balancer needed
+
+### Current Implementation
+
+**LiteLLM Service:** `infra/aws/us-east-2/k8s/litellm/main.tf`
+
+- 4 replicas with 2 CPU / 4Gi memory each
+- Own ACM certificate for TLS termination
+- Connected to PostgreSQL (RDS) and Redis
+- Automatic key generation and rotation
+
+**Workspace Integration:** `infra/aws/us-east-2/k8s/coder-ws/main.tf`
+
+```hcl
+module "default-ws-litellm-rotate-key" {
+  source        = "../../../../../modules/k8s/bootstrap/litellm-rotate-key"
+  namespace     = "coder-ws"
+  secret_id     = var.aws_secret_id
+  secret_region = var.aws_secret_region
+}
+```
+
+**Key Rotation:**
+
+- Keys fetched from AWS Secrets Manager
+- Injected as Kubernetes secrets into workspace namespaces
+- Workspaces use keys to make LLM API calls through LiteLLM
+- Rotation happens automatically without workspace downtime
+
+---
+
+## Helm Chart Management
+
+### Decision: Enable `upgrade_install` on All Helm Releases
+
+**What We Did:**
+Added `upgrade_install = true` to all `helm_release` resources across the codebase.
+
+**Files Updated:**
+
+- `modules/k8s/bootstrap/karpenter/main.tf`
+- `modules/k8s/bootstrap/ebs-controller/main.tf`
+- `modules/k8s/bootstrap/lb-controller/main.tf`
+- `modules/k8s/bootstrap/cert-manager/main.tf`
+- `modules/k8s/bootstrap/coder-server/main.tf`
+- `modules/k8s/bootstrap/coder-proxy/main.tf`
+- `modules/k8s/bootstrap/metrics-server/main.tf`
+
+**Configuration:**
+
+```hcl
+resource "helm_release" "example" {
+  name             = "example"
+  namespace        = var.namespace
+  chart            = "example"
+  repository       = "https://charts.example.com"
+  create_namespace = true
+  upgrade_install  = true  # ← Critical for idempotent deployments
+  skip_crds        = false
+  wait             = true
+  wait_for_jobs    = true
+  version          = var.chart_version
+}
+```
+
+**Why This Matters:**
+
+1. **Idempotent Terraform Applies**
+   - Without `upgrade_install`: Terraform fails if release already exists
+   - With `upgrade_install`: Terraform upgrades existing release or installs new one
+   - Essential for repeatable deployments
+
+2. **Version Management**
+   - Allows Terraform to manage chart version upgrades
+   - No manual `helm upgrade` commands needed
+   - Declarative infrastructure-as-code
+
+3. **CI/CD Integration**
+   - Pipelines can safely re-run Terraform apply
+   - No "already exists" errors in automation
+   - Cleaner error handling
+
+**Helm Provider Version:**
+
+```hcl
+helm = {
+  source  = "hashicorp/helm"
+  version = "3.1.1"  # upgrade_install re-added in this version
+}
+```
+
+**Historical Context:**
+The `upgrade_install` parameter was temporarily removed from the Helm provider in earlier versions, leading to comments in code saying it was "invalid". It was re-added in version 3.1.1 and should now be used as a best practice.
+
+---
+
+## Security Considerations
+
+### TLS/SSL Certificate Management
+
+**ACM Certificates:**
+
+```hcl
+data "aws_acm_certificate" "coder" {
+  domain      = trimsuffix(trimprefix(var.coder_access_url, "https://"), "/")
+  statuses    = ["ISSUED"]
+  most_recent = true
+}
+```
+
+**Best Practices:**
+
+1. Use ACM for automatic certificate renewal
+2. Fetch certificates dynamically (don't hardcode ARNs)
+3. Filter by `ISSUED` status to avoid revoked certs
+4. Use `most_recent` for automatic updates
+
+### Service Account Permissions
+
+**Principle of Least Privilege:**
+
+```hcl
+oidc_principals = {
+  "${var.cluster_oidc_provider_arn}" = [
+    "system:serviceaccount:${var.namespace}:coder"
+  ]
+}
+```
+
+**Why:**
+
+- Restrict IAM role assumption to specific service accounts
+- Prevents any pod from assuming sensitive roles
+- Scoped to specific namespace and service account name
+
+### Source IP Preservation
+
+**NLB Advantage:**
+
+- Client source IP preserved in connection
+- Available in Coder's audit logs
+- No header parsing needed
+- Better security monitoring and rate limiting
+
+**With ALB:**
+
+- Source IP only available in `X-Forwarded-For` header
+- Application must parse headers
+- Less reliable (headers can be spoofed)
+
+---
+
+## Key Takeaways for Sales Engineers
+
+### When Discussing Load Balancers
+
+1. **NLB is the right choice for Coder**
+   - Optimized for long-lived WebSocket connections
+   - Lower latency than ALB
+   - Source IP preservation for audit logs
+   - Static IPs for enterprise firewalls
+
+2. **NLB DOES support TLS termination**
+   - Common misconception that it doesn't
+   - Fully supported via ACM certificates
+   - Show AWS documentation if questioned
+
+3. **ALB only needed if:**
+   - Path-based routing required
+   - WAF integration needed
+   - HTTP-specific features required
+   - None of these apply to standard Coder deployments
+
+### When Discussing Multi-Region
+
+1. **Latency-based routing provides:**
+   - Automatic performance optimization
+   - Built-in failover
+   - No user action required
+
+2. **Region-specific URLs allow:**
+   - Manual region override
+   - Demo flexibility
+   - Testing and troubleshooting
+
+3. **Shared database is critical:**
+   - Users need unified accounts across regions
+   - Workspace state must be accessible everywhere
+   - Consider RDS read replicas for performance
+
+### When Discussing LiteLLM
+
+1. **Separate subdomain approach:**
+   - Keeps architecture simple
+   - No ALB needed
+   - Independent scaling
+   - Clear separation of concerns
+
+2. **Automatic key rotation:**
+   - Security best practice
+   - No manual key management
+   - Zero downtime rotation
+   - AWS Secrets Manager integration
+
+3. **Internal-only option available:**
+   - Maximum security
+   - No external exposure
+   - Simpler architecture
+   - Recommended if no external access needed
+
+### When Discussing Infrastructure as Code
+
+1. **`upgrade_install = true` is critical:**
+   - Enables idempotent Terraform applies
+   - Required for CI/CD pipelines
+   - Prevents deployment failures
+   - Standard best practice
+
+2. **Terraform module structure:**
+   - Reusable across regions
+   - Consistent configuration
+   - Easy to add new regions
+   - Clear separation of concerns
+
+---
+
+## Additional Resources
+
+### AWS Documentation
+
+- [NLB TLS Termination](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/create-tls-listener.html)
+- [Route53 Latency-Based Routing](https://docs.aws.amazon.com/Route53/latest/DeveloperGuide/routing-policy-latency.html)
+- [ACM Certificate Management](https://docs.aws.amazon.com/acm/latest/userguide/acm-overview.html)
+
+### Kubernetes Documentation
+
+- [AWS Load Balancer Controller](https://kubernetes-sigs.github.io/aws-load-balancer-controller/latest/)
+- [Service Annotations](https://kubernetes-sigs.github.io/aws-load-balancer-controller/latest/guide/service/annotations/)
+
+### Coder Documentation
+
+- [Coder Configuration](https://coder.com/docs/admin/configure)
+- [External Authentication](https://coder.com/docs/admin/external-auth)
+- [Enterprise Features](https://coder.com/docs/admin/enterprise)
+
+---
+
+## Version History
+
+- **2025-11-25**: Initial documentation of best practices
+- Added NLB vs ALB comparison and rationale
+- Documented DNS multi-region architecture
+- Explained LiteLLM integration approach
+- Covered Helm `upgrade_install` best practice
+- Included security considerations
+
+---
+
+## Questions or Feedback
+
+For technical questions about this architecture, contact the infrastructure team.
+For customer-specific discussions, work with your Solutions Architect.
diff --git a/docs/MULTI_REGION_DEPLOYMENT.md b/docs/MULTI_REGION_DEPLOYMENT.md
new file mode 100644
index 0000000..81f93d6
--- /dev/null
+++ b/docs/MULTI_REGION_DEPLOYMENT.md
@@ -0,0 +1,324 @@
+# Multi-Region Deployment Progress
+
+**Date:** 2025-12-02
+**Status:** Pending Enterprise License
+
+## Overview
+
+This document tracks the progress of deploying multi-region Coder infrastructure to enable:
+
+- **A) Automatic routing** to the nearest region based on user latency
+- **B) Manual region selection** in the Coder UI for users to choose their preferred region
+
+## Current Status
+
+### ✅ Completed Today
+
+#### 1. Cost Optimization - Aurora Serverless v2
+
+- **Problem:** RDS Aurora Serverless v2 costing $130/month for both writer and reader instances
+- **Solution:** Removed reader instance from `infra/aws/us-east-2/rds/main.tf`
+- **Result:** Reduced cost by ~$44/month to ~$86/month (1.0 ACU total)
+- **File:** `infra/aws/us-east-2/rds/main.tf`
+
+#### 2. Cross-Region Replica Communication
+
+- **Problem:** Coder replicas in us-east-2 and us-west-2 could detect each other but couldn't communicate (timeout errors)
+- **Root Cause:** Security groups blocking port 8080 traffic between VPCs
+- **Solution:**
+  - Added security group rules to allow TCP port 8080 between VPC CIDRs
+  - Codified rules in Terraform for reproducibility
+- **Files:**
+  - `infra/aws/us-east-2/vpc-peering/main.tf`
+  - `infra/aws/us-east-2/vpc-peering/terraform.tfvars`
+
+```terraform
+# Security group rule to allow Coder replica communication from us-west-2 to us-east-2
+resource "aws_security_group_rule" "use2_allow_coder_from_usw2" {
+  provider          = aws.use2
+  type              = "ingress"
+  from_port         = 8080
+  to_port           = 8080
+  protocol          = "tcp"
+  cidr_blocks       = [var.accepter_vpc_cidr]
+  security_group_id = var.requester_node_security_group_id
+  description       = "Allow Coder replica communication from us-west-2"
+}
+```
+
+#### 3. DERP Server Configuration
+
+- **Problem:** `/derp/latency-check` endpoint timing out, replicas couldn't sync properly
+- **Root Cause:** `CODER_DERP_SERVER_ENABLE` environment variable not set
+- **Solution:** Added `CODER_DERP_SERVER_ENABLE = "true"` to both regions' Coder deployments
+- **Result:** Replicas now communicate successfully, no more timeout errors
+- **Files:**
+  - `infra/aws/us-east-2/k8s/coder-server/main.tf`
+  - `infra/aws/us-west-2/k8s/coder-server/main.tf`
+
+```terraform
+env_vars = {
+  CODER_REDIRECT_TO_ACCESS_URL = "false"
+  CODER_TLS_ENABLE             = "false"
+  CODER_SECURE_AUTH_COOKIE     = "true"
+  # Enable DERP server for multi-region replica communication
+  CODER_DERP_SERVER_ENABLE     = "true"
+}
+```
+
+#### 4. Latency Improvement
+
+- **Before:** 111ms
+- **After:** 34ms
+- Achieved through proper VPC peering, security group rules, and DERP server configuration
+
+#### 5. Workspace Proxy Configuration (Ready for Deployment)
+
+- Created complete Terraform configuration for us-west-2 workspace proxy
+- **Files:**
+  - `infra/aws/us-west-2/k8s/coder-proxy/main.tf`
+  - `infra/aws/us-west-2/k8s/coder-proxy/terraform.tfvars`
+  - `infra/aws/us-west-2/k8s/coder-proxy/backend.hcl`
+
+### ⏸️ Blocked - Awaiting Enterprise License
+
+#### Workspace Proxy Deployment
+
+- **Problem:** "Your license is not entitled to create workspace proxies."
+- **Requirement:** Coder Enterprise license required for Workspace Proxy feature
+- **Impact:** Manual region selection (requirement B) cannot be completed without Enterprise license
+
+**Error from Terraform:**
+
+```
+Error: Feature not enabled
+
+  with module.coder-proxy.coderd_workspace_proxy.this,
+  on ../../../../../modules/k8s/bootstrap/coder-proxy/main.tf line 259, in resource "coderd_workspace_proxy" "this":
+ 259: resource "coderd_workspace_proxy" "this" {
+
+Your license is not entitled to create workspace proxies.
+```
+
+**Error from API:**
+
+```json
+{
+  "message": "Workspace Proxy is a Premium feature. Contact sales!"
+}
+```
+
+## Key Technical Concepts
+
+### Coder Replicas vs Workspace Proxies
+
+#### Replicas (Currently Deployed)
+
+- **Purpose:** High availability and automatic failover
+- **Behavior:** Multiple Coder instances share same database, automatic failover if one fails
+- **User Experience:** Users see single "default" region, automatic routing based on DNS
+- **License:** Available in all Coder editions
+- **Status:** ✅ Deployed and working in us-east-2 and us-west-2
+
+#### Workspace Proxies (Blocked by License)
+
+- **Purpose:** User-selectable regions for manual region switching
+- **Behavior:** Users can see and manually switch between regions in Coder UI
+- **User Experience:** "Region" tab in UI with latency display and manual selection
+- **License:** ⚠️ Requires Coder Enterprise license
+- **Status:** ❌ Configuration ready but deployment blocked
+
+## Infrastructure State
+
+### us-east-2 (Ohio) - Primary Region
+
+- **EKS Cluster:** `coderdemo-use2` ✅ Running
+- **Coder Server:** ✅ Deployed and operational
+- **Database:** Aurora Serverless v2 (1.0 ACU writer only) ✅
+- **VPC CIDR:** 10.0.0.0/16
+- **Node Security Group:** `<REDACTED>`
+- **DERP Server:** ✅ Enabled
+- **URL:** https://coderdemo.io
+
+### us-west-2 (Oregon) - Secondary Region
+
+- **EKS Cluster:** `coderdemo-usw2` ✅ Running
+- **Coder Server:** ✅ Deployed as replica
+- **Coder Proxy:** ❌ Blocked by license (configuration ready)
+- **VPC CIDR:** 10.1.0.0/16
+- **Node Security Group:** `<REDACTED>`
+- **DERP Server:** ✅ Enabled
+- **Planned URL:** https://us-west-2.coderdemo.io
+
+### Networking
+
+- **VPC Peering:** ✅ Established between us-east-2 and us-west-2
+- **Security Group Rules:** ✅ Port 8080 allowed between regions
+- **Route Tables:** ✅ Configured for cross-region routing
+- **Replica Communication:** ✅ Working (34ms latency)
+
+## Next Steps - Once Enterprise License is Obtained
+
+### 1. Apply Enterprise License to Coder Deployment
+
+The license needs to be applied to the primary Coder deployment at https://coderdemo.io. This is typically done through the Coder admin UI or by setting the `CODER_LICENSE` environment variable.
+
+### 2. Deploy Workspace Proxy to us-west-2
+
+Run from `infra/aws/us-west-2/k8s/coder-proxy`:
+
+```bash
+terraform apply -var-file=terraform.tfvars -auto-approve
+```
+
+This will:
+
+1. Create the workspace proxy "Oregon" in Coder API
+2. Deploy proxy pods to us-west-2 EKS cluster
+3. Create namespace and secrets
+4. Configure NLB with ACM certificate
+5. Enable manual region selection in Coder UI
+
+### 3. Verify Workspace Proxy Registration
+
+Check that the proxy appears in Coder:
+
+```bash
+curl -H "Coder-Session-Token: <token>" https://coderdemo.io/api/v2/workspaceproxies
+```
+
+Expected response:
+
+```json
+{
+  "proxies": [
+    {
+      "id": "...",
+      "name": "us-west-2",
+      "display_name": "Oregon",
+      "icon": "/emojis/1f1fa-1f1f8.png",
+      "url": "https://us-west-2.coderdemo.io",
+      "healthy": true
+    }
+  ]
+}
+```
+
+### 4. Configure Route53 (If Not Already Done)
+
+Ensure latency-based routing is configured for automatic region selection:
+
+- A record for `coderdemo.io` → us-east-2 NLB (latency-based)
+- A record for `coderdemo.io` → us-west-2 NLB (latency-based)
+- CNAME for `*.coderdemo.io` → coderdemo.io
+- A record for `us-west-2.coderdemo.io` → us-west-2 NLB (simple routing)
+
+### 5. Test User Experience
+
+1. Navigate to https://coderdemo.io
+2. Verify latency-based routing connects to nearest region
+3. Look for "Region" selector in Coder UI
+4. Click "Refresh latency" to see both regions
+5. Manually select "Oregon" region
+6. Verify connection switches to us-west-2
+
+## Configuration Files
+
+### Workspace Proxy Configuration
+
+`infra/aws/us-west-2/k8s/coder-proxy/terraform.tfvars`:
+
+```terraform
+cluster_name    = "coderdemo-usw2"
+cluster_region  = "us-west-2"
+cluster_profile = "noah@coder.com"
+
+coder_proxy_name         = "us-west-2"
+coder_proxy_display_name = "Oregon"
+coder_proxy_icon         = "/emojis/1f1fa-1f1f8.png"
+
+coder_access_url          = "https://coderdemo.io"
+coder_proxy_url           = "https://us-west-2.coderdemo.io"
+coder_proxy_wildcard_url  = "*.us-west-2.coderdemo.io"
+
+coder_token = "<REDACTED - See terraform.tfvars>"
+
+addon_version = "2.27.1"
+image_repo    = "ghcr.io/coder/coder"
+image_tag     = "v2.27.1"
+
+acme_registration_email      = "admin@coderdemo.io"
+cloudflare_api_token         = "placeholder"
+kubernetes_ssl_secret_name   = "coder-proxy-tls"
+kubernetes_create_ssl_secret = false
+```
+
+### VPC Peering Configuration
+
+`infra/aws/us-east-2/vpc-peering/terraform.tfvars`:
+
+```terraform
+profile                          = "noah@coder.com"
+requester_vpc_id                 = "<REDACTED>"
+accepter_vpc_id                  = "<REDACTED>"
+requester_vpc_cidr               = "10.0.0.0/16"
+accepter_vpc_cidr                = "10.1.0.0/16"
+requester_node_security_group_id = "<REDACTED>"
+accepter_node_security_group_id  = "<REDACTED>"
+```
+
+## Reference Links
+
+- [Coder Enterprise Licensing](https://coder.com/docs/coder-oss/latest/admin/licensing)
+- [Workspace Proxies Documentation](https://coder.com/docs/coder-oss/latest/admin/workspace-proxies)
+- [Multi-Region Deployment Guide](https://coder.com/docs/coder-oss/latest/admin/multi-region)
+
+## Important Notes
+
+1. **Token Security:** The Coder API token is stored in terraform.tfvars. Consider using AWS Secrets Manager for production.
+
+2. **S3 Backend:** All Terraform state is stored in S3 bucket in us-east-2. See backend.hcl files for configuration.
+
+3. **Replica Communication:** Replicas use DERP protocol on port 8080 for coordination. Ensure security groups allow this traffic.
+
+4. **DNS Propagation:** After deploying workspace proxy, DNS changes may take 5-60 minutes to propagate globally.
+
+5. **Certificate Management:** ACM certificates are managed separately. Ensure `*.us-west-2.coderdemo.io` certificate is issued in us-west-2.
+
+## Troubleshooting
+
+### If Workspace Proxy Deployment Fails
+
+1. Verify Enterprise license is applied: Check Coder admin UI → Deployment → License
+2. Check Coder API token has admin permissions
+3. Verify network connectivity from us-west-2 to primary deployment
+4. Check pod logs: `kubectl logs -n coder-proxy -l app.kubernetes.io/name=coder`
+
+### If Users Don't See Region Selector
+
+1. Ensure workspace proxy status is "healthy" in API
+2. Hard refresh browser (Cmd+Shift+R / Ctrl+Shift+F5)
+3. Verify user has permission to see workspace proxies
+4. Check Coder version supports workspace proxies (v2.0+)
+
+## Summary
+
+**What Works Now:**
+
+- ✅ Multi-region Coder replicas (us-east-2, us-west-2)
+- ✅ Automatic failover between replicas
+- ✅ Cross-region communication via DERP
+- ✅ 34ms inter-region latency
+- ✅ Cost-optimized Aurora database
+
+**What's Pending:**
+
+- ⏸️ Manual region selection in UI (blocked by Enterprise license)
+- ⏸️ Workspace proxy deployment (configuration ready)
+
+**Action Required:**
+
+1. Obtain Coder Enterprise license
+2. Apply license to deployment
+3. Run `terraform apply` for workspace proxy
+4. Verify region selector appears in UI
diff --git a/docs/cost-optimization-strategy.md b/docs/cost-optimization-strategy.md
new file mode 100644
index 0000000..12da3ff
--- /dev/null
+++ b/docs/cost-optimization-strategy.md
@@ -0,0 +1,130 @@
+# Cost Optimization Strategy for Coder Demo
+
+## Mixed Capacity Approach
+
+### Node Group Strategy
+
+**System Nodes (ON_DEMAND)**
+
+- **Purpose**: Run critical Kubernetes infrastructure
+- **Workloads**: CoreDNS, kube-proxy, metrics-server, cert-manager, AWS LB Controller
+- **Size**: t4g.medium (ARM Graviton)
+- **Count**: 1-2 nodes minimum
+- **Cost**: ~$24/month (1 node) to $48/month (2 nodes)
+
+**Application Nodes (MIXED: 20% On-Demand, 80% Spot via Karpenter)**
+
+- **Purpose**: Run Coder server and workspaces
+- **Spot Savings**: 70-90% cost reduction
+- **Interruption Risk**: Mitigated by:
+  - Multiple instance types (diversified Spot pools)
+  - Karpenter auto-rebalancing
+  - Pod Disruption Budgets
+
+### Karpenter NodePool Configuration
+
+#### 1. Coder Server NodePool (ON_DEMAND Priority)
+
+```yaml
+capacity_type: ["on-demand", "spot"] # Prefer On-Demand, fallback to Spot
+weight:
+  on-demand: 100 # Higher priority
+  spot: 10
+```
+
+#### 2. Coder Workspace NodePool (SPOT Priority)
+
+```yaml
+capacity_type: ["spot", "on-demand"] # Prefer Spot, fallback to On-Demand
+weight:
+  spot: 100 # Higher priority
+  on-demand: 10
+```
+
+### Risk Mitigation
+
+**Spot Interruption Handling:**
+
+1. **2-minute warning** → Karpenter automatically provisions replacement
+2. **Multiple instance types** → 15+ types reduces interruption rate to <1%
+3. **Pod Disruption Budgets** → Ensures minimum replicas always running
+4. **Karpenter Consolidation** → Automatically moves pods before termination
+
+**Example Instance Type Diversity:**
+
+```
+Spot Pool: t4g.medium, t4g.large, t3a.medium, t3a.large,
+           m6g.medium, m6g.large, m6a.medium, m6a.large
+```
+
+### Cost Breakdown
+
+| Component          | Instance Type | Capacity  | Monthly Cost  |
+| ------------------ | ------------- | --------- | ------------- |
+| System Nodes (2)   | t4g.medium    | ON_DEMAND | $48           |
+| Coder Server (2)   | t4g.large     | 80% SPOT  | $28 (vs $140) |
+| Workspaces (avg 5) | t4g.xlarge    | 90% SPOT  | $75 (vs $750) |
+| **Total**          |               | **Mixed** | **$151/mo**   |
+
+**vs All On-Demand:** $938/month → **84% savings**
+
+### Dynamic Scaling
+
+**Low Usage (nights/weekends):**
+
+- Scale to zero workspaces
+- Keep 1 system node + 1 Coder server node
+- Cost: ~$48/month during idle
+
+**High Usage (business hours):**
+
+- Auto-scale workspaces on Spot
+- Karpenter provisions nodes in <60 seconds
+- Cost: ~$150-200/month during peak
+
+### Monitoring & Alerts
+
+**CloudWatch Alarms:**
+
+- Spot interruption rate > 5%
+- Available On-Demand capacity < 20%
+- Karpenter provisioning failures
+
+**Response:**
+
+- Automatic fallback to On-Demand
+- Email alerts to ops team
+- Karpenter adjusts instance type mix
+
+## Implementation Timeline
+
+1. ✅ Deploy EKS with ON_DEMAND system nodes
+2. ⏳ Deploy Karpenter
+3. ⏳ Configure mixed-capacity NodePools
+4. ⏳ Deploy Coder with node affinity rules
+5. ⏳ Test Spot interruption handling
+6. ⏳ Enable auto-scaling policies
+
+## Fallback Plan
+
+If Spot becomes unreliable (rare):
+
+1. Update Karpenter NodePool to 100% On-Demand
+2. `kubectl apply -f nodepool-ondemand.yaml`
+3. Karpenter gracefully migrates pods
+4. Takes ~5 minutes, zero downtime
+
+## Best Practices
+
+✅ **DO:**
+
+- Use multiple Spot instance types (10+)
+- Set Pod Disruption Budgets
+- Monitor Spot interruption rates
+- Test failover regularly
+
+❌ **DON'T:**
+
+- Run databases on Spot (use RDS)
+- Use Spot for single-replica critical services
+- Rely on single instance type for Spot
diff --git a/infra/aws/eu-west-2/eks/main.tf b/infra/aws/eu-west-2/eks/main.tf
index 2bffa33..bed6bd1 100644
--- a/infra/aws/eu-west-2/eks/main.tf
+++ b/infra/aws/eu-west-2/eks/main.tf
@@ -30,7 +30,7 @@ variable "cluster_version" {
 
 variable "cluster_instance_type" {
   description = "EKS Instance Size/Type"
-  default     = "t3.xlarge"
+  default     = "t4g.medium" # ARM Graviton for cost optimization
   type        = string
 }
 
@@ -179,7 +179,7 @@ module "cluster" {
     system = {
       min_size     = 0
       max_size     = 10
-      desired_size = 0 # Cant be modified after creation. Override from AWS Console
+      desired_size = 1 # Cant be modified after creation. Override from AWS Console
       labels       = local.cluster_asg_node_labels
 
       instance_types = [var.cluster_instance_type]
diff --git a/infra/aws/eu-west-2/k8s/cert-manager/main.tf b/infra/aws/eu-west-2/k8s/cert-manager/main.tf
index ab12c5d..d0de2cf 100644
--- a/infra/aws/eu-west-2/k8s/cert-manager/main.tf
+++ b/infra/aws/eu-west-2/k8s/cert-manager/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -41,11 +41,6 @@ variable "addon_version" {
   default = "v1.18.2"
 }
 
-variable "cloudflare_api_token" {
-  type      = string
-  sensitive = true
-}
-
 provider "aws" {
   region  = var.cluster_region
   profile = var.cluster_profile
@@ -60,7 +55,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
@@ -78,7 +73,6 @@ module "cert-manager" {
   cluster_name              = var.cluster_name
   cluster_oidc_provider_arn = var.cluster_oidc_provider_arn
 
-  namespace               = var.addon_namespace
-  helm_version            = var.addon_version
-  cloudflare_token_secret = var.cloudflare_api_token
+  namespace    = var.addon_namespace
+  helm_version = var.addon_version
 }
\ No newline at end of file
diff --git a/infra/aws/eu-west-2/k8s/coder-proxy/main.tf b/infra/aws/eu-west-2/k8s/coder-proxy/main.tf
index b9704ed..06b5c6b 100644
--- a/infra/aws/eu-west-2/k8s/coder-proxy/main.tf
+++ b/infra/aws/eu-west-2/k8s/coder-proxy/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -101,11 +101,6 @@ variable "kubernetes_create_ssl_secret" {
   default = true
 }
 
-variable "cloudflare_api_token" {
-  type      = string
-  sensitive = true
-}
-
 provider "aws" {
   region  = var.cluster_region
   profile = var.cluster_profile
@@ -120,7 +115,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
@@ -161,7 +156,6 @@ module "coder-proxy" {
   proxy_token_config = {
     name = "coder-proxy"
   }
-  cloudflare_api_token = var.cloudflare_api_token
   ssl_cert_config = {
     name          = var.kubernetes_ssl_secret_name
     create_secret = var.kubernetes_create_ssl_secret
diff --git a/infra/aws/eu-west-2/k8s/coder-ws/main.tf b/infra/aws/eu-west-2/k8s/coder-ws/main.tf
index 451a056..6c9140b 100644
--- a/infra/aws/eu-west-2/k8s/coder-ws/main.tf
+++ b/infra/aws/eu-west-2/k8s/coder-ws/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -98,7 +98,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/eu-west-2/k8s/ebs-controller/main.tf b/infra/aws/eu-west-2/k8s/ebs-controller/main.tf
index d7f1f56..5194ec7 100644
--- a/infra/aws/eu-west-2/k8s/ebs-controller/main.tf
+++ b/infra/aws/eu-west-2/k8s/ebs-controller/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -55,7 +55,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/eu-west-2/k8s/karpenter/main.tf b/infra/aws/eu-west-2/k8s/karpenter/main.tf
index f5b34f8..4adb718 100644
--- a/infra/aws/eu-west-2/k8s/karpenter/main.tf
+++ b/infra/aws/eu-west-2/k8s/karpenter/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -54,7 +54,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
@@ -181,7 +181,7 @@ module "karpenter-addon" {
     block_device_mappings = [{
       device_name = "/dev/xvda"
       ebs = {
-        volume_size = "1400Gi"
+        volume_size = "500Gi"
         volume_type = "gp3"
       }
       }, {
diff --git a/infra/aws/eu-west-2/k8s/lb-controller/main.tf b/infra/aws/eu-west-2/k8s/lb-controller/main.tf
index 1f6a0fa..479e9a1 100644
--- a/infra/aws/eu-west-2/k8s/lb-controller/main.tf
+++ b/infra/aws/eu-west-2/k8s/lb-controller/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -60,7 +60,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/eu-west-2/k8s/metrics-server/main.tf b/infra/aws/eu-west-2/k8s/metrics-server/main.tf
index d808c74..cce9447 100644
--- a/infra/aws/eu-west-2/k8s/metrics-server/main.tf
+++ b/infra/aws/eu-west-2/k8s/metrics-server/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
   }
   backend "s3" {}
@@ -48,7 +48,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/us-east-2/README.md b/infra/aws/us-east-2/README.md
new file mode 100644
index 0000000..5ff4543
--- /dev/null
+++ b/infra/aws/us-east-2/README.md
@@ -0,0 +1,140 @@
+# Terraform Backend Configuration
+
+## Security Notice
+
+This directory uses remote S3 backend for state management, but **backend configuration files are gitignored** to prevent leaking AWS account IDs and other sensitive information.
+
+## Local Setup
+
+1. **Get backend configuration from teammate** or **retrieve from AWS**:
+
+   ```bash
+   # Get S3 bucket name (it contains the account ID)
+   aws s3 ls | grep terraform-state
+
+   # Get DynamoDB table name
+   aws dynamodb list-tables --query 'TableNames[?contains(@, `terraform-lock`)]'
+   ```
+
+2. **Create backend configuration** for each module:
+
+   Each Terraform module needs a `backend.tf` file (this file is gitignored). Create it manually:
+
+   ```bash
+   cd infra/aws/us-east-2/vpc  # or any other module
+   ```
+
+   Create `backend.tf`:
+
+   ```hcl
+   terraform {
+     backend "s3" {
+       bucket         = "YOUR-BUCKET-NAME-HERE"
+       key            = "us-east-2/vpc/terraform.tfstate"  # Update path per module
+       region         = "us-east-2"
+       dynamodb_table = "YOUR-TABLE-NAME-HERE"
+       encrypt        = true
+     }
+   }
+   ```
+
+   **Important**: Update the `key` path for each module:
+   - VPC: `us-east-2/vpc/terraform.tfstate`
+   - EKS: `us-east-2/eks/terraform.tfstate`
+   - ACM: `us-east-2/acm/terraform.tfstate`
+   - etc.
+
+3. **Initialize Terraform**:
+   ```bash
+   terraform init
+   ```
+
+## GitHub Actions Setup
+
+GitHub Actions uses secrets to configure the backend securely. Required secrets:
+
+1. `TF_STATE_BUCKET` - S3 bucket name
+2. `TF_STATE_LOCK_TABLE` - DynamoDB table name
+3. `AWS_ROLE_ARN` - IAM role ARN for OIDC authentication
+
+These are configured in: Repository Settings > Secrets and variables > Actions
+
+## Alternative: Using Backend Config File
+
+Instead of creating backend.tf, you can use a config file:
+
+1. Create `backend.conf` (gitignored):
+
+   ```
+   bucket         = "YOUR-BUCKET-NAME"
+   dynamodb_table = "YOUR-TABLE-NAME"
+   region         = "us-east-2"
+   encrypt        = true
+   ```
+
+2. Initialize with:
+   ```bash
+   terraform init -backend-config=backend.conf -backend-config="key=us-east-2/vpc/terraform.tfstate"
+   ```
+
+## Why This Approach?
+
+- **Security**: Account IDs and resource names aren't committed to Git
+- **Flexibility**: Each developer/environment can use different backends
+- **Compliance**: Prevents accidental exposure of infrastructure details
+- **Best Practice**: Follows AWS security recommendations
+
+## Secret Scanning Protection
+
+This repository has automated secret scanning to prevent accidental exposure of credentials:
+
+### GitHub Actions (Automated)
+
+- **Gitleaks** - Scans every PR and push for secrets
+- **TruffleHog** - Additional verification layer
+- **Custom Pattern Matching** - Catches common secret patterns
+- **Auto-Revert** - Automatically reverts commits to main with secrets
+
+### Pre-commit Hooks (Local)
+
+Catch secrets before they reach GitHub:
+
+```bash
+# Install pre-commit
+pip install pre-commit
+
+# Install git hooks
+pre-commit install
+
+# Test on all files
+pre-commit run --all-files
+```
+
+### What Gets Detected
+
+- AWS Access Keys (AKIA...)
+- API Keys and Tokens
+- Private Keys (RSA, SSH, etc.)
+- Database connection strings with passwords
+- GitHub Personal Access Tokens
+- Stripe API keys
+- High-entropy strings (likely secrets)
+
+### If Secrets Are Detected
+
+1. **PR is blocked** - Cannot merge until secrets are removed
+2. **Automatic notification** - PR comment explains the issue
+3. **Required actions**:
+   - Remove the secret from code
+   - Use GitHub Secrets or environment variables
+   - Rotate/invalidate the exposed credential
+
+## Migrating Existing State
+
+If you have local state to migrate:
+
+```bash
+terraform init -migrate-state
+```
+
+Terraform will prompt to copy existing state to the remote backend.
diff --git a/infra/aws/us-east-2/acm/main.tf b/infra/aws/us-east-2/acm/main.tf
new file mode 100644
index 0000000..e37c97e
--- /dev/null
+++ b/infra/aws/us-east-2/acm/main.tf
@@ -0,0 +1,107 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = ">= 5.0"
+    }
+  }
+}
+
+variable "cluster_region" {
+  description = "AWS region for ACM certificate"
+  type        = string
+  default     = "us-east-2"
+}
+
+variable "cluster_profile" {
+  description = "AWS profile"
+  type        = string
+  default     = "default"
+}
+
+variable "domain_name" {
+  description = "Domain name for Coder"
+  type        = string
+  default     = "coderdemo.io"
+}
+
+variable "hosted_zone_id" {
+  description = "Route 53 Hosted Zone ID"
+  type        = string
+}
+
+provider "aws" {
+  region  = var.cluster_region
+  profile = var.cluster_profile
+  alias   = "acm"
+}
+
+# Provider for Route 53 (may be in different account)
+provider "aws" {
+  region  = var.cluster_region
+  profile = var.cluster_profile
+  alias   = "route53"
+}
+
+# ACM Certificate for Coder with wildcard
+resource "aws_acm_certificate" "coder" {
+  provider          = aws.acm
+  domain_name       = var.domain_name
+  validation_method = "DNS"
+
+  subject_alternative_names = [
+    "*.${var.domain_name}"
+  ]
+
+  lifecycle {
+    create_before_destroy = true
+  }
+
+  tags = {
+    Name        = "coder-certificate"
+    Environment = "test"
+    ManagedBy   = "terraform"
+  }
+}
+
+# Route 53 validation records
+resource "aws_route53_record" "cert_validation" {
+  provider = aws.route53
+  for_each = {
+    for dvo in aws_acm_certificate.coder.domain_validation_options : dvo.domain_name => {
+      name   = dvo.resource_record_name
+      record = dvo.resource_record_value
+      type   = dvo.resource_record_type
+    }
+  }
+
+  allow_overwrite = true
+  name            = each.value.name
+  records         = [each.value.record]
+  ttl             = 60
+  type            = each.value.type
+  zone_id         = var.hosted_zone_id
+}
+
+# Wait for certificate validation
+resource "aws_acm_certificate_validation" "coder" {
+  provider                = aws.acm
+  certificate_arn         = aws_acm_certificate.coder.arn
+  validation_record_fqdns = [for record in aws_route53_record.cert_validation : record.fqdn]
+}
+
+# Outputs
+output "certificate_arn" {
+  description = "ARN of the validated ACM certificate"
+  value       = aws_acm_certificate_validation.coder.certificate_arn
+}
+
+output "domain_name" {
+  description = "Domain name for Coder"
+  value       = var.domain_name
+}
+
+output "validation_status" {
+  description = "Certificate validation status"
+  value       = "Certificate validated and ready to use"
+}
diff --git a/infra/aws/us-east-2/acm/terraform.tfvars.example b/infra/aws/us-east-2/acm/terraform.tfvars.example
new file mode 100644
index 0000000..d9adc60
--- /dev/null
+++ b/infra/aws/us-east-2/acm/terraform.tfvars.example
@@ -0,0 +1,7 @@
+# ACM Certificate configuration for Coder
+# Copy this to terraform.tfvars and fill in your values
+
+cluster_region  = "us-east-2"
+cluster_profile = "YOUR_AWS_PROFILE"
+domain_name     = "YOUR_DOMAIN.com"
+hosted_zone_id  = "YOUR_ROUTE53_ZONE_ID"
diff --git a/infra/aws/us-east-2/eks/main.tf b/infra/aws/us-east-2/eks/main.tf
index 80c15aa..6f59178 100644
--- a/infra/aws/us-east-2/eks/main.tf
+++ b/infra/aws/us-east-2/eks/main.tf
@@ -30,7 +30,7 @@ variable "cluster_version" {
 
 variable "cluster_instance_type" {
   description = "EKS Instance Size/Type"
-  default     = "t3.xlarge"
+  default     = "t4g.xlarge"
   type        = string
 }
 
@@ -141,17 +141,115 @@ module "eks" {
       desired_size = 0 # Cant be modified after creation. Override from AWS Console
       labels       = local.cluster_asg_node_labels
 
-      instance_types = [var.cluster_instance_type]
-      capacity_type  = "ON_DEMAND"
+      # Cost optimization: Graviton ARM instances
+      # IMPORTANT: ON_DEMAND for system nodes - production demo cannot break!
+      instance_types = [var.cluster_instance_type, "t4g.small", "t4g.large"] # ARM only
+      ami_type       = "AL2023_ARM_64_STANDARD"                              # ARM-based AMI
+      capacity_type  = "ON_DEMAND"                                           # System infrastructure must be stable
+
       iam_role_additional_policies = {
         AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
         STSAssumeRole                = aws_iam_policy.sts.arn
       }
 
+      # Cost optimization: gp3 volumes with smaller size
+      block_device_mappings = [{
+        device_name = "/dev/xvda"
+        ebs = {
+          volume_type           = "gp3" # Better performance, same cost as gp2
+          volume_size           = 20    # Reduced from default 50GB
+          delete_on_termination = true
+          encrypted             = true
+        }
+      }]
+
       # System Nodes should not be public
       subnet_ids = var.private_subnet_ids
     }
   }
 
   tags = local.tags
-}
\ No newline at end of file
+}
+# VPC Endpoints for cost optimization (reduce NAT Gateway usage)
+resource "aws_vpc_endpoint" "s3" {
+  vpc_id       = var.vpc_id
+  service_name = "com.amazonaws.${var.region}.s3"
+  route_table_ids = flatten([
+    data.aws_route_tables.private.ids
+  ])
+  tags = merge(local.tags, {
+    Name = "${var.name}-s3-endpoint"
+  })
+}
+
+resource "aws_vpc_endpoint" "ecr_api" {
+  vpc_id              = var.vpc_id
+  service_name        = "com.amazonaws.${var.region}.ecr.api"
+  vpc_endpoint_type   = "Interface"
+  subnet_ids          = var.private_subnet_ids
+  security_group_ids  = [aws_security_group.vpc_endpoints.id]
+  private_dns_enabled = true
+  tags = merge(local.tags, {
+    Name = "${var.name}-ecr-api-endpoint"
+  })
+}
+
+resource "aws_vpc_endpoint" "ecr_dkr" {
+  vpc_id              = var.vpc_id
+  service_name        = "com.amazonaws.${var.region}.ecr.dkr"
+  vpc_endpoint_type   = "Interface"
+  subnet_ids          = var.private_subnet_ids
+  security_group_ids  = [aws_security_group.vpc_endpoints.id]
+  private_dns_enabled = true
+  tags = merge(local.tags, {
+    Name = "${var.name}-ecr-dkr-endpoint"
+  })
+}
+
+# Security group for VPC endpoints
+resource "aws_security_group" "vpc_endpoints" {
+  name_prefix = "${var.name}-vpc-endpoints"
+  description = "Security group for VPC endpoints"
+  vpc_id      = var.vpc_id
+
+  ingress {
+    from_port   = 443
+    to_port     = 443
+    protocol    = "tcp"
+    cidr_blocks = ["10.0.0.0/16"]
+  }
+
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
+
+  tags = merge(local.tags, {
+    Name = "${var.name}-vpc-endpoints-sg"
+  })
+}
+
+# Data source for route tables
+data "aws_route_tables" "private" {
+  vpc_id = var.vpc_id
+  filter {
+    name   = "tag:Name"
+    values = ["*private*"]
+  }
+}
+
+# Outputs
+output "vpc_endpoint_s3_id" {
+  description = "S3 VPC Endpoint ID"
+  value       = aws_vpc_endpoint.s3.id
+}
+
+output "vpc_endpoint_ecr_ids" {
+  description = "ECR VPC Endpoint IDs"
+  value = {
+    api = aws_vpc_endpoint.ecr_api.id
+    dkr = aws_vpc_endpoint.ecr_dkr.id
+  }
+}
diff --git a/infra/aws/us-east-2/k8s/cert-manager/main.tf b/infra/aws/us-east-2/k8s/cert-manager/main.tf
index ab12c5d..d0de2cf 100644
--- a/infra/aws/us-east-2/k8s/cert-manager/main.tf
+++ b/infra/aws/us-east-2/k8s/cert-manager/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -41,11 +41,6 @@ variable "addon_version" {
   default = "v1.18.2"
 }
 
-variable "cloudflare_api_token" {
-  type      = string
-  sensitive = true
-}
-
 provider "aws" {
   region  = var.cluster_region
   profile = var.cluster_profile
@@ -60,7 +55,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
@@ -78,7 +73,6 @@ module "cert-manager" {
   cluster_name              = var.cluster_name
   cluster_oidc_provider_arn = var.cluster_oidc_provider_arn
 
-  namespace               = var.addon_namespace
-  helm_version            = var.addon_version
-  cloudflare_token_secret = var.cloudflare_api_token
+  namespace    = var.addon_namespace
+  helm_version = var.addon_version
 }
\ No newline at end of file
diff --git a/infra/aws/us-east-2/k8s/coder-server/main.tf b/infra/aws/us-east-2/k8s/coder-server/main.tf
index 79a8fd2..fb2a908 100644
--- a/infra/aws/us-east-2/k8s/coder-server/main.tf
+++ b/infra/aws/us-east-2/k8s/coder-server/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -141,11 +141,6 @@ variable "kubernetes_create_ssl_secret" {
   default = true
 }
 
-variable "cloudflare_api_token" {
-  type      = string
-  sensitive = true
-}
-
 variable "oidc_sign_in_text" {
   type = string
 }
@@ -176,7 +171,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
@@ -198,6 +193,13 @@ provider "acme" {
   server_url = var.acme_server_url
 }
 
+# Fetch ACM certificate dynamically by domain to avoid hardcoding sensitive ARNs
+data "aws_acm_certificate" "coder" {
+  domain      = trimsuffix(trimprefix(var.coder_access_url, "https://"), "/")
+  statuses    = ["ISSUED"]
+  most_recent = true
+}
+
 module "coder-server" {
   source = "../../../../../modules/k8s/bootstrap/coder-server"
 
@@ -208,13 +210,12 @@ module "coder-server" {
   namespace                       = "coder"
   acme_registration_email         = var.acme_registration_email
   acme_days_until_renewal         = 90
-  replica_count                   = 2
+  replica_count                   = 1 # HA requires Enterprise license
   helm_version                    = var.addon_version
   image_repo                      = var.image_repo
   image_tag                       = var.image_tag
   primary_access_url              = var.coder_access_url
   wildcard_access_url             = var.coder_wildcard_access_url
-  cloudflare_api_token            = var.cloudflare_api_token
   coder_experiments               = var.coder_experiments
   coder_builtin_provisioner_count = var.coder_builtin_provisioner_count
   coder_github_allowed_orgs       = var.coder_github_allowed_orgs
@@ -237,10 +238,25 @@ module "coder-server" {
   github_external_auth_secret_client_id     = var.coder_github_external_auth_secret_client_id
   github_external_auth_secret_client_secret = var.coder_github_external_auth_secret_client_secret
   tags                                      = {}
+  env_vars = {
+    # Disable redirect since NLB terminates TLS and forwards plain HTTP to backend
+    # Without this, Coder sees HTTP and redirects to HTTPS, causing infinite redirect loop
+    CODER_REDIRECT_TO_ACCESS_URL = "false"
+    # Disable TLS on Coder itself since NLB terminates TLS
+    CODER_TLS_ENABLE = "false"
+    # Mark auth cookies as secure since users access via HTTPS
+    CODER_SECURE_AUTH_COOKIE = "true"
+    # Enable DERP server for multi-region replica communication
+    CODER_DERP_SERVER_ENABLE = "true"
+  }
   service_annotations = {
-    "service.beta.kubernetes.io/aws-load-balancer-nlb-target-type" = "instance"
-    "service.beta.kubernetes.io/aws-load-balancer-scheme"          = "internet-facing"
-    "service.beta.kubernetes.io/aws-load-balancer-attributes"      = "deletion_protection.enabled=true"
+    "service.beta.kubernetes.io/aws-load-balancer-nlb-target-type"  = "instance"
+    "service.beta.kubernetes.io/aws-load-balancer-scheme"           = "internet-facing"
+    "service.beta.kubernetes.io/aws-load-balancer-attributes"       = "deletion_protection.enabled=false,load_balancing.cross_zone.enabled=true"
+    "service.beta.kubernetes.io/aws-load-balancer-ssl-cert"         = data.aws_acm_certificate.coder.arn
+    "service.beta.kubernetes.io/aws-load-balancer-ssl-ports"        = "443"
+    "service.beta.kubernetes.io/aws-load-balancer-backend-protocol" = "tcp"
+    # Subnets will be auto-detected by Load Balancer Controller using kubernetes.io/role/elb=1 tag
   }
   node_selector = {
     "node.coder.io/managed-by" = "karpenter"
@@ -279,4 +295,24 @@ module "coder-server" {
       topology_key = "kubernetes.io/hostname"
     }
   }]
+}
+
+# Fix service HTTPS port to forward to HTTP backend (port 8080)
+# since Coder has TLS disabled and only listens on HTTP
+resource "null_resource" "patch_coder_service" {
+  depends_on = [module.coder-server]
+
+  triggers = {
+    # Re-run patch whenever Coder configuration changes
+    always_run = timestamp()
+  }
+
+  provisioner "local-exec" {
+    command = <<-EOT
+      sleep 10
+      kubectl patch svc coder -n coder --type='json' \
+        -p='[{"op": "replace", "path": "/spec/ports/1/targetPort", "value": "http"}]' \
+        2>/dev/null || true
+    EOT
+  }
 }
\ No newline at end of file
diff --git a/infra/aws/us-east-2/k8s/coder-ws/main.tf b/infra/aws/us-east-2/k8s/coder-ws/main.tf
index 451a056..6c9140b 100644
--- a/infra/aws/us-east-2/k8s/coder-ws/main.tf
+++ b/infra/aws/us-east-2/k8s/coder-ws/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -98,7 +98,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/us-east-2/k8s/ebs-controller/main.tf b/infra/aws/us-east-2/k8s/ebs-controller/main.tf
index ed4efef..0c8e7a3 100644
--- a/infra/aws/us-east-2/k8s/ebs-controller/main.tf
+++ b/infra/aws/us-east-2/k8s/ebs-controller/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -60,7 +60,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/us-east-2/k8s/karpenter/main.tf b/infra/aws/us-east-2/k8s/karpenter/main.tf
index a01280e..cc263f5 100644
--- a/infra/aws/us-east-2/k8s/karpenter/main.tf
+++ b/infra/aws/us-east-2/k8s/karpenter/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -53,20 +53,29 @@ data "aws_eks_cluster_auth" "this" {
   name = var.cluster_name
 }
 
-provider "helm" {
-  kubernetes {
-    host                   = data.aws_eks_cluster.this.endpoint
-    cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
-    token                  = data.aws_eks_cluster_auth.this.token
-  }
-}
-
 provider "kubernetes" {
   host                   = data.aws_eks_cluster.this.endpoint
   cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
   token                  = data.aws_eks_cluster_auth.this.token
 }
 
+provider "helm" {
+  kubernetes = {
+    host                   = data.aws_eks_cluster.this.endpoint
+    cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
+    exec = {
+      api_version = "client.authentication.k8s.io/v1beta1"
+      command     = "aws"
+      args = [
+        "eks",
+        "get-token",
+        "--cluster-name", var.cluster_name,
+        "--region", var.cluster_region
+      ]
+    }
+  }
+}
+
 locals {
   global_node_labels = {
     "node.coder.io/instance"   = "coder-v2"
@@ -153,7 +162,15 @@ locals {
     node_requirements = concat(local.global_node_reqs, [{
       key      = "node.kubernetes.io/instance-type"
       operator = "In"
-      values   = ["c6a.32xlarge", "c5a.32xlarge"]
+      values = [
+        # Small demos (5-10 users) - Most cost-effective
+        "c6a.4xlarge", "c5a.4xlarge", # 16 vCPU / 32 GB  - ~$0.18/hr spot
+        "c6a.8xlarge", "c5a.8xlarge", # 32 vCPU / 64 GB  - ~$0.37/hr spot
+        # Medium demos (10-20 users)
+        "c6a.16xlarge", "c5a.16xlarge", # 64 vCPU / 128 GB - ~$0.74/hr spot
+        # Large demos (20-40 users)
+        "c6a.32xlarge", "c5a.32xlarge" # 128 vCPU / 256 GB - ~$1.47/hr spot
+      ]
     }])
     node_class_ref_name          = "coder-ws-class"
     disruption_consolidate_after = "30m"
@@ -183,7 +200,7 @@ module "karpenter-addon" {
     block_device_mappings = [{
       device_name = "/dev/xvda"
       ebs = {
-        volume_size = "1400Gi"
+        volume_size = "500Gi" // Decreased from 1400Gi to save costs; felt overkill for coder-server nodes
         volume_type = "gp3"
       }
       }, {
@@ -198,6 +215,7 @@ module "karpenter-addon" {
     subnet_selector_tags = local.provisioner_subnet_tags
     sg_selector_tags     = local.provisioner_sg_tags
   }]
+  nodepool_configs = local.nodepool_configs
 }
 
 # import {
diff --git a/infra/aws/us-east-2/k8s/lb-controller/main.tf b/infra/aws/us-east-2/k8s/lb-controller/main.tf
index 2bf1d2c..07ed13c 100644
--- a/infra/aws/us-east-2/k8s/lb-controller/main.tf
+++ b/infra/aws/us-east-2/k8s/lb-controller/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -60,7 +60,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/us-east-2/k8s/litellm/main.tf b/infra/aws/us-east-2/k8s/litellm/main.tf
index 3e99231..709707a 100644
--- a/infra/aws/us-east-2/k8s/litellm/main.tf
+++ b/infra/aws/us-east-2/k8s/litellm/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
   }
   backend "s3" {}
diff --git a/infra/aws/us-east-2/k8s/metrics-server/main.tf b/infra/aws/us-east-2/k8s/metrics-server/main.tf
index d808c74..cce9447 100644
--- a/infra/aws/us-east-2/k8s/metrics-server/main.tf
+++ b/infra/aws/us-east-2/k8s/metrics-server/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
   }
   backend "s3" {}
@@ -48,7 +48,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/us-east-2/rds/main.tf b/infra/aws/us-east-2/rds/main.tf
index ad0e620..2adaa05 100644
--- a/infra/aws/us-east-2/rds/main.tf
+++ b/infra/aws/us-east-2/rds/main.tf
@@ -5,6 +5,10 @@ terraform {
       source  = "hashicorp/aws"
       version = ">= 5.46"
     }
+    random = {
+      source  = "hashicorp/random"
+      version = "~> 3.6"
+    }
   }
   backend "s3" {}
 }
@@ -19,20 +23,10 @@ variable "master_username" {
   type        = string
 }
 
-variable "master_password" {
-  description = "Database root password"
-  type        = string
-}
-
 variable "litellm_username" {
   type = string
 }
 
-variable "litellm_password" {
-  type      = string
-  sensitive = true
-}
-
 variable "name" {
   description = "Name of resource and tag prefix"
   type        = string
@@ -80,6 +74,17 @@ provider "aws" {
   profile = var.profile
 }
 
+# Generate secure random passwords
+resource "random_password" "coder_master_password" {
+  length  = 32
+  special = true
+}
+
+resource "random_password" "litellm_password" {
+  length  = 32
+  special = true
+}
+
 # https://developer.hashicorp.com/terraform/tutorials/aws/aws-rds
 resource "aws_db_subnet_group" "db_subnet_group" {
   name       = "${var.name}-db-subnet-group"
@@ -90,52 +95,85 @@ resource "aws_db_subnet_group" "db_subnet_group" {
   }
 }
 
-resource "aws_db_instance" "db" {
-  identifier        = "${var.name}-db"
-  instance_class    = var.instance_class
-  allocated_storage = var.allocated_storage
-  engine            = "postgres"
-  engine_version    = "15.12"
-  # backup_retention_period = 7
-  username               = var.master_username
-  password               = var.master_password
-  db_name                = "coder"
-  db_subnet_group_name   = aws_db_subnet_group.db_subnet_group.name
-  vpc_security_group_ids = [aws_security_group.allow-port-5432.id]
-  publicly_accessible    = false
-  skip_final_snapshot    = false
+# Aurora Serverless v2 Cluster for Coder
+resource "aws_rds_cluster" "coder" {
+  cluster_identifier      = "${var.name}-aurora-cluster"
+  engine                  = "aurora-postgresql"
+  engine_mode             = "provisioned"
+  engine_version          = "15.8"
+  database_name           = "coder"
+  master_username         = var.master_username
+  master_password         = random_password.coder_master_password.result
+  db_subnet_group_name    = aws_db_subnet_group.db_subnet_group.name
+  vpc_security_group_ids  = [aws_security_group.allow-port-5432.id]
+  backup_retention_period = 7
+  preferred_backup_window = "03:00-04:00"
+  skip_final_snapshot     = false
+  storage_encrypted       = true
+
+  serverlessv2_scaling_configuration {
+    min_capacity = 0.5 # 0.5 ACU = 1 GB RAM (idle state)
+    max_capacity = 16  # 16 ACU = 32 GB RAM (handles 5K-10K users)
+  }
 
   tags = {
-    Name = "${var.name}-rds-db"
+    Name = "${var.name}-aurora-coder"
   }
-  lifecycle {
-    ignore_changes = [
-      snapshot_identifier
-    ]
+}
+
+# Aurora Serverless v2 Instance for Coder (Single writer instance)
+resource "aws_rds_cluster_instance" "coder_writer" {
+  identifier           = "${var.name}-aurora-coder-writer"
+  cluster_identifier   = aws_rds_cluster.coder.id
+  instance_class       = "db.serverless"
+  engine               = aws_rds_cluster.coder.engine
+  engine_version       = "15.8"
+  publicly_accessible  = false
+  db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name
+
+  tags = {
+    Name = "${var.name}-aurora-coder-writer"
   }
 }
 
-resource "aws_db_instance" "litellm" {
-  identifier             = "litellm"
-  instance_class         = "db.m5.large"
-  allocated_storage      = 50
-  engine                 = "postgres"
-  engine_version         = "15.12"
-  username               = var.litellm_username
-  password               = var.litellm_password
-  db_name                = "litellm"
-  db_subnet_group_name   = aws_db_subnet_group.db_subnet_group.name
-  vpc_security_group_ids = [aws_security_group.allow-port-5432.id]
-  publicly_accessible    = false
-  skip_final_snapshot    = false
+# Aurora Serverless v2 Cluster for LiteLLM
+resource "aws_rds_cluster" "litellm" {
+  cluster_identifier      = "litellm-aurora-cluster"
+  engine                  = "aurora-postgresql"
+  engine_mode             = "provisioned"
+  engine_version          = "15.8"
+  database_name           = "litellm"
+  master_username         = var.litellm_username
+  master_password         = random_password.litellm_password.result
+  db_subnet_group_name    = aws_db_subnet_group.db_subnet_group.name
+  vpc_security_group_ids  = [aws_security_group.allow-port-5432.id]
+  backup_retention_period = 7
+  preferred_backup_window = "04:00-05:00"
+  skip_final_snapshot     = false
+  storage_encrypted       = true
+
+  serverlessv2_scaling_configuration {
+    min_capacity = 0.5 # 0.5 ACU = 1 GB RAM (idle state)
+    max_capacity = 8   # 8 ACU = 16 GB RAM (handles moderate usage)
+  }
 
   tags = {
-    Name = "litellm"
+    Name = "litellm-aurora"
   }
-  lifecycle {
-    ignore_changes = [
-      snapshot_identifier
-    ]
+}
+
+# Aurora Serverless v2 Instance for LiteLLM
+resource "aws_rds_cluster_instance" "litellm_writer" {
+  identifier           = "litellm-aurora-writer"
+  cluster_identifier   = aws_rds_cluster.litellm.id
+  instance_class       = "db.serverless"
+  engine               = aws_rds_cluster.litellm.engine
+  engine_version       = "15.8"
+  publicly_accessible  = false
+  db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name
+
+  tags = {
+    Name = "litellm-aurora-writer"
   }
 }
 
@@ -151,12 +189,18 @@ resource "aws_vpc_security_group_ingress_rule" "postgres" {
   to_port           = 5432
 }
 
-resource "aws_vpc_security_group_egress_rule" "all" {
+# Allow access from us-west-2 VPC for multi-region deployment
+resource "aws_vpc_security_group_ingress_rule" "postgres_usw2" {
   security_group_id = aws_security_group.allow-port-5432.id
-  cidr_ipv4         = "0.0.0.0/0"
-  ip_protocol       = -1
+  cidr_ipv4         = "10.1.0.0/16"
+  ip_protocol       = "tcp"
+  from_port         = 5432
+  to_port           = 5432
 }
 
+# No egress rules needed - RDS only responds to inbound connections
+# This follows security best practice of least privilege
+
 resource "aws_security_group" "allow-port-5432" {
   vpc_id      = var.vpc_id
   name        = "${var.name}-all-port-5432"
@@ -166,23 +210,95 @@ resource "aws_security_group" "allow-port-5432" {
   }
 }
 
-output "rds_port" {
-  description = "Database instance port"
-  value       = aws_db_instance.db.port
+# Store Coder DB credentials in Secrets Manager
+resource "aws_secretsmanager_secret" "coder_db" {
+  name_prefix             = "${var.name}-coder-db-"
+  description             = "Coder PostgreSQL database credentials"
+  recovery_window_in_days = 7
+
+  tags = {
+    Name = "${var.name}-coder-db-secret"
+  }
+}
+
+resource "aws_secretsmanager_secret_version" "coder_db" {
+  secret_id = aws_secretsmanager_secret.coder_db.id
+  secret_string = jsonencode({
+    username       = var.master_username
+    password       = random_password.coder_master_password.result
+    host           = aws_rds_cluster.coder.endpoint
+    reader_host    = aws_rds_cluster.coder.reader_endpoint
+    port           = aws_rds_cluster.coder.port
+    dbname         = aws_rds_cluster.coder.database_name
+    url            = "postgres://${var.master_username}:${random_password.coder_master_password.result}@${aws_rds_cluster.coder.endpoint}:${aws_rds_cluster.coder.port}/${aws_rds_cluster.coder.database_name}?sslmode=require"
+    reader_url     = "postgres://${var.master_username}:${random_password.coder_master_password.result}@${aws_rds_cluster.coder.reader_endpoint}:${aws_rds_cluster.coder.port}/${aws_rds_cluster.coder.database_name}?sslmode=require"
+    cluster_id     = aws_rds_cluster.coder.id
+    engine_version = aws_rds_cluster.coder.engine_version
+  })
+}
+
+# Store LiteLLM DB credentials in Secrets Manager
+resource "aws_secretsmanager_secret" "litellm_db" {
+  name_prefix             = "litellm-db-"
+  description             = "LiteLLM PostgreSQL database credentials"
+  recovery_window_in_days = 7
+
+  tags = {
+    Name = "litellm-db-secret"
+  }
+}
+
+resource "aws_secretsmanager_secret_version" "litellm_db" {
+  secret_id = aws_secretsmanager_secret.litellm_db.id
+  secret_string = jsonencode({
+    username       = var.litellm_username
+    password       = random_password.litellm_password.result
+    host           = aws_rds_cluster.litellm.endpoint
+    reader_host    = aws_rds_cluster.litellm.reader_endpoint
+    port           = aws_rds_cluster.litellm.port
+    dbname         = aws_rds_cluster.litellm.database_name
+    url            = "postgres://${var.litellm_username}:${random_password.litellm_password.result}@${aws_rds_cluster.litellm.endpoint}:${aws_rds_cluster.litellm.port}/${aws_rds_cluster.litellm.database_name}?sslmode=require"
+    cluster_id     = aws_rds_cluster.litellm.id
+    engine_version = aws_rds_cluster.litellm.engine_version
+  })
+}
+
+output "coder_cluster_endpoint" {
+  description = "Aurora cluster writer endpoint for Coder"
+  value       = aws_rds_cluster.coder.endpoint
+}
+
+output "coder_cluster_reader_endpoint" {
+  description = "Aurora cluster reader endpoint for Coder"
+  value       = aws_rds_cluster.coder.reader_endpoint
+}
+
+output "coder_cluster_port" {
+  description = "Aurora cluster port for Coder"
+  value       = aws_rds_cluster.coder.port
+}
+
+output "coder_db_secret_arn" {
+  description = "ARN of Secrets Manager secret containing Coder DB credentials"
+  value       = aws_secretsmanager_secret.coder_db.arn
+}
+
+output "litellm_cluster_endpoint" {
+  description = "Aurora cluster writer endpoint for LiteLLM"
+  value       = aws_rds_cluster.litellm.endpoint
 }
 
-output "rds_username" {
-  description = "Database instance root username"
-  value       = aws_db_instance.db.username
+output "litellm_cluster_reader_endpoint" {
+  description = "Aurora cluster reader endpoint for LiteLLM"
+  value       = aws_rds_cluster.litellm.reader_endpoint
 }
 
-output "rds_address" {
-  description = "Database instance address"
-  value       = aws_db_instance.db.address
+output "litellm_cluster_port" {
+  description = "Aurora cluster port for LiteLLM"
+  value       = aws_rds_cluster.litellm.port
 }
 
-output "rds_password" {
-  description = "Database instance root password"
-  value       = aws_db_instance.db.password
-  sensitive   = true
+output "litellm_db_secret_arn" {
+  description = "ARN of Secrets Manager secret containing LiteLLM DB credentials"
+  value       = aws_secretsmanager_secret.litellm_db.arn
 }
diff --git a/infra/aws/us-east-2/route53/README.md b/infra/aws/us-east-2/route53/README.md
new file mode 100644
index 0000000..e52ef05
--- /dev/null
+++ b/infra/aws/us-east-2/route53/README.md
@@ -0,0 +1,69 @@
+# Route 53 Latency-Based Routing for Coder
+
+This Terraform configuration sets up Route 53 latency-based routing for the Coder deployment in us-east-2.
+
+## Overview
+
+Latency-based routing automatically directs users to the AWS region that provides the lowest latency, improving the user experience by connecting them to the nearest deployment.
+
+## Features
+
+- **Latency-based routing**: Routes users to the closest region automatically
+- **Health checks**: Monitors endpoint health and routes around failures
+- **Wildcard DNS**: Supports workspace application subdomains
+- **Automatic NLB discovery**: Retrieves NLB hostname from Kubernetes service
+
+## Prerequisites
+
+1. Hosted Zone ID for coderdemo.io (already configured: Z080884039133KJPAGA3S)
+2. Running EKS cluster with Coder deployed
+3. Network Load Balancer created via Kubernetes service
+
+## Deployment
+
+1. Create terraform.tfvars from the example:
+
+```bash
+cp terraform.tfvars.example terraform.tfvars
+```
+
+2. Update terraform.tfvars with your cluster name:
+
+```hcl
+cluster_name = "your-cluster-name"
+```
+
+3. Initialize and apply:
+
+```bash
+terraform init
+terraform plan
+terraform apply
+```
+
+## How It Works
+
+1. The configuration queries the Kubernetes service to get the NLB hostname
+2. Creates Route 53 A records with latency-based routing policy
+3. Sets up health checks to monitor endpoint availability
+4. Configures both main domain and wildcard records
+
+## Health Checks
+
+Health checks monitor the `/api/v2/buildinfo` endpoint on port 443 (HTTPS):
+
+- **Interval**: 30 seconds
+- **Failure threshold**: 3 consecutive failures
+- **Latency measurement**: Enabled for monitoring
+
+## Records Created
+
+- `coderdemo.io` - Main domain with latency routing
+- `*.coderdemo.io` - Wildcard for workspace applications
+
+## Important Notes
+
+- Deploy this configuration in **both** us-east-2 and us-west-2 with different set_identifiers
+- Each region's configuration points to its local NLB
+- Route 53 automatically routes based on measured latency
+- Health checks ensure failover if one region becomes unhealthy
diff --git a/infra/aws/us-east-2/route53/main.tf b/infra/aws/us-east-2/route53/main.tf
new file mode 100644
index 0000000..3f0e191
--- /dev/null
+++ b/infra/aws/us-east-2/route53/main.tf
@@ -0,0 +1,217 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = ">= 5.0"
+    }
+    kubernetes = {
+      source  = "hashicorp/kubernetes"
+      version = ">= 2.0"
+    }
+  }
+}
+
+variable "cluster_region" {
+  description = "AWS region"
+  type        = string
+  default     = "us-east-2"
+}
+
+variable "cluster_profile" {
+  description = "AWS profile"
+  type        = string
+  default     = "default"
+}
+
+variable "cluster_name" {
+  description = "EKS cluster name"
+  type        = string
+}
+
+variable "domain_name" {
+  description = "Domain name for Coder"
+  type        = string
+  default     = ""
+}
+
+variable "hosted_zone_id" {
+  description = "Route 53 Hosted Zone ID (provide via tfvars)"
+  type        = string
+}
+
+variable "coder_service_name" {
+  description = "Coder service name in Kubernetes"
+  type        = string
+  default     = "coder"
+}
+
+variable "coder_namespace" {
+  description = "Coder namespace in Kubernetes"
+  type        = string
+  default     = "coder"
+}
+
+variable "set_identifier" {
+  description = "Unique identifier for this routing policy record"
+  type        = string
+  default     = "us-east-2"
+}
+
+variable "health_check_enabled" {
+  description = "Enable Route 53 health checks"
+  type        = bool
+  default     = true
+}
+
+variable "health_check_path" {
+  description = "Path for health checks"
+  type        = string
+  default     = "/api/v2/buildinfo"
+}
+
+provider "aws" {
+  region  = var.cluster_region
+  profile = var.cluster_profile
+}
+
+data "aws_eks_cluster" "this" {
+  name = var.cluster_name
+}
+
+data "aws_eks_cluster_auth" "this" {
+  name = var.cluster_name
+}
+
+provider "kubernetes" {
+  host                   = data.aws_eks_cluster.this.endpoint
+  cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
+  token                  = data.aws_eks_cluster_auth.this.token
+}
+
+# Get the NLB hostname from the Kubernetes service
+data "kubernetes_service" "coder" {
+  metadata {
+    name      = var.coder_service_name
+    namespace = var.coder_namespace
+  }
+}
+
+# Extract the NLB details
+locals {
+  nlb_hostname = try(data.kubernetes_service.coder.status[0].load_balancer[0].ingress[0].hostname, "")
+}
+
+# Get NLB by tags (AWS Load Balancer Controller tags the NLB)
+data "aws_lb" "coder_nlb" {
+  tags = {
+    "service.k8s.aws/stack" = "${var.coder_namespace}/${var.coder_service_name}"
+  }
+}
+
+# Health check for the NLB endpoint
+resource "aws_route53_health_check" "coder" {
+  count             = var.health_check_enabled ? 1 : 0
+  type              = "HTTPS"
+  resource_path     = var.health_check_path
+  fqdn              = var.domain_name
+  port              = 443
+  request_interval  = 30
+  failure_threshold = 3
+  measure_latency   = true
+
+  tags = {
+    Name        = "coder-${var.set_identifier}"
+    Region      = var.cluster_region
+    Environment = "production"
+    ManagedBy   = "terraform"
+  }
+}
+
+# Latency-based routing record for the main domain
+resource "aws_route53_record" "coder_latency" {
+  zone_id         = var.hosted_zone_id
+  name            = var.domain_name
+  type            = "A"
+  set_identifier  = var.set_identifier
+  allow_overwrite = true
+
+  alias {
+    name                   = local.nlb_hostname
+    zone_id                = data.aws_lb.coder_nlb.zone_id
+    evaluate_target_health = true
+  }
+
+  latency_routing_policy {
+    region = var.cluster_region
+  }
+
+  health_check_id = var.health_check_enabled ? aws_route53_health_check.coder[0].id : null
+}
+
+# Latency-based routing record for wildcard subdomains
+resource "aws_route53_record" "coder_wildcard_latency" {
+  zone_id         = var.hosted_zone_id
+  name            = "*.${var.domain_name}"
+  type            = "A"
+  set_identifier  = var.set_identifier
+  allow_overwrite = true
+
+  alias {
+    name                   = local.nlb_hostname
+    zone_id                = data.aws_lb.coder_nlb.zone_id
+    evaluate_target_health = true
+  }
+
+  latency_routing_policy {
+    region = var.cluster_region
+  }
+
+  health_check_id = var.health_check_enabled ? aws_route53_health_check.coder[0].id : null
+}
+
+# Region-specific subdomain for manual region selection
+resource "aws_route53_record" "coder_region_specific" {
+  zone_id = var.hosted_zone_id
+  name    = "${var.set_identifier}.${var.domain_name}"
+  type    = "A"
+
+  alias {
+    name                   = local.nlb_hostname
+    zone_id                = data.aws_lb.coder_nlb.zone_id
+    evaluate_target_health = true
+  }
+}
+
+# Wildcard for region-specific subdomain (for workspace apps)
+resource "aws_route53_record" "coder_region_specific_wildcard" {
+  zone_id = var.hosted_zone_id
+  name    = "*.${var.set_identifier}.${var.domain_name}"
+  type    = "A"
+
+  alias {
+    name                   = local.nlb_hostname
+    zone_id                = data.aws_lb.coder_nlb.zone_id
+    evaluate_target_health = true
+  }
+}
+
+# Outputs
+output "nlb_hostname" {
+  description = "Network Load Balancer hostname"
+  value       = local.nlb_hostname
+}
+
+output "nlb_zone_id" {
+  description = "Network Load Balancer Route 53 zone ID"
+  value       = data.aws_lb.coder_nlb.zone_id
+}
+
+output "health_check_id" {
+  description = "Route 53 health check ID"
+  value       = var.health_check_enabled ? aws_route53_health_check.coder[0].id : null
+}
+
+output "route53_record_fqdn" {
+  description = "Fully qualified domain name of the Route 53 record"
+  value       = aws_route53_record.coder_latency.fqdn
+}
diff --git a/infra/aws/us-east-2/terraform-backend/main.tf b/infra/aws/us-east-2/terraform-backend/main.tf
new file mode 100644
index 0000000..5be0f2d
--- /dev/null
+++ b/infra/aws/us-east-2/terraform-backend/main.tf
@@ -0,0 +1,144 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = ">= 5.0"
+    }
+  }
+}
+
+variable "region" {
+  description = "AWS region for backend resources"
+  type        = string
+  default     = "us-east-2"
+}
+
+variable "profile" {
+  description = "AWS profile"
+  type        = string
+  default     = "noah@coder.com"
+}
+
+variable "project_name" {
+  description = "Project name for resource naming"
+  type        = string
+  default     = "coder-demo"
+}
+
+provider "aws" {
+  region  = var.region
+  profile = var.profile
+}
+
+# S3 bucket for Terraform state
+resource "aws_s3_bucket" "terraform_state" {
+  bucket = "${var.project_name}-terraform-state-${data.aws_caller_identity.current.account_id}"
+
+  tags = {
+    Name        = "Terraform State Bucket"
+    Environment = "production-demo"
+    ManagedBy   = "terraform"
+    Purpose     = "terraform-backend"
+  }
+}
+
+# Enable versioning for state file history
+resource "aws_s3_bucket_versioning" "terraform_state" {
+  bucket = aws_s3_bucket.terraform_state.id
+
+  versioning_configuration {
+    status = "Enabled"
+  }
+}
+
+# Enable server-side encryption
+resource "aws_s3_bucket_server_side_encryption_configuration" "terraform_state" {
+  bucket = aws_s3_bucket.terraform_state.id
+
+  rule {
+    apply_server_side_encryption_by_default {
+      sse_algorithm = "AES256"
+    }
+  }
+}
+
+# Block public access
+resource "aws_s3_bucket_public_access_block" "terraform_state" {
+  bucket = aws_s3_bucket.terraform_state.id
+
+  block_public_acls       = true
+  block_public_policy     = true
+  ignore_public_acls      = true
+  restrict_public_buckets = true
+}
+
+# Lifecycle policy to delete old state versions after 90 days
+resource "aws_s3_bucket_lifecycle_configuration" "terraform_state" {
+  bucket = aws_s3_bucket.terraform_state.id
+
+  rule {
+    id     = "delete-old-versions"
+    status = "Enabled"
+
+    noncurrent_version_expiration {
+      noncurrent_days = 90
+    }
+  }
+
+  rule {
+    id     = "abort-incomplete-uploads"
+    status = "Enabled"
+
+    abort_incomplete_multipart_upload {
+      days_after_initiation = 7
+    }
+  }
+}
+
+# DynamoDB table for state locking
+resource "aws_dynamodb_table" "terraform_locks" {
+  name         = "${var.project_name}-terraform-locks"
+  billing_mode = "PAY_PER_REQUEST"
+  hash_key     = "LockID"
+
+  attribute {
+    name = "LockID"
+    type = "S"
+  }
+
+  tags = {
+    Name        = "Terraform State Lock Table"
+    Environment = "production-demo"
+    ManagedBy   = "terraform"
+    Purpose     = "terraform-backend"
+  }
+}
+
+# Get current AWS account ID
+data "aws_caller_identity" "current" {}
+
+# Outputs
+output "state_bucket_name" {
+  description = "S3 bucket name for Terraform state"
+  value       = aws_s3_bucket.terraform_state.id
+}
+
+output "state_bucket_arn" {
+  description = "S3 bucket ARN"
+  value       = aws_s3_bucket.terraform_state.arn
+}
+
+output "dynamodb_table_name" {
+  description = "DynamoDB table name for state locking"
+  value       = aws_dynamodb_table.terraform_locks.id
+}
+
+output "backend_config" {
+  description = "Backend configuration to use in other modules"
+  value = {
+    bucket         = aws_s3_bucket.terraform_state.id
+    region         = var.region
+    dynamodb_table = aws_dynamodb_table.terraform_locks.id
+    encrypt        = true
+  }
+}
diff --git a/infra/aws/us-east-2/terraform-backend/terraform.tfvars.example b/infra/aws/us-east-2/terraform-backend/terraform.tfvars.example
new file mode 100644
index 0000000..f62ce73
--- /dev/null
+++ b/infra/aws/us-east-2/terraform-backend/terraform.tfvars.example
@@ -0,0 +1,6 @@
+# Backend configuration for Coder demo environment
+# Copy this to terraform.tfvars and fill in your values
+
+region       = "us-east-2"
+profile      = "YOUR_AWS_PROFILE"
+project_name = "YOUR_PROJECT_NAME"
diff --git a/infra/aws/us-east-2/vpc-peering/main.tf b/infra/aws/us-east-2/vpc-peering/main.tf
new file mode 100644
index 0000000..ebfe054
--- /dev/null
+++ b/infra/aws/us-east-2/vpc-peering/main.tf
@@ -0,0 +1,164 @@
+terraform {
+  required_version = ">= 1.0"
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = ">= 5.100.0"
+    }
+  }
+  backend "s3" {}
+}
+
+variable "profile" {
+  type    = string
+  default = "default"
+}
+
+variable "requester_vpc_id" {
+  description = "VPC ID in us-east-2 (requester)"
+  type        = string
+}
+
+variable "accepter_vpc_id" {
+  description = "VPC ID in us-west-2 (accepter)"
+  type        = string
+}
+
+variable "requester_vpc_cidr" {
+  description = "CIDR block for us-east-2 VPC"
+  type        = string
+  default     = "10.0.0.0/16"
+}
+
+variable "accepter_vpc_cidr" {
+  description = "CIDR block for us-west-2 VPC"
+  type        = string
+  default     = "10.1.0.0/16"
+}
+
+variable "requester_node_security_group_id" {
+  description = "Security group ID for EKS nodes in us-east-2"
+  type        = string
+}
+
+variable "accepter_node_security_group_id" {
+  description = "Security group ID for EKS nodes in us-west-2"
+  type        = string
+}
+
+# Provider for us-east-2 (requester)
+provider "aws" {
+  alias   = "use2"
+  region  = "us-east-2"
+  profile = var.profile
+}
+
+# Provider for us-west-2 (accepter)
+provider "aws" {
+  alias   = "usw2"
+  region  = "us-west-2"
+  profile = var.profile
+}
+
+# Create VPC peering connection from us-east-2
+resource "aws_vpc_peering_connection" "use2_to_usw2" {
+  provider = aws.use2
+
+  vpc_id      = var.requester_vpc_id
+  peer_vpc_id = var.accepter_vpc_id
+  peer_region = "us-west-2"
+  auto_accept = false
+
+  tags = {
+    Name      = "coderdemo-use2-usw2-peering"
+    ManagedBy = "terraform"
+    Side      = "Requester"
+  }
+}
+
+# Accept the peering connection in us-west-2
+resource "aws_vpc_peering_connection_accepter" "usw2_accepter" {
+  provider = aws.usw2
+
+  vpc_peering_connection_id = aws_vpc_peering_connection.use2_to_usw2.id
+  auto_accept               = true
+
+  tags = {
+    Name      = "coderdemo-use2-usw2-peering"
+    ManagedBy = "terraform"
+    Side      = "Accepter"
+  }
+}
+
+# Get route tables in us-east-2
+data "aws_route_tables" "use2" {
+  provider = aws.use2
+  vpc_id   = var.requester_vpc_id
+}
+
+# Get route tables in us-west-2
+data "aws_route_tables" "usw2" {
+  provider = aws.usw2
+  vpc_id   = var.accepter_vpc_id
+}
+
+# Add routes in us-east-2 route tables to us-west-2 CIDR
+resource "aws_route" "use2_to_usw2" {
+  provider = aws.use2
+  for_each = toset(data.aws_route_tables.use2.ids)
+
+  route_table_id            = each.value
+  destination_cidr_block    = var.accepter_vpc_cidr
+  vpc_peering_connection_id = aws_vpc_peering_connection.use2_to_usw2.id
+
+  depends_on = [aws_vpc_peering_connection_accepter.usw2_accepter]
+}
+
+# Add routes in us-west-2 route tables to us-east-2 CIDR
+resource "aws_route" "usw2_to_use2" {
+  provider = aws.usw2
+  for_each = toset(data.aws_route_tables.usw2.ids)
+
+  route_table_id            = each.value
+  destination_cidr_block    = var.requester_vpc_cidr
+  vpc_peering_connection_id = aws_vpc_peering_connection.use2_to_usw2.id
+
+  depends_on = [aws_vpc_peering_connection_accepter.usw2_accepter]
+}
+
+# Security group rule to allow Coder replica communication from us-west-2 to us-east-2
+resource "aws_security_group_rule" "use2_allow_coder_from_usw2" {
+  provider = aws.use2
+
+  type              = "ingress"
+  from_port         = 8080
+  to_port           = 8080
+  protocol          = "tcp"
+  cidr_blocks       = [var.accepter_vpc_cidr]
+  security_group_id = var.requester_node_security_group_id
+  description       = "Allow Coder replica communication from us-west-2"
+}
+
+# Security group rule to allow Coder replica communication from us-east-2 to us-west-2
+resource "aws_security_group_rule" "usw2_allow_coder_from_use2" {
+  provider = aws.usw2
+
+  type              = "ingress"
+  from_port         = 8080
+  to_port           = 8080
+  protocol          = "tcp"
+  cidr_blocks       = [var.requester_vpc_cidr]
+  security_group_id = var.accepter_node_security_group_id
+  description       = "Allow Coder replica communication from us-east-2"
+}
+
+# Outputs
+output "peering_connection_id" {
+  description = "VPC Peering Connection ID"
+  value       = aws_vpc_peering_connection.use2_to_usw2.id
+}
+
+output "peering_status" {
+  description = "VPC Peering Connection Status"
+  value       = aws_vpc_peering_connection.use2_to_usw2.accept_status
+}
diff --git a/infra/aws/us-west-2/acm/main.tf b/infra/aws/us-west-2/acm/main.tf
new file mode 100644
index 0000000..89122ca
--- /dev/null
+++ b/infra/aws/us-west-2/acm/main.tf
@@ -0,0 +1,108 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = ">= 5.0"
+    }
+  }
+}
+
+variable "cluster_region" {
+  description = "AWS region for ACM certificate"
+  type        = string
+  default     = "us-west-2"
+}
+
+variable "cluster_profile" {
+  description = "AWS profile"
+  type        = string
+  default     = "default"
+}
+
+variable "domain_name" {
+  description = "Domain name for Coder"
+  type        = string
+  default     = "coderdemo.io"
+}
+
+variable "hosted_zone_id" {
+  description = "Route 53 Hosted Zone ID"
+  type        = string
+}
+
+provider "aws" {
+  region  = var.cluster_region
+  profile = var.cluster_profile
+  alias   = "acm"
+}
+
+# Provider for Route 53 (may be in different account)
+provider "aws" {
+  region  = var.cluster_region
+  profile = var.cluster_profile
+  alias   = "route53"
+}
+
+# ACM Certificate for Coder with wildcard
+resource "aws_acm_certificate" "coder" {
+  provider          = aws.acm
+  domain_name       = var.domain_name
+  validation_method = "DNS"
+
+  subject_alternative_names = [
+    "*.${var.domain_name}"
+  ]
+
+  lifecycle {
+    create_before_destroy = true
+  }
+
+  tags = {
+    Name        = "coder-certificate"
+    Environment = "production"
+    ManagedBy   = "terraform"
+    Region      = "us-west-2"
+  }
+}
+
+# Route 53 validation records
+resource "aws_route53_record" "cert_validation" {
+  provider = aws.route53
+  for_each = {
+    for dvo in aws_acm_certificate.coder.domain_validation_options : dvo.domain_name => {
+      name   = dvo.resource_record_name
+      record = dvo.resource_record_value
+      type   = dvo.resource_record_type
+    }
+  }
+
+  allow_overwrite = true
+  name            = each.value.name
+  records         = [each.value.record]
+  ttl             = 60
+  type            = each.value.type
+  zone_id         = var.hosted_zone_id
+}
+
+# Wait for certificate validation
+resource "aws_acm_certificate_validation" "coder" {
+  provider                = aws.acm
+  certificate_arn         = aws_acm_certificate.coder.arn
+  validation_record_fqdns = [for record in aws_route53_record.cert_validation : record.fqdn]
+}
+
+# Outputs
+output "certificate_arn" {
+  description = "ARN of the validated ACM certificate"
+  value       = aws_acm_certificate_validation.coder.certificate_arn
+}
+
+output "domain_name" {
+  description = "Domain name for Coder"
+  value       = var.domain_name
+}
+
+output "validation_status" {
+  description = "Certificate validation status"
+  value       = "Certificate validated and ready to use"
+}
diff --git a/infra/aws/us-west-2/eks/main.tf b/infra/aws/us-west-2/eks/main.tf
index 2bffa33..3140818 100644
--- a/infra/aws/us-west-2/eks/main.tf
+++ b/infra/aws/us-west-2/eks/main.tf
@@ -30,10 +30,16 @@ variable "cluster_version" {
 
 variable "cluster_instance_type" {
   description = "EKS Instance Size/Type"
-  default     = "t3.xlarge"
+  default     = "t4g.medium" # ARM Graviton for cost optimization
   type        = string
 }
 
+variable "allowed_cidrs" {
+  description = "CIDR blocks allowed to access EKS API endpoint"
+  type        = list(string)
+  default     = ["0.0.0.0/0"] # Open by default, restrict in tfvars
+}
+
 provider "aws" {
   region  = var.region
   profile = var.profile
@@ -73,16 +79,16 @@ module "eks-network" {
   source = "../../../../modules/network/eks-vpc"
 
   name           = var.name
-  vpc_cidr_block = "10.0.0.0/16"
+  vpc_cidr_block = "10.1.0.0/16"
   public_subnets = {
     "system0" = {
-      cidr_block                          = "10.0.10.0/24"
+      cidr_block                          = "10.1.10.0/24"
       availability_zone                   = "${data.aws_region.this.name}a"
       map_public_ip_on_launch             = true
       private_dns_hostname_type_on_launch = "ip-name"
     }
     "system1" = {
-      cidr_block                          = "10.0.11.0/24"
+      cidr_block                          = "10.1.11.0/24"
       availability_zone                   = "${data.aws_region.this.name}b"
       map_public_ip_on_launch             = true
       private_dns_hostname_type_on_launch = "ip-name"
@@ -90,26 +96,26 @@ module "eks-network" {
   }
   private_subnets = {
     "system0" = {
-      cidr_block                          = "10.0.20.0/24"
+      cidr_block                          = "10.1.20.0/24"
       availability_zone                   = "${data.aws_region.this.name}a"
       private_dns_hostname_type_on_launch = "ip-name"
       tags                                = local.system_subnet_tags
     }
     "system1" = {
-      cidr_block                          = "10.0.21.0/24"
+      cidr_block                          = "10.1.21.0/24"
       availability_zone                   = "${data.aws_region.this.name}b"
       private_dns_hostname_type_on_launch = "ip-name"
       tags                                = local.system_subnet_tags
     }
     "provisioner" = {
-      cidr_block                          = "10.0.22.0/24"
+      cidr_block                          = "10.1.22.0/24"
       availability_zone                   = "${data.aws_region.this.name}a"
       map_public_ip_on_launch             = true
       private_dns_hostname_type_on_launch = "ip-name"
       tags                                = local.provisioner_subnet_tags
     }
     "ws-all" = {
-      cidr_block                          = "10.0.16.0/22"
+      cidr_block                          = "10.1.16.0/22"
       availability_zone                   = "${data.aws_region.this.name}b"
       map_public_ip_on_launch             = true
       private_dns_hostname_type_on_launch = "ip-name"
@@ -144,10 +150,11 @@ module "cluster" {
     module.eks-network.intra_subnet_ids
   ))
 
-  cluster_name                    = var.name
-  cluster_version                 = var.cluster_version
-  cluster_endpoint_public_access  = true
-  cluster_endpoint_private_access = true
+  cluster_name                         = var.name
+  cluster_version                      = var.cluster_version
+  cluster_endpoint_public_access       = true
+  cluster_endpoint_private_access      = true
+  cluster_endpoint_public_access_cidrs = var.allowed_cidrs
 
   create_cluster_security_group = true
   create_node_security_group    = true
@@ -179,11 +186,12 @@ module "cluster" {
     system = {
       min_size     = 0
       max_size     = 10
-      desired_size = 0 # Cant be modified after creation. Override from AWS Console
+      desired_size = 1 # Scale to 1 node for cluster functionality
       labels       = local.cluster_asg_node_labels
 
       instance_types = [var.cluster_instance_type]
       capacity_type  = "ON_DEMAND"
+      ami_type       = "AL2023_ARM_64_STANDARD" # ARM AMI for Graviton instances
       iam_role_additional_policies = {
         AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
         STSAssumeRole                = aws_iam_policy.sts.arn
diff --git a/infra/aws/us-west-2/k8s/cert-manager/main.tf b/infra/aws/us-west-2/k8s/cert-manager/main.tf
index c2869b5..8a423e6 100644
--- a/infra/aws/us-west-2/k8s/cert-manager/main.tf
+++ b/infra/aws/us-west-2/k8s/cert-manager/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -41,11 +41,6 @@ variable "addon_version" {
   default = "1.13.3"
 }
 
-variable "cloudflare_api_token" {
-  type      = string
-  sensitive = true
-}
-
 provider "aws" {
   region  = var.cluster_region
   profile = var.cluster_profile
@@ -60,7 +55,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
@@ -78,7 +73,6 @@ module "cert-manager" {
   cluster_name              = var.cluster_name
   cluster_oidc_provider_arn = var.cluster_oidc_provider_arn
 
-  namespace               = var.addon_namespace
-  helm_version            = var.addon_version
-  cloudflare_token_secret = var.cloudflare_api_token
+  namespace    = var.addon_namespace
+  helm_version = var.addon_version
 }
\ No newline at end of file
diff --git a/infra/aws/us-west-2/k8s/coder-proxy/main.tf b/infra/aws/us-west-2/k8s/coder-proxy/main.tf
index fc46036..06b5c6b 100644
--- a/infra/aws/us-west-2/k8s/coder-proxy/main.tf
+++ b/infra/aws/us-west-2/k8s/coder-proxy/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -101,11 +101,6 @@ variable "kubernetes_create_ssl_secret" {
   default = true
 }
 
-variable "cloudflare_api_token" {
-  type      = string
-  sensitive = true
-}
-
 provider "aws" {
   region  = var.cluster_region
   profile = var.cluster_profile
@@ -120,7 +115,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
@@ -161,7 +156,6 @@ module "coder-proxy" {
   proxy_token_config = {
     name = "coder-proxy"
   }
-  cloudflare_api_token = var.cloudflare_api_token
   ssl_cert_config = {
     name          = var.kubernetes_ssl_secret_name
     create_secret = var.kubernetes_create_ssl_secret
@@ -208,9 +202,4 @@ module "coder-proxy" {
       topology_key = "kubernetes.io/hostname"
     }
   }]
-}
-
-import {
-  id = "coder-proxy"
-  to = module.coder-proxy.kubernetes_namespace.this
 }
\ No newline at end of file
diff --git a/infra/aws/us-west-2/k8s/coder-server/main.tf b/infra/aws/us-west-2/k8s/coder-server/main.tf
new file mode 100644
index 0000000..c66b01f
--- /dev/null
+++ b/infra/aws/us-west-2/k8s/coder-server/main.tf
@@ -0,0 +1,318 @@
+terraform {
+  required_providers {
+    aws = {
+      source = "hashicorp/aws"
+    }
+    helm = {
+      source  = "hashicorp/helm"
+      version = "3.1.1"
+    }
+    kubernetes = {
+      source = "hashicorp/kubernetes"
+    }
+    coderd = {
+      source = "coder/coderd"
+    }
+    acme = {
+      source = "vancluever/acme"
+    }
+    tls = {
+      source = "hashicorp/tls"
+    }
+  }
+  backend "s3" {}
+}
+
+variable "cluster_name" {
+  type = string
+}
+
+variable "cluster_region" {
+  type = string
+}
+
+variable "cluster_profile" {
+  type    = string
+  default = "default"
+}
+
+variable "cluster_oidc_provider_arn" {
+  type = string
+}
+
+variable "acme_server_url" {
+  type    = string
+  default = "https://acme-v02.api.letsencrypt.org/directory"
+}
+
+variable "acme_registration_email" {
+  type = string
+}
+
+variable "addon_version" {
+  type    = string
+  default = "2.25.1"
+}
+
+variable "coder_access_url" {
+  type = string
+}
+
+variable "coder_wildcard_access_url" {
+  type = string
+}
+
+variable "coder_experiments" {
+  type    = list(string)
+  default = []
+}
+
+variable "coder_github_allowed_orgs" {
+  type    = list(string)
+  default = []
+}
+
+variable "coder_builtin_provisioner_count" {
+  type    = number
+  default = 0
+}
+
+variable "coder_github_external_auth_secret_client_secret" {
+  type      = string
+  sensitive = true
+}
+
+variable "coder_github_external_auth_secret_client_id" {
+  type      = string
+  sensitive = true
+}
+
+variable "coder_oauth_secret_client_secret" {
+  type      = string
+  sensitive = true
+}
+
+variable "coder_oauth_secret_client_id" {
+  type      = string
+  sensitive = true
+}
+
+variable "coder_oidc_secret_client_secret" {
+  type      = string
+  sensitive = true
+}
+
+variable "coder_oidc_secret_client_id" {
+  type      = string
+  sensitive = true
+}
+
+variable "coder_oidc_secret_issuer_url" {
+  type      = string
+  sensitive = true
+}
+
+variable "coder_db_secret_url" {
+  type      = string
+  sensitive = true
+}
+
+variable "coder_token" {
+  type      = string
+  sensitive = true
+}
+
+variable "image_repo" {
+  type      = string
+  sensitive = true
+}
+
+variable "image_tag" {
+  type    = string
+  default = "latest"
+}
+
+variable "kubernetes_ssl_secret_name" {
+  type = string
+}
+
+variable "kubernetes_create_ssl_secret" {
+  type    = bool
+  default = true
+}
+
+variable "oidc_sign_in_text" {
+  type = string
+}
+
+variable "oidc_icon_url" {
+  type = string
+}
+
+variable "oidc_scopes" {
+  type = list(string)
+}
+
+variable "oidc_email_domain" {
+  type = string
+}
+
+provider "aws" {
+  region  = var.cluster_region
+  profile = var.cluster_profile
+}
+
+data "aws_eks_cluster" "this" {
+  name = var.cluster_name
+}
+
+data "aws_eks_cluster_auth" "this" {
+  name = var.cluster_name
+}
+
+provider "helm" {
+  kubernetes = {
+    host                   = data.aws_eks_cluster.this.endpoint
+    cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
+    token                  = data.aws_eks_cluster_auth.this.token
+  }
+}
+
+provider "kubernetes" {
+  host                   = data.aws_eks_cluster.this.endpoint
+  cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
+  token                  = data.aws_eks_cluster_auth.this.token
+}
+
+provider "coderd" {
+  url   = var.coder_access_url
+  token = var.coder_token
+}
+
+provider "acme" {
+  server_url = var.acme_server_url
+}
+
+# Fetch ACM certificate dynamically by domain to avoid hardcoding sensitive ARNs
+data "aws_acm_certificate" "coder" {
+  domain      = trimsuffix(trimprefix(var.coder_access_url, "https://"), "/")
+  statuses    = ["ISSUED"]
+  most_recent = true
+}
+
+module "coder-server" {
+  source = "../../../../../modules/k8s/bootstrap/coder-server"
+
+  cluster_name              = var.cluster_name
+  cluster_oidc_provider_arn = var.cluster_oidc_provider_arn
+
+
+  namespace                       = "coder"
+  acme_registration_email         = var.acme_registration_email
+  acme_days_until_renewal         = 90
+  replica_count                   = 1 # HA requires Enterprise license
+  helm_version                    = var.addon_version
+  image_repo                      = var.image_repo
+  image_tag                       = var.image_tag
+  primary_access_url              = var.coder_access_url
+  wildcard_access_url             = var.coder_wildcard_access_url
+  coder_experiments               = var.coder_experiments
+  coder_builtin_provisioner_count = var.coder_builtin_provisioner_count
+  coder_github_allowed_orgs       = var.coder_github_allowed_orgs
+  ssl_cert_config = {
+    name          = var.kubernetes_ssl_secret_name
+    create_secret = var.kubernetes_create_ssl_secret
+  }
+  oidc_config = {
+    sign_in_text = var.oidc_sign_in_text
+    icon_url     = var.oidc_icon_url
+    scopes       = var.oidc_scopes
+    email_domain = var.oidc_email_domain
+  }
+  db_secret_url                             = var.coder_db_secret_url
+  oidc_secret_issuer_url                    = var.coder_oidc_secret_issuer_url
+  oidc_secret_client_id                     = var.coder_oidc_secret_client_id
+  oidc_secret_client_secret                 = var.coder_oidc_secret_client_secret
+  oauth_secret_client_id                    = var.coder_oauth_secret_client_id
+  oauth_secret_client_secret                = var.coder_oauth_secret_client_secret
+  github_external_auth_secret_client_id     = var.coder_github_external_auth_secret_client_id
+  github_external_auth_secret_client_secret = var.coder_github_external_auth_secret_client_secret
+  tags                                      = {}
+  env_vars = {
+    # Disable redirect since NLB terminates TLS and forwards plain HTTP to backend
+    # Without this, Coder sees HTTP and redirects to HTTPS, causing infinite redirect loop
+    CODER_REDIRECT_TO_ACCESS_URL = "false"
+    # Disable TLS on Coder itself since NLB terminates TLS
+    CODER_TLS_ENABLE = "false"
+    # Mark auth cookies as secure since users access via HTTPS
+    CODER_SECURE_AUTH_COOKIE = "true"
+    # Enable DERP server for multi-region replica communication
+    CODER_DERP_SERVER_ENABLE = "true"
+  }
+  service_annotations = {
+    "service.beta.kubernetes.io/aws-load-balancer-nlb-target-type"  = "instance"
+    "service.beta.kubernetes.io/aws-load-balancer-scheme"           = "internet-facing"
+    "service.beta.kubernetes.io/aws-load-balancer-attributes"       = "deletion_protection.enabled=false,load_balancing.cross_zone.enabled=true"
+    "service.beta.kubernetes.io/aws-load-balancer-ssl-cert"         = data.aws_acm_certificate.coder.arn
+    "service.beta.kubernetes.io/aws-load-balancer-ssl-ports"        = "443"
+    "service.beta.kubernetes.io/aws-load-balancer-backend-protocol" = "tcp"
+    # Subnets will be auto-detected by Load Balancer Controller using kubernetes.io/role/elb=1 tag
+  }
+  node_selector = {
+    "node.coder.io/managed-by" = "karpenter"
+    "node.coder.io/used-for"   = "coder-server"
+  }
+  tolerations = [{
+    key      = "dedicated"
+    operator = "Equal"
+    value    = "coder-server"
+    effect   = "NoSchedule"
+  }]
+  topology_spread_constraints = [{
+    max_skew           = 1
+    topology_key       = "kubernetes.io/hostname"
+    when_unsatisfiable = "ScheduleAnyway"
+    label_selector = {
+      match_labels = {
+        "app.kubernetes.io/name"    = "coder"
+        "app.kubernetes.io/part-of" = "coder"
+      }
+    }
+    match_label_keys = [
+      "app.kubernetes.io/instance"
+    ]
+  }]
+  pod_anti_affinity_preferred_during_scheduling_ignored_during_execution = [{
+    weight = 100
+    pod_affinity_term = {
+      label_selector = {
+        match_labels = {
+          "app.kubernetes.io/instance" = "coder-v2"
+          "app.kubernetes.io/name"     = "coder"
+          "app.kubernetes.io/part-of"  = "coder"
+        }
+      }
+      topology_key = "kubernetes.io/hostname"
+    }
+  }]
+}
+
+# Fix service HTTPS port to forward to HTTP backend (port 8080)
+# since Coder has TLS disabled and only listens on HTTP
+resource "null_resource" "patch_coder_service" {
+  depends_on = [module.coder-server]
+
+  triggers = {
+    # Re-run patch whenever Coder configuration changes
+    always_run = timestamp()
+  }
+
+  provisioner "local-exec" {
+    command = <<-EOT
+      sleep 10
+      kubectl patch svc coder -n coder --type='json' \
+        -p='[{"op": "replace", "path": "/spec/ports/1/targetPort", "value": "http"}]' \
+        2>/dev/null || true
+    EOT
+  }
+}
diff --git a/infra/aws/us-west-2/k8s/coder-ws/main.tf b/infra/aws/us-west-2/k8s/coder-ws/main.tf
index 451a056..6c9140b 100644
--- a/infra/aws/us-west-2/k8s/coder-ws/main.tf
+++ b/infra/aws/us-west-2/k8s/coder-ws/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -98,7 +98,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/us-west-2/k8s/ebs-controller/main.tf b/infra/aws/us-west-2/k8s/ebs-controller/main.tf
index d7f1f56..5194ec7 100644
--- a/infra/aws/us-west-2/k8s/ebs-controller/main.tf
+++ b/infra/aws/us-west-2/k8s/ebs-controller/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -55,7 +55,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/us-west-2/k8s/karpenter/main.tf b/infra/aws/us-west-2/k8s/karpenter/main.tf
index f5b34f8..2e9426a 100644
--- a/infra/aws/us-west-2/k8s/karpenter/main.tf
+++ b/infra/aws/us-west-2/k8s/karpenter/main.tf
@@ -5,11 +5,14 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
     }
+    null = {
+      source = "hashicorp/null"
+    }
   }
   backend "s3" {}
 }
@@ -40,6 +43,16 @@ variable "addon_namespace" {
   default = "default"
 }
 
+variable "karpenter_queue_name" {
+  type    = string
+  default = ""
+}
+
+variable "karpenter_queue_rule_name" {
+  type    = string
+  default = ""
+}
+
 provider "aws" {
   region  = var.cluster_region
   profile = var.cluster_profile
@@ -54,7 +67,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
@@ -101,6 +114,24 @@ locals {
 
 locals {
   nodepool_configs = [{
+    name = "coder-server"
+    node_labels = merge(local.global_node_labels, {
+      "node.coder.io/name"     = "coder"
+      "node.coder.io/part-of"  = "coder"
+      "node.coder.io/used-for" = "coder-server"
+    })
+    node_taints = [{
+      key    = "dedicated"
+      value  = "coder-server"
+      effect = "NoSchedule"
+    }]
+    node_requirements = concat(local.global_node_reqs, [{
+      key      = "node.kubernetes.io/instance-type"
+      operator = "In"
+      values   = ["t3.xlarge", "t3a.xlarge", "t3.2xlarge", "t3a.2xlarge"]
+    }])
+    node_class_ref_name = "coder-proxy-class"
+    }, {
     name = "coder-proxy"
     node_labels = merge(local.global_node_labels, {
       "node.coder.io/name"     = "coder"
@@ -115,7 +146,7 @@ locals {
     node_requirements = concat(local.global_node_reqs, [{
       key      = "node.kubernetes.io/instance-type"
       operator = "In"
-      values   = ["m5a.xlarge", "m6a.xlarge"]
+      values   = ["m5a.xlarge", "m6a.xlarge", "t3.xlarge", "t3a.xlarge"]
     }])
     node_class_ref_name = "coder-proxy-class"
     }, {
@@ -133,7 +164,7 @@ locals {
     node_requirements = concat(local.global_node_reqs, [{
       key      = "node.kubernetes.io/instance-type"
       operator = "In"
-      values   = ["m5a.4xlarge", "m6a.4xlarge"]
+      values   = ["m5a.4xlarge", "m6a.4xlarge", "m5a.2xlarge", "m6a.2xlarge"]
     }])
     node_class_ref_name = "coder-provisioner-class"
     }, {
@@ -151,7 +182,15 @@ locals {
     node_requirements = concat(local.global_node_reqs, [{
       key      = "node.kubernetes.io/instance-type"
       operator = "In"
-      values   = ["c6a.32xlarge", "c5a.32xlarge"]
+      values = [
+        # Small demos (5-10 users) - Most cost-effective
+        "c6a.4xlarge", "c5a.4xlarge", # 16 vCPU / 32 GB  - ~$0.18/hr spot
+        "c6a.8xlarge", "c5a.8xlarge", # 32 vCPU / 64 GB  - ~$0.37/hr spot
+        # Medium demos (10-20 users)
+        "c6a.16xlarge", "c5a.16xlarge", # 64 vCPU / 128 GB - ~$0.74/hr spot
+        # Large demos (20-40 users)
+        "c6a.32xlarge", "c5a.32xlarge" # 128 vCPU / 256 GB - ~$1.47/hr spot
+      ]
     }])
     node_class_ref_name          = "coder-ws-class"
     disruption_consolidate_after = "30m"
@@ -168,6 +207,9 @@ module "karpenter-addon" {
   node_selector = {
     "node.amazonaws.io/managed-by" : "asg"
   }
+
+  karpenter_queue_name      = var.karpenter_queue_name
+  karpenter_queue_rule_name = var.karpenter_queue_rule_name
   ec2nodeclass_configs = [{
     name                 = "coder-proxy-class"
     subnet_selector_tags = local.provisioner_subnet_tags
@@ -181,13 +223,13 @@ module "karpenter-addon" {
     block_device_mappings = [{
       device_name = "/dev/xvda"
       ebs = {
-        volume_size = "1400Gi"
+        volume_size = "500G"
         volume_type = "gp3"
       }
       }, {
       device_name = "/dev/xvdb"
       ebs = {
-        volume_size = "50Gi"
+        volume_size = "50G"
         volume_type = "gp3"
       }
     }]
@@ -196,4 +238,31 @@ module "karpenter-addon" {
     subnet_selector_tags = local.provisioner_subnet_tags
     sg_selector_tags     = local.provisioner_sg_tags
   }]
+}
+
+# Create NodePools for each configuration
+module "nodepools" {
+  for_each = { for np in local.nodepool_configs : np.name => np }
+  source   = "../../../../../modules/k8s/objects/nodepool"
+
+  name                            = each.value.name
+  node_labels                     = each.value.node_labels
+  node_taints                     = each.value.node_taints
+  node_requirements               = each.value.node_requirements
+  node_class_ref_name             = each.value.node_class_ref_name
+  disruption_consolidate_after    = lookup(each.value, "disruption_consolidate_after", "1m")
+  disruption_consolidation_policy = lookup(each.value, "disruption_consolidation_policy", "WhenEmpty")
+
+  depends_on = [module.karpenter-addon]
+}
+
+# Apply the NodePool manifests
+resource "null_resource" "apply_nodepools" {
+  for_each = module.nodepools
+
+  provisioner "local-exec" {
+    command = "echo '${each.value.manifest}' | kubectl apply -f -"
+  }
+
+  depends_on = [module.karpenter-addon]
 }
\ No newline at end of file
diff --git a/infra/aws/us-west-2/k8s/lb-controller/main.tf b/infra/aws/us-west-2/k8s/lb-controller/main.tf
index 1f6a0fa..63d0c6b 100644
--- a/infra/aws/us-west-2/k8s/lb-controller/main.tf
+++ b/infra/aws/us-west-2/k8s/lb-controller/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -60,13 +60,19 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
   }
 }
 
+provider "kubernetes" {
+  host                   = data.aws_eks_cluster.this.endpoint
+  cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
+  token                  = data.aws_eks_cluster_auth.this.token
+}
+
 module "lb-controller" {
   source                    = "../../../../../modules/k8s/bootstrap/lb-controller"
   cluster_name              = data.aws_eks_cluster.this.name
diff --git a/infra/aws/us-west-2/k8s/metrics-server/main.tf b/infra/aws/us-west-2/k8s/metrics-server/main.tf
index d808c74..cce9447 100644
--- a/infra/aws/us-west-2/k8s/metrics-server/main.tf
+++ b/infra/aws/us-west-2/k8s/metrics-server/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
   }
   backend "s3" {}
@@ -48,7 +48,7 @@ data "aws_eks_cluster_auth" "this" {
 }
 
 provider "helm" {
-  kubernetes {
+  kubernetes = {
     host                   = data.aws_eks_cluster.this.endpoint
     cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
     token                  = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/us-west-2/k8s/nodepools/main.tf b/infra/aws/us-west-2/k8s/nodepools/main.tf
new file mode 100644
index 0000000..74d63c5
--- /dev/null
+++ b/infra/aws/us-west-2/k8s/nodepools/main.tf
@@ -0,0 +1,356 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = ">= 5.0"
+    }
+    kubernetes = {
+      source  = "hashicorp/kubernetes"
+      version = ">= 2.20"
+    }
+  }
+}
+
+variable "cluster_name" {
+  description = "EKS cluster name"
+  type        = string
+}
+
+variable "cluster_region" {
+  description = "AWS region"
+  type        = string
+}
+
+variable "cluster_profile" {
+  description = "AWS profile"
+  type        = string
+  default     = "default"
+}
+
+provider "aws" {
+  region  = var.cluster_region
+  profile = var.cluster_profile
+}
+
+data "aws_eks_cluster" "this" {
+  name = var.cluster_name
+}
+
+data "aws_eks_cluster_auth" "this" {
+  name = var.cluster_name
+}
+
+provider "kubernetes" {
+  host                   = data.aws_eks_cluster.this.endpoint
+  cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
+  token                  = data.aws_eks_cluster_auth.this.token
+}
+
+# NodePool for Coder Server
+resource "kubernetes_manifest" "coder_server_nodepool" {
+  manifest = {
+    apiVersion = "karpenter.sh/v1"
+    kind       = "NodePool"
+    metadata = {
+      name = "coder-server"
+    }
+    spec = {
+      template = {
+        metadata = {
+          labels = {
+            "node.coder.io/instance"   = "coder-v2"
+            "node.coder.io/managed-by" = "karpenter"
+            "node.coder.io/name"       = "coder"
+            "node.coder.io/part-of"    = "coder"
+            "node.coder.io/used-for"   = "coder-server"
+          }
+        }
+        spec = {
+          expireAfter = "480h"
+          nodeClassRef = {
+            group = "eks.amazonaws.com"
+            kind  = "NodeClass"
+            name  = "default"
+          }
+          requirements = [
+            {
+              key      = "karpenter.sh/capacity-type"
+              operator = "In"
+              values   = ["on-demand"]
+            },
+            {
+              key      = "kubernetes.io/arch"
+              operator = "In"
+              values   = ["amd64"]
+            },
+            {
+              key      = "kubernetes.io/os"
+              operator = "In"
+              values   = ["linux"]
+            },
+            {
+              key      = "eks.amazonaws.com/instance-category"
+              operator = "In"
+              values   = ["t", "m"]
+            },
+            {
+              key      = "eks.amazonaws.com/instance-generation"
+              operator = "Gt"
+              values   = ["3"]
+            },
+            {
+              key      = "node.kubernetes.io/instance-type"
+              operator = "In"
+              values   = ["t3.xlarge", "t3.2xlarge", "t3a.xlarge", "t3a.2xlarge", "m5.xlarge", "m5.2xlarge"]
+            }
+          ]
+          taints = [
+            {
+              key    = "dedicated"
+              value  = "coder-server"
+              effect = "NoSchedule"
+            }
+          ]
+          terminationGracePeriod = "1h"
+        }
+      }
+      disruption = {
+        consolidationPolicy = "WhenEmpty"
+        consolidateAfter    = "5m"
+      }
+    }
+  }
+}
+
+# NodePool for Coder Proxy
+resource "kubernetes_manifest" "coder_proxy_nodepool" {
+  manifest = {
+    apiVersion = "karpenter.sh/v1"
+    kind       = "NodePool"
+    metadata = {
+      name = "coder-proxy"
+    }
+    spec = {
+      template = {
+        metadata = {
+          labels = {
+            "node.coder.io/instance"   = "coder-v2"
+            "node.coder.io/managed-by" = "karpenter"
+            "node.coder.io/name"       = "coder"
+            "node.coder.io/part-of"    = "coder"
+            "node.coder.io/used-for"   = "coder-proxy"
+          }
+        }
+        spec = {
+          expireAfter = "480h"
+          nodeClassRef = {
+            group = "eks.amazonaws.com"
+            kind  = "NodeClass"
+            name  = "default"
+          }
+          requirements = [
+            {
+              key      = "karpenter.sh/capacity-type"
+              operator = "In"
+              values   = ["on-demand", "spot"]
+            },
+            {
+              key      = "kubernetes.io/arch"
+              operator = "In"
+              values   = ["amd64"]
+            },
+            {
+              key      = "kubernetes.io/os"
+              operator = "In"
+              values   = ["linux"]
+            },
+            {
+              key      = "eks.amazonaws.com/instance-category"
+              operator = "In"
+              values   = ["m", "c", "t"]
+            },
+            {
+              key      = "eks.amazonaws.com/instance-generation"
+              operator = "Gt"
+              values   = ["4"]
+            }
+          ]
+          taints = [
+            {
+              key    = "dedicated"
+              value  = "coder-proxy"
+              effect = "NoSchedule"
+            }
+          ]
+          terminationGracePeriod = "30m"
+        }
+      }
+      disruption = {
+        consolidationPolicy = "WhenEmpty"
+        consolidateAfter    = "5m"
+      }
+    }
+  }
+}
+
+# NodePool for Coder Provisioner
+resource "kubernetes_manifest" "coder_provisioner_nodepool" {
+  manifest = {
+    apiVersion = "karpenter.sh/v1"
+    kind       = "NodePool"
+    metadata = {
+      name = "coder-provisioner"
+    }
+    spec = {
+      template = {
+        metadata = {
+          labels = {
+            "node.coder.io/instance"   = "coder-v2"
+            "node.coder.io/managed-by" = "karpenter"
+            "node.coder.io/name"       = "coder"
+            "node.coder.io/part-of"    = "coder"
+            "node.coder.io/used-for"   = "coder-provisioner"
+          }
+        }
+        spec = {
+          expireAfter = "480h"
+          nodeClassRef = {
+            group = "eks.amazonaws.com"
+            kind  = "NodeClass"
+            name  = "default"
+          }
+          requirements = [
+            {
+              key      = "karpenter.sh/capacity-type"
+              operator = "In"
+              values   = ["on-demand", "spot"]
+            },
+            {
+              key      = "kubernetes.io/arch"
+              operator = "In"
+              values   = ["amd64"]
+            },
+            {
+              key      = "kubernetes.io/os"
+              operator = "In"
+              values   = ["linux"]
+            },
+            {
+              key      = "eks.amazonaws.com/instance-category"
+              operator = "In"
+              values   = ["m", "c"]
+            },
+            {
+              key      = "eks.amazonaws.com/instance-generation"
+              operator = "Gt"
+              values   = ["5"]
+            },
+            {
+              key      = "node.kubernetes.io/instance-type"
+              operator = "In"
+              values   = ["m5.2xlarge", "m5.4xlarge", "m6a.2xlarge", "m6a.4xlarge", "c5.2xlarge", "c5.4xlarge"]
+            }
+          ]
+          taints = [
+            {
+              key    = "dedicated"
+              value  = "coder-provisioner"
+              effect = "NoSchedule"
+            }
+          ]
+          terminationGracePeriod = "30m"
+        }
+      }
+      disruption = {
+        consolidationPolicy = "WhenEmpty"
+        consolidateAfter    = "10m"
+      }
+    }
+  }
+}
+
+# NodePool for Coder Workspaces
+resource "kubernetes_manifest" "coder_workspaces_nodepool" {
+  manifest = {
+    apiVersion = "karpenter.sh/v1"
+    kind       = "NodePool"
+    metadata = {
+      name = "coder-workspaces"
+    }
+    spec = {
+      template = {
+        metadata = {
+          labels = {
+            "node.coder.io/instance"   = "coder-v2"
+            "node.coder.io/managed-by" = "karpenter"
+            "node.coder.io/name"       = "coder"
+            "node.coder.io/part-of"    = "coder"
+            "node.coder.io/used-for"   = "coder-workspaces"
+          }
+        }
+        spec = {
+          expireAfter = "336h" # 14 days for workspace nodes
+          nodeClassRef = {
+            group = "eks.amazonaws.com"
+            kind  = "NodeClass"
+            name  = "default"
+          }
+          requirements = [
+            {
+              key      = "karpenter.sh/capacity-type"
+              operator = "In"
+              values   = ["on-demand", "spot"]
+            },
+            {
+              key      = "kubernetes.io/arch"
+              operator = "In"
+              values   = ["amd64"]
+            },
+            {
+              key      = "kubernetes.io/os"
+              operator = "In"
+              values   = ["linux"]
+            },
+            {
+              key      = "eks.amazonaws.com/instance-category"
+              operator = "In"
+              values   = ["c", "m", "r"]
+            },
+            {
+              key      = "eks.amazonaws.com/instance-generation"
+              operator = "Gt"
+              values   = ["5"]
+            }
+          ]
+          taints = [
+            {
+              key    = "dedicated"
+              value  = "coder-workspaces"
+              effect = "NoSchedule"
+            }
+          ]
+          terminationGracePeriod = "30m"
+        }
+      }
+      disruption = {
+        consolidationPolicy = "WhenEmptyOrUnderutilized"
+        consolidateAfter    = "30m"
+        budgets = [
+          {
+            nodes = "10%"
+          }
+        ]
+      }
+    }
+  }
+}
+
+output "nodepools_created" {
+  description = "List of NodePools created"
+  value = [
+    "coder-server",
+    "coder-proxy",
+    "coder-provisioner",
+    "coder-workspaces"
+  ]
+}
diff --git a/infra/aws/us-west-2/route53/main.tf b/infra/aws/us-west-2/route53/main.tf
new file mode 100644
index 0000000..5b0221d
--- /dev/null
+++ b/infra/aws/us-west-2/route53/main.tf
@@ -0,0 +1,218 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = ">= 5.0"
+    }
+    kubernetes = {
+      source  = "hashicorp/kubernetes"
+      version = ">= 2.0"
+    }
+  }
+}
+
+variable "cluster_region" {
+  description = "AWS region"
+  type        = string
+  default     = "us-west-2"
+}
+
+variable "cluster_profile" {
+  description = "AWS profile"
+  type        = string
+  default     = "default"
+}
+
+variable "cluster_name" {
+  description = "EKS cluster name"
+  type        = string
+}
+
+variable "domain_name" {
+  description = "Domain name for Coder"
+  type        = string
+  default     = "coderdemo.io"
+}
+
+variable "hosted_zone_id" {
+  description = "Route 53 Hosted Zone ID"
+  type        = string
+  default     = "Z080884039133KJPAGA3S"
+}
+
+variable "coder_service_name" {
+  description = "Coder service name in Kubernetes"
+  type        = string
+  default     = "coder"
+}
+
+variable "coder_namespace" {
+  description = "Coder namespace in Kubernetes"
+  type        = string
+  default     = "coder-proxy"
+}
+
+variable "set_identifier" {
+  description = "Unique identifier for this routing policy record"
+  type        = string
+  default     = "us-west-2"
+}
+
+variable "health_check_enabled" {
+  description = "Enable Route 53 health checks"
+  type        = bool
+  default     = true
+}
+
+variable "health_check_path" {
+  description = "Path for health checks"
+  type        = string
+  default     = "/api/v2/buildinfo"
+}
+
+provider "aws" {
+  region  = var.cluster_region
+  profile = var.cluster_profile
+}
+
+data "aws_eks_cluster" "this" {
+  name = var.cluster_name
+}
+
+data "aws_eks_cluster_auth" "this" {
+  name = var.cluster_name
+}
+
+provider "kubernetes" {
+  host                   = data.aws_eks_cluster.this.endpoint
+  cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
+  token                  = data.aws_eks_cluster_auth.this.token
+}
+
+# Get the NLB hostname from the Kubernetes service
+data "kubernetes_service" "coder" {
+  metadata {
+    name      = var.coder_service_name
+    namespace = var.coder_namespace
+  }
+}
+
+# Extract the NLB details
+locals {
+  nlb_hostname = try(data.kubernetes_service.coder.status[0].load_balancer[0].ingress[0].hostname, "")
+}
+
+# Get NLB by tags (AWS Load Balancer Controller tags the NLB)
+data "aws_lb" "coder_nlb" {
+  tags = {
+    "service.k8s.aws/stack" = "${var.coder_namespace}/${var.coder_service_name}"
+  }
+}
+
+# Health check for the NLB endpoint
+resource "aws_route53_health_check" "coder" {
+  count             = var.health_check_enabled ? 1 : 0
+  type              = "HTTPS"
+  resource_path     = var.health_check_path
+  fqdn              = var.domain_name
+  port              = 443
+  request_interval  = 30
+  failure_threshold = 3
+  measure_latency   = true
+
+  tags = {
+    Name        = "coder-${var.set_identifier}"
+    Region      = var.cluster_region
+    Environment = "production"
+    ManagedBy   = "terraform"
+  }
+}
+
+# Latency-based routing record for the main domain
+resource "aws_route53_record" "coder_latency" {
+  zone_id         = var.hosted_zone_id
+  name            = var.domain_name
+  type            = "A"
+  set_identifier  = var.set_identifier
+  allow_overwrite = true
+
+  alias {
+    name                   = local.nlb_hostname
+    zone_id                = data.aws_lb.coder_nlb.zone_id
+    evaluate_target_health = true
+  }
+
+  latency_routing_policy {
+    region = var.cluster_region
+  }
+
+  health_check_id = var.health_check_enabled ? aws_route53_health_check.coder[0].id : null
+}
+
+# Latency-based routing record for wildcard subdomains
+resource "aws_route53_record" "coder_wildcard_latency" {
+  zone_id         = var.hosted_zone_id
+  name            = "*.${var.domain_name}"
+  type            = "A"
+  set_identifier  = var.set_identifier
+  allow_overwrite = true
+
+  alias {
+    name                   = local.nlb_hostname
+    zone_id                = data.aws_lb.coder_nlb.zone_id
+    evaluate_target_health = true
+  }
+
+  latency_routing_policy {
+    region = var.cluster_region
+  }
+
+  health_check_id = var.health_check_enabled ? aws_route53_health_check.coder[0].id : null
+}
+
+# Region-specific subdomain for manual region selection
+resource "aws_route53_record" "coder_region_specific" {
+  zone_id = var.hosted_zone_id
+  name    = "${var.set_identifier}.${var.domain_name}"
+  type    = "A"
+
+  alias {
+    name                   = local.nlb_hostname
+    zone_id                = data.aws_lb.coder_nlb.zone_id
+    evaluate_target_health = true
+  }
+}
+
+# Wildcard for region-specific subdomain (for workspace apps)
+resource "aws_route53_record" "coder_region_specific_wildcard" {
+  zone_id = var.hosted_zone_id
+  name    = "*.${var.set_identifier}.${var.domain_name}"
+  type    = "A"
+
+  alias {
+    name                   = local.nlb_hostname
+    zone_id                = data.aws_lb.coder_nlb.zone_id
+    evaluate_target_health = true
+  }
+}
+
+# Outputs
+output "nlb_hostname" {
+  description = "Network Load Balancer hostname"
+  value       = local.nlb_hostname
+}
+
+output "nlb_zone_id" {
+  description = "Network Load Balancer Route 53 zone ID"
+  value       = data.aws_lb.coder_nlb.zone_id
+}
+
+output "health_check_id" {
+  description = "Route 53 health check ID"
+  value       = var.health_check_enabled ? aws_route53_health_check.coder[0].id : null
+}
+
+output "route53_record_fqdn" {
+  description = "Fully qualified domain name of the Route 53 record"
+  value       = aws_route53_record.coder_latency.fqdn
+}
diff --git a/modules/k8s/bootstrap/cert-manager/main.tf b/modules/k8s/bootstrap/cert-manager/main.tf
index 8183719..6f90bb0 100644
--- a/modules/k8s/bootstrap/cert-manager/main.tf
+++ b/modules/k8s/bootstrap/cert-manager/main.tf
@@ -7,7 +7,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -132,12 +132,12 @@ resource "helm_release" "cert-manager" {
   chart            = "cert-manager"
   repository       = "oci://quay.io/jetstack/charts"
   create_namespace = false
-  # Removed invalid upgrade_install attribute for proper error handling
-  skip_crds     = false
-  wait          = true
-  wait_for_jobs = true
-  version       = var.helm_version
-  timeout       = var.helm_timeout
+  upgrade_install  = true
+  skip_crds        = false
+  wait             = true
+  wait_for_jobs    = true
+  version          = var.helm_version
+  timeout          = var.helm_timeout
 
   values = [yamlencode({
     crds = {
diff --git a/modules/k8s/bootstrap/coder-provisioner/main.tf b/modules/k8s/bootstrap/coder-provisioner/main.tf
index 24f22f3..3840721 100644
--- a/modules/k8s/bootstrap/coder-provisioner/main.tf
+++ b/modules/k8s/bootstrap/coder-provisioner/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
diff --git a/modules/k8s/bootstrap/coder-proxy/main.tf b/modules/k8s/bootstrap/coder-proxy/main.tf
index 72c857e..579ecec 100644
--- a/modules/k8s/bootstrap/coder-proxy/main.tf
+++ b/modules/k8s/bootstrap/coder-proxy/main.tf
@@ -7,7 +7,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -346,6 +346,7 @@ resource "helm_release" "coder-proxy" {
   chart            = "coder"
   repository       = "https://helm.coder.com/v2"
   create_namespace = false
+  upgrade_install  = true
   skip_crds        = false
   wait             = true
   wait_for_jobs    = true
diff --git a/modules/k8s/bootstrap/coder-server/main.tf b/modules/k8s/bootstrap/coder-server/main.tf
index a27723a..48d0c5b 100644
--- a/modules/k8s/bootstrap/coder-server/main.tf
+++ b/modules/k8s/bootstrap/coder-server/main.tf
@@ -5,7 +5,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -114,8 +114,8 @@ variable "image_pull_secrets" {
 
 variable "replica_count" {
   type = number
-  # Changed from 0 to 1 because zero replicas results in no running server pods
-  default = 1
+  # reverted back to 0 as this is a demo deployment by default
+  default = 0
 }
 
 variable "env_vars" {
@@ -577,6 +577,7 @@ resource "helm_release" "coder-server" {
   chart            = "coder"
   repository       = "https://helm.coder.com/v2"
   create_namespace = false
+  upgrade_install  = true
   skip_crds        = false
   wait             = true
   wait_for_jobs    = true
diff --git a/modules/k8s/bootstrap/coder-server/policy.tf b/modules/k8s/bootstrap/coder-server/policy.tf
index 9a76b7d..828f8bc 100644
--- a/modules/k8s/bootstrap/coder-server/policy.tf
+++ b/modules/k8s/bootstrap/coder-server/policy.tf
@@ -33,11 +33,30 @@ data "aws_iam_policy_document" "provisioner-policy" {
       "ec2:ReleaseHosts"
     ]
     resources = [
-      "arn:aws:ec2:${local.region}:${local.account_id}:*",
-      "arn:aws:ec2:${local.region}:${local.account_id}:*/*",
-      "arn:aws:ec2:${local.region}:${local.account_id}:*:*",
-      "arn:aws:ec2:${local.region}::image/*"
+      "arn:aws:ec2:${local.region}:${local.account_id}:dedicated-host/*"
+    ]
+    condition {
+      test     = "StringEquals"
+      variable = "aws:RequestTag/ManagedBy"
+      values   = ["coder"]
+    }
+  }
+
+  statement {
+    sid    = "EC2ManageHostLifecycleExisting"
+    effect = "Allow"
+    actions = [
+      "ec2:ModifyHosts",
+      "ec2:ReleaseHosts"
     ]
+    resources = [
+      "arn:aws:ec2:${local.region}:${local.account_id}:dedicated-host/*"
+    ]
+    condition {
+      test     = "StringEquals"
+      variable = "aws:ResourceTag/ManagedBy"
+      values   = ["coder"]
+    }
   }
 
   statement {
diff --git a/modules/k8s/bootstrap/ebs-controller/main.tf b/modules/k8s/bootstrap/ebs-controller/main.tf
index b2a438f..b6dd29a 100644
--- a/modules/k8s/bootstrap/ebs-controller/main.tf
+++ b/modules/k8s/bootstrap/ebs-controller/main.tf
@@ -7,7 +7,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -86,13 +86,13 @@ resource "helm_release" "ebs-controller" {
   chart            = "aws-ebs-csi-driver"
   repository       = "https://kubernetes-sigs.github.io/aws-ebs-csi-driver"
   create_namespace = true
-  # Removed upgrade_install because it's not a valid helm_release attribute
-  skip_crds     = false
-  replace       = var.replace
-  wait          = true
-  wait_for_jobs = true
-  version       = var.chart_version
-  timeout       = 120 # in seconds
+  upgrade_install  = true
+  skip_crds        = false
+  replace          = var.replace
+  wait             = true
+  wait_for_jobs    = true
+  version          = var.chart_version
+  timeout          = 120 # in seconds
 
   values = [yamlencode({
     controller = {
diff --git a/modules/k8s/bootstrap/karpenter/main.tf b/modules/k8s/bootstrap/karpenter/main.tf
index 55781aa..78b15c2 100644
--- a/modules/k8s/bootstrap/karpenter/main.tf
+++ b/modules/k8s/bootstrap/karpenter/main.tf
@@ -2,7 +2,7 @@ terraform {
   required_providers {
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -103,7 +103,7 @@ variable "ec2nodeclass_configs" {
     block_device_mappings = optional(list(object({
       device_name = string
       ebs = object({
-        volume_size           = string
+        volume_size           = string # Kubernetes-style size with unit (e.g. "1400Gi", "50Gi")
         volume_type           = string
         encrypted             = optional(bool, false)
         delete_on_termination = optional(bool, true)
@@ -220,12 +220,12 @@ resource "helm_release" "karpenter" {
   chart            = "karpenter"
   repository       = "oci://public.ecr.aws/karpenter"
   create_namespace = true
-  # Removed invalid upgrade_install attribute
-  skip_crds     = false
-  wait          = true
-  wait_for_jobs = true
-  version       = var.chart_version
-  timeout       = 120 # in seconds
+  upgrade_install  = true
+  skip_crds        = false
+  wait             = true
+  wait_for_jobs    = true
+  version          = var.chart_version
+  timeout          = 120 # in seconds
 
   # Added lifecycle management for proper upgrade handling
   lifecycle {
@@ -256,7 +256,13 @@ resource "helm_release" "karpenter" {
     settings = {
       clusterName = var.cluster_name
       featureGates = {
+        # Cost optimization - consolidate workloads to better-priced spot instances
         spotToSpotConsolidation = true
+        # Future features - currently disabled
+        staticCapacity   = false # New capacity management feature
+        reservedCapacity = false # For Reserved Instance support
+        nodeRepair       = false # Experimental - automatic node repair
+        nodeOverlay      = false # Experimental - network overlay support
       }
       interruptionQueue = module.karpenter.queue_name
     }
@@ -280,16 +286,22 @@ resource "kubernetes_manifest" "ec2nodeclass" {
   manifest   = yamldecode(module.ec2nodeclass[count.index].manifest)
 }
 
-# module "nodepool" {
-#     count = length(local.nodepool_configs)
-#     source = "../objects/nodepool"
-#     name = local.nodepool_configs[count.index].name
-#     node_labels = local.nodepool_configs[count.index].node_labels
-#     node_taints = local.nodepool_configs[count.index].node_taints
-#     node_requirements = local.nodepool_configs[count.index].node_requirements
-#     node_class_ref_name = local.nodepool_configs[count.index].node_class_ref_name
-#     node_expires_after = local.nodepool_configs[count.index].node_expires_after
-#     disruption_consolidation_policy = local.nodepool_configs[count.index].disruption_consolidation_policy
-#     disruption_consolidate_after = local.nodepool_configs[count.index].disruption_consolidate_after
-# }
+module "nodepool" {
+  count                           = length(var.nodepool_configs)
+  source                          = "../../objects/nodepool"
+  name                            = var.nodepool_configs[count.index].name
+  node_labels                     = var.nodepool_configs[count.index].node_labels
+  node_taints                     = var.nodepool_configs[count.index].node_taints
+  node_requirements               = var.nodepool_configs[count.index].node_requirements
+  node_class_ref_name             = var.nodepool_configs[count.index].node_class_ref_name
+  node_expires_after              = var.nodepool_configs[count.index].node_expires_after
+  disruption_consolidation_policy = var.nodepool_configs[count.index].disruption_consolidation_policy
+  disruption_consolidate_after    = var.nodepool_configs[count.index].disruption_consolidate_after
+}
+
+resource "kubernetes_manifest" "nodepool" {
+  depends_on = [helm_release.karpenter]
+  count      = length(var.nodepool_configs)
+  manifest   = yamldecode(module.nodepool[count.index].manifest)
+}
 
diff --git a/modules/k8s/bootstrap/lb-controller/main.tf b/modules/k8s/bootstrap/lb-controller/main.tf
index a5a32ec..45c6392 100644
--- a/modules/k8s/bootstrap/lb-controller/main.tf
+++ b/modules/k8s/bootstrap/lb-controller/main.tf
@@ -7,7 +7,7 @@ terraform {
     }
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
     kubernetes = {
       source = "hashicorp/kubernetes"
@@ -138,12 +138,12 @@ resource "helm_release" "lb-controller" {
   chart            = "aws-load-balancer-controller"
   repository       = "https://aws.github.io/eks-charts"
   create_namespace = true
-  # Removed invalid upgrade_install attribute - Terraform handles upgrades automatically
-  skip_crds     = false
-  wait          = true
-  wait_for_jobs = true
-  version       = var.chart_version
-  timeout       = 120 # in seconds
+  upgrade_install  = true
+  skip_crds        = false
+  wait             = true
+  wait_for_jobs    = true
+  version          = var.chart_version
+  timeout          = 120 # in seconds
 
   values = [yamlencode({
     clusterName = var.cluster_name
diff --git a/modules/k8s/bootstrap/metrics-server/main.tf b/modules/k8s/bootstrap/metrics-server/main.tf
index e588554..7940b5f 100644
--- a/modules/k8s/bootstrap/metrics-server/main.tf
+++ b/modules/k8s/bootstrap/metrics-server/main.tf
@@ -2,7 +2,7 @@ terraform {
   required_providers {
     helm = {
       source  = "hashicorp/helm"
-      version = "2.17.0"
+      version = "3.1.1"
     }
   }
 }
@@ -31,6 +31,7 @@ resource "helm_release" "metrics-server" {
   chart            = "metrics-server"
   repository       = "https://kubernetes-sigs.github.io/metrics-server/"
   create_namespace = true
+  upgrade_install  = true
   skip_crds        = false
   wait             = true
   wait_for_jobs    = true
diff --git a/modules/k8s/objects/ec2nodeclass/main.tf b/modules/k8s/objects/ec2nodeclass/main.tf
index 64c5015..7062bc0 100644
--- a/modules/k8s/objects/ec2nodeclass/main.tf
+++ b/modules/k8s/objects/ec2nodeclass/main.tf
@@ -27,7 +27,7 @@ variable "block_device_mappings" {
   type = list(object({
     device_name = string
     ebs = object({
-      volume_size           = number # Changed from string to number because AWS EBS volume sizes are numeric GiB values
+      volume_size           = string # Kubernetes-style size with unit (e.g. "1400Gi", "50Gi")
       volume_type           = string
       encrypted             = optional(bool, false)
       delete_on_termination = optional(bool, true)