diff --git a/.github/workflows/pre-commit-hooks.yml b/.github/workflows/pre-commit-hooks.yml
new file mode 100644
index 0000000..7949f86
--- /dev/null
+++ b/.github/workflows/pre-commit-hooks.yml
@@ -0,0 +1,56 @@
+# Optional: Pre-commit hooks workflow
+# This provides guidance for setting up local pre-commit hooks
+
+name: Pre-commit Validation
+
+on:
+ pull_request:
+ paths:
+ - ".pre-commit-config.yaml"
+ - ".github/workflows/pre-commit-hooks.yml"
+
+jobs:
+ validate-pre-commit:
+ name: Validate Pre-commit Configuration
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: "3.11"
+
+ - name: Install pre-commit
+ run: |
+ pip install pre-commit
+ pre-commit --version
+
+ - name: Run pre-commit on all files
+ run: pre-commit run --all-files
+ continue-on-error: true
+
+ - name: Show pre-commit setup instructions
+ if: always()
+ run: |
+ echo "## π Setting up Pre-commit Hooks Locally"
+ echo ""
+ echo "Pre-commit hooks help catch secrets BEFORE they reach GitHub."
+ echo ""
+ echo "### Installation:"
+ echo "\`\`\`bash"
+ echo "# Install pre-commit"
+ echo "pip install pre-commit"
+ echo ""
+ echo "# Install the git hooks"
+ echo "pre-commit install"
+ echo ""
+ echo "# (Optional) Run against all files"
+ echo "pre-commit run --all-files"
+ echo "\`\`\`"
+ echo ""
+ echo "### What it does:"
+ echo "- Scans for secrets before each commit"
+ echo "- Validates Terraform formatting"
+ echo "- Checks for merge conflicts"
+ echo "- Prevents large files from being committed"
diff --git a/.github/workflows/secret-scanning.yml b/.github/workflows/secret-scanning.yml
new file mode 100644
index 0000000..95a986e
--- /dev/null
+++ b/.github/workflows/secret-scanning.yml
@@ -0,0 +1,282 @@
+name: Secret Scanning
+
+on:
+ pull_request:
+ branches:
+ - main
+ push:
+ branches:
+ - main
+ - "feature/**"
+ - "fix/**"
+
+permissions:
+ contents: write
+ pull-requests: write
+ issues: write
+
+jobs:
+ gitleaks:
+ name: Gitleaks Secret Scanning
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0 # Fetch all history for accurate scanning
+
+ - name: Run Gitleaks
+ uses: gitleaks/gitleaks-action@v2
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ GITLEAKS_ENABLE_COMMENTS: true
+
+ - name: Upload Gitleaks Report
+ if: failure()
+ uses: actions/upload-artifact@v4
+ with:
+ name: gitleaks-report
+ path: results.sarif
+ retention-days: 7
+
+ trufflehog:
+ name: TruffleHog Secret Scanning
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+
+ - name: TruffleHog OSS
+ uses: trufflesecurity/trufflehog@main
+ with:
+ path: ./
+ base: ${{ github.event.repository.default_branch }}
+ head: HEAD
+ extra_args: --debug --only-verified
+
+ custom-pattern-check:
+ name: Custom Pattern Detection
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+
+ - name: Check for common secret patterns
+ id: secret_check
+ run: |
+ echo "Scanning for common secret patterns..."
+
+ # Define patterns to search for
+ PATTERNS=(
+ "aws_access_key_id"
+ "aws_secret_access_key"
+ "AKIA[0-9A-Z]{16}" # AWS Access Key
+ "(?i)api[_-]?key.*['\"][0-9a-zA-Z]{32,}['\"]" # Generic API keys
+ "(?i)password.*['\"][^'\"]{8,}['\"]" # Passwords in quotes
+ "(?i)secret.*['\"][0-9a-zA-Z]{32,}['\"]" # Generic secrets
+ "(?i)token.*['\"][0-9a-zA-Z]{32,}['\"]" # Tokens
+ "private[_-]?key"
+ "-----BEGIN (RSA|OPENSSH|DSA|EC) PRIVATE KEY-----" # Private keys
+ "ghp_[0-9a-zA-Z]{36}" # GitHub Personal Access Token
+ "ghs_[0-9a-zA-Z]{36}" # GitHub OAuth Secret
+ "sk_live_[0-9a-zA-Z]{24,}" # Stripe Live Secret Key
+ "pk_live_[0-9a-zA-Z]{24,}" # Stripe Live Public Key
+ )
+
+ FOUND_SECRETS=0
+ REPORT_FILE="secret_scan_report.txt"
+
+ echo "=== Secret Scanning Report ===" > $REPORT_FILE
+ echo "Timestamp: $(date)" >> $REPORT_FILE
+ echo "" >> $REPORT_FILE
+
+ # Get list of changed files
+ if [ "${{ github.event_name }}" = "pull_request" ]; then
+ FILES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD)
+ else
+ FILES=$(git diff --name-only HEAD~1 HEAD)
+ fi
+
+ # Skip certain file types and directories
+ FILES=$(echo "$FILES" | grep -v ".terraform/" | grep -v ".git/" | grep -v "node_modules/" || true)
+
+ for FILE in $FILES; do
+ if [ -f "$FILE" ]; then
+ echo "Scanning: $FILE" >> $REPORT_FILE
+
+ for PATTERN in "${PATTERNS[@]}"; do
+ MATCHES=$(grep -niE "$PATTERN" "$FILE" 2>/dev/null || true)
+ if [ ! -z "$MATCHES" ]; then
+ FOUND_SECRETS=1
+ echo " β FOUND POTENTIAL SECRET:" >> $REPORT_FILE
+ echo " Pattern: $PATTERN" >> $REPORT_FILE
+ echo "$MATCHES" | while IFS= read -r line; do
+ # Redact the actual secret value
+ REDACTED=$(echo "$line" | sed -E 's/['\''"][0-9a-zA-Z]{8,}['\''"]/***REDACTED***/g')
+ echo " $REDACTED" >> $REPORT_FILE
+ done
+ echo "" >> $REPORT_FILE
+ fi
+ done
+ fi
+ done
+
+ if [ $FOUND_SECRETS -eq 1 ]; then
+ echo "status=failed" >> $GITHUB_OUTPUT
+ cat $REPORT_FILE
+ echo ""
+ echo "β SECRETS DETECTED! Please remove sensitive data before committing."
+ exit 1
+ else
+ echo "status=passed" >> $GITHUB_OUTPUT
+ echo "β
No secrets detected"
+ fi
+
+ - name: Comment on PR with findings
+ if: failure() && github.event_name == 'pull_request'
+ uses: actions/github-script@v7
+ with:
+ github-token: ${{ secrets.GITHUB_TOKEN }}
+ script: |
+ const fs = require('fs');
+ let report = 'β οΈ **Secret Scanning Failed**\n\n';
+ report += '**Potential secrets or API keys were detected in your changes.**\n\n';
+ report += 'Please review and remove any sensitive data before merging.\n\n';
+ report += '### What to do:\n';
+ report += '1. Remove the secret from your code\n';
+ report += '2. Use environment variables or GitHub Secrets instead\n';
+ report += '3. If the secret was already committed, you must:\n';
+ report += ' - Rotate/invalidate the exposed secret\n';
+ report += ' - Remove it from git history using `git filter-branch` or BFG Repo-Cleaner\n\n';
+ report += '### Common secret patterns detected:\n';
+ report += '- AWS Access Keys (AKIA...)\n';
+ report += '- API Keys\n';
+ report += '- Private Keys\n';
+ report += '- Passwords or tokens in code\n\n';
+ report += '**This PR cannot be merged until all secrets are removed.**';
+
+ github.rest.issues.createComment({
+ issue_number: context.issue.number,
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ body: report
+ });
+
+ block-merge:
+ name: Block Merge if Secrets Found
+ runs-on: ubuntu-latest
+ needs: [gitleaks, trufflehog, custom-pattern-check]
+ if: always()
+ steps:
+ - name: Check scan results
+ run: |
+ if [ "${{ needs.gitleaks.result }}" = "failure" ] || \
+ [ "${{ needs.trufflehog.result }}" = "failure" ] || \
+ [ "${{ needs.custom-pattern-check.result }}" = "failure" ]; then
+ echo "β Secret scanning failed. Blocking merge."
+ exit 1
+ else
+ echo "β
All secret scans passed. Safe to merge."
+ fi
+
+ # Optional: Auto-revert commits with secrets on main branch
+ auto-revert:
+ name: Auto-revert Commits with Secrets
+ runs-on: ubuntu-latest
+ needs: [gitleaks, trufflehog, custom-pattern-check]
+ if: |
+ failure() &&
+ github.event_name == 'push' &&
+ github.ref == 'refs/heads/main'
+ permissions:
+ contents: write
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ token: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Configure git
+ run: |
+ git config user.name "github-actions[bot]"
+ git config user.email "github-actions[bot]@users.noreply.github.com"
+
+ - name: Revert last commit
+ run: |
+ COMMIT_SHA="${{ github.sha }}"
+ COMMIT_MSG=$(git log -1 --pretty=%B $COMMIT_SHA)
+
+ echo "β οΈ Reverting commit: $COMMIT_SHA"
+ echo "Commit message: $COMMIT_MSG"
+
+ git revert --no-edit $COMMIT_SHA
+ git push origin main
+
+ - name: Create issue for manual review
+ uses: actions/github-script@v7
+ with:
+ github-token: ${{ secrets.GITHUB_TOKEN }}
+ script: |
+ const issue = await github.rest.issues.create({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ title: 'π¨ Secrets Detected - Commit Automatically Reverted',
+ body: `## Security Alert: Secrets Detected
+
+ **Commit**: \`${{ github.sha }}\`
+ **Author**: @${{ github.actor }}
+ **Branch**: main
+
+ ### What happened?
+ Secret scanning detected potential secrets or API keys in a commit to the main branch.
+ The commit has been automatically reverted to prevent exposure.
+
+ ### Required Actions:
+
+ 1. **β οΈ ROTATE ALL EXPOSED SECRETS IMMEDIATELY**
+ - If the secret was an API key, revoke it
+ - If it was an AWS key, disable it in IAM
+ - Generate new credentials
+
+ 2. **Clean up your local branch**:
+ \`\`\`bash
+ git fetch origin
+ git reset --hard origin/main
+ \`\`\`
+
+ 3. **Remove the secret properly**:
+ - Use environment variables
+ - Use GitHub Secrets
+ - Use AWS Secrets Manager / Parameter Store
+ - Add pattern to .gitignore
+
+ 4. **Re-commit without secrets**:
+ - Make your changes again
+ - Ensure no secrets are in the code
+ - Submit a new PR
+
+ ### Preventing Future Incidents:
+
+ - Always use \`.tfvars\` files for sensitive values (they're gitignored)
+ - Use \`backend.tf\` for backend config (also gitignored)
+ - Store secrets in GitHub Secrets or AWS Secrets Manager
+ - Run \`git diff\` before committing to review changes
+ - Enable pre-commit hooks for local secret scanning
+
+ **This issue will remain open until confirmed that exposed secrets have been rotated.**`,
+ labels: ['security', 'urgent', 'secrets-detected']
+ });
+
+ console.log('Created issue:', issue.data.number);
+
+ - name: Send alert notification
+ if: always()
+ run: |
+ echo "π¨ SECURITY ALERT: Secrets detected in commit ${{ github.sha }}"
+ echo "Commit has been reverted and an issue has been created."
+ echo "Please rotate any exposed credentials immediately."
diff --git a/.github/workflows/terraform-apply.yml b/.github/workflows/terraform-apply.yml
new file mode 100644
index 0000000..52eda40
--- /dev/null
+++ b/.github/workflows/terraform-apply.yml
@@ -0,0 +1,111 @@
+name: Terraform Apply
+
+on:
+ push:
+ branches:
+ - main
+ paths:
+ - "infra/aws/**/*.tf"
+ - "infra/aws/**/*.tfvars"
+ - ".github/workflows/terraform-*.yml"
+ workflow_dispatch:
+ inputs:
+ module:
+ description: "Specific module to apply (leave empty for all changed)"
+ required: false
+ type: string
+
+permissions:
+ contents: read
+ id-token: write
+
+jobs:
+ detect-changes:
+ name: Detect Changed Modules
+ runs-on: ubuntu-latest
+ outputs:
+ modules: ${{ steps.detect.outputs.modules }}
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 2
+
+ - name: Detect changed Terraform modules
+ id: detect
+ run: |
+ if [ "${{ github.event_name }}" == "workflow_dispatch" ] && [ -n "${{ inputs.module }}" ]; then
+ # Manual trigger with specific module
+ MODULES=$(echo '["${{ inputs.module }}"]')
+ echo "Manual module specified: $MODULES"
+ echo "modules=$MODULES" >> $GITHUB_OUTPUT
+ exit 0
+ fi
+
+ # Get changed files from the last commit
+ CHANGED_FILES=$(git diff --name-only HEAD~1 HEAD | grep -E '^infra/aws/.*\.tf(vars)?$' || true)
+
+ if [ -z "$CHANGED_FILES" ]; then
+ echo "No Terraform files changed"
+ echo "modules=[]" >> $GITHUB_OUTPUT
+ exit 0
+ fi
+
+ # Extract unique module directories
+ MODULES=$(echo "$CHANGED_FILES" | xargs -n1 dirname | sort -u | jq -R -s -c 'split("\n")[:-1]')
+ echo "Changed modules: $MODULES"
+ echo "modules=$MODULES" >> $GITHUB_OUTPUT
+
+ terraform-apply:
+ name: Apply - ${{ matrix.module }}
+ runs-on: ubuntu-latest
+ needs: detect-changes
+ if: needs.detect-changes.outputs.modules != '[]'
+ strategy:
+ matrix:
+ module: ${{ fromJson(needs.detect-changes.outputs.modules) }}
+ fail-fast: false
+ max-parallel: 1 # Apply modules one at a time to avoid conflicts
+ defaults:
+ run:
+ working-directory: ${{ matrix.module }}
+ environment:
+ name: production-demo
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+
+ - name: Configure AWS Credentials
+ uses: aws-actions/configure-aws-credentials@v4
+ with:
+ role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
+ aws-region: us-east-2
+ role-session-name: GitHubActions-TerraformApply
+
+ - name: Setup Terraform
+ uses: hashicorp/setup-terraform@v3
+ with:
+ terraform_version: "~1.6"
+
+ - name: Terraform Init
+ env:
+ TF_CLI_ARGS_init: >-
+ -backend-config="bucket=${{ secrets.TF_STATE_BUCKET }}"
+ -backend-config="dynamodb_table=${{ secrets.TF_STATE_LOCK_TABLE }}"
+ -backend-config="region=us-east-2"
+ -backend-config="encrypt=true"
+ run: terraform init -input=false
+
+ - name: Terraform Plan
+ run: terraform plan -no-color -input=false -out=tfplan
+
+ - name: Terraform Apply
+ run: terraform apply -no-color -input=false tfplan
+
+ - name: Upload Terraform State (backup)
+ uses: actions/upload-artifact@v4
+ if: always()
+ with:
+ name: terraform-state-${{ hashFiles(format('{0}/**', matrix.module)) }}
+ path: ${{ matrix.module }}/.terraform/
+ retention-days: 7
diff --git a/.github/workflows/terraform-destroy.yml b/.github/workflows/terraform-destroy.yml
new file mode 100644
index 0000000..590c354
--- /dev/null
+++ b/.github/workflows/terraform-destroy.yml
@@ -0,0 +1,68 @@
+name: Terraform Destroy
+
+on:
+ workflow_dispatch:
+ inputs:
+ module:
+ description: "Module to destroy (e.g., infra/aws/us-east-2/eks)"
+ required: true
+ type: string
+ confirm:
+ description: 'Type "destroy" to confirm'
+ required: true
+ type: string
+
+permissions:
+ contents: read
+ id-token: write
+
+jobs:
+ terraform-destroy:
+ name: Destroy - ${{ inputs.module }}
+ runs-on: ubuntu-latest
+ if: inputs.confirm == 'destroy'
+ defaults:
+ run:
+ working-directory: ${{ inputs.module }}
+ environment:
+ name: production-demo
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+
+ - name: Configure AWS Credentials
+ uses: aws-actions/configure-aws-credentials@v4
+ with:
+ role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
+ aws-region: us-east-2
+ role-session-name: GitHubActions-TerraformDestroy
+
+ - name: Setup Terraform
+ uses: hashicorp/setup-terraform@v3
+ with:
+ terraform_version: "~1.6"
+
+ - name: Terraform Init
+ env:
+ TF_CLI_ARGS_init: >-
+ -backend-config="bucket=${{ secrets.TF_STATE_BUCKET }}"
+ -backend-config="dynamodb_table=${{ secrets.TF_STATE_LOCK_TABLE }}"
+ -backend-config="region=us-east-2"
+ -backend-config="encrypt=true"
+ run: terraform init -input=false
+
+ - name: Terraform Plan Destroy
+ run: terraform plan -destroy -no-color -input=false -out=tfplan
+
+ - name: Terraform Destroy
+ run: terraform apply -no-color -input=false tfplan
+
+ validation-failed:
+ name: Validation Failed
+ runs-on: ubuntu-latest
+ if: inputs.confirm != 'destroy'
+ steps:
+ - name: Confirmation not provided
+ run: |
+ echo "::error::Destroy confirmation not provided. You must type 'destroy' to confirm."
+ exit 1
diff --git a/.github/workflows/terraform-plan.yml b/.github/workflows/terraform-plan.yml
new file mode 100644
index 0000000..0da766e
--- /dev/null
+++ b/.github/workflows/terraform-plan.yml
@@ -0,0 +1,140 @@
+name: Terraform Plan
+
+on:
+ pull_request:
+ branches:
+ - main
+ paths:
+ - "infra/aws/**/*.tf"
+ - "infra/aws/**/*.tfvars"
+ - ".github/workflows/terraform-*.yml"
+
+permissions:
+ contents: read
+ pull-requests: write
+ id-token: write
+
+jobs:
+ detect-changes:
+ name: Detect Changed Modules
+ runs-on: ubuntu-latest
+ outputs:
+ modules: ${{ steps.detect.outputs.modules }}
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+
+ - name: Detect changed Terraform modules
+ id: detect
+ run: |
+ # Get changed files
+ CHANGED_FILES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD | grep -E '^infra/aws/.*\.tf(vars)?$' || true)
+
+ if [ -z "$CHANGED_FILES" ]; then
+ echo "No Terraform files changed"
+ echo "modules=[]" >> $GITHUB_OUTPUT
+ exit 0
+ fi
+
+ # Extract unique module directories
+ MODULES=$(echo "$CHANGED_FILES" | xargs -n1 dirname | sort -u | jq -R -s -c 'split("\n")[:-1]')
+ echo "Changed modules: $MODULES"
+ echo "modules=$MODULES" >> $GITHUB_OUTPUT
+
+ terraform-plan:
+ name: Plan - ${{ matrix.module }}
+ runs-on: ubuntu-latest
+ needs: detect-changes
+ if: needs.detect-changes.outputs.modules != '[]'
+ strategy:
+ matrix:
+ module: ${{ fromJson(needs.detect-changes.outputs.modules) }}
+ fail-fast: false
+ defaults:
+ run:
+ working-directory: ${{ matrix.module }}
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+
+ - name: Configure AWS Credentials
+ uses: aws-actions/configure-aws-credentials@v4
+ with:
+ role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
+ aws-region: us-east-2
+ role-session-name: GitHubActions-TerraformPlan
+
+ - name: Setup Terraform
+ uses: hashicorp/setup-terraform@v3
+ with:
+ terraform_version: "~1.6"
+
+ - name: Terraform Format Check
+ id: fmt
+ run: terraform fmt -check -recursive
+ continue-on-error: true
+
+ - name: Terraform Init
+ id: init
+ env:
+ TF_CLI_ARGS_init: >-
+ -backend-config="bucket=${{ secrets.TF_STATE_BUCKET }}"
+ -backend-config="dynamodb_table=${{ secrets.TF_STATE_LOCK_TABLE }}"
+ -backend-config="region=us-east-2"
+ -backend-config="encrypt=true"
+ run: terraform init -input=false
+
+ - name: Terraform Validate
+ id: validate
+ run: terraform validate -no-color
+
+ - name: Terraform Plan
+ id: plan
+ run: |
+ terraform plan -no-color -input=false -out=tfplan
+ terraform show -no-color tfplan > plan.txt
+ continue-on-error: true
+
+ - name: Comment PR with Plan
+ uses: actions/github-script@v7
+ if: github.event_name == 'pull_request'
+ env:
+ PLAN: ${{ steps.plan.outputs.stdout }}
+ with:
+ github-token: ${{ secrets.GITHUB_TOKEN }}
+ script: |
+ const fs = require('fs');
+ const module = '${{ matrix.module }}';
+ const plan = fs.existsSync('${{ matrix.module }}/plan.txt')
+ ? fs.readFileSync('${{ matrix.module }}/plan.txt', 'utf8')
+ : 'Plan output not available';
+
+ const output = `### Terraform Plan: \`${module}\`
+
+ #### Format and Style π \`${{ steps.fmt.outcome }}\`
+ #### Initialization βοΈ \`${{ steps.init.outcome }}\`
+ #### Validation π€ \`${{ steps.validate.outcome }}\`
+ #### Plan π \`${{ steps.plan.outcome }}\`
+
+ Show Plan
+
+ \`\`\`terraform
+ ${plan.slice(0, 65000)}
+ \`\`\`
+
+
+
+ *Pusher: @${{ github.actor }}, Action: \`${{ github.event_name }}\`, Workflow: \`${{ github.workflow }}\`*`;
+
+ github.rest.issues.createComment({
+ issue_number: context.issue.number,
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ body: output
+ });
+
+ - name: Fail if plan failed
+ if: steps.plan.outcome == 'failure'
+ run: exit 1
diff --git a/.gitignore b/.gitignore
index 839afa9..e15c52f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,9 +2,22 @@
.terraform/
.terraform.lock.hcl
terraform.tfstate*
-**.tfvars**
tf.plan
-
+tfplan
+*.tfplan
+*.log
+
+# Backend configuration (contains sensitive IDs)
+backend.tf
+backend.tfvars
+*.backend.tfvars
+backend.hcl
+*.backend.hcl
+
+# Terraform variable files (may contain sensitive IDs, ARNs, domains)
+*.tfvars
+!*.tfvars.example
+terraform.tfvars.example
# Helm + Kubernetes
infra/aws/us-east-2/apps/coder-ws/experiment/prometheus.yaml
infra/aws/us-east-2/apps/coder-devel/build-and-push
diff --git a/.gitleaks.toml b/.gitleaks.toml
new file mode 100644
index 0000000..f1ef882
--- /dev/null
+++ b/.gitleaks.toml
@@ -0,0 +1,107 @@
+# Gitleaks configuration file
+# https://github.com/gitleaks/gitleaks
+
+title = "Gitleaks Configuration for Coder Infrastructure"
+
+[extend]
+# useDefault will extend the base configuration with all default gitleaks rules
+useDefault = true
+
+[allowlist]
+description = "Allowlist for non-sensitive patterns"
+
+# Ignore test/example values
+regexes = [
+ '''test[_-]?(token|key|secret|password)''', # Test credentials
+ '''example[_-]?(token|key|secret)''',
+ '''dummy[_-]?(token|key|secret)''',
+ '''fake[_-]?(token|key|secret)''',
+ '''YOUR[_-]''', # Placeholder values like YOUR_API_KEY
+ '''REPLACE[_-]''',
+ '''CHANGEME''',
+ '''TODO''',
+]
+
+# Ignore certain file paths
+paths = [
+ '''\.git/''',
+ '''\.terraform/''',
+ '''node_modules/''',
+ '''vendor/''',
+ '''\.(tfstate|tfstate\.backup)$''',
+ '''\.example$''', # Example configuration files
+ '''\.md$''', # Documentation files (review these manually)
+ '''go\.sum$''',
+ '''package-lock\.json$''',
+]
+
+# Ignore certain commits (if needed, add commit SHAs here)
+commits = []
+
+# Custom rules for infrastructure-specific secrets
+[[rules]]
+id = "terraform-sensitive-variable"
+description = "Terraform sensitive variable not marked as sensitive"
+regex = '''variable\s+"([^"]+)"\s+\{[^}]*default\s+=\s+["']([^"']{8,})["'][^}]*\}'''
+tags = ["terraform", "sensitive"]
+
+[[rules]]
+id = "aws-account-id"
+description = "AWS Account ID"
+regex = '''\d{12}'''
+tags = ["aws", "account-id"]
+# Note: Account IDs aren't secrets, but good to track
+[rules.allowlist]
+regexes = [
+ '''(region|zone|ami|snapshot|volume)-\d{12}''', # Not account IDs
+]
+
+[[rules]]
+id = "coder-access-url"
+description = "Coder access URL with potential secrets"
+regex = '''coder_access_url\s*=\s*["\']https?://[^"\']*:[^"\'@]*@'''
+tags = ["coder", "url", "credentials"]
+
+[[rules]]
+id = "database-connection-string"
+description = "Database connection string with credentials"
+regex = '''postgres://([^:]+):([^@]+)@'''
+tags = ["database", "credentials"]
+[rules.allowlist]
+regexes = [
+ '''postgres://\w+@localhost''', # Local connections without password
+ '''mode=memory''', # In-memory databases
+]
+
+[[rules]]
+id = "route53-zone-id"
+description = "Route53 Hosted Zone ID"
+regex = '''Z[A-Z0-9]{12,}'''
+tags = ["aws", "route53"]
+# These are semi-sensitive; track but don't necessarily block
+
+[[rules]]
+id = "oidc-provider-arn"
+description = "OIDC Provider ARN containing account ID"
+regex = '''arn:aws:iam::\d{12}:oidc-provider'''
+tags = ["aws", "oidc", "arn"]
+
+[[rules]]
+id = "kubernetes-secret-value"
+description = "Kubernetes secret value in manifest"
+regex = '''(apiVersion:\s*v1\s+kind:\s*Secret.*data:.*\n\s+\w+:\s+)([A-Za-z0-9+/=]{16,})'''
+tags = ["kubernetes", "secret", "base64"]
+
+# Entropy-based detection for high-entropy strings (likely secrets)
+[[rules]]
+id = "high-entropy-string"
+description = "High entropy string (possible secret)"
+regex = '''['\"]([A-Za-z0-9+/=]{32,})['\"]'''
+entropy = 4.5 # Minimum entropy threshold
+tags = ["entropy", "generic"]
+[rules.allowlist]
+paths = [
+ '''\.lock$''',
+ '''\.sum$''',
+ '''\.json$''',
+]
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..d49d3f8
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,55 @@
+# Pre-commit hooks configuration
+# Install: pip install pre-commit && pre-commit install
+# Run manually: pre-commit run --all-files
+
+repos:
+ # Gitleaks - Secret detection
+ - repo: https://github.com/gitleaks/gitleaks
+ rev: v8.18.4
+ hooks:
+ - id: gitleaks
+
+ # General checks
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v4.5.0
+ hooks:
+ - id: trailing-whitespace
+ exclude: '\.md$'
+ - id: end-of-file-fixer
+ - id: check-yaml
+ args: ["--unsafe"] # Allow custom YAML tags
+ - id: check-added-large-files
+ args: ["--maxkb=1000"]
+ - id: check-merge-conflict
+ - id: detect-private-key
+ - id: detect-aws-credentials
+ args: ["--allow-missing-credentials"]
+
+ # Terraform
+ - repo: https://github.com/antonbabenko/pre-commit-terraform
+ rev: v1.88.4
+ hooks:
+ - id: terraform_fmt
+ - id: terraform_validate
+ args:
+ - --hook-config=--retry-once-with-cleanup=true
+ - id: terraform_tflint
+ args:
+ - --args=--config=__GIT_WORKING_DIR__/.tflint.hcl
+ - id: terraform_docs
+ args:
+ - --hook-config=--path-to-file=README.md
+ - --hook-config=--add-to-existing-file=true
+ - --hook-config=--create-file-if-not-exist=true
+
+ # Prevent commits to main
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v4.5.0
+ hooks:
+ - id: no-commit-to-branch
+ args: ["--branch", "main", "--branch", "master"]
+ stages: [commit]
+
+# Global settings
+default_language_version:
+ python: python3.11
diff --git a/GITHUB_APP_SETUP.md b/GITHUB_APP_SETUP.md
new file mode 100644
index 0000000..adc457c
--- /dev/null
+++ b/GITHUB_APP_SETUP.md
@@ -0,0 +1,56 @@
+# GitHub App Setup for Coder
+
+## Correct Callback URLs
+
+When configuring your GitHub App for Coder, use these **exact** callback URLs:
+
+### Primary OAuth (User Authentication)
+
+```
+https://coderdemo.io/api/v2/users/oauth2/github/callback
+```
+
+### External Auth (Git Operations in Workspaces)
+
+```
+https://coderdemo.io/api/v2/external-auth/primary-github/callback
+```
+
+## Important Settings
+
+1. **Request user authorization (OAuth) during installation**: β
**MUST be checked**
+ - This allows users to log into Coder with their GitHub identity
+
+2. **Permissions Required**:
+ - **Account permissions**:
+ - Email addresses: Read-only
+ - **Repository permissions**:
+ - Contents: Read and write
+ - Metadata: Read-only (auto-required)
+ - Pull requests: Read and write (optional, for PR creation)
+ - Issues: Read and write (optional, for issue management)
+
+3. **Installation**:
+ - Install the app to your account/organization
+ - Grant access to "All repositories" or specific repos
+
+## Common Issues
+
+### "redirect_uri is not associated with this application"
+
+- **Cause**: Callback URLs don't match what Coder is sending
+- **Solution**: Verify the URLs above are **exactly** correct (including `/api/v2/users/` and `/api/v2/`)
+
+### "Not HTTPS Secure" warning
+
+- **Cause**: Accessing `http://coderdemo.io` instead of `https://coderdemo.io`
+- **Solution**: Always use `https://` when accessing Coder
+
+## After Setup
+
+Once configured, users can:
+
+- Log into Coder using GitHub authentication
+- Clone repositories in their workspaces
+- Push/pull code
+- Create pull requests (if permissions granted)
diff --git a/README.md b/README.md
index 2d11dd7..4278a73 100644
--- a/README.md
+++ b/README.md
@@ -1,407 +1,557 @@
-# AI Demo Environment (ai.coder.com)
+# Coder Demo Environment (coderdemo.io)
-Welcome to the AI Demo Environment's Github repository!
+Welcome to the Coder Demo Environment's Github repository!
-This project is used by ["ai.coder.com"](https://ai.coder.com), allowing users to experiment with the latest AI features in Coder and create demoes for them.
+This project powers ["coderdemo.io"](https://coderdemo.io), a production-grade, multi-region demonstration environment showcasing Coder's cloud development capabilities, workspace proxies, and global deployment patterns.
----
+> [!IMPORTANT]
+> **This infrastructure is HEAVILY AWS-opinionated.**
+>
+> This repository uses AWS-specific services and patterns throughout (EKS, Aurora Serverless v2, VPC, Route53, ACM, etc.). While Coder itself is cloud-agnostic, this particular deployment is designed exclusively for AWS. If you're deploying on GCP, Azure, or other cloud providers, you'll need to significantly adapt the infrastructure code.
-## Getting Hand's On
+---
-> [!IMPORTANT] Before accessing the deployment, make sure you've been invited to our "coder-contrib" Github organization. If not, reach out to `jullian@coder.com` and send your Github handle to be added in. Otherwise, if you're an internal user, you should already have access to to the environment.
+## Getting Started
### Accessing the Deployment:
-Get Started Here π [https://ai.coder.com](https://ai.coder.com)
+Get Started Here π [https://coderdemo.io](https://coderdemo.io)
**Login Flow**
-- Non-Coder Employee
+1. Click "Sign in with GitHub"
+2. Authorize the Coder Demo GitHub App
+3. Start creating workspaces in your preferred region!
+
+**Available Regions:**
-1. Select "GitHub"
+- πΊπΈ **US East (Ohio)** - Primary deployment with database
+- πΊπΈ **US West (Oregon)** - Secondary server + workspace proxy
+- πͺπΊ **EU West (London)** - Workspace proxy
+
+> [!NOTE] This is a demo environment. For production Coder deployments, refer to the [official Coder documentation](https://coder.com/docs).
+
+---
-2. Login with your Github account (that has access to the coder-contrib Github Organization).
+## Architecture Overview
-- Coder Employee
+This deployment implements a **hub-and-spoke architecture** across three AWS regions:
-1. Select "Okta"
+### Hub Region: us-east-2 (Ohio)
-2. Login with your Github account (that has access to the coder-contrib Github Organization).
+The primary region containing foundational, non-repeatable infrastructure:
+
+- **Central Database**: Aurora Serverless v2 PostgreSQL cluster (shared by all regions)
+- **Terraform Backend**: S3 bucket and DynamoDB table for state management
+- **Container Registry**: ECR for custom images
+- **Primary VPC**: Custom VPC with peering to spoke regions
+- **Primary Coder Server**: Main deployment handling authentication and control plane
+- **Additional Services**: Redis, LiteLLM, and custom applications
+
+### Spoke Regions: us-west-2 (Oregon) & eu-west-2 (London)
+
+Repeatable regional infrastructure for workspace proxies:
+
+- **Workspace Proxies**: Low-latency access to workspaces
+- **EKS Clusters**: Regional Kubernetes clusters with Karpenter autoscaling
+- **Route53**: Regional DNS records for proxy endpoints
+- **AWS ACM**: Regional SSL/TLS certificates
+
+```
+ βββββββββββββββββββββββββββββββββββ
+ β us-east-2 (Primary Hub) β
+ β β
+ β βββββββββββββββββββββββββββ β
+ β β Coder Server β β
+ β β Aurora Serverless v2 β β
+ β β Redis / ECR β β
+ β βββββββββββββββββββββββββββ β
+ β β
+ ββββββββββββββ¬ββββββββββββββββββββ
+ β
+ ββββββββββββββ΄βββββββββββββ
+ β β
+ ββββββββββββΌβββββββββββ βββββββββββΌβββββββββββ
+ β us-west-2 (Spoke) β β eu-west-2 (Spoke) β
+ β β β β
+ β βββββββββββββββββ β β ββββββββββββββββ β
+ β β Coder Proxy β β β β Coder Proxy β β
+ β β Coder Server β β β β Workspaces β β
+ β β Workspaces β β β ββββββββββββββββ β
+ β βββββββββββββββββ β β β
+ βββββββββββββββββββββββ ββββββββββββββββββββββ
+```
+
+For detailed architecture documentation, see:
+
+- [Multi-Region Deployment Guide](./docs/MULTI_REGION_DEPLOYMENT.md)
+- [Infrastructure Best Practices](./docs/INFRASTRUCTURE_BEST_PRACTICES.md)
+- [Architecture Diagram](./docs/ARCHITECTURE_DIAGRAM.md)
---
## How-To-Deploy
-> [!WARNING] The following environment is heavily opinionated towards: AWS. Make sure to pull the modules and modify according to your use-case. Additionally, the [`infra/aws/us-east-2`](./infra/aws/us-east-2) project is not repeatable. For repeatable references, check out [`infra/aws/us-west-2`](./infra/aws/us-west-2) and [`infra/aws/eu-west-2`](./infra/aws/eu-west-2)
+> [!WARNING]
+> **Infrastructure Repeatability Notice**
+>
+> This environment is heavily opinionated towards AWS and uses a hub-and-spoke architecture:
+>
+> - **[`infra/aws/us-east-2`](./infra/aws/us-east-2)** - Primary hub region with foundational infrastructure (database, terraform backend, VPC, etc.). **This is NOT repeatable** - it's meant to be deployed once as your control plane.
+> - **[`infra/aws/eu-west-2`](./infra/aws/eu-west-2)** - Clean spoke region example with workspace proxy only. **This IS repeatable** for adding new regions.
+> - **[`infra/aws/us-west-2`](./infra/aws/us-west-2)** - Hybrid spoke region with both server and proxy deployments. Use this as a reference for redundant server deployments.
+>
+> When deploying to new regions, use `eu-west-2` as your template for workspace proxies.
+
+### Deployment Overview
+
+The infrastructure is deployed in layers:
+
+1. **Foundation Layer** (us-east-2 only - deploy once)
+ - Terraform backend (S3 + DynamoDB)
+ - VPC with custom networking
+ - Aurora Serverless v2 PostgreSQL database
+ - ECR for container images
+ - Redis for caching
+
+2. **Compute Layer** (all regions)
+ - EKS clusters with managed node groups
+ - Karpenter for workspace autoscaling
+ - VPC peering (for spoke regions to hub)
+
+3. **Certificate & DNS Layer** (all regions)
+ - AWS Certificate Manager (ACM) for SSL/TLS
+ - Route53 for DNS management
+ - Regional subdomains (e.g., `us-west-2.coderdemo.io`)
+
+4. **Kubernetes Applications Layer** (all regions)
+ - AWS Load Balancer Controller
+ - AWS EBS CSI Driver
+ - Karpenter node provisioner
+ - Metrics Server
+ - Cert Manager
+
+5. **Coder Layer**
+ - **Primary (us-east-2)**: Coder Server with database connection
+ - **Spoke regions**: Coder Workspace Proxies connected to primary
+
+### About the Infrastructure Modules
+
+This repository provides reusable Terraform modules for deploying Coder on AWS:
+
+#### Network Module: [`eks-vpc`](./modules/network/eks-vpc)
+
+Creates an opinionated VPC designed for EKS and Coder workloads:
+
+- Customizable public and private subnets across multiple AZs
+- Internet Gateway for public access
+- Cost-optimized NAT Gateway using [fck-nat](https://github.com/RaJiska/terraform-aws-fck-nat)
+- Automatic routing configuration
+- Subnet tagging for EKS and Karpenter integration
+
+#### Compute Module: [`eks-cluster`](./modules/compute/cluster)
+
+Creates a production-ready EKS cluster similar to [EKS Auto Mode](https://docs.aws.amazon.com/eks/latest/userguide/automode.html):
+
+- Leverages the [AWS Managed Terraform EKS module](https://github.com/terraform-aws-modules/terraform-aws-eks/tree/master)
+- Pre-configured IAM roles and policies for:
+ - [Karpenter](https://karpenter.sh/) - Node autoscaling
+ - [AWS EBS CSI Driver](https://github.com/kubernetes-sigs/aws-ebs-csi-driver) - Persistent volumes
+ - [AWS Load Balancer Controller](https://github.com/kubernetes-sigs/aws-load-balancer-controller) - Ingress management
+ - [Coder External Provisioner](https://coder.com/docs/admin/provisioners) - Workspace provisioning
+ - [Amazon Bedrock](https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html) - AI capabilities
+- IRSA (IAM Roles for Service Accounts) configuration
+- Node group with custom launch templates
+
+#### Kubernetes Bootstrap Modules: [`modules/k8s/bootstrap/`](./modules/k8s/bootstrap/)
+
+Helm-based Kubernetes application deployments:
+
+- **[`lb-controller`](./modules/k8s/bootstrap/lb-controller)** - AWS Load Balancer Controller
+- **[`ebs-controller`](./modules/k8s/bootstrap/ebs-controller)** - AWS EBS CSI Driver
+- **[`metrics-server`](./modules/k8s/bootstrap/metrics-server)** - Kubernetes Metrics Server
+- **[`karpenter`](./modules/k8s/bootstrap/karpenter)** - Karpenter autoscaler with NodePools
+- **[`cert-manager`](./modules/k8s/bootstrap/cert-manager)** - Certificate management
+- **[`coder-server`](./modules/k8s/bootstrap/coder-server)** - Primary Coder deployment
+- **[`coder-proxy`](./modules/k8s/bootstrap/coder-proxy)** - Workspace proxy deployments
-In this repository, we deploy the infrastructure separately from the K8s applications which includes Coder.
+---
-To make things easy, we generate K8s app manifests from any `k8s/` project subfolders which reference the main `eks/` application indirectly which auto-populates any infrastructure dependent resource names.
+## Deployment Guide
+
+### Prerequisites
+
+- AWS CLI configured with appropriate credentials
+- Terraform >= 1.9.0
+- kubectl
+- Helm 3.x
+- GitHub OAuth App credentials (for authentication)
+
+### Step 1: Deploy Foundation Infrastructure (us-east-2 only)
+
+> [!IMPORTANT]
+> Only deploy this once for your entire multi-region setup.
+
+```bash
+cd infra/aws/us-east-2
+
+# 1. Create Terraform backend
+cd terraform-backend
+terraform init
+terraform apply
+cd ..
+
+# 2. Create VPC
+cd vpc
+terraform init -backend-config=backend.hcl
+terraform apply
+cd ..
+
+# 3. Deploy EKS cluster
+cd eks
+terraform init -backend-config=backend.hcl
+terraform apply
+cd ..
+
+# 4. Deploy Aurora Serverless v2 database
+cd rds
+terraform init -backend-config=backend.hcl
+terraform apply
+cd ..
+
+# 5. Set up Route53 and ACM for primary domain
+cd route53
+terraform init -backend-config=backend.hcl
+terraform apply
+cd ..
+
+cd acm
+terraform init -backend-config=backend.hcl
+terraform apply
+cd ..
+```
-### About the Infrastructure
+### Step 2: Deploy Kubernetes Applications (us-east-2)
-The deployment currently has 2 repeatable components: [`eks-vpc` module](./modules/network/eks-vpc) and [`eks-cluster` module](./modules/compute/cluster).
+```bash
+cd infra/aws/us-east-2/k8s
-#### [`eks-vpc`](./modules/network/eks-vpc)
+# Update kubeconfig
+aws eks update-kubeconfig --region us-east-2 --name coderdemo
-The following module creates an opinionated VPC that let's you granularly define individual subnets. This includes unevenly defining public and private subnets.
+# Deploy in order (each depends on previous)
+cd lb-controller && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+cd ebs-controller && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+cd metrics-server && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+cd karpenter && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+cd cert-manager && terraform init -backend-config=backend.hcl && terraform apply && cd ..
-This will come with an Internet Gateway and a Custom NAT Gateway (using [RaJiska/terraform-aws-fck-nat](github.com/RaJiska/terraform-aws-fck-nat)).
+# Deploy Coder Server
+cd coder-server && terraform init -backend-config=backend.hcl && terraform apply && cd ..
-The public subnets will have automatic routes to the IGW and private subnets with routes to the NAT.
+# Deploy Coder Workspace Provisioner
+cd coder-ws && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+```
-#### [`eks-cluster`](./modules/compute/cluster).
+### Step 3: Deploy Spoke Regions (repeatable)
-The following module creates an opinionated cluster, similar to [EKS Auto Mode](https://docs.aws.amazon.com/eks/latest/userguide/automode.html), that creates both the EKS Cluster (using the [AWS Managed Terraform EKS module](https://github.com/terraform-aws-modules/terraform-aws-eks/tree/master)), and resources needed by:
+For each additional region (use `eu-west-2` as template):
-- [Karpenter](https://karpenter.sh/)
-- [Amazon Bedrock](https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html)
-- [AWS EBS Controller](https://github.com/kubernetes-sigs/aws-ebs-csi-driver)
-- [AWS Load Balancer Controller](https://github.com/kubernetes-sigs/aws-load-balancer-controller)
-- [Coder External Provisioner](https://coder.com/docs/admin/provisioners)
+```bash
+# Example: Deploy to eu-west-2
+cd infra/aws/eu-west-2
-##### Karpenter
+# 1. Deploy EKS cluster
+cd eks
+terraform init -backend-config=backend.hcl
+terraform apply
+cd ..
-We use the the [AWS Managed Terraform EKS Module for Karpenter in the background](https://github.com/terraform-aws-modules/terraform-aws-eks/tree/master/modules/karpenter).
+# 2. Deploy Kubernetes applications (same order as us-east-2)
+cd k8s
+aws eks update-kubeconfig --region eu-west-2 --name coderdemo-euw2
-This automatically creates:
+cd lb-controller && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+cd ebs-controller && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+cd metrics-server && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+cd karpenter && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+cd cert-manager && terraform init -backend-config=backend.hcl && terraform apply && cd ..
-- SQS Queue
-- IAM Roles
-- Event Bridge
+# 3. Deploy Coder Workspace Proxy
+cd coder-proxy && terraform init -backend-config=backend.hcl && terraform apply && cd ..
-##### Amazon Bedrock
+# 4. Deploy Coder Workspace Provisioner
+cd coder-ws && terraform init -backend-config=backend.hcl && terraform apply && cd ..
+```
-Auto-Creates
+### Step 4: Configure DNS and Certificates
-- IAM Role
+Each region requires:
-##### AWS EBS Controller
+1. Route53 DNS records pointing to the regional load balancer
+2. ACM certificate for the regional subdomain
+3. TLS certificate configuration in Coder proxy/server
-Auto-Creates
+See the region-specific configurations in:
-- IAM Role
+- `infra/aws/us-east-2/route53/`
+- `infra/aws/us-west-2/route53/`
+- `infra/aws/us-west-2/acm/`
-##### AWS Load Balancer Controller
+---
-Auto-Creates
+## Configuration
-- IAM Role
+### Terraform Variables
-##### Coder External Provisioner
+Each deployment requires a `terraform.tfvars` file (gitignored for security). Key variables include:
-Auto-Creates
+#### EKS Variables
-- IAM Role
+```hcl
+cluster_name = "coderdemo"
+cluster_region = "us-east-2"
+cluster_profile = "your-aws-profile"
+```
-### Creating the Infrastructure (on AWS)
+#### Coder Variables
-To deploy the base infrastructure, you can get started with referencing our [modules directory](./modules).
+```hcl
+coder_access_url = "https://coderdemo.io"
+coder_wildcard_access_url = "*.coderdemo.io"
+addon_version = "2.27.1" # Coder version
+```
-If you don't have an existing network infrastructure, then you can start with deploying the [`eks-vpc` module](./modules/network/eks-vpc).
+#### Database (us-east-2 only)
-Additionally, if you don't have an existing cluster infrastructure, then you can start with deploying the [`eks-cluster` module](./modules/compute/cluster).
+```hcl
+coder_db_secret_url = "postgres://user:pass@host:5432/coder?sslmode=require"
+```
-Lastly, for Coder's backend database, you can refer to our deployment in [`./aidev/infra/aws/us-east-2/rds`](./aidev/infra/aws/us-east-2/rds) to see how to deploy it.
+#### Authentication
-We just an [`aws_db_instance`](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/db_instance) that uses Postgres.
+```hcl
+# GitHub OAuth
+coder_oauth_secret_client_id = "your-github-oauth-client-id"
+coder_oauth_secret_client_secret = "your-github-oauth-secret"
-Refer to the example below to see how this would look like put together:
+# GitHub External Auth (for workspace git operations)
+coder_github_external_auth_secret_client_id = "your-github-app-id"
+coder_github_external_auth_secret_client_secret = "your-github-app-secret"
+```
-```terraform
+#### SSL/TLS Configuration
-terraform {
- required_version = ">= 1.0"
- required_providers {
- aws = {
- source = "hashicorp/aws"
- version = ">= 5.100.0"
- }
- }
-}
+```hcl
+# Using AWS ACM (recommended)
+kubernetes_create_ssl_secret = false
+kubernetes_ssl_secret_name = "coder-tls"
+acme_registration_email = "admin@coderdemo.io"
+```
-variable "name" {
- description = "The resource name."
- type = string
-}
+### Backend Configuration
-variable "region" {
- description = "The aws region to deploy eks cluster"
- type = string
-}
+Each region uses S3 for Terraform state. Create a `backend.hcl` file:
-variable "cluster_version" {
- description = "The EKS Version"
- type = string
-}
-
-variable "cluster_instance_type" {
- description = "EKS Instance Size/Type."
- default = "t3.xlarge"
- type = string
-}
-
-variable "coder_ws_volume_size" {
- description = "Coder Workspace K8s Node Volume Size."
- default = 50
- type = number
-}
-
-variable "coder_ws_instance_type" {
- description = "Coder Workspace K8s Node Instance Size/Type."
- default = "t3.xlarge"
- type = string
-}
-
-variable "network_cidr_block" {
- description = "VPC CIDR Block"
- type = string
- default = "10.0.0.0/16"
-}
-
-variable "db_instance_class" {
- description = "RDS DB Instance Class"
- type = string
- default = "db.m5.large"
-}
-
-variable "db_allocated_storage" {
- description = "RDS DB Allocated Storage Amount"
- type = string
- default = "40"
-}
-
-variable "db_master_username" {
- description = "RDS DB Master Username"
- type = string
- sensitive = true
-}
-
-variable "db_master_password" {
- description = "RDS DB Master Password"
- type = string
- sensitive = true
-}
-
-module "eks-network" {
- source = "../../../../modules/network/eks-vpc"
-
- name = var.name
- vpc_cidr_block = var.network_cidr_block
- public_subnets = {
- # System subnets requiring public access (e.g. NAT Gateways, Load Balancers, IGW, etc.)
- "system0" = {
- cidr_block = "10.0.10.0/24"
- availability_zone = "${data.aws_region.this.name}a"
- map_public_ip_on_launch = true
- private_dns_hostname_type_on_launch = "ip-name"
- }
- "system1" = {
- cidr_block = "10.0.11.0/24"
- availability_zone = "${data.aws_region.this.name}b"
- map_public_ip_on_launch = true
- private_dns_hostname_type_on_launch = "ip-name"
- }
- }
- private_subnets = {
- # System subnets that don't need to be exposed publically (e.g. K8s Worker Nodes, Database, etc.)
- "system0" = {
- cidr_block = "10.0.20.0/24"
- availability_zone = "${data.aws_region.this.name}a"
- private_dns_hostname_type_on_launch = "ip-name"
- tags = local.system_subnet_tags
- }
- "system1" = {
- cidr_block = "10.0.21.0/24"
- availability_zone = "${data.aws_region.this.name}b"
- private_dns_hostname_type_on_launch = "ip-name"
- tags = local.system_subnet_tags
- }
- "provisioner" = {
- cidr_block = "10.0.22.0/24"
- availability_zone = "${data.aws_region.this.name}a"
- map_public_ip_on_launch = true
- private_dns_hostname_type_on_launch = "ip-name"
- tags = local.provisioner_subnet_tags
- }
- "ws-all" = {
- cidr_block = "10.0.16.0/22"
- availability_zone = "${data.aws_region.this.name}b"
- map_public_ip_on_launch = true
- private_dns_hostname_type_on_launch = "ip-name"
- tags = local.ws_all_subnet_tags
- }
- }
-}
-
-data "aws_iam_policy_document" "sts" {
- statement {
- effect = "Allow"
- actions = ["sts:*"]
- resources = ["*"]
- }
-}
-
-resource "aws_iam_policy" "sts" {
- name_prefix = "sts"
- path = "/"
- description = "Assume Role Policy"
- policy = data.aws_iam_policy_document.sts.json
-}
-
-module "eks-cluster" {
- source = "../../../../modules/compute/cluster"
-
- vpc_id = module.eks-network.vpc_id
- cluster_public_subnet_ids = module.eks-network.public_subnet_ids
- cluster_private_subnet_ids = module.eks-network.private_subnet_ids
- cluster_intra_subnet_ids = module.eks-network.intra_subnet_ids
- cluster_instance_type = var.cluster_instance_type
-
- cluster_name = var.name
- cluster_version = var.cluster_version
- cluster_asg_additional_policies = {
- AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
- STSAssumeRole = aws_iam_policy.sts.arn
- }
- cluster_node_security_group_tags = merge(
- local.system_sg_tags,
- merge(local.provisioner_sg_tags, local.ws_all_sg_tags)
- )
- cluster_asg_node_labels = local.cluster_asg_node_labels
- cluster_addons = {
- coredns = {
- most_recent = true
- }
- kube-proxy = {
- most_recent = true
- }
- vpc-cni = {
- most_recent = true
- }
- }
-
- karpenter_controller_policy_statements = [{
- effect = "Allow",
- actions = toset(["iam:PassRole"]),
- resources = toset(["*"]),
- }]
-
- karpenter_node_role_policies = {
- AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
- STSAssumeRole = aws_iam_policy.sts.arn
- }
-
- coder_ws_instance_type = var.coder_ws_instance_type
- coder_ws_volume_size = var.coder_ws_volume_size
-}
-
-###
-# Only deploy the database if you're creating the central Coder infrastructure.
-# Otherwise, if you're deploying separate clusters for Coder proxies + provisioners in a different network, then there's no need for another database.
-###
-
-resource "aws_db_subnet_group" "db_subnet_group" {
- name = "${var.name}-db-subnet-group"
- subnet_ids = module.eks-network.private_subnet_ids
-
- tags = {
- Name = "${var.name}-db-subnet-group"
- }
-}
-
-resource "aws_db_instance" "db" {
- identifier = "${var.name}-db"
- instance_class = var.instance_class
- allocated_storage = var.allocated_storage
- engine = "postgres"
- engine_version = "15.12"
- username = var.master_username
- password = var.master_password
- db_name = "coder"
- db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name
- vpc_security_group_ids = [ aws_security_group.postgres.id ]
- publicly_accessible = false
- skip_final_snapshot = false
-
- tags = {
- Name = "${var.name}-rds-db"
- }
- lifecycle {
- ignore_changes = [
- snapshot_identifier
- ]
- }
-}
-
-resource "aws_vpc_security_group_ingress_rule" "postgres" {
- security_group_id = aws_security_group.postgres.id
- cidr_ipv4 = var.network_cidr_block
- ip_protocol = "tcp"
- from_port = 5432
- to_port = 5432
-}
-
-resource "aws_vpc_security_group_egress_rule" "all" {
- security_group_id = aws_security_group.postgres.id
- cidr_ipv4 = "0.0.0.0/0"
- ip_protocol = -1
-}
-
-resource "aws_security_group" "postgres" {
- vpc_id = module.eks-network.vpc_id
- name = "${var.name}-postgres"
- description = "Security Group for Postgres traffic"
- tags = {
- Name = "${var.name}-postgres"
- }
-}
+```hcl
+bucket = "your-terraform-state-bucket"
+key = "path/to/state/terraform.tfstate"
+region = "us-east-2"
+dynamodb_table = "your-terraform-locks-table"
+encrypt = true
+profile = "your-aws-profile"
```
-The deployment may take a while (~20 minutes or more). In the meantime, you can then get started with creating other dependencies.
+---
+
+## Multi-Region Architecture Details
+
+### Database Strategy
+
+This deployment uses a **centralized database** approach:
+
+- Aurora Serverless v2 PostgreSQL in us-east-2
+- All regions connect to the same database over VPC peering
+- Benefits: Simplified data consistency, no replication complexity
+- Trade-offs: All regions depend on us-east-2 availability
+
+For production high-availability requirements, consider:
+
+- Aurora Global Database for multi-region read replicas
+- Active-active deployments with database replication
+- Regional database failover strategies
+
+See [Multi-Region Deployment Guide](./docs/MULTI_REGION_DEPLOYMENT.md) for more details.
+
+### Workspace Proxy Strategy
+
+Workspace proxies provide:
+
+- **Low-latency connections** to workspaces in remote regions
+- **Reduced bandwidth costs** by keeping traffic regional
+- **Improved user experience** for global teams
+
+Each proxy:
+
+1. Registers with the primary Coder server (us-east-2)
+2. Receives a session token for authentication
+3. Proxies workspace connections without database access
+4. Can run workspace provisioners locally
+
+### Network Architecture
+
+- **VPC Peering**: Spoke regions peer with hub region for database access
+- **NAT Strategy**: Cost-optimized fck-nat for outbound internet access
+- **Load Balancers**: NLB for Coder, ALB for other services
+- **DNS**: Regional subdomains route to closest workspace proxy
+
+---
+
+## Monitoring and Observability
+
+> [!NOTE]
+> Observability stack configuration is in progress.
+
+Planned integrations:
+
+- Prometheus for metrics collection
+- Grafana for visualization
+- CloudWatch for AWS resource monitoring
+- Coder built-in metrics and health endpoints
+
+---
+
+## Security Considerations
+
+### Secrets Management
+
+- **Database credentials**: Stored in terraform.tfvars (gitignored)
+- **OAuth credentials**: Stored in terraform.tfvars (gitignored)
+- **TLS certificates**: Managed by AWS ACM
+- **Kubernetes secrets**: Created by Terraform, stored in etcd
+
+For production, consider:
-### Deploying Required Apps
+- AWS Secrets Manager for credential rotation
+- External Secrets Operator for Kubernetes
+- HashiCorp Vault for centralized secret management
-Once the K8s (and maybe the Database) infrastructure is deployed, the next step is to deploy the K8s apps.
+### Network Security
-Before getting to Coder, we should first deploy:
+- Private subnets for all compute resources
+- Security groups restricting traffic between tiers
+- VPC peering for controlled cross-region access
+- TLS encryption for all external endpoints
-- [`AWS Load Balancer Controller`](https://github.com/kubernetes-sigs/aws-load-balancer-controller)
-- [`AWS EBS Controller`](https://github.com/kubernetes-sigs/aws-ebs-csi-driver)
-- [`K8s Metrics Server`](github.com/kubernetes-sigs/metrics-server)
-- [`Karpenter`](https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/#4-install-karpenter)
-- [`Cert-Manager`](https://cert-manager.io/docs/installation/helm/)
+### IAM Best Practices
-Afterwards, you can then deploy
+- IRSA (IAM Roles for Service Accounts) for pod-level permissions
+- Least privilege principle for all IAM policies
+- No long-lived credentials in pods
+- Regular IAM policy audits
+
+---
+
+## Cost Optimization
+
+Key strategies used in this deployment:
+
+1. **Karpenter Autoscaling**: Scales nodes to zero when workspaces are idle
+2. **Aurora Serverless v2**: Scales database capacity based on load
+3. **fck-nat**: Open-source NAT solution (90% cheaper than AWS NAT Gateway)
+4. **Spot Instances**: Karpenter uses spot for workspace nodes where appropriate
+5. **Regional Resources**: Only deploy proxies in regions with active users
+
+Estimated monthly costs:
+
+- Hub region (us-east-2): $200-400/month base + per-workspace costs
+- Spoke regions: $100-200/month base + per-workspace costs
+
+See [Infrastructure Best Practices](./docs/INFRASTRUCTURE_BEST_PRACTICES.md) for detailed cost analysis.
+
+---
-- [`Coder Server`](https://artifacthub.io/packages/helm/coder-v2/coder)
-- [`Coder Proxy` (uses same chart as the Coder Server)](https://artifacthub.io/packages/helm/coder-v2/coder)
-- [`Coder Workspace`](https://artifacthub.io/packages/helm/coder-v2/coder-provisioner)
+## Troubleshooting
-You can deploy the above manually yourself following your own preferred methods.
+### Common Issues
-Otherwise, you can leverage our K8s app TF modules to automatically generate the manifests:
+**EKS cluster creation fails**
-#### [`lb-controller`](./modules/k8s/apps/lb-controller)
+- Verify IAM permissions for EKS and VPC operations
+- Check VPC CIDR doesn't conflict with existing networks
+- Ensure sufficient EIPs available in the region
-#### [`ebs-controller`](./modules/k8s/apps/ebs-controller)
+**Karpenter not scaling nodes**
-#### [`metrics-server`](./modules/k8s/apps/metrics-server)
+- Verify Karpenter controller has IRSA permissions
+- Check NodePool configurations in `k8s/karpenter/`
+- Review Karpenter logs: `kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter`
-#### [`karpenter`](./modules/k8s/apps/karpenter)
+**Coder proxy not connecting**
-#### [`cert-manager`](./modules/k8s/apps/cert-manager)
+- Verify proxy token is correctly configured
+- Check network connectivity from proxy to primary server
+- Review NLB health checks and target group status
-#### [`coder-server`](./modules/k8s/apps/coder-server)
+**Database connection failures**
-#### [`coder-proxy`](./modules/k8s/apps/coder-proxy)
+- Verify security group allows traffic from EKS nodes
+- Check VPC peering routes are configured
+- Confirm database URL includes `?sslmode=require`
-#### [`coder-ws`](./modules/k8s/apps/coder-ws)
+### Useful Commands
-## How-It-Works
+```bash
+# Check EKS cluster status
+aws eks describe-cluster --name coderdemo --region us-east-2
->
+# Get kubeconfig
+aws eks update-kubeconfig --name coderdemo --region us-east-2
-### Coder Tasks
+# View Karpenter logs
+kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter -f
+
+# Check Coder server logs
+kubectl logs -n coder -l app.kubernetes.io/name=coder -f
+
+# List all Karpenter nodes
+kubectl get nodes -l karpenter.sh/initialized=true
+
+# Check workspace proxy status
+kubectl get pods -n coder-proxy
+```
+
+---
+
+## Contributing
+
+This repository represents a production demo environment. For general Coder questions or contributions, please visit:
+
+- [Coder GitHub](https://github.com/coder/coder)
+- [Coder Documentation](https://coder.com/docs)
+- [Coder Community Discord](https://coder.com/chat)
+
+---
+
+## License
+
+This infrastructure code is provided as-is for reference purposes. Refer to individual component licenses:
+
+- [Coder License](https://github.com/coder/coder/blob/main/LICENSE)
+- [Terraform License](https://github.com/hashicorp/terraform/blob/main/LICENSE)
+- [AWS Provider License](https://github.com/hashicorp/terraform-provider-aws/blob/main/LICENSE)
+
+---
+
+## Additional Resources
+
+- [Coder Documentation](https://coder.com/docs)
+- [Coder Template Examples](https://github.com/coder/coder/tree/main/examples/templates)
+- [EKS Best Practices Guide](https://aws.github.io/aws-eks-best-practices/)
+- [Karpenter Documentation](https://karpenter.sh/docs/)
+- [Multi-Region Deployment Guide](./docs/MULTI_REGION_DEPLOYMENT.md)
+- [Infrastructure Best Practices](./docs/INFRASTRUCTURE_BEST_PRACTICES.md)
+
+---
->
+**Built with β€οΈ by the Coder team**
diff --git a/docs/ARCHITECTURE_DIAGRAM.md b/docs/ARCHITECTURE_DIAGRAM.md
new file mode 100644
index 0000000..864f173
--- /dev/null
+++ b/docs/ARCHITECTURE_DIAGRAM.md
@@ -0,0 +1,814 @@
+# Coder Demo Environment Architecture Diagram
+
+This document provides a comprehensive visual representation of the **coderdemo.io** infrastructure architecture.
+
+---
+
+## Table of Contents
+
+1. [Overview Diagram](#overview-diagram)
+2. [Component Details](#component-details)
+3. [Traffic Flow](#traffic-flow)
+4. [Key Architecture Decisions](#key-architecture-decisions)
+
+---
+
+## Overview Diagram
+
+```
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+β INTERNET / USERS β
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+ β
+ β HTTPS
+ βΌ
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+β AWS ROUTE 53 (coderdemo.io) β
+β β
+β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
+β β LATENCY-BASED ROUTING (Automatic) β β
+β β β’ coderdemo.io β Nearest region (health check monitored) β β
+β β β’ *.coderdemo.io β Workspace apps (latency-routed) β β
+β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
+β β
+β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
+β β REGION-SPECIFIC ROUTING (Manual Override) β β
+β β β’ us-east-2.coderdemo.io β Force Ohio region β β
+β β β’ us-west-2.coderdemo.io β Force Oregon region β β
+β β β’ *.us-east-2.coderdemo.io β Ohio workspace apps β β
+β β β’ *.us-west-2.coderdemo.io β Oregon workspace apps β β
+β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+ β β
+ β β
+ βββββββββββΌβββββββββββ ββββββββββββΌββββββββββ
+ β US-EAST-2 (Ohio) β β US-WEST-2 (Oregon) β
+ β PRIMARY REGION β β SECONDARY REGION β
+ ββββββββββββββββββββββ ββββββββββββββββββββββ
+
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+β US-EAST-2 REGION (PRIMARY) β
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
+β β
+β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
+β β NETWORK LOAD BALANCER (NLB) β β
+β β β’ TLS Termination (ACM Certificate) β β
+β β β’ Static IP Addresses (per AZ) β β
+β β β’ Layer 4 (TCP) - Low latency β β
+β β β’ Source IP Preservation β β
+β β β’ HTTPS:443 β HTTP:8080 (backend) β β
+β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
+β β β
+β βββββββββββββββββββββββββββββββββΌβββββββββββββββββββββββββββββββββββ β
+β β VPC (10.0.0.0/16) β β
+β β β β
+β β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β β
+β β β PUBLIC SUBNETS (system0, system1) β β β
+β β β β’ Internet Gateway (IGW) β β β
+β β β β’ NAT Gateway (fck-nat - cost optimized) β β β
+β β β β’ Network Load Balancers β β β
+β β β β’ Multi-AZ (us-east-2a, us-east-2b) β β β
+β β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β β
+β β β β β
+β β βββββββββββββββββββββββββββββΌββββββββββββββββββββββββββββββ β β
+β β β PRIVATE SUBNETS β β β
+β β β β β β
+β β β ββββββββββββββββββββββββββββββββββββββββββββββββββββ β β β
+β β β β SYSTEM SUBNETS (system0, system1) β β β β
+β β β β β’ EKS Control Plane β β β β
+β β β β β’ EKS Managed Node Groups β β β β
+β β β β β’ Graviton ARM instances (t4g.xlarge) β β β β
+β β β β β’ ON_DEMAND capacity (stable) β β β β
+β β β ββββββββββββββββββββββββββββββββββββββββββββββββββββ β β β
+β β β β β β
+β β β ββββββββββββββββββββββββββββββββββββββββββββββββββββ β β β
+β β β β PROVISIONER SUBNET β β β β
+β β β β β’ Coder External Provisioner pods β β β β
+β β β β β’ Workspace orchestration β β β β
+β β β ββββββββββββββββββββββββββββββββββββββββββββββββββββ β β β
+β β β β β β
+β β β ββββββββββββββββββββββββββββββββββββββββββββββββββββ β β β
+β β β β WORKSPACE SUBNET (ws-all) β β β β
+β β β β β’ Coder Workspace pods β β β β
+β β β β β’ Karpenter auto-scaled nodes β β β β
+β β β β β’ User development environments β β β β
+β β β ββββββββββββββββββββββββββββββββββββββββββββββββββββ β β β
+β β β β β β
+β β β ββββββββββββββββββββββββββββββββββββββββββββββββββββ β β β
+β β β β RDS SUBNET (Database) β β β β
+β β β β β’ Aurora PostgreSQL 15.8 (Serverless v2) β β β β
+β β β β β’ Auto-scaling: 0.5-16 ACU (1-32 GB RAM) β β β β
+β β β β β’ Multi-AZ: Writer + Reader instances β β β β
+β β β β β’ Private only (no public access) β β β β
+β β β β β’ Shared across regions β β β β
+β β β ββββββββββββββββββββββββββββββββββββββββββββββββββββ β β β
+β β β β β β
+β β β ββββββββββββββββββββββββββββββββββββββββββββββββββββ β β β
+β β β β VPC ENDPOINTS (Cost Optimization) β β β β
+β β β β β’ S3 Gateway Endpoint β β β β
+β β β β β’ ECR API Interface Endpoint β β β β
+β β β β β’ ECR DKR Interface Endpoint β β β β
+β β β β β’ Reduces NAT Gateway data transfer costs β β β β
+β β β ββββββββββββββββββββββββββββββββββββββββββββββββββββ β β β
+β β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β β
+β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
+β β
+β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
+β β EKS CLUSTER (Kubernetes 1.x) β β
+β β β β
+β β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β β
+β β β CODER NAMESPACE β β β
+β β β β’ Coder Server (Deployment) β β β
+β β β - CODER_TLS_ENABLE = false (NLB handles TLS) β β β
+β β β - CODER_SECURE_AUTH_COOKIE = true β β β
+β β β - CODER_REDIRECT_TO_ACCESS_URL = false β β β
+β β β - GitHub OAuth integration β β β
+β β β - PostgreSQL RDS connection β β β
+β β β β’ Service Type: LoadBalancer (creates NLB) β β β
+β β β β’ ACM Certificate for TLS termination β β β
+β β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β β
+β β β β
+β β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β β
+β β β CODER-WS NAMESPACE (Workspaces) β β β
+β β β β’ Coder External Provisioner (Deployment) β β β
+β β β β’ Workspace pods (dynamically created) β β β
+β β β β’ EBS volumes for persistent storage β β β
+β β β β’ IRSA for AWS permissions β β β
+β β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β β
+β β β β
+β β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β β
+β β β INFRASTRUCTURE SERVICES (kube-system, etc.) β β β
+β β β β’ AWS Load Balancer Controller β β β
+β β β - Creates and manages NLBs β β β
+β β β - Service annotations for TLS termination β β β
+β β β β’ Karpenter β β β
+β β β - Auto-scaling for workspace nodes β β β
+β β β - SQS queue + EventBridge β β β
+β β β - Cost-optimized instance selection β β β
+β β β β’ EBS CSI Driver β β β
+β β β - Dynamic volume provisioning β β β
+β β β β’ Cert-Manager β β β
+β β β - Certificate management β β β
+β β β β’ Metrics Server β β β
+β β β - Resource metrics collection β β β
+β β β β’ CoreDNS, kube-proxy, vpc-cni (EKS addons) β β β
+β β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β β
+β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+β US-WEST-2 REGION (SECONDARY) β
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
+β β’ Similar architecture to us-east-2 β
+β β’ Infrastructure code exists (acm/, k8s/coder-server/, route53/) β
+β β’ NOT YET DEPLOYED (pending deployment) β
+β β’ Would share the same RDS database for unified accounts β
+β β’ Independent EKS cluster with own NLB β
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+β SECURITY LAYER β
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
+β β’ IAM Roles (IRSA - IAM Roles for Service Accounts) β
+β - Coder Server β RDS access β
+β - Coder Provisioner β EC2/EKS permissions β
+β - EBS Controller β EBS volume management β
+β - Load Balancer Controller β ELB management β
+β - Karpenter β EC2 instance launching β
+β β’ Security Groups β
+β - EKS cluster security group β
+β - Node security group β
+β - RDS security group (port 5432 from VPC CIDR) β
+β - VPC endpoints security group (port 443) β
+β β’ Network ACLs β
+β β’ TLS Certificates (ACM) β
+β - Auto-renewal enabled β
+β - Dynamically fetched (not hardcoded) β
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+```
+
+---
+
+## Component Details
+
+### DNS Layer (Route 53)
+
+**Hosted Zone:** `coderdemo.io`
+
+**Routing Policies:**
+
+1. **Latency-Based Routing (Primary)**
+ - Automatically routes users to the nearest AWS region
+ - Health checks monitor regional availability
+ - Automatic failover if a region becomes unhealthy
+ - Records: `coderdemo.io` and `*.coderdemo.io`
+
+2. **Region-Specific Routing (Manual Override)**
+ - Allows explicit region selection
+ - Useful for demos, testing, and regional preferences
+ - Records:
+ - `us-east-2.coderdemo.io` (Ohio)
+ - `us-west-2.coderdemo.io` (Oregon)
+ - Wildcards for workspace apps
+
+### Network Architecture
+
+**VPC Configuration:**
+
+- CIDR Block: `10.0.0.0/16`
+- Multi-AZ deployment (2 availability zones per region)
+
+**Subnet Types:**
+
+1. **Public Subnets** (`system0`, `system1`)
+ - Internet Gateway for outbound internet access
+ - NAT Gateway (fck-nat for cost optimization)
+ - Network Load Balancers
+ - CIDR: `10.0.10.0/24`, `10.0.11.0/24`
+
+2. **Private Subnets**
+ - **System Subnets** (`system0`, `system1`)
+ - EKS managed node groups
+ - Core infrastructure services
+ - CIDR: `10.0.20.0/24`, `10.0.21.0/24`
+
+ - **Provisioner Subnet**
+ - Coder external provisioner pods
+ - Workspace orchestration
+ - CIDR: `10.0.22.0/24`
+
+ - **Workspace Subnet** (`ws-all`)
+ - User workspace pods
+ - Karpenter-managed nodes
+ - CIDR: `10.0.16.0/22` (larger range for scalability)
+
+ - **RDS Subnet**
+ - PostgreSQL database
+ - Multi-AZ for high availability
+ - No public access
+
+**VPC Endpoints (Cost Optimization):**
+
+- S3 Gateway Endpoint
+- ECR API Interface Endpoint
+- ECR DKR Interface Endpoint
+- Reduces NAT Gateway data transfer costs
+
+### Load Balancing
+
+**Network Load Balancer (NLB):**
+
+- **Type:** Layer 4 (TCP/TLS)
+- **TLS Termination:** Yes (via ACM certificates)
+- **Benefits:**
+ - Low latency for WebSocket connections
+ - Source IP preservation for audit logs
+ - Static IP addresses per availability zone
+ - Better for long-lived connections
+- **Configuration:**
+ - Listener: HTTPS:443 β HTTP:8080 (Coder backend)
+ - Health checks enabled
+ - Cross-zone load balancing enabled
+
+### Compute Layer
+
+**EKS Cluster:**
+
+- Kubernetes version: Latest stable
+- Control plane: Fully managed by AWS
+- Public and private endpoint access enabled
+
+**Node Groups:**
+
+1. **System Managed Node Group**
+ - Instance type: `t4g.xlarge` (Graviton ARM)
+ - Capacity: ON_DEMAND (stable, no interruptions)
+ - Auto-scaling: 0-10 nodes
+ - Volume: 20GB gp3 (cost-optimized)
+ - Purpose: Core Kubernetes services
+
+2. **Workspace Nodes (Karpenter-managed)**
+ - Dynamic provisioning based on workspace requirements
+ - Cost-optimized instance selection
+ - Automatic scaling and termination
+ - Spot instances supported for cost savings
+
+**Karpenter Configuration:**
+
+- SQS queue for event handling
+- EventBridge for EC2 spot interruption notifications
+- IAM role for instance launching
+- Custom node classes for different workspace types
+
+### Storage Layer
+
+**Aurora Serverless v2 (PostgreSQL):**
+
+- Engine: Aurora PostgreSQL 15.8
+- Instance class: `db.serverless` (auto-scaling)
+- Scaling: 0.5-16 ACU (Coder), 0.5-8 ACU (LiteLLM)
+- Multi-AZ: Writer + Reader instances
+- Encryption: At rest and in transit
+- Backup: Automated daily backups (7-day retention)
+- Access: Private only (from VPC CIDR)
+- Cost: Pay-per-ACU-hour (~$9-$400/month depending on load)
+
+**Amazon EBS:**
+
+- CSI Driver: Installed via Helm
+- Volume type: gp3 (general purpose SSD)
+- Dynamic provisioning for workspace persistent storage
+- Encryption: Enabled
+
+### Kubernetes Services
+
+**Core Services:**
+
+1. **Coder Server** (Namespace: `coder`)
+ - Deployment with multiple replicas
+ - Service type: LoadBalancer (creates NLB)
+ - Environment variables:
+ - `CODER_TLS_ENABLE=false` (NLB handles TLS)
+ - `CODER_SECURE_AUTH_COOKIE=true`
+ - `CODER_REDIRECT_TO_ACCESS_URL=false`
+ - Connected to PostgreSQL RDS
+ - GitHub OAuth integration
+
+2. **Coder External Provisioner** (Namespace: `coder-ws`)
+ - Manages workspace lifecycle
+ - Creates and destroys workspace pods
+ - IRSA for AWS permissions
+
+3. **AWS Load Balancer Controller**
+ - Reconciles Kubernetes Service resources
+ - Creates and manages NLBs
+ - Handles TLS certificate attachment
+ - Service annotations for configuration
+
+4. **Karpenter**
+ - Node auto-scaling
+ - Instance type selection
+ - Spot instance management
+ - Cost optimization
+
+5. **EBS CSI Driver**
+ - Dynamic volume provisioning
+ - Volume snapshots
+ - Volume resizing
+
+6. **Cert-Manager**
+ - SSL/TLS certificate management
+ - Automatic renewal
+ - Integration with Let's Encrypt or ACM
+
+7. **Metrics Server**
+ - Resource metrics collection
+ - HPA (Horizontal Pod Autoscaler) support
+
+**EKS Addons:**
+
+- CoreDNS (DNS resolution)
+- kube-proxy (network proxy)
+- vpc-cni (VPC networking)
+
+### Security
+
+**IAM Roles (IRSA):**
+
+- Coder Server: RDS access, Secrets Manager
+- Coder Provisioner: EC2, EKS permissions
+- EBS Controller: EBS volume operations
+- Load Balancer Controller: ELB operations
+- Karpenter: EC2 instance launching
+
+**Security Groups:**
+
+- EKS cluster security group
+- Node security group
+- RDS security group (port 5432 from VPC)
+- VPC endpoints security group (port 443)
+
+**TLS Certificates:**
+
+- Managed by ACM
+- Automatic renewal
+- Attached to NLB via Load Balancer Controller
+
+---
+
+## Traffic Flow
+
+### User Authentication Flow
+
+```
+User Browser
+ β
+ β HTTPS
+ βΌ
+Route 53 (coderdemo.io)
+ β
+ β Latency-based routing
+ βΌ
+Network Load Balancer (TLS termination)
+ β
+ β HTTP:8080
+ βΌ
+Coder Server Pod
+ β
+ ββββ GitHub OAuth (authentication)
+ β
+ ββββ PostgreSQL RDS (user data)
+```
+
+### Workspace Creation Flow
+
+```
+User (via Coder UI)
+ β
+ βΌ
+Coder Server
+ β
+ β Creates workspace resource
+ βΌ
+Coder External Provisioner
+ β
+ ββββ Checks node capacity
+ β
+ ββββ Karpenter provisions new node (if needed)
+ β β
+ β ββββ EC2 API (launches instance)
+ β
+ ββββ Schedules workspace pod on node
+ β
+ ββββ EBS CSI creates persistent volume
+ β
+ ββββ Workspace pod starts
+ β
+ ββββ User can access workspace
+```
+
+### Workspace Application Access Flow
+
+```
+User Browser
+ β
+ β HTTPS (workspace-123.coderdemo.io)
+ βΌ
+Route 53 (*.coderdemo.io wildcard)
+ β
+ β Latency-based routing
+ βΌ
+Network Load Balancer
+ β
+ β HTTP
+ βΌ
+Coder Server (proxy)
+ β
+ β Proxies to workspace
+ βΌ
+Workspace Pod (port 8000, 3000, etc.)
+```
+
+---
+
+## Key Architecture Decisions
+
+### 1. Network Load Balancer (NLB) over Application Load Balancer (ALB)
+
+**Why NLB:**
+
+- **Lower latency:** Layer 4 (TCP) vs Layer 7 (HTTP)
+- **Source IP preservation:** Essential for Coder audit logs
+- **Static IPs:** Easier for enterprise firewall rules
+- **Long-lived connections:** Better for WebSocket connections (terminals, live updates)
+- **Cost efficiency:** Lower cost at high volume
+
+**TLS Termination at NLB:**
+
+- NLBs DO support TLS termination when configured with ACM certificates
+- Configured via AWS Load Balancer Controller service annotations
+- Traffic flow: User (HTTPS:443) β NLB (terminates TLS) β Coder (HTTP:8080)
+
+### 2. Multi-Region with Latency-Based Routing
+
+**Benefits:**
+
+- **Automatic performance optimization:** Users connect to nearest region
+- **Built-in failover:** Route53 health checks automatically remove unhealthy regions
+- **Manual override available:** Region-specific URLs for demos and testing
+- **Global reach:** Serves users worldwide with low latency
+
+**Implementation:**
+
+- Route53 latency routing policy
+- Health checks per region
+- Shared RDS database across regions (for unified accounts)
+
+### 3. Cost Optimizations
+
+**Implemented:**
+
+- **Graviton ARM instances:** t4g.xlarge (lower cost than x86)
+- **VPC Endpoints:** S3, ECR API/DKR (reduces NAT Gateway costs)
+- **fck-nat:** Custom NAT solution instead of AWS NAT Gateway
+- **Karpenter:** Right-sized workspace nodes, automatic termination
+- **gp3 volumes:** Better performance than gp2 at same cost
+- **Spot instances:** For workspace nodes (when interruption-tolerant)
+
+### 4. Security Best Practices
+
+**IRSA (IAM Roles for Service Accounts):**
+
+- No AWS credentials stored in Kubernetes secrets
+- Least-privilege access per service
+- Automatic credential rotation
+
+**Network Segmentation:**
+
+- Separate subnets for system, provisioner, and workspaces
+- RDS in private subnet with no public access
+- Security groups restrict traffic by source/destination
+
+**TLS Everywhere:**
+
+- ACM certificates with auto-renewal
+- TLS termination at load balancer
+- Secure cookies enabled
+
+### 5. Helm Chart Management
+
+**Decision: `upgrade_install = true`**
+
+- Idempotent Terraform applies
+- No "already exists" errors in CI/CD
+- Declarative version management
+- Re-added in Helm provider version 3.1.1
+
+### 6. Aurora Serverless v2 for Cost Optimization
+
+**Configuration:**
+
+- Engine: Aurora PostgreSQL 15.8 (Serverless v2)
+- Scaling: 0.5-16 ACU for Coder, 0.5-8 ACU for LiteLLM
+- Multi-AZ: Writer + Reader instances
+
+**Benefits:**
+
+- **Cost savings:** Scales down to 0.5 ACU (~$9/month) during idle periods
+- **Auto-scaling:** Automatically scales up to handle load (up to 16 ACU = 32 GB RAM)
+- **No manual intervention:** Seamless scaling based on demand
+- **Pay-per-use:** Only pay for ACU-hours consumed vs 24/7 provisioned instance
+
+**Trade-off:**
+
+- **Cold start delay:** 5-10 second initial response after idle period (>30 minutes)
+- **Acceptable for demo environment** where cost optimization outweighs instant response
+
+---
+
+## Known Behaviors (Demo Environment)
+
+This section documents expected behaviors in the demo environment that optimize for cost over instant response time.
+
+### 1. Aurora Serverless v2 Cold Start (5-10 seconds)
+
+**When it happens:**
+
+- After 30+ minutes of no database activity
+- First visitor after idle period
+
+**What you'll see:**
+
+- Site takes 5-10 seconds to load initially
+- Subsequent requests are instant (<100ms)
+- Aurora scales from 0.5 ACU β 1-2 ACU automatically
+
+**Why it's acceptable:**
+
+- Demo environment prioritizes cost savings
+- Saves ~$120/month vs provisioned RDS
+- No errors, just slower initial load
+- Perfect for sporadic demo usage
+
+**To eliminate (if needed):**
+
+- Increase `min_capacity = 2` in `infra/aws/us-east-2/rds/main.tf`
+- Trade-off: ~$35/month baseline vs $9/month
+
+### 2. HTTPβHTTPS Redirect Delay ("Not Secure" Warning)
+
+**When it happens:**
+
+- User types `coderdemo.io` without `https://`
+- Browser tries HTTP:80 first (standard behavior)
+
+**What you'll see:**
+
+1. Browser shows "Connecting..." or spinning
+2. Brief "Site is not secure" warning (2-3 seconds)
+3. Warning disappears, site loads normally with HTTPS
+
+**Root cause:**
+
+- NLB only has port 443 (HTTPS) listener configured
+- No port 80 (HTTP) listener to redirect to HTTPS
+- NLBs don't support HTTPβHTTPS redirects (ALB feature only)
+- Browser timeout on port 80, then retries port 443
+
+**Why it's acceptable:**
+
+- Demo environment, not production
+- Site works perfectly once HTTPS connects
+- No security risk (just UX delay)
+- Users who bookmark or click links use HTTPS directly
+
+**Why HSTS is NOT configured:**
+
+HSTS (HTTP Strict Transport Security) headers would help eliminate the "not secure" warning by making browsers automatically use HTTPS after the first visit. However, **Coder's HSTS feature does not work when behind a reverse proxy.**
+
+**Investigation findings:**
+
+- Coder supports HSTS via `CODER_STRICT_TRANSPORT_SECURITY` environment variable
+- However, Coder only sends HSTS headers when it directly terminates TLS (`CODER_TLS_ENABLE=true`)
+- When behind an NLB/reverse proxy with `CODER_TLS_ENABLE=false`, Coder sees incoming HTTP traffic
+- Coder's help states: "This header should only be set if the server is accessed via HTTPS"
+- Since Coder doesn't detect it's behind an HTTPS proxy, it won't send HSTS headers
+
+**Workaround not possible without:**
+
+- Switching to ALB (which can do HTTPβHTTPS redirect at load balancer level)
+- Having Coder terminate TLS directly (loses NLB benefits)
+- Waiting for Coder to add reverse-proxy awareness for HSTS feature
+- Using CloudFront in front of NLB for HTTPβHTTPS redirect
+
+**Alternative mitigation options:**
+
+- Option A: Add CloudFront with HTTPβHTTPS redirect (adds complexity and cost)
+- Option B: Switch to ALB (loses NLB benefits: lower latency, source IP preservation)
+- Option C: Configure port 80 forwarding in Coder service (complex, not standard)
+- Option D: Accept current behavior (recommended for demo environment)
+
+### Summary of Expected Load Times
+
+| Scenario | Load Time | Behavior |
+| ------------------------- | --------------- | -------------------------------------------------- |
+| **First visit (HTTP)** | 7-13 seconds | HTTP:80 timeout (2-3s) + Aurora cold start (5-10s) |
+| **First visit (HTTPS)** | 5-10 seconds | Aurora cold start only |
+| **Return visit (HTTP)** | 7-13 seconds | HTTP:80 timeout (2-3s) + Aurora cold start (5-10s) |
+| **After warm-up (HTTPS)** | <100ms | Instant, everything cached |
+| **Bookmarked/HTTPS link** | <100ms or 5-10s | Instant if warm, cold start if idle |
+
+**Note:** Always share URLs as `https://coderdemo.io` to avoid the 2-3 second HTTP:80 timeout delay.
+
+---
+
+## Infrastructure as Code
+
+All infrastructure is managed via Terraform:
+
+**Directory Structure:**
+
+```
+infra/aws/
+βββ us-east-2/ # Primary region (deployed)
+β βββ eks/ # EKS cluster
+β βββ rds/ # PostgreSQL database
+β βββ route53/ # DNS records
+β βββ k8s/ # Kubernetes applications
+β βββ coder-server/
+β βββ karpenter/
+β βββ lb-controller/
+β βββ ...
+βββ us-west-2/ # Secondary region (code exists, not deployed)
+β βββ acm/
+β βββ eks/
+β βββ route53/
+β βββ k8s/
+βββ eu-west-2/ # Tertiary region (partial code)
+
+modules/
+βββ compute/
+β βββ cluster/ # Reusable EKS cluster module
+βββ network/
+β βββ eks-vpc/ # Reusable VPC module
+βββ k8s/
+ βββ bootstrap/ # Reusable K8s app modules
+```
+
+**Terraform State:**
+
+- Stored in S3 backend
+- State locking via DynamoDB
+- Separate state files per region/component
+
+---
+
+## Deployment Status
+
+### US-EAST-2 (Ohio) - PRIMARY
+
+β
**DEPLOYED**
+
+- EKS cluster
+- RDS PostgreSQL
+- Route53 DNS records
+- All Kubernetes services
+- Coder server operational
+
+### US-WEST-2 (Oregon) - SECONDARY
+
+β³ **PENDING DEPLOYMENT**
+
+- Infrastructure code exists
+- ACM certificates ready to deploy
+- Coder server configuration ready
+- Route53 DNS records ready
+- Needs deployment to become active
+
+### EU-WEST-2 (London) - TERTIARY
+
+π§ **PARTIAL CODE**
+
+- Some infrastructure modules present
+- Not fully configured
+
+---
+
+## Monitoring and Observability
+
+**Currently Configured:**
+
+- Route53 health checks
+- EKS control plane logs
+- Kubernetes metrics server
+- Load balancer metrics (CloudWatch)
+
+**Recommended Additions:**
+
+- Prometheus for metrics collection
+- Grafana for visualization
+- AWS X-Ray for distributed tracing
+- CloudWatch Container Insights
+- Coder audit logs to CloudWatch/S3
+
+---
+
+## Disaster Recovery
+
+**Current Strategy:**
+
+- Multi-AZ RDS deployment (automatic failover)
+- Multi-region infrastructure code (can deploy us-west-2 rapidly)
+- Route53 health checks and automatic failover
+- Automated daily RDS backups
+
+**RTO/RPO:**
+
+- **RTO (Recovery Time Objective):** ~20 minutes (deploy us-west-2)
+- **RPO (Recovery Point Objective):** <1 minute (RDS Multi-AZ synchronous replication)
+
+---
+
+## Scaling Considerations
+
+**Horizontal Scaling:**
+
+- Coder server: Increase replica count in Helm values
+- Workspace nodes: Karpenter automatically scales based on demand
+- System nodes: Adjust EKS managed node group size
+
+**Vertical Scaling:**
+
+- RDS: Change instance class (requires downtime or blue/green deployment)
+- Workspace resources: Update Coder template resource requests/limits
+- Node instance types: Modify Karpenter NodePool configuration
+
+**Regional Expansion:**
+
+- Deploy us-west-2 for West Coast users
+- Deploy eu-west-2 for European users
+- Consider VPC peering or Transit Gateway for inter-region communication
+
+---
+
+## Related Documentation
+
+- [Infrastructure Best Practices](./INFRASTRUCTURE_BEST_PRACTICES.md)
+- [README](../README.md)
+
+---
+
+## Changelog
+
+- **2025-11-26**:
+ - Updated to reflect Aurora Serverless v2 configuration
+ - Added "Known Behaviors" section documenting cold start and HTTP redirect behavior
+ - Investigated and documented why HSTS cannot be configured when Coder is behind reverse proxy
+ - Documented alternative mitigation options for HTTPβHTTPS redirect delay
+- **2025-11-25**: Initial architecture diagram created
+
+---
+
+## Questions or Feedback
+
+For technical questions about this architecture, contact the infrastructure team.
diff --git a/docs/INFRASTRUCTURE_BEST_PRACTICES.md b/docs/INFRASTRUCTURE_BEST_PRACTICES.md
new file mode 100644
index 0000000..2a80306
--- /dev/null
+++ b/docs/INFRASTRUCTURE_BEST_PRACTICES.md
@@ -0,0 +1,505 @@
+# Infrastructure Best Practices for Coder Deployment
+
+This document outlines the architectural decisions, best practices, and rationale behind the Coder infrastructure deployment on AWS EKS. Use this as a reference when discussing technical implementation with customers and prospects.
+
+---
+
+## Table of Contents
+
+1. [Load Balancer Architecture](#load-balancer-architecture)
+2. [DNS and Multi-Region Setup](#dns-and-multi-region-setup)
+3. [LiteLLM Integration Architecture](#litellm-integration-architecture)
+4. [Helm Chart Management](#helm-chart-management)
+5. [Security Considerations](#security-considerations)
+
+---
+
+## Load Balancer Architecture
+
+### Decision: Network Load Balancer (NLB) with TLS Termination
+
+**What We Did:**
+
+- Deployed NLB with TLS termination using ACM certificates
+- Configured `CODER_TLS_ENABLE = "false"` on Coder server
+- NLB terminates TLS and forwards plain HTTP to backend
+
+**Why This Approach:**
+
+#### NLB Advantages for Coder
+
+1. **Lower Latency** - Layer 4 (TCP) vs Layer 7 (HTTP)
+ - Less protocol overhead
+ - Direct connection forwarding
+ - Critical for long-lived WebSocket connections (terminals, live updates)
+
+2. **Source IP Preservation**
+ - NLB preserves client source IP addresses
+ - Essential for Coder's audit logs and security monitoring
+ - No need to parse `X-Forwarded-For` headers
+
+3. **Static IP Addresses**
+ - NLB provides static IPs per availability zone
+ - Easier for enterprise firewall rules and allowlists
+ - ALB uses dynamic IPs (requires DNS-based allowlisting)
+
+4. **Connection Handling**
+ - Better for long-lived persistent connections
+ - Coder workspaces maintain extended connections
+ - Lower overhead per connection
+
+5. **Cost Efficiency**
+ - NLB: $0.0225/hour + $0.006/GB processed
+ - ALB: $0.0225/hour + $0.008/GB processed + per-rule charges
+ - Lower cost at high volume
+
+#### TLS Termination at NLB
+
+**Common Misconception:**
+
+> "NLBs don't terminate TLS - they're Layer 4 pass-through only"
+
+**Reality:**
+NLBs **DO support TLS termination** when configured with ACM certificates via the AWS Load Balancer Controller.
+
+**Configuration:**
+
+```hcl
+service_annotations = {
+ "service.beta.kubernetes.io/aws-load-balancer-ssl-cert" = data.aws_acm_certificate.coder.arn
+ "service.beta.kubernetes.io/aws-load-balancer-ssl-ports" = "443"
+}
+```
+
+**Traffic Flow:**
+
+```
+User (HTTPS:443) β NLB (terminates TLS) β Coder Backend (HTTP:8080)
+```
+
+**Coder Configuration:**
+
+```hcl
+env_vars = {
+ CODER_REDIRECT_TO_ACCESS_URL = "false" # Prevent redirect loops
+ CODER_TLS_ENABLE = "false" # NLB handles TLS
+ CODER_SECURE_AUTH_COOKIE = "true" # Users connect via HTTPS
+}
+```
+
+**Official Documentation:**
+
+- [AWS: Create TLS Listener for NLB](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/create-tls-listener.html)
+- [AWS: NLB TLS Termination Announcement](https://aws.amazon.com/blogs/aws/new-tls-termination-for-network-load-balancers/)
+- [AWS Load Balancer Controller: NLB TLS Termination](https://kubernetes-sigs.github.io/aws-load-balancer-controller/latest/guide/use_cases/nlb_tls_termination/)
+
+#### When to Use ALB Instead
+
+Consider ALB only if you need:
+
+- Path-based routing (`/api` β service A, `/web` β service B)
+- Host-based routing (multiple domains to different backends)
+- HTTP-level features (redirects, header manipulation, authentication)
+- WAF (Web Application Firewall) integration
+- More detailed HTTP metrics
+
+**For Coder:** These features are not needed - it's a single application without complex routing requirements.
+
+---
+
+## DNS and Multi-Region Setup
+
+### Architecture Overview
+
+**Root Domain:** `coderdemo.io` (Route53 hosted zone)
+
+**DNS Records:**
+
+#### 1. Latency-Based Routing (Automatic)
+
+```
+coderdemo.io β Routes to nearest region (us-east-2 or us-west-2)
+*.coderdemo.io β Wildcard for workspace apps (latency-routed)
+```
+
+**Configuration:**
+
+```hcl
+resource "aws_route53_record" "coder_latency" {
+ zone_id = var.hosted_zone_id
+ name = var.domain_name
+ type = "A"
+ set_identifier = var.set_identifier # e.g., "us-east-2"
+
+ alias {
+ name = local.nlb_hostname
+ zone_id = data.aws_lb.coder_nlb.zone_id
+ evaluate_target_health = true
+ }
+
+ latency_routing_policy {
+ region = var.cluster_region
+ }
+
+ health_check_id = aws_route53_health_check.coder[0].id
+}
+```
+
+#### 2. Region-Specific Subdomains (Manual Selection)
+
+```
+us-east-2.coderdemo.io β Force Ohio region
+us-west-2.coderdemo.io β Force Oregon region
+*.us-east-2.coderdemo.io β Ohio workspace apps
+*.us-west-2.coderdemo.io β Oregon workspace apps
+```
+
+**Use Case:**
+Instructor in East Coast can join West Coast customer demo by using `us-west-2.coderdemo.io` instead of relying on latency-based routing.
+
+### Benefits
+
+1. **Automatic Failover**
+ - Route53 health checks monitor each region
+ - Unhealthy regions automatically removed from rotation
+ - Users transparently routed to healthy region
+
+2. **Performance Optimization**
+ - Users connect to geographically nearest region
+ - Lower latency for all interactions
+ - Better experience for global teams
+
+3. **Manual Override**
+ - Region-specific URLs allow explicit region selection
+ - Useful for demos, testing, or specific customer requirements
+ - No code changes needed - just use different URL
+
+### Multi-Region Coder Visibility
+
+**Current State:**
+
+- Only `us-east-2` appears in Coder's region dropdown
+- `us-west-2` infrastructure code exists but not deployed
+
+**For us-west-2 to Appear:**
+
+1. Deploy ACM certificates (`infra/aws/us-west-2/acm/`)
+2. Deploy Coder server (`infra/aws/us-west-2/k8s/coder-server/`)
+3. Deploy Route53 records (`infra/aws/us-west-2/route53/`)
+4. Ensure shared RDS database or database replication
+
+**Important:** Both regions must use the same database for unified user accounts and workspace state.
+
+---
+
+## LiteLLM Integration Architecture
+
+### Decision: Separate Service with Subdomain
+
+**Architecture:**
+
+```
+coderdemo.io β Coder (latency-routed)
+llm.coderdemo.io β LiteLLM (separate NLB)
+```
+
+**Deployment:**
+
+- LiteLLM: Separate Kubernetes deployment with own NLB
+- Each Coder workspace namespace gets LiteLLM API keys via secret rotation
+- Keys automatically rotated from AWS Secrets Manager
+
+**Why This Approach:**
+
+#### Option 1: Separate Subdomain β
(Implemented)
+
+**Advantages:**
+
+- Keep NLB for both services (no ALB needed)
+- Clean separation of concerns
+- Independent scaling and monitoring
+- No path rewriting complexity
+
+#### Option 2: Path-Based Routing (Not Recommended)
+
+```
+coderdemo.io/ β Coder
+coderdemo.io/v1/* β LiteLLM
+```
+
+**Disadvantages:**
+
+- Requires switching to ALB
+- More complex configuration
+- Potential URL rewriting issues
+- No clear benefit for this use case
+
+#### Option 3: Internal Only (Alternative)
+
+**For Maximum Security:**
+
+- Don't expose LiteLLM externally at all
+- Coder communicates via internal Kubernetes service DNS
+- Only Coder β LiteLLM traffic allowed
+- No additional load balancer needed
+
+### Current Implementation
+
+**LiteLLM Service:** `infra/aws/us-east-2/k8s/litellm/main.tf`
+
+- 4 replicas with 2 CPU / 4Gi memory each
+- Own ACM certificate for TLS termination
+- Connected to PostgreSQL (RDS) and Redis
+- Automatic key generation and rotation
+
+**Workspace Integration:** `infra/aws/us-east-2/k8s/coder-ws/main.tf`
+
+```hcl
+module "default-ws-litellm-rotate-key" {
+ source = "../../../../../modules/k8s/bootstrap/litellm-rotate-key"
+ namespace = "coder-ws"
+ secret_id = var.aws_secret_id
+ secret_region = var.aws_secret_region
+}
+```
+
+**Key Rotation:**
+
+- Keys fetched from AWS Secrets Manager
+- Injected as Kubernetes secrets into workspace namespaces
+- Workspaces use keys to make LLM API calls through LiteLLM
+- Rotation happens automatically without workspace downtime
+
+---
+
+## Helm Chart Management
+
+### Decision: Enable `upgrade_install` on All Helm Releases
+
+**What We Did:**
+Added `upgrade_install = true` to all `helm_release` resources across the codebase.
+
+**Files Updated:**
+
+- `modules/k8s/bootstrap/karpenter/main.tf`
+- `modules/k8s/bootstrap/ebs-controller/main.tf`
+- `modules/k8s/bootstrap/lb-controller/main.tf`
+- `modules/k8s/bootstrap/cert-manager/main.tf`
+- `modules/k8s/bootstrap/coder-server/main.tf`
+- `modules/k8s/bootstrap/coder-proxy/main.tf`
+- `modules/k8s/bootstrap/metrics-server/main.tf`
+
+**Configuration:**
+
+```hcl
+resource "helm_release" "example" {
+ name = "example"
+ namespace = var.namespace
+ chart = "example"
+ repository = "https://charts.example.com"
+ create_namespace = true
+ upgrade_install = true # β Critical for idempotent deployments
+ skip_crds = false
+ wait = true
+ wait_for_jobs = true
+ version = var.chart_version
+}
+```
+
+**Why This Matters:**
+
+1. **Idempotent Terraform Applies**
+ - Without `upgrade_install`: Terraform fails if release already exists
+ - With `upgrade_install`: Terraform upgrades existing release or installs new one
+ - Essential for repeatable deployments
+
+2. **Version Management**
+ - Allows Terraform to manage chart version upgrades
+ - No manual `helm upgrade` commands needed
+ - Declarative infrastructure-as-code
+
+3. **CI/CD Integration**
+ - Pipelines can safely re-run Terraform apply
+ - No "already exists" errors in automation
+ - Cleaner error handling
+
+**Helm Provider Version:**
+
+```hcl
+helm = {
+ source = "hashicorp/helm"
+ version = "3.1.1" # upgrade_install re-added in this version
+}
+```
+
+**Historical Context:**
+The `upgrade_install` parameter was temporarily removed from the Helm provider in earlier versions, leading to comments in code saying it was "invalid". It was re-added in version 3.1.1 and should now be used as a best practice.
+
+---
+
+## Security Considerations
+
+### TLS/SSL Certificate Management
+
+**ACM Certificates:**
+
+```hcl
+data "aws_acm_certificate" "coder" {
+ domain = trimsuffix(trimprefix(var.coder_access_url, "https://"), "/")
+ statuses = ["ISSUED"]
+ most_recent = true
+}
+```
+
+**Best Practices:**
+
+1. Use ACM for automatic certificate renewal
+2. Fetch certificates dynamically (don't hardcode ARNs)
+3. Filter by `ISSUED` status to avoid revoked certs
+4. Use `most_recent` for automatic updates
+
+### Service Account Permissions
+
+**Principle of Least Privilege:**
+
+```hcl
+oidc_principals = {
+ "${var.cluster_oidc_provider_arn}" = [
+ "system:serviceaccount:${var.namespace}:coder"
+ ]
+}
+```
+
+**Why:**
+
+- Restrict IAM role assumption to specific service accounts
+- Prevents any pod from assuming sensitive roles
+- Scoped to specific namespace and service account name
+
+### Source IP Preservation
+
+**NLB Advantage:**
+
+- Client source IP preserved in connection
+- Available in Coder's audit logs
+- No header parsing needed
+- Better security monitoring and rate limiting
+
+**With ALB:**
+
+- Source IP only available in `X-Forwarded-For` header
+- Application must parse headers
+- Less reliable (headers can be spoofed)
+
+---
+
+## Key Takeaways for Sales Engineers
+
+### When Discussing Load Balancers
+
+1. **NLB is the right choice for Coder**
+ - Optimized for long-lived WebSocket connections
+ - Lower latency than ALB
+ - Source IP preservation for audit logs
+ - Static IPs for enterprise firewalls
+
+2. **NLB DOES support TLS termination**
+ - Common misconception that it doesn't
+ - Fully supported via ACM certificates
+ - Show AWS documentation if questioned
+
+3. **ALB only needed if:**
+ - Path-based routing required
+ - WAF integration needed
+ - HTTP-specific features required
+ - None of these apply to standard Coder deployments
+
+### When Discussing Multi-Region
+
+1. **Latency-based routing provides:**
+ - Automatic performance optimization
+ - Built-in failover
+ - No user action required
+
+2. **Region-specific URLs allow:**
+ - Manual region override
+ - Demo flexibility
+ - Testing and troubleshooting
+
+3. **Shared database is critical:**
+ - Users need unified accounts across regions
+ - Workspace state must be accessible everywhere
+ - Consider RDS read replicas for performance
+
+### When Discussing LiteLLM
+
+1. **Separate subdomain approach:**
+ - Keeps architecture simple
+ - No ALB needed
+ - Independent scaling
+ - Clear separation of concerns
+
+2. **Automatic key rotation:**
+ - Security best practice
+ - No manual key management
+ - Zero downtime rotation
+ - AWS Secrets Manager integration
+
+3. **Internal-only option available:**
+ - Maximum security
+ - No external exposure
+ - Simpler architecture
+ - Recommended if no external access needed
+
+### When Discussing Infrastructure as Code
+
+1. **`upgrade_install = true` is critical:**
+ - Enables idempotent Terraform applies
+ - Required for CI/CD pipelines
+ - Prevents deployment failures
+ - Standard best practice
+
+2. **Terraform module structure:**
+ - Reusable across regions
+ - Consistent configuration
+ - Easy to add new regions
+ - Clear separation of concerns
+
+---
+
+## Additional Resources
+
+### AWS Documentation
+
+- [NLB TLS Termination](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/create-tls-listener.html)
+- [Route53 Latency-Based Routing](https://docs.aws.amazon.com/Route53/latest/DeveloperGuide/routing-policy-latency.html)
+- [ACM Certificate Management](https://docs.aws.amazon.com/acm/latest/userguide/acm-overview.html)
+
+### Kubernetes Documentation
+
+- [AWS Load Balancer Controller](https://kubernetes-sigs.github.io/aws-load-balancer-controller/latest/)
+- [Service Annotations](https://kubernetes-sigs.github.io/aws-load-balancer-controller/latest/guide/service/annotations/)
+
+### Coder Documentation
+
+- [Coder Configuration](https://coder.com/docs/admin/configure)
+- [External Authentication](https://coder.com/docs/admin/external-auth)
+- [Enterprise Features](https://coder.com/docs/admin/enterprise)
+
+---
+
+## Version History
+
+- **2025-11-25**: Initial documentation of best practices
+- Added NLB vs ALB comparison and rationale
+- Documented DNS multi-region architecture
+- Explained LiteLLM integration approach
+- Covered Helm `upgrade_install` best practice
+- Included security considerations
+
+---
+
+## Questions or Feedback
+
+For technical questions about this architecture, contact the infrastructure team.
+For customer-specific discussions, work with your Solutions Architect.
diff --git a/docs/MULTI_REGION_DEPLOYMENT.md b/docs/MULTI_REGION_DEPLOYMENT.md
new file mode 100644
index 0000000..81f93d6
--- /dev/null
+++ b/docs/MULTI_REGION_DEPLOYMENT.md
@@ -0,0 +1,324 @@
+# Multi-Region Deployment Progress
+
+**Date:** 2025-12-02
+**Status:** Pending Enterprise License
+
+## Overview
+
+This document tracks the progress of deploying multi-region Coder infrastructure to enable:
+
+- **A) Automatic routing** to the nearest region based on user latency
+- **B) Manual region selection** in the Coder UI for users to choose their preferred region
+
+## Current Status
+
+### β
Completed Today
+
+#### 1. Cost Optimization - Aurora Serverless v2
+
+- **Problem:** RDS Aurora Serverless v2 costing $130/month for both writer and reader instances
+- **Solution:** Removed reader instance from `infra/aws/us-east-2/rds/main.tf`
+- **Result:** Reduced cost by ~$44/month to ~$86/month (1.0 ACU total)
+- **File:** `infra/aws/us-east-2/rds/main.tf`
+
+#### 2. Cross-Region Replica Communication
+
+- **Problem:** Coder replicas in us-east-2 and us-west-2 could detect each other but couldn't communicate (timeout errors)
+- **Root Cause:** Security groups blocking port 8080 traffic between VPCs
+- **Solution:**
+ - Added security group rules to allow TCP port 8080 between VPC CIDRs
+ - Codified rules in Terraform for reproducibility
+- **Files:**
+ - `infra/aws/us-east-2/vpc-peering/main.tf`
+ - `infra/aws/us-east-2/vpc-peering/terraform.tfvars`
+
+```terraform
+# Security group rule to allow Coder replica communication from us-west-2 to us-east-2
+resource "aws_security_group_rule" "use2_allow_coder_from_usw2" {
+ provider = aws.use2
+ type = "ingress"
+ from_port = 8080
+ to_port = 8080
+ protocol = "tcp"
+ cidr_blocks = [var.accepter_vpc_cidr]
+ security_group_id = var.requester_node_security_group_id
+ description = "Allow Coder replica communication from us-west-2"
+}
+```
+
+#### 3. DERP Server Configuration
+
+- **Problem:** `/derp/latency-check` endpoint timing out, replicas couldn't sync properly
+- **Root Cause:** `CODER_DERP_SERVER_ENABLE` environment variable not set
+- **Solution:** Added `CODER_DERP_SERVER_ENABLE = "true"` to both regions' Coder deployments
+- **Result:** Replicas now communicate successfully, no more timeout errors
+- **Files:**
+ - `infra/aws/us-east-2/k8s/coder-server/main.tf`
+ - `infra/aws/us-west-2/k8s/coder-server/main.tf`
+
+```terraform
+env_vars = {
+ CODER_REDIRECT_TO_ACCESS_URL = "false"
+ CODER_TLS_ENABLE = "false"
+ CODER_SECURE_AUTH_COOKIE = "true"
+ # Enable DERP server for multi-region replica communication
+ CODER_DERP_SERVER_ENABLE = "true"
+}
+```
+
+#### 4. Latency Improvement
+
+- **Before:** 111ms
+- **After:** 34ms
+- Achieved through proper VPC peering, security group rules, and DERP server configuration
+
+#### 5. Workspace Proxy Configuration (Ready for Deployment)
+
+- Created complete Terraform configuration for us-west-2 workspace proxy
+- **Files:**
+ - `infra/aws/us-west-2/k8s/coder-proxy/main.tf`
+ - `infra/aws/us-west-2/k8s/coder-proxy/terraform.tfvars`
+ - `infra/aws/us-west-2/k8s/coder-proxy/backend.hcl`
+
+### βΈοΈ Blocked - Awaiting Enterprise License
+
+#### Workspace Proxy Deployment
+
+- **Problem:** "Your license is not entitled to create workspace proxies."
+- **Requirement:** Coder Enterprise license required for Workspace Proxy feature
+- **Impact:** Manual region selection (requirement B) cannot be completed without Enterprise license
+
+**Error from Terraform:**
+
+```
+Error: Feature not enabled
+
+ with module.coder-proxy.coderd_workspace_proxy.this,
+ on ../../../../../modules/k8s/bootstrap/coder-proxy/main.tf line 259, in resource "coderd_workspace_proxy" "this":
+ 259: resource "coderd_workspace_proxy" "this" {
+
+Your license is not entitled to create workspace proxies.
+```
+
+**Error from API:**
+
+```json
+{
+ "message": "Workspace Proxy is a Premium feature. Contact sales!"
+}
+```
+
+## Key Technical Concepts
+
+### Coder Replicas vs Workspace Proxies
+
+#### Replicas (Currently Deployed)
+
+- **Purpose:** High availability and automatic failover
+- **Behavior:** Multiple Coder instances share same database, automatic failover if one fails
+- **User Experience:** Users see single "default" region, automatic routing based on DNS
+- **License:** Available in all Coder editions
+- **Status:** β
Deployed and working in us-east-2 and us-west-2
+
+#### Workspace Proxies (Blocked by License)
+
+- **Purpose:** User-selectable regions for manual region switching
+- **Behavior:** Users can see and manually switch between regions in Coder UI
+- **User Experience:** "Region" tab in UI with latency display and manual selection
+- **License:** β οΈ Requires Coder Enterprise license
+- **Status:** β Configuration ready but deployment blocked
+
+## Infrastructure State
+
+### us-east-2 (Ohio) - Primary Region
+
+- **EKS Cluster:** `coderdemo-use2` β
Running
+- **Coder Server:** β
Deployed and operational
+- **Database:** Aurora Serverless v2 (1.0 ACU writer only) β
+- **VPC CIDR:** 10.0.0.0/16
+- **Node Security Group:** ``
+- **DERP Server:** β
Enabled
+- **URL:** https://coderdemo.io
+
+### us-west-2 (Oregon) - Secondary Region
+
+- **EKS Cluster:** `coderdemo-usw2` β
Running
+- **Coder Server:** β
Deployed as replica
+- **Coder Proxy:** β Blocked by license (configuration ready)
+- **VPC CIDR:** 10.1.0.0/16
+- **Node Security Group:** ``
+- **DERP Server:** β
Enabled
+- **Planned URL:** https://us-west-2.coderdemo.io
+
+### Networking
+
+- **VPC Peering:** β
Established between us-east-2 and us-west-2
+- **Security Group Rules:** β
Port 8080 allowed between regions
+- **Route Tables:** β
Configured for cross-region routing
+- **Replica Communication:** β
Working (34ms latency)
+
+## Next Steps - Once Enterprise License is Obtained
+
+### 1. Apply Enterprise License to Coder Deployment
+
+The license needs to be applied to the primary Coder deployment at https://coderdemo.io. This is typically done through the Coder admin UI or by setting the `CODER_LICENSE` environment variable.
+
+### 2. Deploy Workspace Proxy to us-west-2
+
+Run from `infra/aws/us-west-2/k8s/coder-proxy`:
+
+```bash
+terraform apply -var-file=terraform.tfvars -auto-approve
+```
+
+This will:
+
+1. Create the workspace proxy "Oregon" in Coder API
+2. Deploy proxy pods to us-west-2 EKS cluster
+3. Create namespace and secrets
+4. Configure NLB with ACM certificate
+5. Enable manual region selection in Coder UI
+
+### 3. Verify Workspace Proxy Registration
+
+Check that the proxy appears in Coder:
+
+```bash
+curl -H "Coder-Session-Token: " https://coderdemo.io/api/v2/workspaceproxies
+```
+
+Expected response:
+
+```json
+{
+ "proxies": [
+ {
+ "id": "...",
+ "name": "us-west-2",
+ "display_name": "Oregon",
+ "icon": "/emojis/1f1fa-1f1f8.png",
+ "url": "https://us-west-2.coderdemo.io",
+ "healthy": true
+ }
+ ]
+}
+```
+
+### 4. Configure Route53 (If Not Already Done)
+
+Ensure latency-based routing is configured for automatic region selection:
+
+- A record for `coderdemo.io` β us-east-2 NLB (latency-based)
+- A record for `coderdemo.io` β us-west-2 NLB (latency-based)
+- CNAME for `*.coderdemo.io` β coderdemo.io
+- A record for `us-west-2.coderdemo.io` β us-west-2 NLB (simple routing)
+
+### 5. Test User Experience
+
+1. Navigate to https://coderdemo.io
+2. Verify latency-based routing connects to nearest region
+3. Look for "Region" selector in Coder UI
+4. Click "Refresh latency" to see both regions
+5. Manually select "Oregon" region
+6. Verify connection switches to us-west-2
+
+## Configuration Files
+
+### Workspace Proxy Configuration
+
+`infra/aws/us-west-2/k8s/coder-proxy/terraform.tfvars`:
+
+```terraform
+cluster_name = "coderdemo-usw2"
+cluster_region = "us-west-2"
+cluster_profile = "noah@coder.com"
+
+coder_proxy_name = "us-west-2"
+coder_proxy_display_name = "Oregon"
+coder_proxy_icon = "/emojis/1f1fa-1f1f8.png"
+
+coder_access_url = "https://coderdemo.io"
+coder_proxy_url = "https://us-west-2.coderdemo.io"
+coder_proxy_wildcard_url = "*.us-west-2.coderdemo.io"
+
+coder_token = ""
+
+addon_version = "2.27.1"
+image_repo = "ghcr.io/coder/coder"
+image_tag = "v2.27.1"
+
+acme_registration_email = "admin@coderdemo.io"
+cloudflare_api_token = "placeholder"
+kubernetes_ssl_secret_name = "coder-proxy-tls"
+kubernetes_create_ssl_secret = false
+```
+
+### VPC Peering Configuration
+
+`infra/aws/us-east-2/vpc-peering/terraform.tfvars`:
+
+```terraform
+profile = "noah@coder.com"
+requester_vpc_id = ""
+accepter_vpc_id = ""
+requester_vpc_cidr = "10.0.0.0/16"
+accepter_vpc_cidr = "10.1.0.0/16"
+requester_node_security_group_id = ""
+accepter_node_security_group_id = ""
+```
+
+## Reference Links
+
+- [Coder Enterprise Licensing](https://coder.com/docs/coder-oss/latest/admin/licensing)
+- [Workspace Proxies Documentation](https://coder.com/docs/coder-oss/latest/admin/workspace-proxies)
+- [Multi-Region Deployment Guide](https://coder.com/docs/coder-oss/latest/admin/multi-region)
+
+## Important Notes
+
+1. **Token Security:** The Coder API token is stored in terraform.tfvars. Consider using AWS Secrets Manager for production.
+
+2. **S3 Backend:** All Terraform state is stored in S3 bucket in us-east-2. See backend.hcl files for configuration.
+
+3. **Replica Communication:** Replicas use DERP protocol on port 8080 for coordination. Ensure security groups allow this traffic.
+
+4. **DNS Propagation:** After deploying workspace proxy, DNS changes may take 5-60 minutes to propagate globally.
+
+5. **Certificate Management:** ACM certificates are managed separately. Ensure `*.us-west-2.coderdemo.io` certificate is issued in us-west-2.
+
+## Troubleshooting
+
+### If Workspace Proxy Deployment Fails
+
+1. Verify Enterprise license is applied: Check Coder admin UI β Deployment β License
+2. Check Coder API token has admin permissions
+3. Verify network connectivity from us-west-2 to primary deployment
+4. Check pod logs: `kubectl logs -n coder-proxy -l app.kubernetes.io/name=coder`
+
+### If Users Don't See Region Selector
+
+1. Ensure workspace proxy status is "healthy" in API
+2. Hard refresh browser (Cmd+Shift+R / Ctrl+Shift+F5)
+3. Verify user has permission to see workspace proxies
+4. Check Coder version supports workspace proxies (v2.0+)
+
+## Summary
+
+**What Works Now:**
+
+- β
Multi-region Coder replicas (us-east-2, us-west-2)
+- β
Automatic failover between replicas
+- β
Cross-region communication via DERP
+- β
34ms inter-region latency
+- β
Cost-optimized Aurora database
+
+**What's Pending:**
+
+- βΈοΈ Manual region selection in UI (blocked by Enterprise license)
+- βΈοΈ Workspace proxy deployment (configuration ready)
+
+**Action Required:**
+
+1. Obtain Coder Enterprise license
+2. Apply license to deployment
+3. Run `terraform apply` for workspace proxy
+4. Verify region selector appears in UI
diff --git a/docs/cost-optimization-strategy.md b/docs/cost-optimization-strategy.md
new file mode 100644
index 0000000..12da3ff
--- /dev/null
+++ b/docs/cost-optimization-strategy.md
@@ -0,0 +1,130 @@
+# Cost Optimization Strategy for Coder Demo
+
+## Mixed Capacity Approach
+
+### Node Group Strategy
+
+**System Nodes (ON_DEMAND)**
+
+- **Purpose**: Run critical Kubernetes infrastructure
+- **Workloads**: CoreDNS, kube-proxy, metrics-server, cert-manager, AWS LB Controller
+- **Size**: t4g.medium (ARM Graviton)
+- **Count**: 1-2 nodes minimum
+- **Cost**: ~$24/month (1 node) to $48/month (2 nodes)
+
+**Application Nodes (MIXED: 20% On-Demand, 80% Spot via Karpenter)**
+
+- **Purpose**: Run Coder server and workspaces
+- **Spot Savings**: 70-90% cost reduction
+- **Interruption Risk**: Mitigated by:
+ - Multiple instance types (diversified Spot pools)
+ - Karpenter auto-rebalancing
+ - Pod Disruption Budgets
+
+### Karpenter NodePool Configuration
+
+#### 1. Coder Server NodePool (ON_DEMAND Priority)
+
+```yaml
+capacity_type: ["on-demand", "spot"] # Prefer On-Demand, fallback to Spot
+weight:
+ on-demand: 100 # Higher priority
+ spot: 10
+```
+
+#### 2. Coder Workspace NodePool (SPOT Priority)
+
+```yaml
+capacity_type: ["spot", "on-demand"] # Prefer Spot, fallback to On-Demand
+weight:
+ spot: 100 # Higher priority
+ on-demand: 10
+```
+
+### Risk Mitigation
+
+**Spot Interruption Handling:**
+
+1. **2-minute warning** β Karpenter automatically provisions replacement
+2. **Multiple instance types** β 15+ types reduces interruption rate to <1%
+3. **Pod Disruption Budgets** β Ensures minimum replicas always running
+4. **Karpenter Consolidation** β Automatically moves pods before termination
+
+**Example Instance Type Diversity:**
+
+```
+Spot Pool: t4g.medium, t4g.large, t3a.medium, t3a.large,
+ m6g.medium, m6g.large, m6a.medium, m6a.large
+```
+
+### Cost Breakdown
+
+| Component | Instance Type | Capacity | Monthly Cost |
+| ------------------ | ------------- | --------- | ------------- |
+| System Nodes (2) | t4g.medium | ON_DEMAND | $48 |
+| Coder Server (2) | t4g.large | 80% SPOT | $28 (vs $140) |
+| Workspaces (avg 5) | t4g.xlarge | 90% SPOT | $75 (vs $750) |
+| **Total** | | **Mixed** | **$151/mo** |
+
+**vs All On-Demand:** $938/month β **84% savings**
+
+### Dynamic Scaling
+
+**Low Usage (nights/weekends):**
+
+- Scale to zero workspaces
+- Keep 1 system node + 1 Coder server node
+- Cost: ~$48/month during idle
+
+**High Usage (business hours):**
+
+- Auto-scale workspaces on Spot
+- Karpenter provisions nodes in <60 seconds
+- Cost: ~$150-200/month during peak
+
+### Monitoring & Alerts
+
+**CloudWatch Alarms:**
+
+- Spot interruption rate > 5%
+- Available On-Demand capacity < 20%
+- Karpenter provisioning failures
+
+**Response:**
+
+- Automatic fallback to On-Demand
+- Email alerts to ops team
+- Karpenter adjusts instance type mix
+
+## Implementation Timeline
+
+1. β
Deploy EKS with ON_DEMAND system nodes
+2. β³ Deploy Karpenter
+3. β³ Configure mixed-capacity NodePools
+4. β³ Deploy Coder with node affinity rules
+5. β³ Test Spot interruption handling
+6. β³ Enable auto-scaling policies
+
+## Fallback Plan
+
+If Spot becomes unreliable (rare):
+
+1. Update Karpenter NodePool to 100% On-Demand
+2. `kubectl apply -f nodepool-ondemand.yaml`
+3. Karpenter gracefully migrates pods
+4. Takes ~5 minutes, zero downtime
+
+## Best Practices
+
+β
**DO:**
+
+- Use multiple Spot instance types (10+)
+- Set Pod Disruption Budgets
+- Monitor Spot interruption rates
+- Test failover regularly
+
+β **DON'T:**
+
+- Run databases on Spot (use RDS)
+- Use Spot for single-replica critical services
+- Rely on single instance type for Spot
diff --git a/infra/aws/eu-west-2/eks/main.tf b/infra/aws/eu-west-2/eks/main.tf
index 2bffa33..bed6bd1 100644
--- a/infra/aws/eu-west-2/eks/main.tf
+++ b/infra/aws/eu-west-2/eks/main.tf
@@ -30,7 +30,7 @@ variable "cluster_version" {
variable "cluster_instance_type" {
description = "EKS Instance Size/Type"
- default = "t3.xlarge"
+ default = "t4g.medium" # ARM Graviton for cost optimization
type = string
}
@@ -179,7 +179,7 @@ module "cluster" {
system = {
min_size = 0
max_size = 10
- desired_size = 0 # Cant be modified after creation. Override from AWS Console
+ desired_size = 1 # Cant be modified after creation. Override from AWS Console
labels = local.cluster_asg_node_labels
instance_types = [var.cluster_instance_type]
diff --git a/infra/aws/eu-west-2/k8s/cert-manager/main.tf b/infra/aws/eu-west-2/k8s/cert-manager/main.tf
index ab12c5d..d0de2cf 100644
--- a/infra/aws/eu-west-2/k8s/cert-manager/main.tf
+++ b/infra/aws/eu-west-2/k8s/cert-manager/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -41,11 +41,6 @@ variable "addon_version" {
default = "v1.18.2"
}
-variable "cloudflare_api_token" {
- type = string
- sensitive = true
-}
-
provider "aws" {
region = var.cluster_region
profile = var.cluster_profile
@@ -60,7 +55,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
@@ -78,7 +73,6 @@ module "cert-manager" {
cluster_name = var.cluster_name
cluster_oidc_provider_arn = var.cluster_oidc_provider_arn
- namespace = var.addon_namespace
- helm_version = var.addon_version
- cloudflare_token_secret = var.cloudflare_api_token
+ namespace = var.addon_namespace
+ helm_version = var.addon_version
}
\ No newline at end of file
diff --git a/infra/aws/eu-west-2/k8s/coder-proxy/main.tf b/infra/aws/eu-west-2/k8s/coder-proxy/main.tf
index b9704ed..06b5c6b 100644
--- a/infra/aws/eu-west-2/k8s/coder-proxy/main.tf
+++ b/infra/aws/eu-west-2/k8s/coder-proxy/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -101,11 +101,6 @@ variable "kubernetes_create_ssl_secret" {
default = true
}
-variable "cloudflare_api_token" {
- type = string
- sensitive = true
-}
-
provider "aws" {
region = var.cluster_region
profile = var.cluster_profile
@@ -120,7 +115,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
@@ -161,7 +156,6 @@ module "coder-proxy" {
proxy_token_config = {
name = "coder-proxy"
}
- cloudflare_api_token = var.cloudflare_api_token
ssl_cert_config = {
name = var.kubernetes_ssl_secret_name
create_secret = var.kubernetes_create_ssl_secret
diff --git a/infra/aws/eu-west-2/k8s/coder-ws/main.tf b/infra/aws/eu-west-2/k8s/coder-ws/main.tf
index 451a056..6c9140b 100644
--- a/infra/aws/eu-west-2/k8s/coder-ws/main.tf
+++ b/infra/aws/eu-west-2/k8s/coder-ws/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -98,7 +98,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/eu-west-2/k8s/ebs-controller/main.tf b/infra/aws/eu-west-2/k8s/ebs-controller/main.tf
index d7f1f56..5194ec7 100644
--- a/infra/aws/eu-west-2/k8s/ebs-controller/main.tf
+++ b/infra/aws/eu-west-2/k8s/ebs-controller/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -55,7 +55,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/eu-west-2/k8s/karpenter/main.tf b/infra/aws/eu-west-2/k8s/karpenter/main.tf
index f5b34f8..4adb718 100644
--- a/infra/aws/eu-west-2/k8s/karpenter/main.tf
+++ b/infra/aws/eu-west-2/k8s/karpenter/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -54,7 +54,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
@@ -181,7 +181,7 @@ module "karpenter-addon" {
block_device_mappings = [{
device_name = "/dev/xvda"
ebs = {
- volume_size = "1400Gi"
+ volume_size = "500Gi"
volume_type = "gp3"
}
}, {
diff --git a/infra/aws/eu-west-2/k8s/lb-controller/main.tf b/infra/aws/eu-west-2/k8s/lb-controller/main.tf
index 1f6a0fa..479e9a1 100644
--- a/infra/aws/eu-west-2/k8s/lb-controller/main.tf
+++ b/infra/aws/eu-west-2/k8s/lb-controller/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -60,7 +60,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/eu-west-2/k8s/metrics-server/main.tf b/infra/aws/eu-west-2/k8s/metrics-server/main.tf
index d808c74..cce9447 100644
--- a/infra/aws/eu-west-2/k8s/metrics-server/main.tf
+++ b/infra/aws/eu-west-2/k8s/metrics-server/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
}
backend "s3" {}
@@ -48,7 +48,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/us-east-2/README.md b/infra/aws/us-east-2/README.md
new file mode 100644
index 0000000..5ff4543
--- /dev/null
+++ b/infra/aws/us-east-2/README.md
@@ -0,0 +1,140 @@
+# Terraform Backend Configuration
+
+## Security Notice
+
+This directory uses remote S3 backend for state management, but **backend configuration files are gitignored** to prevent leaking AWS account IDs and other sensitive information.
+
+## Local Setup
+
+1. **Get backend configuration from teammate** or **retrieve from AWS**:
+
+ ```bash
+ # Get S3 bucket name (it contains the account ID)
+ aws s3 ls | grep terraform-state
+
+ # Get DynamoDB table name
+ aws dynamodb list-tables --query 'TableNames[?contains(@, `terraform-lock`)]'
+ ```
+
+2. **Create backend configuration** for each module:
+
+ Each Terraform module needs a `backend.tf` file (this file is gitignored). Create it manually:
+
+ ```bash
+ cd infra/aws/us-east-2/vpc # or any other module
+ ```
+
+ Create `backend.tf`:
+
+ ```hcl
+ terraform {
+ backend "s3" {
+ bucket = "YOUR-BUCKET-NAME-HERE"
+ key = "us-east-2/vpc/terraform.tfstate" # Update path per module
+ region = "us-east-2"
+ dynamodb_table = "YOUR-TABLE-NAME-HERE"
+ encrypt = true
+ }
+ }
+ ```
+
+ **Important**: Update the `key` path for each module:
+ - VPC: `us-east-2/vpc/terraform.tfstate`
+ - EKS: `us-east-2/eks/terraform.tfstate`
+ - ACM: `us-east-2/acm/terraform.tfstate`
+ - etc.
+
+3. **Initialize Terraform**:
+ ```bash
+ terraform init
+ ```
+
+## GitHub Actions Setup
+
+GitHub Actions uses secrets to configure the backend securely. Required secrets:
+
+1. `TF_STATE_BUCKET` - S3 bucket name
+2. `TF_STATE_LOCK_TABLE` - DynamoDB table name
+3. `AWS_ROLE_ARN` - IAM role ARN for OIDC authentication
+
+These are configured in: Repository Settings > Secrets and variables > Actions
+
+## Alternative: Using Backend Config File
+
+Instead of creating backend.tf, you can use a config file:
+
+1. Create `backend.conf` (gitignored):
+
+ ```
+ bucket = "YOUR-BUCKET-NAME"
+ dynamodb_table = "YOUR-TABLE-NAME"
+ region = "us-east-2"
+ encrypt = true
+ ```
+
+2. Initialize with:
+ ```bash
+ terraform init -backend-config=backend.conf -backend-config="key=us-east-2/vpc/terraform.tfstate"
+ ```
+
+## Why This Approach?
+
+- **Security**: Account IDs and resource names aren't committed to Git
+- **Flexibility**: Each developer/environment can use different backends
+- **Compliance**: Prevents accidental exposure of infrastructure details
+- **Best Practice**: Follows AWS security recommendations
+
+## Secret Scanning Protection
+
+This repository has automated secret scanning to prevent accidental exposure of credentials:
+
+### GitHub Actions (Automated)
+
+- **Gitleaks** - Scans every PR and push for secrets
+- **TruffleHog** - Additional verification layer
+- **Custom Pattern Matching** - Catches common secret patterns
+- **Auto-Revert** - Automatically reverts commits to main with secrets
+
+### Pre-commit Hooks (Local)
+
+Catch secrets before they reach GitHub:
+
+```bash
+# Install pre-commit
+pip install pre-commit
+
+# Install git hooks
+pre-commit install
+
+# Test on all files
+pre-commit run --all-files
+```
+
+### What Gets Detected
+
+- AWS Access Keys (AKIA...)
+- API Keys and Tokens
+- Private Keys (RSA, SSH, etc.)
+- Database connection strings with passwords
+- GitHub Personal Access Tokens
+- Stripe API keys
+- High-entropy strings (likely secrets)
+
+### If Secrets Are Detected
+
+1. **PR is blocked** - Cannot merge until secrets are removed
+2. **Automatic notification** - PR comment explains the issue
+3. **Required actions**:
+ - Remove the secret from code
+ - Use GitHub Secrets or environment variables
+ - Rotate/invalidate the exposed credential
+
+## Migrating Existing State
+
+If you have local state to migrate:
+
+```bash
+terraform init -migrate-state
+```
+
+Terraform will prompt to copy existing state to the remote backend.
diff --git a/infra/aws/us-east-2/acm/main.tf b/infra/aws/us-east-2/acm/main.tf
new file mode 100644
index 0000000..e37c97e
--- /dev/null
+++ b/infra/aws/us-east-2/acm/main.tf
@@ -0,0 +1,107 @@
+terraform {
+ required_providers {
+ aws = {
+ source = "hashicorp/aws"
+ version = ">= 5.0"
+ }
+ }
+}
+
+variable "cluster_region" {
+ description = "AWS region for ACM certificate"
+ type = string
+ default = "us-east-2"
+}
+
+variable "cluster_profile" {
+ description = "AWS profile"
+ type = string
+ default = "default"
+}
+
+variable "domain_name" {
+ description = "Domain name for Coder"
+ type = string
+ default = "coderdemo.io"
+}
+
+variable "hosted_zone_id" {
+ description = "Route 53 Hosted Zone ID"
+ type = string
+}
+
+provider "aws" {
+ region = var.cluster_region
+ profile = var.cluster_profile
+ alias = "acm"
+}
+
+# Provider for Route 53 (may be in different account)
+provider "aws" {
+ region = var.cluster_region
+ profile = var.cluster_profile
+ alias = "route53"
+}
+
+# ACM Certificate for Coder with wildcard
+resource "aws_acm_certificate" "coder" {
+ provider = aws.acm
+ domain_name = var.domain_name
+ validation_method = "DNS"
+
+ subject_alternative_names = [
+ "*.${var.domain_name}"
+ ]
+
+ lifecycle {
+ create_before_destroy = true
+ }
+
+ tags = {
+ Name = "coder-certificate"
+ Environment = "test"
+ ManagedBy = "terraform"
+ }
+}
+
+# Route 53 validation records
+resource "aws_route53_record" "cert_validation" {
+ provider = aws.route53
+ for_each = {
+ for dvo in aws_acm_certificate.coder.domain_validation_options : dvo.domain_name => {
+ name = dvo.resource_record_name
+ record = dvo.resource_record_value
+ type = dvo.resource_record_type
+ }
+ }
+
+ allow_overwrite = true
+ name = each.value.name
+ records = [each.value.record]
+ ttl = 60
+ type = each.value.type
+ zone_id = var.hosted_zone_id
+}
+
+# Wait for certificate validation
+resource "aws_acm_certificate_validation" "coder" {
+ provider = aws.acm
+ certificate_arn = aws_acm_certificate.coder.arn
+ validation_record_fqdns = [for record in aws_route53_record.cert_validation : record.fqdn]
+}
+
+# Outputs
+output "certificate_arn" {
+ description = "ARN of the validated ACM certificate"
+ value = aws_acm_certificate_validation.coder.certificate_arn
+}
+
+output "domain_name" {
+ description = "Domain name for Coder"
+ value = var.domain_name
+}
+
+output "validation_status" {
+ description = "Certificate validation status"
+ value = "Certificate validated and ready to use"
+}
diff --git a/infra/aws/us-east-2/acm/terraform.tfvars.example b/infra/aws/us-east-2/acm/terraform.tfvars.example
new file mode 100644
index 0000000..d9adc60
--- /dev/null
+++ b/infra/aws/us-east-2/acm/terraform.tfvars.example
@@ -0,0 +1,7 @@
+# ACM Certificate configuration for Coder
+# Copy this to terraform.tfvars and fill in your values
+
+cluster_region = "us-east-2"
+cluster_profile = "YOUR_AWS_PROFILE"
+domain_name = "YOUR_DOMAIN.com"
+hosted_zone_id = "YOUR_ROUTE53_ZONE_ID"
diff --git a/infra/aws/us-east-2/eks/main.tf b/infra/aws/us-east-2/eks/main.tf
index 80c15aa..6f59178 100644
--- a/infra/aws/us-east-2/eks/main.tf
+++ b/infra/aws/us-east-2/eks/main.tf
@@ -30,7 +30,7 @@ variable "cluster_version" {
variable "cluster_instance_type" {
description = "EKS Instance Size/Type"
- default = "t3.xlarge"
+ default = "t4g.xlarge"
type = string
}
@@ -141,17 +141,115 @@ module "eks" {
desired_size = 0 # Cant be modified after creation. Override from AWS Console
labels = local.cluster_asg_node_labels
- instance_types = [var.cluster_instance_type]
- capacity_type = "ON_DEMAND"
+ # Cost optimization: Graviton ARM instances
+ # IMPORTANT: ON_DEMAND for system nodes - production demo cannot break!
+ instance_types = [var.cluster_instance_type, "t4g.small", "t4g.large"] # ARM only
+ ami_type = "AL2023_ARM_64_STANDARD" # ARM-based AMI
+ capacity_type = "ON_DEMAND" # System infrastructure must be stable
+
iam_role_additional_policies = {
AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
STSAssumeRole = aws_iam_policy.sts.arn
}
+ # Cost optimization: gp3 volumes with smaller size
+ block_device_mappings = [{
+ device_name = "/dev/xvda"
+ ebs = {
+ volume_type = "gp3" # Better performance, same cost as gp2
+ volume_size = 20 # Reduced from default 50GB
+ delete_on_termination = true
+ encrypted = true
+ }
+ }]
+
# System Nodes should not be public
subnet_ids = var.private_subnet_ids
}
}
tags = local.tags
-}
\ No newline at end of file
+}
+# VPC Endpoints for cost optimization (reduce NAT Gateway usage)
+resource "aws_vpc_endpoint" "s3" {
+ vpc_id = var.vpc_id
+ service_name = "com.amazonaws.${var.region}.s3"
+ route_table_ids = flatten([
+ data.aws_route_tables.private.ids
+ ])
+ tags = merge(local.tags, {
+ Name = "${var.name}-s3-endpoint"
+ })
+}
+
+resource "aws_vpc_endpoint" "ecr_api" {
+ vpc_id = var.vpc_id
+ service_name = "com.amazonaws.${var.region}.ecr.api"
+ vpc_endpoint_type = "Interface"
+ subnet_ids = var.private_subnet_ids
+ security_group_ids = [aws_security_group.vpc_endpoints.id]
+ private_dns_enabled = true
+ tags = merge(local.tags, {
+ Name = "${var.name}-ecr-api-endpoint"
+ })
+}
+
+resource "aws_vpc_endpoint" "ecr_dkr" {
+ vpc_id = var.vpc_id
+ service_name = "com.amazonaws.${var.region}.ecr.dkr"
+ vpc_endpoint_type = "Interface"
+ subnet_ids = var.private_subnet_ids
+ security_group_ids = [aws_security_group.vpc_endpoints.id]
+ private_dns_enabled = true
+ tags = merge(local.tags, {
+ Name = "${var.name}-ecr-dkr-endpoint"
+ })
+}
+
+# Security group for VPC endpoints
+resource "aws_security_group" "vpc_endpoints" {
+ name_prefix = "${var.name}-vpc-endpoints"
+ description = "Security group for VPC endpoints"
+ vpc_id = var.vpc_id
+
+ ingress {
+ from_port = 443
+ to_port = 443
+ protocol = "tcp"
+ cidr_blocks = ["10.0.0.0/16"]
+ }
+
+ egress {
+ from_port = 0
+ to_port = 0
+ protocol = "-1"
+ cidr_blocks = ["0.0.0.0/0"]
+ }
+
+ tags = merge(local.tags, {
+ Name = "${var.name}-vpc-endpoints-sg"
+ })
+}
+
+# Data source for route tables
+data "aws_route_tables" "private" {
+ vpc_id = var.vpc_id
+ filter {
+ name = "tag:Name"
+ values = ["*private*"]
+ }
+}
+
+# Outputs
+output "vpc_endpoint_s3_id" {
+ description = "S3 VPC Endpoint ID"
+ value = aws_vpc_endpoint.s3.id
+}
+
+output "vpc_endpoint_ecr_ids" {
+ description = "ECR VPC Endpoint IDs"
+ value = {
+ api = aws_vpc_endpoint.ecr_api.id
+ dkr = aws_vpc_endpoint.ecr_dkr.id
+ }
+}
diff --git a/infra/aws/us-east-2/k8s/cert-manager/main.tf b/infra/aws/us-east-2/k8s/cert-manager/main.tf
index ab12c5d..d0de2cf 100644
--- a/infra/aws/us-east-2/k8s/cert-manager/main.tf
+++ b/infra/aws/us-east-2/k8s/cert-manager/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -41,11 +41,6 @@ variable "addon_version" {
default = "v1.18.2"
}
-variable "cloudflare_api_token" {
- type = string
- sensitive = true
-}
-
provider "aws" {
region = var.cluster_region
profile = var.cluster_profile
@@ -60,7 +55,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
@@ -78,7 +73,6 @@ module "cert-manager" {
cluster_name = var.cluster_name
cluster_oidc_provider_arn = var.cluster_oidc_provider_arn
- namespace = var.addon_namespace
- helm_version = var.addon_version
- cloudflare_token_secret = var.cloudflare_api_token
+ namespace = var.addon_namespace
+ helm_version = var.addon_version
}
\ No newline at end of file
diff --git a/infra/aws/us-east-2/k8s/coder-server/main.tf b/infra/aws/us-east-2/k8s/coder-server/main.tf
index 79a8fd2..fb2a908 100644
--- a/infra/aws/us-east-2/k8s/coder-server/main.tf
+++ b/infra/aws/us-east-2/k8s/coder-server/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -141,11 +141,6 @@ variable "kubernetes_create_ssl_secret" {
default = true
}
-variable "cloudflare_api_token" {
- type = string
- sensitive = true
-}
-
variable "oidc_sign_in_text" {
type = string
}
@@ -176,7 +171,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
@@ -198,6 +193,13 @@ provider "acme" {
server_url = var.acme_server_url
}
+# Fetch ACM certificate dynamically by domain to avoid hardcoding sensitive ARNs
+data "aws_acm_certificate" "coder" {
+ domain = trimsuffix(trimprefix(var.coder_access_url, "https://"), "/")
+ statuses = ["ISSUED"]
+ most_recent = true
+}
+
module "coder-server" {
source = "../../../../../modules/k8s/bootstrap/coder-server"
@@ -208,13 +210,12 @@ module "coder-server" {
namespace = "coder"
acme_registration_email = var.acme_registration_email
acme_days_until_renewal = 90
- replica_count = 2
+ replica_count = 1 # HA requires Enterprise license
helm_version = var.addon_version
image_repo = var.image_repo
image_tag = var.image_tag
primary_access_url = var.coder_access_url
wildcard_access_url = var.coder_wildcard_access_url
- cloudflare_api_token = var.cloudflare_api_token
coder_experiments = var.coder_experiments
coder_builtin_provisioner_count = var.coder_builtin_provisioner_count
coder_github_allowed_orgs = var.coder_github_allowed_orgs
@@ -237,10 +238,25 @@ module "coder-server" {
github_external_auth_secret_client_id = var.coder_github_external_auth_secret_client_id
github_external_auth_secret_client_secret = var.coder_github_external_auth_secret_client_secret
tags = {}
+ env_vars = {
+ # Disable redirect since NLB terminates TLS and forwards plain HTTP to backend
+ # Without this, Coder sees HTTP and redirects to HTTPS, causing infinite redirect loop
+ CODER_REDIRECT_TO_ACCESS_URL = "false"
+ # Disable TLS on Coder itself since NLB terminates TLS
+ CODER_TLS_ENABLE = "false"
+ # Mark auth cookies as secure since users access via HTTPS
+ CODER_SECURE_AUTH_COOKIE = "true"
+ # Enable DERP server for multi-region replica communication
+ CODER_DERP_SERVER_ENABLE = "true"
+ }
service_annotations = {
- "service.beta.kubernetes.io/aws-load-balancer-nlb-target-type" = "instance"
- "service.beta.kubernetes.io/aws-load-balancer-scheme" = "internet-facing"
- "service.beta.kubernetes.io/aws-load-balancer-attributes" = "deletion_protection.enabled=true"
+ "service.beta.kubernetes.io/aws-load-balancer-nlb-target-type" = "instance"
+ "service.beta.kubernetes.io/aws-load-balancer-scheme" = "internet-facing"
+ "service.beta.kubernetes.io/aws-load-balancer-attributes" = "deletion_protection.enabled=false,load_balancing.cross_zone.enabled=true"
+ "service.beta.kubernetes.io/aws-load-balancer-ssl-cert" = data.aws_acm_certificate.coder.arn
+ "service.beta.kubernetes.io/aws-load-balancer-ssl-ports" = "443"
+ "service.beta.kubernetes.io/aws-load-balancer-backend-protocol" = "tcp"
+ # Subnets will be auto-detected by Load Balancer Controller using kubernetes.io/role/elb=1 tag
}
node_selector = {
"node.coder.io/managed-by" = "karpenter"
@@ -279,4 +295,24 @@ module "coder-server" {
topology_key = "kubernetes.io/hostname"
}
}]
+}
+
+# Fix service HTTPS port to forward to HTTP backend (port 8080)
+# since Coder has TLS disabled and only listens on HTTP
+resource "null_resource" "patch_coder_service" {
+ depends_on = [module.coder-server]
+
+ triggers = {
+ # Re-run patch whenever Coder configuration changes
+ always_run = timestamp()
+ }
+
+ provisioner "local-exec" {
+ command = <<-EOT
+ sleep 10
+ kubectl patch svc coder -n coder --type='json' \
+ -p='[{"op": "replace", "path": "/spec/ports/1/targetPort", "value": "http"}]' \
+ 2>/dev/null || true
+ EOT
+ }
}
\ No newline at end of file
diff --git a/infra/aws/us-east-2/k8s/coder-ws/main.tf b/infra/aws/us-east-2/k8s/coder-ws/main.tf
index 451a056..6c9140b 100644
--- a/infra/aws/us-east-2/k8s/coder-ws/main.tf
+++ b/infra/aws/us-east-2/k8s/coder-ws/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -98,7 +98,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/us-east-2/k8s/ebs-controller/main.tf b/infra/aws/us-east-2/k8s/ebs-controller/main.tf
index ed4efef..0c8e7a3 100644
--- a/infra/aws/us-east-2/k8s/ebs-controller/main.tf
+++ b/infra/aws/us-east-2/k8s/ebs-controller/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -60,7 +60,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/us-east-2/k8s/karpenter/main.tf b/infra/aws/us-east-2/k8s/karpenter/main.tf
index a01280e..cc263f5 100644
--- a/infra/aws/us-east-2/k8s/karpenter/main.tf
+++ b/infra/aws/us-east-2/k8s/karpenter/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -53,20 +53,29 @@ data "aws_eks_cluster_auth" "this" {
name = var.cluster_name
}
-provider "helm" {
- kubernetes {
- host = data.aws_eks_cluster.this.endpoint
- cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
- token = data.aws_eks_cluster_auth.this.token
- }
-}
-
provider "kubernetes" {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
}
+provider "helm" {
+ kubernetes = {
+ host = data.aws_eks_cluster.this.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
+ exec = {
+ api_version = "client.authentication.k8s.io/v1beta1"
+ command = "aws"
+ args = [
+ "eks",
+ "get-token",
+ "--cluster-name", var.cluster_name,
+ "--region", var.cluster_region
+ ]
+ }
+ }
+}
+
locals {
global_node_labels = {
"node.coder.io/instance" = "coder-v2"
@@ -153,7 +162,15 @@ locals {
node_requirements = concat(local.global_node_reqs, [{
key = "node.kubernetes.io/instance-type"
operator = "In"
- values = ["c6a.32xlarge", "c5a.32xlarge"]
+ values = [
+ # Small demos (5-10 users) - Most cost-effective
+ "c6a.4xlarge", "c5a.4xlarge", # 16 vCPU / 32 GB - ~$0.18/hr spot
+ "c6a.8xlarge", "c5a.8xlarge", # 32 vCPU / 64 GB - ~$0.37/hr spot
+ # Medium demos (10-20 users)
+ "c6a.16xlarge", "c5a.16xlarge", # 64 vCPU / 128 GB - ~$0.74/hr spot
+ # Large demos (20-40 users)
+ "c6a.32xlarge", "c5a.32xlarge" # 128 vCPU / 256 GB - ~$1.47/hr spot
+ ]
}])
node_class_ref_name = "coder-ws-class"
disruption_consolidate_after = "30m"
@@ -183,7 +200,7 @@ module "karpenter-addon" {
block_device_mappings = [{
device_name = "/dev/xvda"
ebs = {
- volume_size = "1400Gi"
+ volume_size = "500Gi" // Decreased from 1400Gi to save costs; felt overkill for coder-server nodes
volume_type = "gp3"
}
}, {
@@ -198,6 +215,7 @@ module "karpenter-addon" {
subnet_selector_tags = local.provisioner_subnet_tags
sg_selector_tags = local.provisioner_sg_tags
}]
+ nodepool_configs = local.nodepool_configs
}
# import {
diff --git a/infra/aws/us-east-2/k8s/lb-controller/main.tf b/infra/aws/us-east-2/k8s/lb-controller/main.tf
index 2bf1d2c..07ed13c 100644
--- a/infra/aws/us-east-2/k8s/lb-controller/main.tf
+++ b/infra/aws/us-east-2/k8s/lb-controller/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -60,7 +60,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/us-east-2/k8s/litellm/main.tf b/infra/aws/us-east-2/k8s/litellm/main.tf
index 3e99231..709707a 100644
--- a/infra/aws/us-east-2/k8s/litellm/main.tf
+++ b/infra/aws/us-east-2/k8s/litellm/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
}
backend "s3" {}
diff --git a/infra/aws/us-east-2/k8s/metrics-server/main.tf b/infra/aws/us-east-2/k8s/metrics-server/main.tf
index d808c74..cce9447 100644
--- a/infra/aws/us-east-2/k8s/metrics-server/main.tf
+++ b/infra/aws/us-east-2/k8s/metrics-server/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
}
backend "s3" {}
@@ -48,7 +48,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/us-east-2/rds/main.tf b/infra/aws/us-east-2/rds/main.tf
index ad0e620..2adaa05 100644
--- a/infra/aws/us-east-2/rds/main.tf
+++ b/infra/aws/us-east-2/rds/main.tf
@@ -5,6 +5,10 @@ terraform {
source = "hashicorp/aws"
version = ">= 5.46"
}
+ random = {
+ source = "hashicorp/random"
+ version = "~> 3.6"
+ }
}
backend "s3" {}
}
@@ -19,20 +23,10 @@ variable "master_username" {
type = string
}
-variable "master_password" {
- description = "Database root password"
- type = string
-}
-
variable "litellm_username" {
type = string
}
-variable "litellm_password" {
- type = string
- sensitive = true
-}
-
variable "name" {
description = "Name of resource and tag prefix"
type = string
@@ -80,6 +74,17 @@ provider "aws" {
profile = var.profile
}
+# Generate secure random passwords
+resource "random_password" "coder_master_password" {
+ length = 32
+ special = true
+}
+
+resource "random_password" "litellm_password" {
+ length = 32
+ special = true
+}
+
# https://developer.hashicorp.com/terraform/tutorials/aws/aws-rds
resource "aws_db_subnet_group" "db_subnet_group" {
name = "${var.name}-db-subnet-group"
@@ -90,52 +95,85 @@ resource "aws_db_subnet_group" "db_subnet_group" {
}
}
-resource "aws_db_instance" "db" {
- identifier = "${var.name}-db"
- instance_class = var.instance_class
- allocated_storage = var.allocated_storage
- engine = "postgres"
- engine_version = "15.12"
- # backup_retention_period = 7
- username = var.master_username
- password = var.master_password
- db_name = "coder"
- db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name
- vpc_security_group_ids = [aws_security_group.allow-port-5432.id]
- publicly_accessible = false
- skip_final_snapshot = false
+# Aurora Serverless v2 Cluster for Coder
+resource "aws_rds_cluster" "coder" {
+ cluster_identifier = "${var.name}-aurora-cluster"
+ engine = "aurora-postgresql"
+ engine_mode = "provisioned"
+ engine_version = "15.8"
+ database_name = "coder"
+ master_username = var.master_username
+ master_password = random_password.coder_master_password.result
+ db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name
+ vpc_security_group_ids = [aws_security_group.allow-port-5432.id]
+ backup_retention_period = 7
+ preferred_backup_window = "03:00-04:00"
+ skip_final_snapshot = false
+ storage_encrypted = true
+
+ serverlessv2_scaling_configuration {
+ min_capacity = 0.5 # 0.5 ACU = 1 GB RAM (idle state)
+ max_capacity = 16 # 16 ACU = 32 GB RAM (handles 5K-10K users)
+ }
tags = {
- Name = "${var.name}-rds-db"
+ Name = "${var.name}-aurora-coder"
}
- lifecycle {
- ignore_changes = [
- snapshot_identifier
- ]
+}
+
+# Aurora Serverless v2 Instance for Coder (Single writer instance)
+resource "aws_rds_cluster_instance" "coder_writer" {
+ identifier = "${var.name}-aurora-coder-writer"
+ cluster_identifier = aws_rds_cluster.coder.id
+ instance_class = "db.serverless"
+ engine = aws_rds_cluster.coder.engine
+ engine_version = "15.8"
+ publicly_accessible = false
+ db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name
+
+ tags = {
+ Name = "${var.name}-aurora-coder-writer"
}
}
-resource "aws_db_instance" "litellm" {
- identifier = "litellm"
- instance_class = "db.m5.large"
- allocated_storage = 50
- engine = "postgres"
- engine_version = "15.12"
- username = var.litellm_username
- password = var.litellm_password
- db_name = "litellm"
- db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name
- vpc_security_group_ids = [aws_security_group.allow-port-5432.id]
- publicly_accessible = false
- skip_final_snapshot = false
+# Aurora Serverless v2 Cluster for LiteLLM
+resource "aws_rds_cluster" "litellm" {
+ cluster_identifier = "litellm-aurora-cluster"
+ engine = "aurora-postgresql"
+ engine_mode = "provisioned"
+ engine_version = "15.8"
+ database_name = "litellm"
+ master_username = var.litellm_username
+ master_password = random_password.litellm_password.result
+ db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name
+ vpc_security_group_ids = [aws_security_group.allow-port-5432.id]
+ backup_retention_period = 7
+ preferred_backup_window = "04:00-05:00"
+ skip_final_snapshot = false
+ storage_encrypted = true
+
+ serverlessv2_scaling_configuration {
+ min_capacity = 0.5 # 0.5 ACU = 1 GB RAM (idle state)
+ max_capacity = 8 # 8 ACU = 16 GB RAM (handles moderate usage)
+ }
tags = {
- Name = "litellm"
+ Name = "litellm-aurora"
}
- lifecycle {
- ignore_changes = [
- snapshot_identifier
- ]
+}
+
+# Aurora Serverless v2 Instance for LiteLLM
+resource "aws_rds_cluster_instance" "litellm_writer" {
+ identifier = "litellm-aurora-writer"
+ cluster_identifier = aws_rds_cluster.litellm.id
+ instance_class = "db.serverless"
+ engine = aws_rds_cluster.litellm.engine
+ engine_version = "15.8"
+ publicly_accessible = false
+ db_subnet_group_name = aws_db_subnet_group.db_subnet_group.name
+
+ tags = {
+ Name = "litellm-aurora-writer"
}
}
@@ -151,12 +189,18 @@ resource "aws_vpc_security_group_ingress_rule" "postgres" {
to_port = 5432
}
-resource "aws_vpc_security_group_egress_rule" "all" {
+# Allow access from us-west-2 VPC for multi-region deployment
+resource "aws_vpc_security_group_ingress_rule" "postgres_usw2" {
security_group_id = aws_security_group.allow-port-5432.id
- cidr_ipv4 = "0.0.0.0/0"
- ip_protocol = -1
+ cidr_ipv4 = "10.1.0.0/16"
+ ip_protocol = "tcp"
+ from_port = 5432
+ to_port = 5432
}
+# No egress rules needed - RDS only responds to inbound connections
+# This follows security best practice of least privilege
+
resource "aws_security_group" "allow-port-5432" {
vpc_id = var.vpc_id
name = "${var.name}-all-port-5432"
@@ -166,23 +210,95 @@ resource "aws_security_group" "allow-port-5432" {
}
}
-output "rds_port" {
- description = "Database instance port"
- value = aws_db_instance.db.port
+# Store Coder DB credentials in Secrets Manager
+resource "aws_secretsmanager_secret" "coder_db" {
+ name_prefix = "${var.name}-coder-db-"
+ description = "Coder PostgreSQL database credentials"
+ recovery_window_in_days = 7
+
+ tags = {
+ Name = "${var.name}-coder-db-secret"
+ }
+}
+
+resource "aws_secretsmanager_secret_version" "coder_db" {
+ secret_id = aws_secretsmanager_secret.coder_db.id
+ secret_string = jsonencode({
+ username = var.master_username
+ password = random_password.coder_master_password.result
+ host = aws_rds_cluster.coder.endpoint
+ reader_host = aws_rds_cluster.coder.reader_endpoint
+ port = aws_rds_cluster.coder.port
+ dbname = aws_rds_cluster.coder.database_name
+ url = "postgres://${var.master_username}:${random_password.coder_master_password.result}@${aws_rds_cluster.coder.endpoint}:${aws_rds_cluster.coder.port}/${aws_rds_cluster.coder.database_name}?sslmode=require"
+ reader_url = "postgres://${var.master_username}:${random_password.coder_master_password.result}@${aws_rds_cluster.coder.reader_endpoint}:${aws_rds_cluster.coder.port}/${aws_rds_cluster.coder.database_name}?sslmode=require"
+ cluster_id = aws_rds_cluster.coder.id
+ engine_version = aws_rds_cluster.coder.engine_version
+ })
+}
+
+# Store LiteLLM DB credentials in Secrets Manager
+resource "aws_secretsmanager_secret" "litellm_db" {
+ name_prefix = "litellm-db-"
+ description = "LiteLLM PostgreSQL database credentials"
+ recovery_window_in_days = 7
+
+ tags = {
+ Name = "litellm-db-secret"
+ }
+}
+
+resource "aws_secretsmanager_secret_version" "litellm_db" {
+ secret_id = aws_secretsmanager_secret.litellm_db.id
+ secret_string = jsonencode({
+ username = var.litellm_username
+ password = random_password.litellm_password.result
+ host = aws_rds_cluster.litellm.endpoint
+ reader_host = aws_rds_cluster.litellm.reader_endpoint
+ port = aws_rds_cluster.litellm.port
+ dbname = aws_rds_cluster.litellm.database_name
+ url = "postgres://${var.litellm_username}:${random_password.litellm_password.result}@${aws_rds_cluster.litellm.endpoint}:${aws_rds_cluster.litellm.port}/${aws_rds_cluster.litellm.database_name}?sslmode=require"
+ cluster_id = aws_rds_cluster.litellm.id
+ engine_version = aws_rds_cluster.litellm.engine_version
+ })
+}
+
+output "coder_cluster_endpoint" {
+ description = "Aurora cluster writer endpoint for Coder"
+ value = aws_rds_cluster.coder.endpoint
+}
+
+output "coder_cluster_reader_endpoint" {
+ description = "Aurora cluster reader endpoint for Coder"
+ value = aws_rds_cluster.coder.reader_endpoint
+}
+
+output "coder_cluster_port" {
+ description = "Aurora cluster port for Coder"
+ value = aws_rds_cluster.coder.port
+}
+
+output "coder_db_secret_arn" {
+ description = "ARN of Secrets Manager secret containing Coder DB credentials"
+ value = aws_secretsmanager_secret.coder_db.arn
+}
+
+output "litellm_cluster_endpoint" {
+ description = "Aurora cluster writer endpoint for LiteLLM"
+ value = aws_rds_cluster.litellm.endpoint
}
-output "rds_username" {
- description = "Database instance root username"
- value = aws_db_instance.db.username
+output "litellm_cluster_reader_endpoint" {
+ description = "Aurora cluster reader endpoint for LiteLLM"
+ value = aws_rds_cluster.litellm.reader_endpoint
}
-output "rds_address" {
- description = "Database instance address"
- value = aws_db_instance.db.address
+output "litellm_cluster_port" {
+ description = "Aurora cluster port for LiteLLM"
+ value = aws_rds_cluster.litellm.port
}
-output "rds_password" {
- description = "Database instance root password"
- value = aws_db_instance.db.password
- sensitive = true
+output "litellm_db_secret_arn" {
+ description = "ARN of Secrets Manager secret containing LiteLLM DB credentials"
+ value = aws_secretsmanager_secret.litellm_db.arn
}
diff --git a/infra/aws/us-east-2/route53/README.md b/infra/aws/us-east-2/route53/README.md
new file mode 100644
index 0000000..e52ef05
--- /dev/null
+++ b/infra/aws/us-east-2/route53/README.md
@@ -0,0 +1,69 @@
+# Route 53 Latency-Based Routing for Coder
+
+This Terraform configuration sets up Route 53 latency-based routing for the Coder deployment in us-east-2.
+
+## Overview
+
+Latency-based routing automatically directs users to the AWS region that provides the lowest latency, improving the user experience by connecting them to the nearest deployment.
+
+## Features
+
+- **Latency-based routing**: Routes users to the closest region automatically
+- **Health checks**: Monitors endpoint health and routes around failures
+- **Wildcard DNS**: Supports workspace application subdomains
+- **Automatic NLB discovery**: Retrieves NLB hostname from Kubernetes service
+
+## Prerequisites
+
+1. Hosted Zone ID for coderdemo.io (already configured: Z080884039133KJPAGA3S)
+2. Running EKS cluster with Coder deployed
+3. Network Load Balancer created via Kubernetes service
+
+## Deployment
+
+1. Create terraform.tfvars from the example:
+
+```bash
+cp terraform.tfvars.example terraform.tfvars
+```
+
+2. Update terraform.tfvars with your cluster name:
+
+```hcl
+cluster_name = "your-cluster-name"
+```
+
+3. Initialize and apply:
+
+```bash
+terraform init
+terraform plan
+terraform apply
+```
+
+## How It Works
+
+1. The configuration queries the Kubernetes service to get the NLB hostname
+2. Creates Route 53 A records with latency-based routing policy
+3. Sets up health checks to monitor endpoint availability
+4. Configures both main domain and wildcard records
+
+## Health Checks
+
+Health checks monitor the `/api/v2/buildinfo` endpoint on port 443 (HTTPS):
+
+- **Interval**: 30 seconds
+- **Failure threshold**: 3 consecutive failures
+- **Latency measurement**: Enabled for monitoring
+
+## Records Created
+
+- `coderdemo.io` - Main domain with latency routing
+- `*.coderdemo.io` - Wildcard for workspace applications
+
+## Important Notes
+
+- Deploy this configuration in **both** us-east-2 and us-west-2 with different set_identifiers
+- Each region's configuration points to its local NLB
+- Route 53 automatically routes based on measured latency
+- Health checks ensure failover if one region becomes unhealthy
diff --git a/infra/aws/us-east-2/route53/main.tf b/infra/aws/us-east-2/route53/main.tf
new file mode 100644
index 0000000..3f0e191
--- /dev/null
+++ b/infra/aws/us-east-2/route53/main.tf
@@ -0,0 +1,217 @@
+terraform {
+ required_providers {
+ aws = {
+ source = "hashicorp/aws"
+ version = ">= 5.0"
+ }
+ kubernetes = {
+ source = "hashicorp/kubernetes"
+ version = ">= 2.0"
+ }
+ }
+}
+
+variable "cluster_region" {
+ description = "AWS region"
+ type = string
+ default = "us-east-2"
+}
+
+variable "cluster_profile" {
+ description = "AWS profile"
+ type = string
+ default = "default"
+}
+
+variable "cluster_name" {
+ description = "EKS cluster name"
+ type = string
+}
+
+variable "domain_name" {
+ description = "Domain name for Coder"
+ type = string
+ default = ""
+}
+
+variable "hosted_zone_id" {
+ description = "Route 53 Hosted Zone ID (provide via tfvars)"
+ type = string
+}
+
+variable "coder_service_name" {
+ description = "Coder service name in Kubernetes"
+ type = string
+ default = "coder"
+}
+
+variable "coder_namespace" {
+ description = "Coder namespace in Kubernetes"
+ type = string
+ default = "coder"
+}
+
+variable "set_identifier" {
+ description = "Unique identifier for this routing policy record"
+ type = string
+ default = "us-east-2"
+}
+
+variable "health_check_enabled" {
+ description = "Enable Route 53 health checks"
+ type = bool
+ default = true
+}
+
+variable "health_check_path" {
+ description = "Path for health checks"
+ type = string
+ default = "/api/v2/buildinfo"
+}
+
+provider "aws" {
+ region = var.cluster_region
+ profile = var.cluster_profile
+}
+
+data "aws_eks_cluster" "this" {
+ name = var.cluster_name
+}
+
+data "aws_eks_cluster_auth" "this" {
+ name = var.cluster_name
+}
+
+provider "kubernetes" {
+ host = data.aws_eks_cluster.this.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
+ token = data.aws_eks_cluster_auth.this.token
+}
+
+# Get the NLB hostname from the Kubernetes service
+data "kubernetes_service" "coder" {
+ metadata {
+ name = var.coder_service_name
+ namespace = var.coder_namespace
+ }
+}
+
+# Extract the NLB details
+locals {
+ nlb_hostname = try(data.kubernetes_service.coder.status[0].load_balancer[0].ingress[0].hostname, "")
+}
+
+# Get NLB by tags (AWS Load Balancer Controller tags the NLB)
+data "aws_lb" "coder_nlb" {
+ tags = {
+ "service.k8s.aws/stack" = "${var.coder_namespace}/${var.coder_service_name}"
+ }
+}
+
+# Health check for the NLB endpoint
+resource "aws_route53_health_check" "coder" {
+ count = var.health_check_enabled ? 1 : 0
+ type = "HTTPS"
+ resource_path = var.health_check_path
+ fqdn = var.domain_name
+ port = 443
+ request_interval = 30
+ failure_threshold = 3
+ measure_latency = true
+
+ tags = {
+ Name = "coder-${var.set_identifier}"
+ Region = var.cluster_region
+ Environment = "production"
+ ManagedBy = "terraform"
+ }
+}
+
+# Latency-based routing record for the main domain
+resource "aws_route53_record" "coder_latency" {
+ zone_id = var.hosted_zone_id
+ name = var.domain_name
+ type = "A"
+ set_identifier = var.set_identifier
+ allow_overwrite = true
+
+ alias {
+ name = local.nlb_hostname
+ zone_id = data.aws_lb.coder_nlb.zone_id
+ evaluate_target_health = true
+ }
+
+ latency_routing_policy {
+ region = var.cluster_region
+ }
+
+ health_check_id = var.health_check_enabled ? aws_route53_health_check.coder[0].id : null
+}
+
+# Latency-based routing record for wildcard subdomains
+resource "aws_route53_record" "coder_wildcard_latency" {
+ zone_id = var.hosted_zone_id
+ name = "*.${var.domain_name}"
+ type = "A"
+ set_identifier = var.set_identifier
+ allow_overwrite = true
+
+ alias {
+ name = local.nlb_hostname
+ zone_id = data.aws_lb.coder_nlb.zone_id
+ evaluate_target_health = true
+ }
+
+ latency_routing_policy {
+ region = var.cluster_region
+ }
+
+ health_check_id = var.health_check_enabled ? aws_route53_health_check.coder[0].id : null
+}
+
+# Region-specific subdomain for manual region selection
+resource "aws_route53_record" "coder_region_specific" {
+ zone_id = var.hosted_zone_id
+ name = "${var.set_identifier}.${var.domain_name}"
+ type = "A"
+
+ alias {
+ name = local.nlb_hostname
+ zone_id = data.aws_lb.coder_nlb.zone_id
+ evaluate_target_health = true
+ }
+}
+
+# Wildcard for region-specific subdomain (for workspace apps)
+resource "aws_route53_record" "coder_region_specific_wildcard" {
+ zone_id = var.hosted_zone_id
+ name = "*.${var.set_identifier}.${var.domain_name}"
+ type = "A"
+
+ alias {
+ name = local.nlb_hostname
+ zone_id = data.aws_lb.coder_nlb.zone_id
+ evaluate_target_health = true
+ }
+}
+
+# Outputs
+output "nlb_hostname" {
+ description = "Network Load Balancer hostname"
+ value = local.nlb_hostname
+}
+
+output "nlb_zone_id" {
+ description = "Network Load Balancer Route 53 zone ID"
+ value = data.aws_lb.coder_nlb.zone_id
+}
+
+output "health_check_id" {
+ description = "Route 53 health check ID"
+ value = var.health_check_enabled ? aws_route53_health_check.coder[0].id : null
+}
+
+output "route53_record_fqdn" {
+ description = "Fully qualified domain name of the Route 53 record"
+ value = aws_route53_record.coder_latency.fqdn
+}
diff --git a/infra/aws/us-east-2/terraform-backend/main.tf b/infra/aws/us-east-2/terraform-backend/main.tf
new file mode 100644
index 0000000..5be0f2d
--- /dev/null
+++ b/infra/aws/us-east-2/terraform-backend/main.tf
@@ -0,0 +1,144 @@
+terraform {
+ required_providers {
+ aws = {
+ source = "hashicorp/aws"
+ version = ">= 5.0"
+ }
+ }
+}
+
+variable "region" {
+ description = "AWS region for backend resources"
+ type = string
+ default = "us-east-2"
+}
+
+variable "profile" {
+ description = "AWS profile"
+ type = string
+ default = "noah@coder.com"
+}
+
+variable "project_name" {
+ description = "Project name for resource naming"
+ type = string
+ default = "coder-demo"
+}
+
+provider "aws" {
+ region = var.region
+ profile = var.profile
+}
+
+# S3 bucket for Terraform state
+resource "aws_s3_bucket" "terraform_state" {
+ bucket = "${var.project_name}-terraform-state-${data.aws_caller_identity.current.account_id}"
+
+ tags = {
+ Name = "Terraform State Bucket"
+ Environment = "production-demo"
+ ManagedBy = "terraform"
+ Purpose = "terraform-backend"
+ }
+}
+
+# Enable versioning for state file history
+resource "aws_s3_bucket_versioning" "terraform_state" {
+ bucket = aws_s3_bucket.terraform_state.id
+
+ versioning_configuration {
+ status = "Enabled"
+ }
+}
+
+# Enable server-side encryption
+resource "aws_s3_bucket_server_side_encryption_configuration" "terraform_state" {
+ bucket = aws_s3_bucket.terraform_state.id
+
+ rule {
+ apply_server_side_encryption_by_default {
+ sse_algorithm = "AES256"
+ }
+ }
+}
+
+# Block public access
+resource "aws_s3_bucket_public_access_block" "terraform_state" {
+ bucket = aws_s3_bucket.terraform_state.id
+
+ block_public_acls = true
+ block_public_policy = true
+ ignore_public_acls = true
+ restrict_public_buckets = true
+}
+
+# Lifecycle policy to delete old state versions after 90 days
+resource "aws_s3_bucket_lifecycle_configuration" "terraform_state" {
+ bucket = aws_s3_bucket.terraform_state.id
+
+ rule {
+ id = "delete-old-versions"
+ status = "Enabled"
+
+ noncurrent_version_expiration {
+ noncurrent_days = 90
+ }
+ }
+
+ rule {
+ id = "abort-incomplete-uploads"
+ status = "Enabled"
+
+ abort_incomplete_multipart_upload {
+ days_after_initiation = 7
+ }
+ }
+}
+
+# DynamoDB table for state locking
+resource "aws_dynamodb_table" "terraform_locks" {
+ name = "${var.project_name}-terraform-locks"
+ billing_mode = "PAY_PER_REQUEST"
+ hash_key = "LockID"
+
+ attribute {
+ name = "LockID"
+ type = "S"
+ }
+
+ tags = {
+ Name = "Terraform State Lock Table"
+ Environment = "production-demo"
+ ManagedBy = "terraform"
+ Purpose = "terraform-backend"
+ }
+}
+
+# Get current AWS account ID
+data "aws_caller_identity" "current" {}
+
+# Outputs
+output "state_bucket_name" {
+ description = "S3 bucket name for Terraform state"
+ value = aws_s3_bucket.terraform_state.id
+}
+
+output "state_bucket_arn" {
+ description = "S3 bucket ARN"
+ value = aws_s3_bucket.terraform_state.arn
+}
+
+output "dynamodb_table_name" {
+ description = "DynamoDB table name for state locking"
+ value = aws_dynamodb_table.terraform_locks.id
+}
+
+output "backend_config" {
+ description = "Backend configuration to use in other modules"
+ value = {
+ bucket = aws_s3_bucket.terraform_state.id
+ region = var.region
+ dynamodb_table = aws_dynamodb_table.terraform_locks.id
+ encrypt = true
+ }
+}
diff --git a/infra/aws/us-east-2/terraform-backend/terraform.tfvars.example b/infra/aws/us-east-2/terraform-backend/terraform.tfvars.example
new file mode 100644
index 0000000..f62ce73
--- /dev/null
+++ b/infra/aws/us-east-2/terraform-backend/terraform.tfvars.example
@@ -0,0 +1,6 @@
+# Backend configuration for Coder demo environment
+# Copy this to terraform.tfvars and fill in your values
+
+region = "us-east-2"
+profile = "YOUR_AWS_PROFILE"
+project_name = "YOUR_PROJECT_NAME"
diff --git a/infra/aws/us-east-2/vpc-peering/main.tf b/infra/aws/us-east-2/vpc-peering/main.tf
new file mode 100644
index 0000000..ebfe054
--- /dev/null
+++ b/infra/aws/us-east-2/vpc-peering/main.tf
@@ -0,0 +1,164 @@
+terraform {
+ required_version = ">= 1.0"
+ required_providers {
+ aws = {
+ source = "hashicorp/aws"
+ version = ">= 5.100.0"
+ }
+ }
+ backend "s3" {}
+}
+
+variable "profile" {
+ type = string
+ default = "default"
+}
+
+variable "requester_vpc_id" {
+ description = "VPC ID in us-east-2 (requester)"
+ type = string
+}
+
+variable "accepter_vpc_id" {
+ description = "VPC ID in us-west-2 (accepter)"
+ type = string
+}
+
+variable "requester_vpc_cidr" {
+ description = "CIDR block for us-east-2 VPC"
+ type = string
+ default = "10.0.0.0/16"
+}
+
+variable "accepter_vpc_cidr" {
+ description = "CIDR block for us-west-2 VPC"
+ type = string
+ default = "10.1.0.0/16"
+}
+
+variable "requester_node_security_group_id" {
+ description = "Security group ID for EKS nodes in us-east-2"
+ type = string
+}
+
+variable "accepter_node_security_group_id" {
+ description = "Security group ID for EKS nodes in us-west-2"
+ type = string
+}
+
+# Provider for us-east-2 (requester)
+provider "aws" {
+ alias = "use2"
+ region = "us-east-2"
+ profile = var.profile
+}
+
+# Provider for us-west-2 (accepter)
+provider "aws" {
+ alias = "usw2"
+ region = "us-west-2"
+ profile = var.profile
+}
+
+# Create VPC peering connection from us-east-2
+resource "aws_vpc_peering_connection" "use2_to_usw2" {
+ provider = aws.use2
+
+ vpc_id = var.requester_vpc_id
+ peer_vpc_id = var.accepter_vpc_id
+ peer_region = "us-west-2"
+ auto_accept = false
+
+ tags = {
+ Name = "coderdemo-use2-usw2-peering"
+ ManagedBy = "terraform"
+ Side = "Requester"
+ }
+}
+
+# Accept the peering connection in us-west-2
+resource "aws_vpc_peering_connection_accepter" "usw2_accepter" {
+ provider = aws.usw2
+
+ vpc_peering_connection_id = aws_vpc_peering_connection.use2_to_usw2.id
+ auto_accept = true
+
+ tags = {
+ Name = "coderdemo-use2-usw2-peering"
+ ManagedBy = "terraform"
+ Side = "Accepter"
+ }
+}
+
+# Get route tables in us-east-2
+data "aws_route_tables" "use2" {
+ provider = aws.use2
+ vpc_id = var.requester_vpc_id
+}
+
+# Get route tables in us-west-2
+data "aws_route_tables" "usw2" {
+ provider = aws.usw2
+ vpc_id = var.accepter_vpc_id
+}
+
+# Add routes in us-east-2 route tables to us-west-2 CIDR
+resource "aws_route" "use2_to_usw2" {
+ provider = aws.use2
+ for_each = toset(data.aws_route_tables.use2.ids)
+
+ route_table_id = each.value
+ destination_cidr_block = var.accepter_vpc_cidr
+ vpc_peering_connection_id = aws_vpc_peering_connection.use2_to_usw2.id
+
+ depends_on = [aws_vpc_peering_connection_accepter.usw2_accepter]
+}
+
+# Add routes in us-west-2 route tables to us-east-2 CIDR
+resource "aws_route" "usw2_to_use2" {
+ provider = aws.usw2
+ for_each = toset(data.aws_route_tables.usw2.ids)
+
+ route_table_id = each.value
+ destination_cidr_block = var.requester_vpc_cidr
+ vpc_peering_connection_id = aws_vpc_peering_connection.use2_to_usw2.id
+
+ depends_on = [aws_vpc_peering_connection_accepter.usw2_accepter]
+}
+
+# Security group rule to allow Coder replica communication from us-west-2 to us-east-2
+resource "aws_security_group_rule" "use2_allow_coder_from_usw2" {
+ provider = aws.use2
+
+ type = "ingress"
+ from_port = 8080
+ to_port = 8080
+ protocol = "tcp"
+ cidr_blocks = [var.accepter_vpc_cidr]
+ security_group_id = var.requester_node_security_group_id
+ description = "Allow Coder replica communication from us-west-2"
+}
+
+# Security group rule to allow Coder replica communication from us-east-2 to us-west-2
+resource "aws_security_group_rule" "usw2_allow_coder_from_use2" {
+ provider = aws.usw2
+
+ type = "ingress"
+ from_port = 8080
+ to_port = 8080
+ protocol = "tcp"
+ cidr_blocks = [var.requester_vpc_cidr]
+ security_group_id = var.accepter_node_security_group_id
+ description = "Allow Coder replica communication from us-east-2"
+}
+
+# Outputs
+output "peering_connection_id" {
+ description = "VPC Peering Connection ID"
+ value = aws_vpc_peering_connection.use2_to_usw2.id
+}
+
+output "peering_status" {
+ description = "VPC Peering Connection Status"
+ value = aws_vpc_peering_connection.use2_to_usw2.accept_status
+}
diff --git a/infra/aws/us-west-2/acm/main.tf b/infra/aws/us-west-2/acm/main.tf
new file mode 100644
index 0000000..89122ca
--- /dev/null
+++ b/infra/aws/us-west-2/acm/main.tf
@@ -0,0 +1,108 @@
+terraform {
+ required_providers {
+ aws = {
+ source = "hashicorp/aws"
+ version = ">= 5.0"
+ }
+ }
+}
+
+variable "cluster_region" {
+ description = "AWS region for ACM certificate"
+ type = string
+ default = "us-west-2"
+}
+
+variable "cluster_profile" {
+ description = "AWS profile"
+ type = string
+ default = "default"
+}
+
+variable "domain_name" {
+ description = "Domain name for Coder"
+ type = string
+ default = "coderdemo.io"
+}
+
+variable "hosted_zone_id" {
+ description = "Route 53 Hosted Zone ID"
+ type = string
+}
+
+provider "aws" {
+ region = var.cluster_region
+ profile = var.cluster_profile
+ alias = "acm"
+}
+
+# Provider for Route 53 (may be in different account)
+provider "aws" {
+ region = var.cluster_region
+ profile = var.cluster_profile
+ alias = "route53"
+}
+
+# ACM Certificate for Coder with wildcard
+resource "aws_acm_certificate" "coder" {
+ provider = aws.acm
+ domain_name = var.domain_name
+ validation_method = "DNS"
+
+ subject_alternative_names = [
+ "*.${var.domain_name}"
+ ]
+
+ lifecycle {
+ create_before_destroy = true
+ }
+
+ tags = {
+ Name = "coder-certificate"
+ Environment = "production"
+ ManagedBy = "terraform"
+ Region = "us-west-2"
+ }
+}
+
+# Route 53 validation records
+resource "aws_route53_record" "cert_validation" {
+ provider = aws.route53
+ for_each = {
+ for dvo in aws_acm_certificate.coder.domain_validation_options : dvo.domain_name => {
+ name = dvo.resource_record_name
+ record = dvo.resource_record_value
+ type = dvo.resource_record_type
+ }
+ }
+
+ allow_overwrite = true
+ name = each.value.name
+ records = [each.value.record]
+ ttl = 60
+ type = each.value.type
+ zone_id = var.hosted_zone_id
+}
+
+# Wait for certificate validation
+resource "aws_acm_certificate_validation" "coder" {
+ provider = aws.acm
+ certificate_arn = aws_acm_certificate.coder.arn
+ validation_record_fqdns = [for record in aws_route53_record.cert_validation : record.fqdn]
+}
+
+# Outputs
+output "certificate_arn" {
+ description = "ARN of the validated ACM certificate"
+ value = aws_acm_certificate_validation.coder.certificate_arn
+}
+
+output "domain_name" {
+ description = "Domain name for Coder"
+ value = var.domain_name
+}
+
+output "validation_status" {
+ description = "Certificate validation status"
+ value = "Certificate validated and ready to use"
+}
diff --git a/infra/aws/us-west-2/eks/main.tf b/infra/aws/us-west-2/eks/main.tf
index 2bffa33..3140818 100644
--- a/infra/aws/us-west-2/eks/main.tf
+++ b/infra/aws/us-west-2/eks/main.tf
@@ -30,10 +30,16 @@ variable "cluster_version" {
variable "cluster_instance_type" {
description = "EKS Instance Size/Type"
- default = "t3.xlarge"
+ default = "t4g.medium" # ARM Graviton for cost optimization
type = string
}
+variable "allowed_cidrs" {
+ description = "CIDR blocks allowed to access EKS API endpoint"
+ type = list(string)
+ default = ["0.0.0.0/0"] # Open by default, restrict in tfvars
+}
+
provider "aws" {
region = var.region
profile = var.profile
@@ -73,16 +79,16 @@ module "eks-network" {
source = "../../../../modules/network/eks-vpc"
name = var.name
- vpc_cidr_block = "10.0.0.0/16"
+ vpc_cidr_block = "10.1.0.0/16"
public_subnets = {
"system0" = {
- cidr_block = "10.0.10.0/24"
+ cidr_block = "10.1.10.0/24"
availability_zone = "${data.aws_region.this.name}a"
map_public_ip_on_launch = true
private_dns_hostname_type_on_launch = "ip-name"
}
"system1" = {
- cidr_block = "10.0.11.0/24"
+ cidr_block = "10.1.11.0/24"
availability_zone = "${data.aws_region.this.name}b"
map_public_ip_on_launch = true
private_dns_hostname_type_on_launch = "ip-name"
@@ -90,26 +96,26 @@ module "eks-network" {
}
private_subnets = {
"system0" = {
- cidr_block = "10.0.20.0/24"
+ cidr_block = "10.1.20.0/24"
availability_zone = "${data.aws_region.this.name}a"
private_dns_hostname_type_on_launch = "ip-name"
tags = local.system_subnet_tags
}
"system1" = {
- cidr_block = "10.0.21.0/24"
+ cidr_block = "10.1.21.0/24"
availability_zone = "${data.aws_region.this.name}b"
private_dns_hostname_type_on_launch = "ip-name"
tags = local.system_subnet_tags
}
"provisioner" = {
- cidr_block = "10.0.22.0/24"
+ cidr_block = "10.1.22.0/24"
availability_zone = "${data.aws_region.this.name}a"
map_public_ip_on_launch = true
private_dns_hostname_type_on_launch = "ip-name"
tags = local.provisioner_subnet_tags
}
"ws-all" = {
- cidr_block = "10.0.16.0/22"
+ cidr_block = "10.1.16.0/22"
availability_zone = "${data.aws_region.this.name}b"
map_public_ip_on_launch = true
private_dns_hostname_type_on_launch = "ip-name"
@@ -144,10 +150,11 @@ module "cluster" {
module.eks-network.intra_subnet_ids
))
- cluster_name = var.name
- cluster_version = var.cluster_version
- cluster_endpoint_public_access = true
- cluster_endpoint_private_access = true
+ cluster_name = var.name
+ cluster_version = var.cluster_version
+ cluster_endpoint_public_access = true
+ cluster_endpoint_private_access = true
+ cluster_endpoint_public_access_cidrs = var.allowed_cidrs
create_cluster_security_group = true
create_node_security_group = true
@@ -179,11 +186,12 @@ module "cluster" {
system = {
min_size = 0
max_size = 10
- desired_size = 0 # Cant be modified after creation. Override from AWS Console
+ desired_size = 1 # Scale to 1 node for cluster functionality
labels = local.cluster_asg_node_labels
instance_types = [var.cluster_instance_type]
capacity_type = "ON_DEMAND"
+ ami_type = "AL2023_ARM_64_STANDARD" # ARM AMI for Graviton instances
iam_role_additional_policies = {
AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
STSAssumeRole = aws_iam_policy.sts.arn
diff --git a/infra/aws/us-west-2/k8s/cert-manager/main.tf b/infra/aws/us-west-2/k8s/cert-manager/main.tf
index c2869b5..8a423e6 100644
--- a/infra/aws/us-west-2/k8s/cert-manager/main.tf
+++ b/infra/aws/us-west-2/k8s/cert-manager/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -41,11 +41,6 @@ variable "addon_version" {
default = "1.13.3"
}
-variable "cloudflare_api_token" {
- type = string
- sensitive = true
-}
-
provider "aws" {
region = var.cluster_region
profile = var.cluster_profile
@@ -60,7 +55,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
@@ -78,7 +73,6 @@ module "cert-manager" {
cluster_name = var.cluster_name
cluster_oidc_provider_arn = var.cluster_oidc_provider_arn
- namespace = var.addon_namespace
- helm_version = var.addon_version
- cloudflare_token_secret = var.cloudflare_api_token
+ namespace = var.addon_namespace
+ helm_version = var.addon_version
}
\ No newline at end of file
diff --git a/infra/aws/us-west-2/k8s/coder-proxy/main.tf b/infra/aws/us-west-2/k8s/coder-proxy/main.tf
index fc46036..06b5c6b 100644
--- a/infra/aws/us-west-2/k8s/coder-proxy/main.tf
+++ b/infra/aws/us-west-2/k8s/coder-proxy/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -101,11 +101,6 @@ variable "kubernetes_create_ssl_secret" {
default = true
}
-variable "cloudflare_api_token" {
- type = string
- sensitive = true
-}
-
provider "aws" {
region = var.cluster_region
profile = var.cluster_profile
@@ -120,7 +115,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
@@ -161,7 +156,6 @@ module "coder-proxy" {
proxy_token_config = {
name = "coder-proxy"
}
- cloudflare_api_token = var.cloudflare_api_token
ssl_cert_config = {
name = var.kubernetes_ssl_secret_name
create_secret = var.kubernetes_create_ssl_secret
@@ -208,9 +202,4 @@ module "coder-proxy" {
topology_key = "kubernetes.io/hostname"
}
}]
-}
-
-import {
- id = "coder-proxy"
- to = module.coder-proxy.kubernetes_namespace.this
}
\ No newline at end of file
diff --git a/infra/aws/us-west-2/k8s/coder-server/main.tf b/infra/aws/us-west-2/k8s/coder-server/main.tf
new file mode 100644
index 0000000..c66b01f
--- /dev/null
+++ b/infra/aws/us-west-2/k8s/coder-server/main.tf
@@ -0,0 +1,318 @@
+terraform {
+ required_providers {
+ aws = {
+ source = "hashicorp/aws"
+ }
+ helm = {
+ source = "hashicorp/helm"
+ version = "3.1.1"
+ }
+ kubernetes = {
+ source = "hashicorp/kubernetes"
+ }
+ coderd = {
+ source = "coder/coderd"
+ }
+ acme = {
+ source = "vancluever/acme"
+ }
+ tls = {
+ source = "hashicorp/tls"
+ }
+ }
+ backend "s3" {}
+}
+
+variable "cluster_name" {
+ type = string
+}
+
+variable "cluster_region" {
+ type = string
+}
+
+variable "cluster_profile" {
+ type = string
+ default = "default"
+}
+
+variable "cluster_oidc_provider_arn" {
+ type = string
+}
+
+variable "acme_server_url" {
+ type = string
+ default = "https://acme-v02.api.letsencrypt.org/directory"
+}
+
+variable "acme_registration_email" {
+ type = string
+}
+
+variable "addon_version" {
+ type = string
+ default = "2.25.1"
+}
+
+variable "coder_access_url" {
+ type = string
+}
+
+variable "coder_wildcard_access_url" {
+ type = string
+}
+
+variable "coder_experiments" {
+ type = list(string)
+ default = []
+}
+
+variable "coder_github_allowed_orgs" {
+ type = list(string)
+ default = []
+}
+
+variable "coder_builtin_provisioner_count" {
+ type = number
+ default = 0
+}
+
+variable "coder_github_external_auth_secret_client_secret" {
+ type = string
+ sensitive = true
+}
+
+variable "coder_github_external_auth_secret_client_id" {
+ type = string
+ sensitive = true
+}
+
+variable "coder_oauth_secret_client_secret" {
+ type = string
+ sensitive = true
+}
+
+variable "coder_oauth_secret_client_id" {
+ type = string
+ sensitive = true
+}
+
+variable "coder_oidc_secret_client_secret" {
+ type = string
+ sensitive = true
+}
+
+variable "coder_oidc_secret_client_id" {
+ type = string
+ sensitive = true
+}
+
+variable "coder_oidc_secret_issuer_url" {
+ type = string
+ sensitive = true
+}
+
+variable "coder_db_secret_url" {
+ type = string
+ sensitive = true
+}
+
+variable "coder_token" {
+ type = string
+ sensitive = true
+}
+
+variable "image_repo" {
+ type = string
+ sensitive = true
+}
+
+variable "image_tag" {
+ type = string
+ default = "latest"
+}
+
+variable "kubernetes_ssl_secret_name" {
+ type = string
+}
+
+variable "kubernetes_create_ssl_secret" {
+ type = bool
+ default = true
+}
+
+variable "oidc_sign_in_text" {
+ type = string
+}
+
+variable "oidc_icon_url" {
+ type = string
+}
+
+variable "oidc_scopes" {
+ type = list(string)
+}
+
+variable "oidc_email_domain" {
+ type = string
+}
+
+provider "aws" {
+ region = var.cluster_region
+ profile = var.cluster_profile
+}
+
+data "aws_eks_cluster" "this" {
+ name = var.cluster_name
+}
+
+data "aws_eks_cluster_auth" "this" {
+ name = var.cluster_name
+}
+
+provider "helm" {
+ kubernetes = {
+ host = data.aws_eks_cluster.this.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
+ token = data.aws_eks_cluster_auth.this.token
+ }
+}
+
+provider "kubernetes" {
+ host = data.aws_eks_cluster.this.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
+ token = data.aws_eks_cluster_auth.this.token
+}
+
+provider "coderd" {
+ url = var.coder_access_url
+ token = var.coder_token
+}
+
+provider "acme" {
+ server_url = var.acme_server_url
+}
+
+# Fetch ACM certificate dynamically by domain to avoid hardcoding sensitive ARNs
+data "aws_acm_certificate" "coder" {
+ domain = trimsuffix(trimprefix(var.coder_access_url, "https://"), "/")
+ statuses = ["ISSUED"]
+ most_recent = true
+}
+
+module "coder-server" {
+ source = "../../../../../modules/k8s/bootstrap/coder-server"
+
+ cluster_name = var.cluster_name
+ cluster_oidc_provider_arn = var.cluster_oidc_provider_arn
+
+
+ namespace = "coder"
+ acme_registration_email = var.acme_registration_email
+ acme_days_until_renewal = 90
+ replica_count = 1 # HA requires Enterprise license
+ helm_version = var.addon_version
+ image_repo = var.image_repo
+ image_tag = var.image_tag
+ primary_access_url = var.coder_access_url
+ wildcard_access_url = var.coder_wildcard_access_url
+ coder_experiments = var.coder_experiments
+ coder_builtin_provisioner_count = var.coder_builtin_provisioner_count
+ coder_github_allowed_orgs = var.coder_github_allowed_orgs
+ ssl_cert_config = {
+ name = var.kubernetes_ssl_secret_name
+ create_secret = var.kubernetes_create_ssl_secret
+ }
+ oidc_config = {
+ sign_in_text = var.oidc_sign_in_text
+ icon_url = var.oidc_icon_url
+ scopes = var.oidc_scopes
+ email_domain = var.oidc_email_domain
+ }
+ db_secret_url = var.coder_db_secret_url
+ oidc_secret_issuer_url = var.coder_oidc_secret_issuer_url
+ oidc_secret_client_id = var.coder_oidc_secret_client_id
+ oidc_secret_client_secret = var.coder_oidc_secret_client_secret
+ oauth_secret_client_id = var.coder_oauth_secret_client_id
+ oauth_secret_client_secret = var.coder_oauth_secret_client_secret
+ github_external_auth_secret_client_id = var.coder_github_external_auth_secret_client_id
+ github_external_auth_secret_client_secret = var.coder_github_external_auth_secret_client_secret
+ tags = {}
+ env_vars = {
+ # Disable redirect since NLB terminates TLS and forwards plain HTTP to backend
+ # Without this, Coder sees HTTP and redirects to HTTPS, causing infinite redirect loop
+ CODER_REDIRECT_TO_ACCESS_URL = "false"
+ # Disable TLS on Coder itself since NLB terminates TLS
+ CODER_TLS_ENABLE = "false"
+ # Mark auth cookies as secure since users access via HTTPS
+ CODER_SECURE_AUTH_COOKIE = "true"
+ # Enable DERP server for multi-region replica communication
+ CODER_DERP_SERVER_ENABLE = "true"
+ }
+ service_annotations = {
+ "service.beta.kubernetes.io/aws-load-balancer-nlb-target-type" = "instance"
+ "service.beta.kubernetes.io/aws-load-balancer-scheme" = "internet-facing"
+ "service.beta.kubernetes.io/aws-load-balancer-attributes" = "deletion_protection.enabled=false,load_balancing.cross_zone.enabled=true"
+ "service.beta.kubernetes.io/aws-load-balancer-ssl-cert" = data.aws_acm_certificate.coder.arn
+ "service.beta.kubernetes.io/aws-load-balancer-ssl-ports" = "443"
+ "service.beta.kubernetes.io/aws-load-balancer-backend-protocol" = "tcp"
+ # Subnets will be auto-detected by Load Balancer Controller using kubernetes.io/role/elb=1 tag
+ }
+ node_selector = {
+ "node.coder.io/managed-by" = "karpenter"
+ "node.coder.io/used-for" = "coder-server"
+ }
+ tolerations = [{
+ key = "dedicated"
+ operator = "Equal"
+ value = "coder-server"
+ effect = "NoSchedule"
+ }]
+ topology_spread_constraints = [{
+ max_skew = 1
+ topology_key = "kubernetes.io/hostname"
+ when_unsatisfiable = "ScheduleAnyway"
+ label_selector = {
+ match_labels = {
+ "app.kubernetes.io/name" = "coder"
+ "app.kubernetes.io/part-of" = "coder"
+ }
+ }
+ match_label_keys = [
+ "app.kubernetes.io/instance"
+ ]
+ }]
+ pod_anti_affinity_preferred_during_scheduling_ignored_during_execution = [{
+ weight = 100
+ pod_affinity_term = {
+ label_selector = {
+ match_labels = {
+ "app.kubernetes.io/instance" = "coder-v2"
+ "app.kubernetes.io/name" = "coder"
+ "app.kubernetes.io/part-of" = "coder"
+ }
+ }
+ topology_key = "kubernetes.io/hostname"
+ }
+ }]
+}
+
+# Fix service HTTPS port to forward to HTTP backend (port 8080)
+# since Coder has TLS disabled and only listens on HTTP
+resource "null_resource" "patch_coder_service" {
+ depends_on = [module.coder-server]
+
+ triggers = {
+ # Re-run patch whenever Coder configuration changes
+ always_run = timestamp()
+ }
+
+ provisioner "local-exec" {
+ command = <<-EOT
+ sleep 10
+ kubectl patch svc coder -n coder --type='json' \
+ -p='[{"op": "replace", "path": "/spec/ports/1/targetPort", "value": "http"}]' \
+ 2>/dev/null || true
+ EOT
+ }
+}
diff --git a/infra/aws/us-west-2/k8s/coder-ws/main.tf b/infra/aws/us-west-2/k8s/coder-ws/main.tf
index 451a056..6c9140b 100644
--- a/infra/aws/us-west-2/k8s/coder-ws/main.tf
+++ b/infra/aws/us-west-2/k8s/coder-ws/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -98,7 +98,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/us-west-2/k8s/ebs-controller/main.tf b/infra/aws/us-west-2/k8s/ebs-controller/main.tf
index d7f1f56..5194ec7 100644
--- a/infra/aws/us-west-2/k8s/ebs-controller/main.tf
+++ b/infra/aws/us-west-2/k8s/ebs-controller/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -55,7 +55,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/us-west-2/k8s/karpenter/main.tf b/infra/aws/us-west-2/k8s/karpenter/main.tf
index f5b34f8..2e9426a 100644
--- a/infra/aws/us-west-2/k8s/karpenter/main.tf
+++ b/infra/aws/us-west-2/k8s/karpenter/main.tf
@@ -5,11 +5,14 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
}
+ null = {
+ source = "hashicorp/null"
+ }
}
backend "s3" {}
}
@@ -40,6 +43,16 @@ variable "addon_namespace" {
default = "default"
}
+variable "karpenter_queue_name" {
+ type = string
+ default = ""
+}
+
+variable "karpenter_queue_rule_name" {
+ type = string
+ default = ""
+}
+
provider "aws" {
region = var.cluster_region
profile = var.cluster_profile
@@ -54,7 +67,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
@@ -101,6 +114,24 @@ locals {
locals {
nodepool_configs = [{
+ name = "coder-server"
+ node_labels = merge(local.global_node_labels, {
+ "node.coder.io/name" = "coder"
+ "node.coder.io/part-of" = "coder"
+ "node.coder.io/used-for" = "coder-server"
+ })
+ node_taints = [{
+ key = "dedicated"
+ value = "coder-server"
+ effect = "NoSchedule"
+ }]
+ node_requirements = concat(local.global_node_reqs, [{
+ key = "node.kubernetes.io/instance-type"
+ operator = "In"
+ values = ["t3.xlarge", "t3a.xlarge", "t3.2xlarge", "t3a.2xlarge"]
+ }])
+ node_class_ref_name = "coder-proxy-class"
+ }, {
name = "coder-proxy"
node_labels = merge(local.global_node_labels, {
"node.coder.io/name" = "coder"
@@ -115,7 +146,7 @@ locals {
node_requirements = concat(local.global_node_reqs, [{
key = "node.kubernetes.io/instance-type"
operator = "In"
- values = ["m5a.xlarge", "m6a.xlarge"]
+ values = ["m5a.xlarge", "m6a.xlarge", "t3.xlarge", "t3a.xlarge"]
}])
node_class_ref_name = "coder-proxy-class"
}, {
@@ -133,7 +164,7 @@ locals {
node_requirements = concat(local.global_node_reqs, [{
key = "node.kubernetes.io/instance-type"
operator = "In"
- values = ["m5a.4xlarge", "m6a.4xlarge"]
+ values = ["m5a.4xlarge", "m6a.4xlarge", "m5a.2xlarge", "m6a.2xlarge"]
}])
node_class_ref_name = "coder-provisioner-class"
}, {
@@ -151,7 +182,15 @@ locals {
node_requirements = concat(local.global_node_reqs, [{
key = "node.kubernetes.io/instance-type"
operator = "In"
- values = ["c6a.32xlarge", "c5a.32xlarge"]
+ values = [
+ # Small demos (5-10 users) - Most cost-effective
+ "c6a.4xlarge", "c5a.4xlarge", # 16 vCPU / 32 GB - ~$0.18/hr spot
+ "c6a.8xlarge", "c5a.8xlarge", # 32 vCPU / 64 GB - ~$0.37/hr spot
+ # Medium demos (10-20 users)
+ "c6a.16xlarge", "c5a.16xlarge", # 64 vCPU / 128 GB - ~$0.74/hr spot
+ # Large demos (20-40 users)
+ "c6a.32xlarge", "c5a.32xlarge" # 128 vCPU / 256 GB - ~$1.47/hr spot
+ ]
}])
node_class_ref_name = "coder-ws-class"
disruption_consolidate_after = "30m"
@@ -168,6 +207,9 @@ module "karpenter-addon" {
node_selector = {
"node.amazonaws.io/managed-by" : "asg"
}
+
+ karpenter_queue_name = var.karpenter_queue_name
+ karpenter_queue_rule_name = var.karpenter_queue_rule_name
ec2nodeclass_configs = [{
name = "coder-proxy-class"
subnet_selector_tags = local.provisioner_subnet_tags
@@ -181,13 +223,13 @@ module "karpenter-addon" {
block_device_mappings = [{
device_name = "/dev/xvda"
ebs = {
- volume_size = "1400Gi"
+ volume_size = "500G"
volume_type = "gp3"
}
}, {
device_name = "/dev/xvdb"
ebs = {
- volume_size = "50Gi"
+ volume_size = "50G"
volume_type = "gp3"
}
}]
@@ -196,4 +238,31 @@ module "karpenter-addon" {
subnet_selector_tags = local.provisioner_subnet_tags
sg_selector_tags = local.provisioner_sg_tags
}]
+}
+
+# Create NodePools for each configuration
+module "nodepools" {
+ for_each = { for np in local.nodepool_configs : np.name => np }
+ source = "../../../../../modules/k8s/objects/nodepool"
+
+ name = each.value.name
+ node_labels = each.value.node_labels
+ node_taints = each.value.node_taints
+ node_requirements = each.value.node_requirements
+ node_class_ref_name = each.value.node_class_ref_name
+ disruption_consolidate_after = lookup(each.value, "disruption_consolidate_after", "1m")
+ disruption_consolidation_policy = lookup(each.value, "disruption_consolidation_policy", "WhenEmpty")
+
+ depends_on = [module.karpenter-addon]
+}
+
+# Apply the NodePool manifests
+resource "null_resource" "apply_nodepools" {
+ for_each = module.nodepools
+
+ provisioner "local-exec" {
+ command = "echo '${each.value.manifest}' | kubectl apply -f -"
+ }
+
+ depends_on = [module.karpenter-addon]
}
\ No newline at end of file
diff --git a/infra/aws/us-west-2/k8s/lb-controller/main.tf b/infra/aws/us-west-2/k8s/lb-controller/main.tf
index 1f6a0fa..63d0c6b 100644
--- a/infra/aws/us-west-2/k8s/lb-controller/main.tf
+++ b/infra/aws/us-west-2/k8s/lb-controller/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -60,13 +60,19 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
}
}
+provider "kubernetes" {
+ host = data.aws_eks_cluster.this.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
+ token = data.aws_eks_cluster_auth.this.token
+}
+
module "lb-controller" {
source = "../../../../../modules/k8s/bootstrap/lb-controller"
cluster_name = data.aws_eks_cluster.this.name
diff --git a/infra/aws/us-west-2/k8s/metrics-server/main.tf b/infra/aws/us-west-2/k8s/metrics-server/main.tf
index d808c74..cce9447 100644
--- a/infra/aws/us-west-2/k8s/metrics-server/main.tf
+++ b/infra/aws/us-west-2/k8s/metrics-server/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
}
backend "s3" {}
@@ -48,7 +48,7 @@ data "aws_eks_cluster_auth" "this" {
}
provider "helm" {
- kubernetes {
+ kubernetes = {
host = data.aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.this.token
diff --git a/infra/aws/us-west-2/k8s/nodepools/main.tf b/infra/aws/us-west-2/k8s/nodepools/main.tf
new file mode 100644
index 0000000..74d63c5
--- /dev/null
+++ b/infra/aws/us-west-2/k8s/nodepools/main.tf
@@ -0,0 +1,356 @@
+terraform {
+ required_providers {
+ aws = {
+ source = "hashicorp/aws"
+ version = ">= 5.0"
+ }
+ kubernetes = {
+ source = "hashicorp/kubernetes"
+ version = ">= 2.20"
+ }
+ }
+}
+
+variable "cluster_name" {
+ description = "EKS cluster name"
+ type = string
+}
+
+variable "cluster_region" {
+ description = "AWS region"
+ type = string
+}
+
+variable "cluster_profile" {
+ description = "AWS profile"
+ type = string
+ default = "default"
+}
+
+provider "aws" {
+ region = var.cluster_region
+ profile = var.cluster_profile
+}
+
+data "aws_eks_cluster" "this" {
+ name = var.cluster_name
+}
+
+data "aws_eks_cluster_auth" "this" {
+ name = var.cluster_name
+}
+
+provider "kubernetes" {
+ host = data.aws_eks_cluster.this.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
+ token = data.aws_eks_cluster_auth.this.token
+}
+
+# NodePool for Coder Server
+resource "kubernetes_manifest" "coder_server_nodepool" {
+ manifest = {
+ apiVersion = "karpenter.sh/v1"
+ kind = "NodePool"
+ metadata = {
+ name = "coder-server"
+ }
+ spec = {
+ template = {
+ metadata = {
+ labels = {
+ "node.coder.io/instance" = "coder-v2"
+ "node.coder.io/managed-by" = "karpenter"
+ "node.coder.io/name" = "coder"
+ "node.coder.io/part-of" = "coder"
+ "node.coder.io/used-for" = "coder-server"
+ }
+ }
+ spec = {
+ expireAfter = "480h"
+ nodeClassRef = {
+ group = "eks.amazonaws.com"
+ kind = "NodeClass"
+ name = "default"
+ }
+ requirements = [
+ {
+ key = "karpenter.sh/capacity-type"
+ operator = "In"
+ values = ["on-demand"]
+ },
+ {
+ key = "kubernetes.io/arch"
+ operator = "In"
+ values = ["amd64"]
+ },
+ {
+ key = "kubernetes.io/os"
+ operator = "In"
+ values = ["linux"]
+ },
+ {
+ key = "eks.amazonaws.com/instance-category"
+ operator = "In"
+ values = ["t", "m"]
+ },
+ {
+ key = "eks.amazonaws.com/instance-generation"
+ operator = "Gt"
+ values = ["3"]
+ },
+ {
+ key = "node.kubernetes.io/instance-type"
+ operator = "In"
+ values = ["t3.xlarge", "t3.2xlarge", "t3a.xlarge", "t3a.2xlarge", "m5.xlarge", "m5.2xlarge"]
+ }
+ ]
+ taints = [
+ {
+ key = "dedicated"
+ value = "coder-server"
+ effect = "NoSchedule"
+ }
+ ]
+ terminationGracePeriod = "1h"
+ }
+ }
+ disruption = {
+ consolidationPolicy = "WhenEmpty"
+ consolidateAfter = "5m"
+ }
+ }
+ }
+}
+
+# NodePool for Coder Proxy
+resource "kubernetes_manifest" "coder_proxy_nodepool" {
+ manifest = {
+ apiVersion = "karpenter.sh/v1"
+ kind = "NodePool"
+ metadata = {
+ name = "coder-proxy"
+ }
+ spec = {
+ template = {
+ metadata = {
+ labels = {
+ "node.coder.io/instance" = "coder-v2"
+ "node.coder.io/managed-by" = "karpenter"
+ "node.coder.io/name" = "coder"
+ "node.coder.io/part-of" = "coder"
+ "node.coder.io/used-for" = "coder-proxy"
+ }
+ }
+ spec = {
+ expireAfter = "480h"
+ nodeClassRef = {
+ group = "eks.amazonaws.com"
+ kind = "NodeClass"
+ name = "default"
+ }
+ requirements = [
+ {
+ key = "karpenter.sh/capacity-type"
+ operator = "In"
+ values = ["on-demand", "spot"]
+ },
+ {
+ key = "kubernetes.io/arch"
+ operator = "In"
+ values = ["amd64"]
+ },
+ {
+ key = "kubernetes.io/os"
+ operator = "In"
+ values = ["linux"]
+ },
+ {
+ key = "eks.amazonaws.com/instance-category"
+ operator = "In"
+ values = ["m", "c", "t"]
+ },
+ {
+ key = "eks.amazonaws.com/instance-generation"
+ operator = "Gt"
+ values = ["4"]
+ }
+ ]
+ taints = [
+ {
+ key = "dedicated"
+ value = "coder-proxy"
+ effect = "NoSchedule"
+ }
+ ]
+ terminationGracePeriod = "30m"
+ }
+ }
+ disruption = {
+ consolidationPolicy = "WhenEmpty"
+ consolidateAfter = "5m"
+ }
+ }
+ }
+}
+
+# NodePool for Coder Provisioner
+resource "kubernetes_manifest" "coder_provisioner_nodepool" {
+ manifest = {
+ apiVersion = "karpenter.sh/v1"
+ kind = "NodePool"
+ metadata = {
+ name = "coder-provisioner"
+ }
+ spec = {
+ template = {
+ metadata = {
+ labels = {
+ "node.coder.io/instance" = "coder-v2"
+ "node.coder.io/managed-by" = "karpenter"
+ "node.coder.io/name" = "coder"
+ "node.coder.io/part-of" = "coder"
+ "node.coder.io/used-for" = "coder-provisioner"
+ }
+ }
+ spec = {
+ expireAfter = "480h"
+ nodeClassRef = {
+ group = "eks.amazonaws.com"
+ kind = "NodeClass"
+ name = "default"
+ }
+ requirements = [
+ {
+ key = "karpenter.sh/capacity-type"
+ operator = "In"
+ values = ["on-demand", "spot"]
+ },
+ {
+ key = "kubernetes.io/arch"
+ operator = "In"
+ values = ["amd64"]
+ },
+ {
+ key = "kubernetes.io/os"
+ operator = "In"
+ values = ["linux"]
+ },
+ {
+ key = "eks.amazonaws.com/instance-category"
+ operator = "In"
+ values = ["m", "c"]
+ },
+ {
+ key = "eks.amazonaws.com/instance-generation"
+ operator = "Gt"
+ values = ["5"]
+ },
+ {
+ key = "node.kubernetes.io/instance-type"
+ operator = "In"
+ values = ["m5.2xlarge", "m5.4xlarge", "m6a.2xlarge", "m6a.4xlarge", "c5.2xlarge", "c5.4xlarge"]
+ }
+ ]
+ taints = [
+ {
+ key = "dedicated"
+ value = "coder-provisioner"
+ effect = "NoSchedule"
+ }
+ ]
+ terminationGracePeriod = "30m"
+ }
+ }
+ disruption = {
+ consolidationPolicy = "WhenEmpty"
+ consolidateAfter = "10m"
+ }
+ }
+ }
+}
+
+# NodePool for Coder Workspaces
+resource "kubernetes_manifest" "coder_workspaces_nodepool" {
+ manifest = {
+ apiVersion = "karpenter.sh/v1"
+ kind = "NodePool"
+ metadata = {
+ name = "coder-workspaces"
+ }
+ spec = {
+ template = {
+ metadata = {
+ labels = {
+ "node.coder.io/instance" = "coder-v2"
+ "node.coder.io/managed-by" = "karpenter"
+ "node.coder.io/name" = "coder"
+ "node.coder.io/part-of" = "coder"
+ "node.coder.io/used-for" = "coder-workspaces"
+ }
+ }
+ spec = {
+ expireAfter = "336h" # 14 days for workspace nodes
+ nodeClassRef = {
+ group = "eks.amazonaws.com"
+ kind = "NodeClass"
+ name = "default"
+ }
+ requirements = [
+ {
+ key = "karpenter.sh/capacity-type"
+ operator = "In"
+ values = ["on-demand", "spot"]
+ },
+ {
+ key = "kubernetes.io/arch"
+ operator = "In"
+ values = ["amd64"]
+ },
+ {
+ key = "kubernetes.io/os"
+ operator = "In"
+ values = ["linux"]
+ },
+ {
+ key = "eks.amazonaws.com/instance-category"
+ operator = "In"
+ values = ["c", "m", "r"]
+ },
+ {
+ key = "eks.amazonaws.com/instance-generation"
+ operator = "Gt"
+ values = ["5"]
+ }
+ ]
+ taints = [
+ {
+ key = "dedicated"
+ value = "coder-workspaces"
+ effect = "NoSchedule"
+ }
+ ]
+ terminationGracePeriod = "30m"
+ }
+ }
+ disruption = {
+ consolidationPolicy = "WhenEmptyOrUnderutilized"
+ consolidateAfter = "30m"
+ budgets = [
+ {
+ nodes = "10%"
+ }
+ ]
+ }
+ }
+ }
+}
+
+output "nodepools_created" {
+ description = "List of NodePools created"
+ value = [
+ "coder-server",
+ "coder-proxy",
+ "coder-provisioner",
+ "coder-workspaces"
+ ]
+}
diff --git a/infra/aws/us-west-2/route53/main.tf b/infra/aws/us-west-2/route53/main.tf
new file mode 100644
index 0000000..5b0221d
--- /dev/null
+++ b/infra/aws/us-west-2/route53/main.tf
@@ -0,0 +1,218 @@
+terraform {
+ required_providers {
+ aws = {
+ source = "hashicorp/aws"
+ version = ">= 5.0"
+ }
+ kubernetes = {
+ source = "hashicorp/kubernetes"
+ version = ">= 2.0"
+ }
+ }
+}
+
+variable "cluster_region" {
+ description = "AWS region"
+ type = string
+ default = "us-west-2"
+}
+
+variable "cluster_profile" {
+ description = "AWS profile"
+ type = string
+ default = "default"
+}
+
+variable "cluster_name" {
+ description = "EKS cluster name"
+ type = string
+}
+
+variable "domain_name" {
+ description = "Domain name for Coder"
+ type = string
+ default = "coderdemo.io"
+}
+
+variable "hosted_zone_id" {
+ description = "Route 53 Hosted Zone ID"
+ type = string
+ default = "Z080884039133KJPAGA3S"
+}
+
+variable "coder_service_name" {
+ description = "Coder service name in Kubernetes"
+ type = string
+ default = "coder"
+}
+
+variable "coder_namespace" {
+ description = "Coder namespace in Kubernetes"
+ type = string
+ default = "coder-proxy"
+}
+
+variable "set_identifier" {
+ description = "Unique identifier for this routing policy record"
+ type = string
+ default = "us-west-2"
+}
+
+variable "health_check_enabled" {
+ description = "Enable Route 53 health checks"
+ type = bool
+ default = true
+}
+
+variable "health_check_path" {
+ description = "Path for health checks"
+ type = string
+ default = "/api/v2/buildinfo"
+}
+
+provider "aws" {
+ region = var.cluster_region
+ profile = var.cluster_profile
+}
+
+data "aws_eks_cluster" "this" {
+ name = var.cluster_name
+}
+
+data "aws_eks_cluster_auth" "this" {
+ name = var.cluster_name
+}
+
+provider "kubernetes" {
+ host = data.aws_eks_cluster.this.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data)
+ token = data.aws_eks_cluster_auth.this.token
+}
+
+# Get the NLB hostname from the Kubernetes service
+data "kubernetes_service" "coder" {
+ metadata {
+ name = var.coder_service_name
+ namespace = var.coder_namespace
+ }
+}
+
+# Extract the NLB details
+locals {
+ nlb_hostname = try(data.kubernetes_service.coder.status[0].load_balancer[0].ingress[0].hostname, "")
+}
+
+# Get NLB by tags (AWS Load Balancer Controller tags the NLB)
+data "aws_lb" "coder_nlb" {
+ tags = {
+ "service.k8s.aws/stack" = "${var.coder_namespace}/${var.coder_service_name}"
+ }
+}
+
+# Health check for the NLB endpoint
+resource "aws_route53_health_check" "coder" {
+ count = var.health_check_enabled ? 1 : 0
+ type = "HTTPS"
+ resource_path = var.health_check_path
+ fqdn = var.domain_name
+ port = 443
+ request_interval = 30
+ failure_threshold = 3
+ measure_latency = true
+
+ tags = {
+ Name = "coder-${var.set_identifier}"
+ Region = var.cluster_region
+ Environment = "production"
+ ManagedBy = "terraform"
+ }
+}
+
+# Latency-based routing record for the main domain
+resource "aws_route53_record" "coder_latency" {
+ zone_id = var.hosted_zone_id
+ name = var.domain_name
+ type = "A"
+ set_identifier = var.set_identifier
+ allow_overwrite = true
+
+ alias {
+ name = local.nlb_hostname
+ zone_id = data.aws_lb.coder_nlb.zone_id
+ evaluate_target_health = true
+ }
+
+ latency_routing_policy {
+ region = var.cluster_region
+ }
+
+ health_check_id = var.health_check_enabled ? aws_route53_health_check.coder[0].id : null
+}
+
+# Latency-based routing record for wildcard subdomains
+resource "aws_route53_record" "coder_wildcard_latency" {
+ zone_id = var.hosted_zone_id
+ name = "*.${var.domain_name}"
+ type = "A"
+ set_identifier = var.set_identifier
+ allow_overwrite = true
+
+ alias {
+ name = local.nlb_hostname
+ zone_id = data.aws_lb.coder_nlb.zone_id
+ evaluate_target_health = true
+ }
+
+ latency_routing_policy {
+ region = var.cluster_region
+ }
+
+ health_check_id = var.health_check_enabled ? aws_route53_health_check.coder[0].id : null
+}
+
+# Region-specific subdomain for manual region selection
+resource "aws_route53_record" "coder_region_specific" {
+ zone_id = var.hosted_zone_id
+ name = "${var.set_identifier}.${var.domain_name}"
+ type = "A"
+
+ alias {
+ name = local.nlb_hostname
+ zone_id = data.aws_lb.coder_nlb.zone_id
+ evaluate_target_health = true
+ }
+}
+
+# Wildcard for region-specific subdomain (for workspace apps)
+resource "aws_route53_record" "coder_region_specific_wildcard" {
+ zone_id = var.hosted_zone_id
+ name = "*.${var.set_identifier}.${var.domain_name}"
+ type = "A"
+
+ alias {
+ name = local.nlb_hostname
+ zone_id = data.aws_lb.coder_nlb.zone_id
+ evaluate_target_health = true
+ }
+}
+
+# Outputs
+output "nlb_hostname" {
+ description = "Network Load Balancer hostname"
+ value = local.nlb_hostname
+}
+
+output "nlb_zone_id" {
+ description = "Network Load Balancer Route 53 zone ID"
+ value = data.aws_lb.coder_nlb.zone_id
+}
+
+output "health_check_id" {
+ description = "Route 53 health check ID"
+ value = var.health_check_enabled ? aws_route53_health_check.coder[0].id : null
+}
+
+output "route53_record_fqdn" {
+ description = "Fully qualified domain name of the Route 53 record"
+ value = aws_route53_record.coder_latency.fqdn
+}
diff --git a/modules/k8s/bootstrap/cert-manager/main.tf b/modules/k8s/bootstrap/cert-manager/main.tf
index 8183719..6f90bb0 100644
--- a/modules/k8s/bootstrap/cert-manager/main.tf
+++ b/modules/k8s/bootstrap/cert-manager/main.tf
@@ -7,7 +7,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -132,12 +132,12 @@ resource "helm_release" "cert-manager" {
chart = "cert-manager"
repository = "oci://quay.io/jetstack/charts"
create_namespace = false
- # Removed invalid upgrade_install attribute for proper error handling
- skip_crds = false
- wait = true
- wait_for_jobs = true
- version = var.helm_version
- timeout = var.helm_timeout
+ upgrade_install = true
+ skip_crds = false
+ wait = true
+ wait_for_jobs = true
+ version = var.helm_version
+ timeout = var.helm_timeout
values = [yamlencode({
crds = {
diff --git a/modules/k8s/bootstrap/coder-provisioner/main.tf b/modules/k8s/bootstrap/coder-provisioner/main.tf
index 24f22f3..3840721 100644
--- a/modules/k8s/bootstrap/coder-provisioner/main.tf
+++ b/modules/k8s/bootstrap/coder-provisioner/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
diff --git a/modules/k8s/bootstrap/coder-proxy/main.tf b/modules/k8s/bootstrap/coder-proxy/main.tf
index 72c857e..579ecec 100644
--- a/modules/k8s/bootstrap/coder-proxy/main.tf
+++ b/modules/k8s/bootstrap/coder-proxy/main.tf
@@ -7,7 +7,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -346,6 +346,7 @@ resource "helm_release" "coder-proxy" {
chart = "coder"
repository = "https://helm.coder.com/v2"
create_namespace = false
+ upgrade_install = true
skip_crds = false
wait = true
wait_for_jobs = true
diff --git a/modules/k8s/bootstrap/coder-server/main.tf b/modules/k8s/bootstrap/coder-server/main.tf
index a27723a..48d0c5b 100644
--- a/modules/k8s/bootstrap/coder-server/main.tf
+++ b/modules/k8s/bootstrap/coder-server/main.tf
@@ -5,7 +5,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -114,8 +114,8 @@ variable "image_pull_secrets" {
variable "replica_count" {
type = number
- # Changed from 0 to 1 because zero replicas results in no running server pods
- default = 1
+ # reverted back to 0 as this is a demo deployment by default
+ default = 0
}
variable "env_vars" {
@@ -577,6 +577,7 @@ resource "helm_release" "coder-server" {
chart = "coder"
repository = "https://helm.coder.com/v2"
create_namespace = false
+ upgrade_install = true
skip_crds = false
wait = true
wait_for_jobs = true
diff --git a/modules/k8s/bootstrap/coder-server/policy.tf b/modules/k8s/bootstrap/coder-server/policy.tf
index 9a76b7d..828f8bc 100644
--- a/modules/k8s/bootstrap/coder-server/policy.tf
+++ b/modules/k8s/bootstrap/coder-server/policy.tf
@@ -33,11 +33,30 @@ data "aws_iam_policy_document" "provisioner-policy" {
"ec2:ReleaseHosts"
]
resources = [
- "arn:aws:ec2:${local.region}:${local.account_id}:*",
- "arn:aws:ec2:${local.region}:${local.account_id}:*/*",
- "arn:aws:ec2:${local.region}:${local.account_id}:*:*",
- "arn:aws:ec2:${local.region}::image/*"
+ "arn:aws:ec2:${local.region}:${local.account_id}:dedicated-host/*"
+ ]
+ condition {
+ test = "StringEquals"
+ variable = "aws:RequestTag/ManagedBy"
+ values = ["coder"]
+ }
+ }
+
+ statement {
+ sid = "EC2ManageHostLifecycleExisting"
+ effect = "Allow"
+ actions = [
+ "ec2:ModifyHosts",
+ "ec2:ReleaseHosts"
]
+ resources = [
+ "arn:aws:ec2:${local.region}:${local.account_id}:dedicated-host/*"
+ ]
+ condition {
+ test = "StringEquals"
+ variable = "aws:ResourceTag/ManagedBy"
+ values = ["coder"]
+ }
}
statement {
diff --git a/modules/k8s/bootstrap/ebs-controller/main.tf b/modules/k8s/bootstrap/ebs-controller/main.tf
index b2a438f..b6dd29a 100644
--- a/modules/k8s/bootstrap/ebs-controller/main.tf
+++ b/modules/k8s/bootstrap/ebs-controller/main.tf
@@ -7,7 +7,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -86,13 +86,13 @@ resource "helm_release" "ebs-controller" {
chart = "aws-ebs-csi-driver"
repository = "https://kubernetes-sigs.github.io/aws-ebs-csi-driver"
create_namespace = true
- # Removed upgrade_install because it's not a valid helm_release attribute
- skip_crds = false
- replace = var.replace
- wait = true
- wait_for_jobs = true
- version = var.chart_version
- timeout = 120 # in seconds
+ upgrade_install = true
+ skip_crds = false
+ replace = var.replace
+ wait = true
+ wait_for_jobs = true
+ version = var.chart_version
+ timeout = 120 # in seconds
values = [yamlencode({
controller = {
diff --git a/modules/k8s/bootstrap/karpenter/main.tf b/modules/k8s/bootstrap/karpenter/main.tf
index 55781aa..78b15c2 100644
--- a/modules/k8s/bootstrap/karpenter/main.tf
+++ b/modules/k8s/bootstrap/karpenter/main.tf
@@ -2,7 +2,7 @@ terraform {
required_providers {
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -103,7 +103,7 @@ variable "ec2nodeclass_configs" {
block_device_mappings = optional(list(object({
device_name = string
ebs = object({
- volume_size = string
+ volume_size = string # Kubernetes-style size with unit (e.g. "1400Gi", "50Gi")
volume_type = string
encrypted = optional(bool, false)
delete_on_termination = optional(bool, true)
@@ -220,12 +220,12 @@ resource "helm_release" "karpenter" {
chart = "karpenter"
repository = "oci://public.ecr.aws/karpenter"
create_namespace = true
- # Removed invalid upgrade_install attribute
- skip_crds = false
- wait = true
- wait_for_jobs = true
- version = var.chart_version
- timeout = 120 # in seconds
+ upgrade_install = true
+ skip_crds = false
+ wait = true
+ wait_for_jobs = true
+ version = var.chart_version
+ timeout = 120 # in seconds
# Added lifecycle management for proper upgrade handling
lifecycle {
@@ -256,7 +256,13 @@ resource "helm_release" "karpenter" {
settings = {
clusterName = var.cluster_name
featureGates = {
+ # Cost optimization - consolidate workloads to better-priced spot instances
spotToSpotConsolidation = true
+ # Future features - currently disabled
+ staticCapacity = false # New capacity management feature
+ reservedCapacity = false # For Reserved Instance support
+ nodeRepair = false # Experimental - automatic node repair
+ nodeOverlay = false # Experimental - network overlay support
}
interruptionQueue = module.karpenter.queue_name
}
@@ -280,16 +286,22 @@ resource "kubernetes_manifest" "ec2nodeclass" {
manifest = yamldecode(module.ec2nodeclass[count.index].manifest)
}
-# module "nodepool" {
-# count = length(local.nodepool_configs)
-# source = "../objects/nodepool"
-# name = local.nodepool_configs[count.index].name
-# node_labels = local.nodepool_configs[count.index].node_labels
-# node_taints = local.nodepool_configs[count.index].node_taints
-# node_requirements = local.nodepool_configs[count.index].node_requirements
-# node_class_ref_name = local.nodepool_configs[count.index].node_class_ref_name
-# node_expires_after = local.nodepool_configs[count.index].node_expires_after
-# disruption_consolidation_policy = local.nodepool_configs[count.index].disruption_consolidation_policy
-# disruption_consolidate_after = local.nodepool_configs[count.index].disruption_consolidate_after
-# }
+module "nodepool" {
+ count = length(var.nodepool_configs)
+ source = "../../objects/nodepool"
+ name = var.nodepool_configs[count.index].name
+ node_labels = var.nodepool_configs[count.index].node_labels
+ node_taints = var.nodepool_configs[count.index].node_taints
+ node_requirements = var.nodepool_configs[count.index].node_requirements
+ node_class_ref_name = var.nodepool_configs[count.index].node_class_ref_name
+ node_expires_after = var.nodepool_configs[count.index].node_expires_after
+ disruption_consolidation_policy = var.nodepool_configs[count.index].disruption_consolidation_policy
+ disruption_consolidate_after = var.nodepool_configs[count.index].disruption_consolidate_after
+}
+
+resource "kubernetes_manifest" "nodepool" {
+ depends_on = [helm_release.karpenter]
+ count = length(var.nodepool_configs)
+ manifest = yamldecode(module.nodepool[count.index].manifest)
+}
diff --git a/modules/k8s/bootstrap/lb-controller/main.tf b/modules/k8s/bootstrap/lb-controller/main.tf
index a5a32ec..45c6392 100644
--- a/modules/k8s/bootstrap/lb-controller/main.tf
+++ b/modules/k8s/bootstrap/lb-controller/main.tf
@@ -7,7 +7,7 @@ terraform {
}
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
@@ -138,12 +138,12 @@ resource "helm_release" "lb-controller" {
chart = "aws-load-balancer-controller"
repository = "https://aws.github.io/eks-charts"
create_namespace = true
- # Removed invalid upgrade_install attribute - Terraform handles upgrades automatically
- skip_crds = false
- wait = true
- wait_for_jobs = true
- version = var.chart_version
- timeout = 120 # in seconds
+ upgrade_install = true
+ skip_crds = false
+ wait = true
+ wait_for_jobs = true
+ version = var.chart_version
+ timeout = 120 # in seconds
values = [yamlencode({
clusterName = var.cluster_name
diff --git a/modules/k8s/bootstrap/metrics-server/main.tf b/modules/k8s/bootstrap/metrics-server/main.tf
index e588554..7940b5f 100644
--- a/modules/k8s/bootstrap/metrics-server/main.tf
+++ b/modules/k8s/bootstrap/metrics-server/main.tf
@@ -2,7 +2,7 @@ terraform {
required_providers {
helm = {
source = "hashicorp/helm"
- version = "2.17.0"
+ version = "3.1.1"
}
}
}
@@ -31,6 +31,7 @@ resource "helm_release" "metrics-server" {
chart = "metrics-server"
repository = "https://kubernetes-sigs.github.io/metrics-server/"
create_namespace = true
+ upgrade_install = true
skip_crds = false
wait = true
wait_for_jobs = true
diff --git a/modules/k8s/objects/ec2nodeclass/main.tf b/modules/k8s/objects/ec2nodeclass/main.tf
index 64c5015..7062bc0 100644
--- a/modules/k8s/objects/ec2nodeclass/main.tf
+++ b/modules/k8s/objects/ec2nodeclass/main.tf
@@ -27,7 +27,7 @@ variable "block_device_mappings" {
type = list(object({
device_name = string
ebs = object({
- volume_size = number # Changed from string to number because AWS EBS volume sizes are numeric GiB values
+ volume_size = string # Kubernetes-style size with unit (e.g. "1400Gi", "50Gi")
volume_type = string
encrypted = optional(bool, false)
delete_on_termination = optional(bool, true)