scribd
diff --git a/‎README.md‎
Lines changed: 22 additions & 8 deletions b/‎README.md‎
Lines changed: 22 additions & 8 deletions
diff --git a/‎images/Step-Function-sample-output.png‎
183 KB b/‎images/Step-Function-sample-output.png‎
183 KB
diff --git a/‎lambdas/checkNodesForRunningPods.py‎
Lines changed: 64 additions & 47 deletions b/‎lambdas/checkNodesForRunningPods.py‎
Lines changed: 64 additions & 47 deletions
@@ -1,18 +1,19 @@
 # terraform-aws-recycle-eks
 
 This module creates a terraform module to recycle EKS worker nodes. The high level functionalities are explained below,
- - Use a lamdba to take an instance id as an input, to put it in standby state. Using autoscaling api to automatically add a new instance to the group while putting the old instance to standby state. The old instance will get into "Standby" state only when the new instance is in fully "Inservice" state
+ - Creates a step-function that will consist of 4 lambdas. This step function will handle the transfer of inputs across the lambda functions.
+ - The first lambda takes an instance id as an input, to put it in standby state. Using autoscaling api to automatically add a new instance to the group while putting the old instance to standby state. The old instance will get into "Standby" state only when the new instance is in fully "Inservice" state
  - Taint this "Standby" node in EKS using K8S API in Lambda to prevent new pods from getting scheduled into this node
  - Periodically use K8S API check for status of “stateful” pods on that node based on the label selector provided. Another Lambda will do that
- - Once all stateful pods have completed on the node, use K8S API in another Lambda to drain the standby node
- - Once the number of running pod reached 0, shut down that standby instance using AWS SDK.
- - We are not termnating the node, only shutting it down, hust in case. In future releases, we will be start terminating the nodes
+ - Once all stateful pods have completed on the node, i.e number of running pod reached 0, shut down that standby instance using AWS SDK via lambda. We are not terminating the node, only shutting it down, just in case. In future releases, we will be start terminating the nodes
+ 
 
 ## TODO:
  - Check for new node in service before proceeding to put the existing node in standby state. Right now we are putting a sleep of 300 sec.
- - Stop using anonymous role and find a way to map the role with a proper user
- - get_bearer_token() function used in all lambda. Refactor the code to use as a Python module.
+ - Refactor the code to use as a common module for getting the access token.
  - Better logging and exception handling
+ - Make use of namespace input while selecting the pods. Currently it checks for pods in all namespaces.
+ - Find a terraform way to edit configmap/aws-auth, this step is still manual to make this module work.
 
 There are two main components:
 
@@ -22,10 +23,9 @@ There are two main components:
 
 ## Usage
 
-**Set up all supported AWS / Datadog integrations**
 
 ```
-module "recycl-eks-worker-npde" {
+module "recycl-eks-worker-node" {
   source = "git::git@github.com:scribd/terraform-aws-recycle-eks.git"
   name                   = "string"
   tags                            = {
@@ -35,9 +35,20 @@ module "recycl-eks-worker-npde" {
   vpc_subnet_ids         = ["subnet-12345678", "subnet-87654321"]
   vpc_security_group_ids = ["sg-12345678"]
   aws_region             = "us-east-2"
+  namespace = "your pod namespace" # As of now it is just a place holder we check for all namespaces now
 
 }
+
+```
+After running the module, Run `kubectl edit -n kube-system configmap/aws-auth` and add the following:
 ```
+mapRoles: | 
+# ...
+    - rolearn: <IAM role for the lamda execution>
+      username: lambda
+
+```
+You can get IAM role for the lamda execution from the output variable of "lambda_exec_arn" in this module
 
 ## Running of step function
 
@@ -52,6 +63,9 @@ Step function takes an json input
 This label selector will be the identifier on which the step function will wait and rest all pods will be ignored.
 
 ```
+## Sample Output of a step function
+
+![](images/Step-Function-sample-output.png)
 
 ## Development
 
 
@@ -5,10 +5,11 @@
 import os.path
 import base64
 import logging
+import re
 import yaml
 import boto3
+import kubernetes as k8s
 from botocore.signers import RequestSigner
-from kubernetes import client, config
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -17,53 +18,59 @@
 MIRROR_POD_ANNOTATION_KEY = "kubernetes.io/config.mirror"
 CONTROLLER_KIND_DAEMON_SET = "DaemonSet"
 
-def get_bearer_token(cluster_id, region):
-    ''' create the bearer token
-    '''
-    if not os.path.exists(KUBE_FILEPATH):
-        kube_content = dict()
-        # Get data from EKS API
-        eks_api = boto3.client('eks',region_name=region)
-        cluster_info = eks_api.describe_cluster(name=cluster_id)
-        certificate = cluster_info['cluster']['certificateAuthority']['data']
-        endpoint = cluster_info['cluster']['endpoint']
-        # Generating kubeconfig
-        kube_content = dict()
-        kube_content['apiVersion'] = 'v1'
-        kube_content['clusters'] = [
+def create_kube_config(eks, cluster_name):
+    """Creates the Kubernetes config file required when instantiating the API client."""
+    cluster_info = eks.describe_cluster(name=cluster_name)['cluster']
+    certificate = cluster_info['certificateAuthority']['data']
+    endpoint = cluster_info['endpoint']
+
+    kube_config = {
+        'apiVersion': 'v1',
+        'clusters': [
             {
-            'cluster':
-                {
-                'server': endpoint,
-                'certificate-authority-data': certificate
-                },
-            'name':cluster_id
-            }]
+                'cluster':
+                    {
+                        'server': endpoint,
+                        'certificate-authority-data': certificate
+                    },
+                'name': 'k8s'
 
-        kube_content['contexts'] = [
+            }],
+        'contexts': [
             {
-            'context':
-                {
-                'cluster':cluster_id,
-                'user':'aws'
-                },
-            'name':'aws'
+                'context':
+                    {
+                        'cluster': 'k8s',
+                        'user': 'aws'
+                    },
+                'name': 'aws'
+            }],
+        'current-context': 'aws',
+        'Kind': 'config',
+        'users': [
+            {
+                'name': 'aws',
+                'user': 'lambda'
             }]
-        kube_content['current-context'] = 'aws'
-        kube_content['Kind'] = 'config'
-        kube_content['users'] = [
-        {
-        'name':'aws',
-        'user':'lambda'
-        }]
-        # Write kubeconfig
-        with open(KUBE_FILEPATH, 'w') as outfile:
-            yaml.dump(kube_content, outfile, default_flow_style=False)
+    }
+
+    with open(KUBE_FILEPATH, 'w') as kube_file_content:
+        yaml.dump(kube_config, kube_file_content, default_flow_style=False)
+
+
+def get_bearer_token(cluster, region):
+    """Creates the authentication to token required by AWS IAM Authenticator. This is
+    done by creating a base64 encoded string which represents a HTTP call to the STS
+    GetCallerIdentity Query Request
+    (https://docs.aws.amazon.com/STS/latest/APIReference/API_GetCallerIdentity.html).
+    The AWS IAM Authenticator decodes the base64 string and makes the request on behalf of the user.
+    """
     STS_TOKEN_EXPIRES_IN = 60
     session = boto3.session.Session()
 
     client = session.client('sts', region_name=region)
     service_id = client.meta.service_model.service_id
+
     signer = RequestSigner(
         service_id,
         region,
@@ -72,23 +79,29 @@ def get_bearer_token(cluster_id, region):
         session.get_credentials(),
         session.events
     )
+
     params = {
         'method': 'GET',
         'url': 'https://sts.{}.amazonaws.com/?Action=GetCallerIdentity&Version=2011-06-15'.format(region),
         'body': {},
         'headers': {
-            'x-k8s-aws-id': cluster_id
+            'x-k8s-aws-id': cluster
         },
         'context': {}
     }
+
     signed_url = signer.generate_presigned_url(
         params,
         region_name=region,
         expires_in=STS_TOKEN_EXPIRES_IN,
         operation_name=''
     )
+
     base64_url = base64.urlsafe_b64encode(signed_url.encode('utf-8')).decode('utf-8')
 
+    # need to remove base64 encoding padding:
+    # https://github.com/kubernetes-sigs/aws-iam-authenticator/issues/202
+    return 'k8s-aws-v1.' + re.sub(r'=*', '', base64_url)
 
 def get_evictable_pods(api, node_name,label_selector):
     '''
@@ -112,20 +125,24 @@ def handler(event, context):
     Lambda handler, this function will call the
     private functions to get the running pod count based on the label selector provided
     '''
+    eks = boto3.client('eks', region_name=event['region'])
+    #loading Kube Config
+    if not os.path.exists(KUBE_FILEPATH):
+        create_kube_config(eks, event['cluster_name'])
+    k8s.config.load_kube_config(KUBE_FILEPATH)
+    configuration = k8s.client.Configuration()
+    #getting the auth token
     token = get_bearer_token(event['cluster_name'],event['region'])
-    # Configure
-    config.load_kube_config(KUBE_FILEPATH)
-    configuration = client.Configuration()
     configuration.api_key['authorization'] = token
     configuration.api_key_prefix['authorization'] = 'Bearer'
     # API
-    api = client.ApiClient(configuration)
-    v1 = client.CoreV1Api(api)
+    api = k8s.client.ApiClient(configuration)
+    core_v1_api = k8s.client.CoreV1Api(api)
 
     # Get all the pods
-    runningPodCount=count_running_pods(v1,node_name=event['node_name'],
+    running_pod_count=count_running_pods(core_v1_api,node_name=event['node_name'],
         label_selector=event['label_selector'])
     output_json = {"region": event['region'], "node_name" : event['node_name'] ,
                     "instance_id" : event['instance_id'], "cluster_name": event['cluster_name'],
-                     "activePodCount": runningPodCount}
+                     "activePodCount": running_pod_count}
     return output_json