From 76cc8869e318c7f2c355424cee0cf9e7361a99c1 Mon Sep 17 00:00:00 2001 From: Michele Cereda Date: Wed, 24 Jul 2024 18:06:11 +0200 Subject: [PATCH] chore: make cluster-autoscaler work on eks --- knowledge base/cloud computing/aws/cli.md | 8 +- knowledge base/cloud computing/aws/eks.md | 155 +++++++++++++++++- .../kubernetes/cluster autoscaler.md | 72 ++++++++ knowledge base/kubernetes/kubectl.md | 13 ++ snippets/aws.fish | 8 + snippets/kubectl.sh | 18 +- 6 files changed, 262 insertions(+), 12 deletions(-) create mode 100644 knowledge base/kubernetes/cluster autoscaler.md diff --git a/knowledge base/cloud computing/aws/cli.md b/knowledge base/cloud computing/aws/cli.md index ce83242..3c904dd 100644 --- a/knowledge base/cloud computing/aws/cli.md +++ b/knowledge base/cloud computing/aws/cli.md @@ -1,7 +1,5 @@ # AWS CLI -## Table of contents - 1. [TL;DR](#tldr) 1. [Profiles](#profiles) 1. [Configuration](#configuration) @@ -17,11 +15,12 @@ Do *not* use `--max-items` together with `--query`: the items limit is applied b to show no results.
- Installation and configuration + Setup ```sh # Install the CLI. brew install 'awscli' +docker pull 'amazon/aws-cli' pip install 'awscli' # Configure profiles. @@ -53,6 +52,9 @@ rm -r ~'/.aws/cli/cache' Usage ```sh +# Use the docker version. +docker run --rm -ti -v "$HOME/.aws:/root/.aws:ro" 'amazon/aws-cli:2.17.16' autoscaling describe-auto-scaling-groups + # List applications in CodeDeploy. aws deploy list-applications diff --git a/knowledge base/cloud computing/aws/eks.md b/knowledge base/cloud computing/aws/eks.md index 3cd21b8..6a5495e 100644 --- a/knowledge base/cloud computing/aws/eks.md +++ b/knowledge base/cloud computing/aws/eks.md @@ -11,11 +11,14 @@ 1. [Storage](#storage) 1. [Use EBS as volumes](#use-ebs-as-volumes) 1. [EBS CSI driver IAM role](#ebs-csi-driver-iam-role) +1. [Pod identity](#pod-identity) +1. [Autoscaling](#autoscaling) + 1. [Cluster autoscaler](#cluster-autoscaler) 1. [Troubleshooting](#troubleshooting) - 1. [Identify common issues](#identify-common-issues) - 1. [The worker nodes fail to join the cluster](#the-worker-nodes-fail-to-join-the-cluster) + 1. [Identify common issues](#identify-common-issues) + 1. [The worker nodes fail to join the cluster](#the-worker-nodes-fail-to-join-the-cluster) 1. [Further readings](#further-readings) - 1. [Sources](#sources) + 1. [Sources](#sources) ## TL;DR @@ -34,9 +37,9 @@ both the control plane and nodes.
Such security group cannot be avoided nor customized in the cluster's definition (e.g. using IaC tools like [Pulumi] or [Terraform]): -> ```txt -> error: aws:eks/cluster:Cluster resource 'cluster' has a problem: Value for unconfigurable attribute. Can't configure a value for "vpc_config.0.cluster_security_group_id": its value will be decided automatically based on the result of applying this configuration. -> ``` +> error: aws:eks/cluster:Cluster resource 'cluster' has a problem: Value for unconfigurable attribute. Can't configure a +> value for "vpc_config.0.cluster_security_group_id": its value will be decided automatically based on the result of +> applying this configuration. For some reason, giving resources a tag like `aks:eks:cluster-name=value` succeeds, but has no effect (it is not really applied). @@ -83,6 +86,7 @@ aws eks associate-access-policy --cluster-name 'DeepThought' \ # Connect to clusters. aws eks update-kubeconfig --name 'DeepThought' && kubectl cluster-info +aws eks --region 'eu-west-1' update-kubeconfig --name 'oneForAll' --profile 'dev-user' && kubectl cluster-info # Create EC2 node groups. @@ -100,6 +104,10 @@ aws eks create-fargate-profile \ --pod-execution-role-arn 'arn:aws:iam::000011112222:role/DeepThinkerFargate' \ --subnets 'subnet-11112222333344445' 'subnet-66667777888899990' \ --selectors 'namespace=string' + + +# Get addon names. +aws eks describe-addon-versions --query 'addons[].addonName' ```
@@ -685,17 +693,137 @@ Requirements: 1. ClusterRole, ClusterRoleBinding, and other RBAC components. 1. Snapshot controller's Deployment. +## Pod identity + +Refer [Learn how EKS Pod Identity grants pods access to AWS services]. + +Provides pods the ability to manage AWS credentials in a similar way to how EC2 instance profiles provide credentials to +instances. + +Limitations: + +- Pod Identity Agents are DaemonSets.
+ This means they **cannot** run on Fargate hosts and **will** require EC2 nodes. +- Does **not** work with **Amazon-provided EKS add-ons** that need IAM credentials.
+ These controllers, drivers and plugins support EKS Pod Identities should they be installed as **self-managed** add-ons + instead. + +Procedure: + +1. Set up the Pod Identity Agent on clusters. + +
+ Requirements + + - The **nodes**' service role **must** have permissions for the agent to execute `AssumeRoleForPodIdentity` actions in + the EKS Auth API. + + Use the AWS-managed `AmazonEKSWorkerNodePolicy` policy.
+ Alternatively, add a custom policy with the following: + + ```json + { + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Action": [ "eks-auth:AssumeRoleForPodIdentity" ], + "Resource": "*" + }] + } + ``` + + Limit this action using tags to restrict which roles can be assumed by pods that use the agent. + + - Nodes to **be able** to reach and download images from ECRs.
+ Required since the container image for the add-on is available there. + - Nodes to **be able** to reach the EKS Auth API.
+ Private clusters **will** require the `eks-auth` endpoint in PrivateLink. + +
+
+ CLI + + ```sh + aws eks create-addon --cluster-name 'cluster' --addon-name 'eks-pod-identity-agent' + aws eks create-addon --cluster-name 'cluster' --addon-name 'eks-pod-identity-agent' --resolve-conflicts 'OVERWRITE' + ``` + +
+
+ Pulumi + + ```ts + new aws.eks.Addon("pod-identity", { + clusterName: cluster.name, + addonName: "eks-pod-identity-agent", + resolveConflictsOnCreate: "OVERWRITE", + resolveConflictsOnUpdate: "OVERWRITE", + }); + ``` + +
+ +1. Associate IAM roles with Kubernetes service accounts: + +
+ CLI + + ```sh + aws eks create-pod-identity-association \ + --cluster-name 'cluster' --namespace 'default' \ + --service-account 'default' --role-arn 'arn:aws:iam::012345678901:role/CustomRole' + ``` + +
+
+ Pulumi + + ```ts + new aws.eks.PodIdentityAssociation("customRole-to-defaultServiceAccount", { + clusterName: cluster.name, + roleArn: customRole.arn, + serviceAccount: "default", + namespace: "default", + }); + ``` + +
+ + There is no need for the service account to exists before association.
+ The moment it will be created in the defined namespace, it will also be able to assume the role. + +1. Configure pods to use those service accounts. + +## Autoscaling + +Autoscaling of EKS clusters can happen: + +- _Horizontally_ (as in **number** of nodes) through the use of [Cluster Autoscaler]. +- _Vertically_ (as in **size** of nodes) through the use of [Karpenter]. + +The pods running the autoscaling components **will need** the necessary permissions to operate on the cluster's +resources.
+This means giving them pods access keys, or enabling [Pod Identity]. + +### Cluster autoscaler + +Nothing more than the [Kubernetes' cluster autoscaler component]. + +After any operation, the cluster autoscaler will wait for the ASG cooldown time to end.
+Only then, it will start counting down its own timers. + ## Troubleshooting See [Amazon EKS troubleshooting]. ### Identify common issues -Use the [AWSSupport-TroubleshootEKSWorkerNode](https://docs.aws.amazon.com/systems-manager-automation-runbooks/latest/userguide/automation-awssupport-troubleshooteksworkernode.html) runbook. +Use the [AWSSupport-TroubleshootEKSWorkerNode runbook]. > For the automation to work, worker nodes **must** have permission to access Systems Manager and have Systems Manager > running.
-> Grant this permission by attaching the [`AmazonSSMManagedInstanceCore`](https://docs.aws.amazon.com/systems-manager/latest/userguide/setup-instance-profile.html#instance-profile-policies-overview) policy to the node role. +> Grant this permission by attaching the `AmazonSSMManagedInstanceCore` policy to the node role.
+> See [Configure instance permissions required for Systems Manager]. Procedure: @@ -754,6 +882,9 @@ Debug: see [Identify common issues]. - [How to Add IAM User and IAM Role to AWS EKS Cluster?] - [Amazon Elastic Block Store (EBS) CSI driver] - [Manage the Amazon EBS CSI driver as an Amazon EKS add-on] +- [How do you get kubectl to log in to an AWS EKS cluster?] +- [Learn how EKS Pod Identity grants pods access to AWS services] +- [Configure instance permissions required for Systems Manager] [access management]: #access-management +[cluster autoscaler]: #cluster-autoscaler [create worker nodes]: #create-worker-nodes [ebs csi driver iam role]: #ebs-csi-driver-iam-role [identify common issues]: #identify-common-issues +[pod identity]: #pod-identity [requirements]: #requirements [secrets encryption through kms]: #secrets-encryption-through-kms [amazon web services]: README.md [cli]: cli.md +[kubernetes' cluster autoscaler component]: ../../kubernetes/cluster%20autoscaler.md [ebs]: ebs.md +[karpenter]: ../../kubernetes/karpenter.placeholder [kubernetes]: ../../kubernetes/README.md [pulumi]: ../../pulumi.md [terraform]: ../../pulumi.md @@ -790,7 +925,9 @@ Debug: see [Identify common issues]. [aws eks create-cluster]: https://docs.aws.amazon.com/cli/latest/reference/eks/create-cluster.html [aws eks create-fargate-profile]: https://docs.aws.amazon.com/cli/latest/reference/eks/create-fargate-profile.html [aws eks create-nodegroup]: https://docs.aws.amazon.com/cli/latest/reference/eks/create-nodegroup.html +[AWSSupport-TroubleshootEKSWorkerNode runbook]: https://docs.aws.amazon.com/systems-manager-automation-runbooks/latest/userguide/automation-awssupport-troubleshooteksworkernode.html [choosing an amazon ec2 instance type]: https://docs.aws.amazon.com/eks/latest/userguide/choosing-instance-type.html +[configure instance permissions required for systems manager]: https://docs.aws.amazon.com/systems-manager/latest/userguide/setup-instance-profile.html#instance-profile-policies-overview [de-mystifying cluster networking for amazon eks worker nodes]: https://aws.amazon.com/blogs/containers/de-mystifying-cluster-networking-for-amazon-eks-worker-nodes/ [eks workshop]: https://www.eksworkshop.com/ [enabling iam principal access to your cluster]: https://docs.aws.amazon.com/eks/latest/userguide/add-user-role.html @@ -802,6 +939,7 @@ Debug: see [Identify common issues]. [how do i resolve the error "you must be logged in to the server (unauthorized)" when i connect to the amazon eks api server?]: https://repost.aws/knowledge-center/eks-api-server-unauthorized-error [how do i use persistent storage in amazon eks?]: https://repost.aws/knowledge-center/eks-persistent-storage [identity and access management]: https://aws.github.io/aws-eks-best-practices/security/docs/iam/ +[learn how eks pod identity grants pods access to aws services]: https://docs.aws.amazon.com/eks/latest/userguide/pod-identities.html [manage the amazon ebs csi driver as an amazon eks add-on]: https://docs.aws.amazon.com/eks/latest/userguide/managing-ebs-csi.html [managed node groups]: https://docs.aws.amazon.com/eks/latest/userguide/managed-node-groups.html [private cluster requirements]: https://docs.aws.amazon.com/eks/latest/userguide/private-clusters.html @@ -817,5 +955,6 @@ Debug: see [Identify common issues]. [amazon elastic block store (ebs) csi driver]: https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/README.md [external-snapshotter]: https://github.com/kubernetes-csi/external-snapshotter +[how do you get kubectl to log in to an aws eks cluster?]: https://stackoverflow.com/questions/53266960/how-do-you-get-kubectl-to-log-in-to-an-aws-eks-cluster [how to add iam user and iam role to aws eks cluster?]: https://antonputra.com/kubernetes/add-iam-user-and-iam-role-to-eks/ [visualizing aws eks kubernetes clusters with relationship graphs]: https://dev.to/aws-builders/visualizing-aws-eks-kubernetes-clusters-with-relationship-graphs-46a4 diff --git a/knowledge base/kubernetes/cluster autoscaler.md b/knowledge base/kubernetes/cluster autoscaler.md new file mode 100644 index 0000000..f18d85b --- /dev/null +++ b/knowledge base/kubernetes/cluster autoscaler.md @@ -0,0 +1,72 @@ +# Cluster autoscaler + +Automatically adjusts the number of nodes in Kubernetes clusters. + +1. [TL;DR](#tldr) +1. [Further readings](#further-readings) + 1. [Sources](#sources) + +## TL;DR + +Acts when one of the following conditions is true: + +- Pods failed to run in the cluster due to insufficient resources. +- Nodes in the cluster have been underutilized for an extended period of time, and their pods can be placed on other + existing nodes. + +
+ Setup + +```sh +helm repo add 'autoscaler' 'https://kubernetes.github.io/autoscaler' +helm show values 'autoscaler/cluster-autoscaler' + +helm install 'cluster-autoscaler' 'autoscaler/cluster-autoscaler' --set 'autoDiscovery.clusterName'=clusterName +helm --namespace 'kube-system' upgrade --install 'cluster-autoscaler' 'autoscaler/cluster-autoscaler' \ + --set 'autoDiscovery.clusterName'=clusterName + +helm uninstall 'cluster-autoscaler' +helm --namespace 'kube-system' uninstall 'cluster-autoscaler' +``` + +
+ + + +
+ Real world use cases + +```sh +helm --namespace 'kube-system' upgrade --install 'cluster-autoscaler' 'autoscaler/cluster-autoscaler' \ + --set 'cloudProvider'='aws' --set 'awsRegion'='eu-west-1' \ + --set 'autoDiscovery.clusterName'='defaultCluster' --set 'rbac.serviceAccount.name'='cluster-autoscaler-aws' +``` + +
+ +## Further readings + +- [Main repository] + +### Sources + + + + + + + +[main repository]: https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler + + diff --git a/knowledge base/kubernetes/kubectl.md b/knowledge base/kubernetes/kubectl.md index 5fe6fbf..6415540 100644 --- a/knowledge base/kubernetes/kubectl.md +++ b/knowledge base/kubernetes/kubectl.md @@ -405,6 +405,13 @@ kubectl top node 'my-node' # Forward local connections to cluster resources. kubectl port-forward 'my-pod' '5000:6000' kubectl -n 'default' port-forward 'service/my-service' '8443:https' + +# Start pods and attach to them. +kubectl run --rm -it --image 'alpine' 'alpine' --command -- sh +kubectl run --rm -it --image 'amazon/aws-cli:2.17.16' 'awscli' -- autoscaling describe-auto-scaling-groups + +# Attach to running pods. +kubectl attach 'alpine' -c 'alpine' -it ``` @@ -422,6 +429,12 @@ kubectl -n 'awx' port-forward 'service/awx-service' '8080:http' # Delete leftovers CRDs from helm charts by release name. kubectl delete crds -l "helm.sh/chart=awx-operator" +# Run pods with specific specs. +kubectl -n 'kube-system' run --rm -it 'awscli' --overrides '{"spec":{"serviceAccountName":"cluster-autoscaler-aws"}}' \ + --image '012345678901.dkr.ecr.eu-west-1.amazonaws.com/cache/amazon/aws-cli:2.17.16' \ + -- \ + autoscaling describe-auto-scaling-groups + # Show Containers' status, properties and capabilities from the inside. # Run the command from *inside* the container. cat '/proc/1/status' diff --git a/snippets/aws.fish b/snippets/aws.fish index 231e4d9..4fd6d3d 100644 --- a/snippets/aws.fish +++ b/snippets/aws.fish @@ -133,6 +133,8 @@ aws kms get-key-policy --output 'text' --key-id '01234567-89ab-cdef-0123-456789a aws ec2 describe-images --image-ids 'ami-01234567890abcdef' aws ec2 describe-images --image-ids 'ami-01234567890abcdef' --query 'Images[].Description' +aws autoscaling describe-auto-scaling-groups +aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names 'ProductionServers' aws autoscaling start-instance-refresh --auto-scaling-group-name 'ProductionServers' aws autoscaling describe-instance-refreshes \ --auto-scaling-group-name 'ProductionServers' --instance-refresh-ids '01234567-89ab-cdef-0123-456789abcdef' @@ -147,3 +149,9 @@ aws kms encrypt --key-id '01234567-89ab-cdef-0123-456789abcdef' --plaintext 'My aws kms decrypt --ciphertext-blob 'fileb://ciphertext.dat' aws kms decrypt --ciphertext-blob 'fileb://ciphertext.dat' --query 'Plaintext' --output 'text' \ | base64 --decode + +aws eks --region 'eu-west-1' update-kubeconfig --name 'oneForAll' --profile 'dev-user' + +aws eks describe-addon-versions --query 'sort(addons[].addonName)' + +docker run --rm -ti -v "$HOME/.aws:/root/.aws:ro" 'amazon/aws-cli:2.17.16' autoscaling describe-auto-scaling-groups diff --git a/snippets/kubectl.sh b/snippets/kubectl.sh index 769c49d..b76ae82 100644 --- a/snippets/kubectl.sh +++ b/snippets/kubectl.sh @@ -2,10 +2,26 @@ kubectl create namespace 'gitlab' +kubectl create --namespace 'gitlab' secret generic 'gitlab-runner-token' --dry-run='client' --output 'yaml' \ + --from-literal='runner-registration-token=""' --from-literal='runner-token=glrt-…' kubectl apply --namespace 'gitlab' --values 'secrets.yaml' + +kubectl get nodes 'fargate-ip-172-31-83-147.eu-west-1.compute.internal' -o 'yaml' | yq -y '.metadata.labels' +kubectl get nodes -o jsonpath='{.items[].metadata.labels}' | yq -y + +kubectl get events -n 'monitoring' --sort-by '.metadata.creationTimestamp' + # Requires the metrics server to be running in the cluster kubectl top nodes kubectl top pods -kubectl get events -n 'monitoring' --sort-by '.metadata.creationTimestamp' + +kubectl run --rm -it --image 'alpine' 'alpine' --command -- sh +kubectl run --rm -t --image 'amazon/aws-cli:2.17.16' 'awscli' -- autoscaling describe-auto-scaling-groups +kubectl -n 'kube-system' run --rm -it 'awscli' --overrides '{"spec":{"serviceAccountName":"cluster-autoscaler-aws"}}' \ + --image '012345678901.dkr.ecr.eu-west-1.amazonaws.com/cache/amazon/aws-cli:2.17.16' \ + autoscaling describe-auto-scaling-groups + + +kubectl scale deployment -n 'kube-system' 'cluster-autoscaler-aws-cluster-autoscaler' --replicas '0'