chore: add section about privileged containers

This commit is contained in:
Michele Cereda
2023-04-22 00:02:26 +02:00
parent 171cdab453
commit 6fd0975a29
2 changed files with 199 additions and 65 deletions

View File

@@ -23,21 +23,21 @@ kubectl get pods -l app=nginx,tier=frontend
### Table of contents
- [TL;DR](#tldr)
- [Configuration](#configuration)
- [Configure access to multiple clusters](#configure-access-to-multiple-clusters)
- [Create resources](#create-resources)
- [Output formatting](#output-formatting)
- [Verbosity and debugging](#verbosity-and-debugging)
- [Further readings](#further-readings)
- [Sources](#sources)
1. [TL;DR](#tldr)
1. [Configuration](#configuration)
1. [Configure access to multiple clusters](#configure-access-to-multiple-clusters)
1. [Create resources](#create-resources)
1. [Output formatting](#output-formatting)
1. [Verbosity and debugging](#verbosity-and-debugging)
1. [Further readings](#further-readings)
1. [Sources](#sources)
## TL;DR
```sh
# Enable shell completion.
source <(kubectl completion bash)
echo "[[ $commands[kubectl] ]] && source <(kubectl completion zsh)" >> ~/.zshrc
source <(kubectl completion 'bash')
echo "[[ $commands[kubectl] ]] && source <(kubectl completion 'zsh')" >> ~/.zshrc
# Shot the merged configuration.
kubectl config view
@@ -48,13 +48,13 @@ kubectl config view -o jsonpath='{.users[*].name}'
kubectl config view -o jsonpath='{.users[?(@.name == "e2e")].user.password}'
# Set configuration values.
kubectl config set-context --current --namespace=keda
kubectl config set-context gce --user=cluster-admin --namespace=foo
kubectl config set-context --current --namespace='keda'
kubectl config set-context 'gce' --user='cluster-admin' --namespace='foo'
kubectl config set-credentials \
kubeuser/foo.kubernetes.com --username=kubeuser --password=kubepassword
'kubeuser/foo.kubernetes.com' --username='kubeuser' --password='kubepassword'
# Delete configuration values.
kubectl config unset users.foo
kubectl config unset 'users.foo'
# Use multiple config files at once.
# This will temporarily merge them in one big configuration file.
@@ -65,47 +65,47 @@ kubectl config get-contexts
kubectl config current-context
# Set context as the default one.
kubectl config use-context docker-desktop
kubectl config use-context gce
kubectl config use-context 'docker-desktop'
kubectl config use-context 'gce'
# Display addresses of the master and services.
kubectl cluster-info
# Dump the complete current cluster state.
kubectl cluster-info dump
kubectl cluster-info dump --output-directory=/path/to/cluster-state
kubectl cluster-info dump --output-directory='/path/to/cluster-state'
# List supported resources types along with their short name, API group, Kind,
# and whether they are namespaced.
kubectl api-resources
kubectl api-resources --namespaced=true
kubectl api-resources -o name
kubectl api-resources -o wide
kubectl api-resources --verbs=list,get
kubectl api-resources --namespaced='true'
kubectl api-resources -o 'name'
kubectl api-resources -o 'wide'
kubectl api-resources --verbs='list,get'
# Show the documentation about resources or their fields.
kubectl explain pods
kubectl explain pods.spec.containers
kubectl explain 'pods'
kubectl explain 'pods.spec.containers'
# List and filter resources.
kubectl get pods
kubectl get pod/coredns-845757d86-47np2 -n kube-system
kubectl get 'pod/coredns-845757d86-47np2' -n 'kube-system'
kubectl get namespaces,pods --show-labels
kubectl get services -A -o wide
kubectl get rs --sort-by=.metadata.name
kubectl get pv --sort-by=.spec.capacity.storage --no-headers
kubectl get services -A -o 'wide'
kubectl get rs --sort-by='.metadata.name'
kubectl get pv --sort-by='.spec.capacity.storage' --no-headers
kubectl get po --sort-by='.status.containerStatuses[0].restartCount'
kubectl get events --sort-by .metadata.creationTimestamp
kubectl get pods --field-selector=status.phase=Running
kubectl get events --sort-by '.metadata.creationTimestamp'
kubectl get pods --field-selector='status.phase=Running'
kubectl get node -l='!node-role.kubernetes.io/master'
kubectl get replicasets -l 'environment in (prod, qa)'
kubectl get deploy --selector 'tier,tier notin (frontend)'
# Extract information from resources' definition.
kubectl get deployment nginx -o yaml
kubectl get cm kube-root-ca.crt -o jsonpath='{.data.ca\.crt}'
kubectl get deployment 'nginx' -o 'yaml'
kubectl get cm 'kube-root-ca.crt' -o jsonpath='{.data.ca\.crt}'
kubectl get po -o=jsonpath='{.items..metadata.name}'
kubectl get po -l app=redis -o jsonpath='{.items[*].metadata.labels.version}'
kubectl get po -l 'app=redis' -o jsonpath='{.items[*].metadata.labels.version}'
kubectl get nodes \
-o jsonpath='{.items[*].status.addresses[?(@.type=="ExternalIP")].address}'
@@ -126,9 +126,9 @@ kubectl get nodes \
| grep "Ready=True"
# List all secrets currently in use by a Pod.
kubectl get pods -o json \
kubectl get pods -o 'json' \
| jq '.items[].spec.containers[].env[]?.valueFrom.secretKeyRef.name' \
| grep -v null | sort | uniq
| grep -v 'null' | sort | uniq
# List the name of Pods belonging to a particular RC.
SELECTOR=${$(kubectl get rc my-rc --output=json | jq -j '.spec.selector | to_entries | .[] | "\(.key)=\(.value),"')%?} kubectl get pods -l=$SELECTOR \
@@ -144,17 +144,17 @@ kubectl get pods --all-namespaces \
# Produce a period-delimited tree of all keys returned for nodes.
# Helpful when trying to locate a specific key within a complex nested JSON
# structure.
kubectl get nodes -o json | jq -c 'path(..)|[.[]|tostring]|join(".")'
kubectl get nodes -o 'json' | jq -c 'path(..)|[.[]|tostring]|join(".")'
# Show detailed information about resources.
kubectl describe node pi
kubectl describe deploy,rs,po -l app=redis
kubectl describe deploy,rs,po -l 'app=redis'
# Create resources from manifests.
kubectl apply -f manifest.yaml
kubectl apply -f path/to/m1.yaml -f ./m2.yaml
kubectl apply -f dir/
kubectl apply -f https://git.io/vPieo
kubectl apply -f 'manifest.yaml'
kubectl apply -f 'path/to/m1.yaml' -f './m2.yaml'
kubectl apply -f 'dir/'
kubectl apply -f 'https://git.io/vPieo'
cat <<-EOF | kubectl apply -f -
apiVersion: v1
kind: Secret
@@ -171,18 +171,27 @@ EOF
kubectl diff -f ./manifest.yaml
# Start a Pod.
kubectl run nginx --image nginx
kubectl run busybox --rm -it --image=busybox -n keda -- sh
kubectl run 'nginx' --image 'nginx'
kubectl run 'busybox' --rm -it --image='busybox' -n 'keda' -- sh
kubectl run 'alpine' --restart=Never -it --image 'alpine' -- sh
kubectl run 'ephemeral' --image=registry.k8s.io/pause:3.1 --restart=Never
# Start a Pod and write its specs into a file.
kubectl run nginx --image=nginx --dry-run=client -o yaml > pod.yaml
kubectl run 'nginx' --image='nginx' --dry-run='client' -o 'yaml' > 'pod.yaml'
# Create a single instance deployment of 'nginx'.
kubectl create deployment nginx --image=nginx
kubectl create deployment 'nginx' --image 'nginx'
# Start a Job using an existing Job as template
kubectl create job backup-before-upgrade-13.6.2-to-13.9.2 \
--from=cronjob.batch/backup -n gitlab
# Start a Job printing "Hello World".
kubectl create job 'hello' --image 'busybox:1.28' -- echo "Hello World"
# Start a Job using an existing Job as template.
kubectl create job 'backup-before-upgrade-13.6.2-to-13.9.2' \
--from=cronjob.batch/backup -n 'gitlab'
# Start a CronJob printing "Hello World" every minute.
kubectl create cronjob 'hello' --image=busybox:1.28 --schedule="*/1 * * * *" \
-- echo "Hello World"
# Wait for a pod to be 'ready'.
kubectl wait --for 'condition=ready' --timeout 120s \
@@ -210,7 +219,7 @@ kubectl rollout status -w deployment/frontend
kubectl rollout restart deployment/frontend
# Replace a Pod based on the JSON passed into stdin.
cat pod.json | kubectl replace -f -
cat 'pod.json' | kubectl replace -f -
# Force replacement, deletion and recreation (in this order) of resources.
# This Will cause a service outage.
@@ -225,8 +234,9 @@ kubectl get pod mypod -o yaml \
| sed 's/\(image: myimage\):.*$/\1:v4/' \
| kubectl replace -f -
# Add Labels to resources.
kubectl label pods nginx custom-name=awesome
# Add Labels.
kubectl label pods 'nginx' 'custom-name=awesome'
kubectl label ns 'default' 'pod-security.kubernetes.io/enforce=privileged'
# Add Annotations.
kubectl annotate pods alpine icon-url=http://goo.gl/XXBTWq
@@ -336,6 +346,9 @@ kubectl taint nodes node1 key1=value1:NoSchedule-
# If a taint with that key and effect already exists, replace its value.
kubectl taint nodes foo dedicated=special-user:NoSchedule
# Execute a privileged, debug container.
kubectl debug -it 'node/docker-desktop' --image 'busybox:1.28'
# Mark Nodes as unschedulable.
kubectl cordon my-node
@@ -351,6 +364,14 @@ kubectl top node my-node
# Listen on port 5000 on the local machine and forward connections to port 6000
# of my-pod
kubectl port-forward my-pod 5000:6000
# Show Containers' status, properties and capabilities from the inside.
# Run the command from *inside* the container.
cat /proc/1/status
# Check a container's capabilities.
# Run the command from *inside* the container.
grep 'Cap' /proc/1/status
```
## Configuration
@@ -546,6 +567,8 @@ Verbosity | Description
- [Taints and Tolerations]
- [Commands reference]
- [Configure access to multiple clusters]
- [Configure a Security Context for a Pod or Container]
- [Enforce Pod Security Standards with Namespace Labels]
## Sources
@@ -554,14 +577,16 @@ Verbosity | Description
- [Run a replicated stateful application]
- [Accessing an application on Kubernetes in Docker]
<!-- docs -->
<!-- project's references -->
[assigning pods to nodes]: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/
[cheatsheet]: https://kubernetes.io/docs/reference/kubectl/cheatsheet
[commands reference]: https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands
[configure a security context for a pod or container]: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/
[configure access to multiple clusters]: https://kubernetes.io/docs/tasks/access-application-cluster/configure-access-multiple-clusters/
[enforce pod security standards with namespace labels]: https://kubernetes.io/docs/tasks/configure-pod-container/enforce-standards-namespace-labels/
[taints and tolerations]: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/
<!-- other articles -->
<!-- external references articles -->
[accessing an application on kubernetes in docker]: https://medium.com/@lizrice/accessing-an-application-on-kubernetes-in-docker-1054d46b64b1
[run a replicated stateful application]: https://kubernetes.io/docs/tasks/run-application/run-replicated-stateful-application/
[run a single-instance stateful application]: https://kubernetes.io/docs/tasks/run-application/run-single-instance-stateful-application/

View File

@@ -6,18 +6,23 @@ Hosted by the [Cloud Native Computing Foundation][cncf].
1. [Composition](#composition)
1. [The control plane](#the-control-plane)
1. [kube-apiserver](#kube-apiserver)
2. [etcd](#etcd)
3. [kube-scheduler](#kube-scheduler)
4. [kube-controller-manager](#kube-controller-manager)
5. [cloud-controller-manager](#cloud-controller-manager)
2. [The worker Nodes](#the-worker-nodes)
1. [etcd](#etcd)
1. [kube-scheduler](#kube-scheduler)
1. [kube-controller-manager](#kube-controller-manager)
1. [cloud-controller-manager](#cloud-controller-manager)
1. [The worker Nodes](#the-worker-nodes)
1. [kubelet](#kubelet)
2. [kube-proxy](#kube-proxy)
3. [Container runtime](#container-runtime)
3. [Addons](#addons)
2. [The API](#the-api)
3. [Managed Kubernetes Services](#managed-kubernetes-services)
4. [Sources](#sources)
1. [kube-proxy](#kube-proxy)
1. [Container runtime](#container-runtime)
1. [Addons](#addons)
1. [The API](#the-api)
1. [Managed Kubernetes Services](#managed-kubernetes-services)
1. [Security](#security)
1. [Highly privileged containers](#highly-privileged-containers)
1. [Capabilities](#capabilities)
1. [Privileged container vs privilege escalation](#privileged-container-vs-privilege-escalation)
1. [Further readings](#further-readings)
1. [Sources](#sources)
## Composition
@@ -107,7 +112,7 @@ The software that is responsible for running containers.
Kubernetes supports container runtimes like `containerd`, `CRI-O`, and any other implementation of the Kubernetes CRI (Container Runtime Interface).
### Addons
#### Addons
Addons use Kubernetes resources (_DaemonSet_, _Deployment_, etc) to implement cluster features, and as such namespaced resources for addons belong within the `kube-system` namespace.
@@ -131,15 +136,119 @@ The Kubernetes API can be extended:
Cloud providers offer managed versions.
## Security
### Highly privileged containers
Some workloads (e.g. [ElasticSearch]) might require to change one or more system settings for performance, stability, or other issues.<br/>
This is usually achieved executing the change from a Container with high privileges, which has access to the Node's resources and breaks the isolation Containers are usually famous for. If compromised, an attacker can use this highly privileged container to gain access to the underlying Node.
To mitigate this, [Kubernetes introduced the design of a Security Context][security context design proposal].<br/>
From this document:
> A security context is a set of constraints that are applied to a Container in order to achieve the following goals (from the [Security design][Security Design Proposal]):
>
> - ensure a **clear isolation** between the Container and the underlying host it runs on;
> - **limit** the ability of the Container to negatively impact the infrastructure or other Containers.
>
> [The main idea is that] **Containers should only be granted the access they need to perform their work**. The Security Context takes advantage of containerization features such as the ability to [add or remove capabilities][Runtime privilege and Linux capabilities in Docker containers] to give a process some privileges, but not all the privileges of the `root` user.
#### Capabilities
Adding capabilities to a Container is **not** making it _privileged_, **nor** allowing _privilege escalation_. It is just giving the Container the ability to write to specific files or devices depending on the given capability.
This means having a capability assigned does **not** automatically make the Container able to wreak havoc on a Node, and this practice **can be a legitimate use** of this feature instead.
From the feature's `man` page:
> Linux divides the privileges traditionally associated with superuser into distinct units, known as _capabilities_, which can be independently enabled and disabled. Capabilities are a per-thread attribute.
This also means a Container will be **limited** to its contents, plus the capabilities it has been assigned.
Some capabilities are assigned to all Containers by default, while others (the ones which could cause more issues) require to be **explicitly** set using the Containers' `securityContext.capabilities.add` property.<br/>
If a Container is _privileged_ (see [Privileged container vs privilege escalation](#privileged-container-vs-privilege-escalation)), it will have access to **all** the capabilities, with no regards of what are explicitly assigned to it.
Check:
- [Linux capabilities], to see what capabilities can be assigned to a process **in a Linux system**;
- [Runtime privilege and Linux capabilities in Docker containers] for the capabilities available **inside Kubernetes**, and
- [Container capabilities in Kubernetes] for a handy table associating capabilities in Kubernetes to their Linux variant.
#### Privileged container vs privilege escalation
A _privileged container_ is very different from a _container leveraging privilege escalation_.
A **privileged container** does whatever a processes running directly on the Node can.<br/>
It will have automatically assigned **all** [capabilities](#capabilities), and being `root` in this container is effectively being `root` on the Node it is running on.
> For a Container to be _privileged_, its definition **requires the `securityContext.privileged` property set to `true`**.
**Privilege escalation** allows **a process inside the Container** to gain more privileges than its parent process.<br/>
The process will be able to assume `root`-like powers, but will have access only to the **assigned** [capabilities](#capabilities) and generally have limited to no access to the Node like any other Container.
> For a Container to _leverage privilege escalation_, its definition **requires the `securityContext.allowPrivilegeEscalation` property**:
>
> - to **either** be set to `true`, or
> - to **not be set** at all **if**:
> - the Container is already privileged, or
> - the Container has `SYS_ADMIN` capabilities.
>
> This property directly controls whether the [`no_new_privs`][No New Privileges Design Proposal] flag gets set on the Container's process.
From the [design document for `no_new_privs`][No New Privileges Design Proposal]:
> In Linux, the `execve` system call can grant more privileges to a newly-created process than its parent process. Considering security issues, since Linux kernel v3.5, there is a new flag named `no_new_privs` added to prevent those new privileges from being granted to the processes.
>
> `no_new_privs` is inherited across `fork`, `clone` and `execve` and **can not be unset**. With `no_new_privs` set, `execve` promises not to grant the privilege to do anything that could not have been done without the `execve` call.
>
> For more details about `no_new_privs`, please check the [Linux kernel documentation][no_new_privs linux kernel documentation].
>
> […]
>
> To recap, below is a table defining the default behavior at the pod security policy level and what can be set as a default with a pod security policy:
>
> | allowPrivilegeEscalation setting | uid = 0 or unset | uid != 0 | privileged/CAP_SYS_ADMIN |
> | -------------------------------- | ------------------ | ------------------ | ------------------------ |
> | nil | no_new_privs=true | no_new_privs=false | no_new_privs=false |
> | false | no_new_privs=true | no_new_privs=true | no_new_privs=false |
> | true | no_new_privs=false | no_new_privs=false | no_new_privs=false |
## Further readings
- Kubernetes' [security context design proposal]
- Kubernetes' [No New Privileges Design Proposal]
- [Linux kernel documentation about `no_new_privs`][no_new_privs linux kernel documentation]
- [Linux capabilities]
- [Runtime privilege and Linux capabilities in Docker containers]
- [Container capabilities in Kubernetes]
- [Configure a Security Context for a Pod or a Container], specifically the [Set capabilities for a Container] section
- [Kubernetes SecurityContext Capabilities Explained]
- [Best practices for pod security in Azure Kubernetes Service (AKS)]
- [`kubectl`][kubectl]
## Sources
- [Concepts]
All the references in the [further readings] section, plus the following:
- Kubernetes' [concepts]
<!-- project's documentation -->
[api deprecation policy]: https://kubernetes.io/docs/reference/using-api/deprecation-policy/
[concepts]: https://kubernetes.io/docs/concepts/
[configure a security context for a pod or a container]: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/
[no new privileges design proposal]: https://github.com/kubernetes/design-proposals-archive/blob/main/auth/no-new-privs.md
[security context design proposal]: https://github.com/kubernetes/design-proposals-archive/blob/main/auth/security_context.md
[security design proposal]: https://github.com/kubernetes/design-proposals-archive/blob/main/auth/security.md
[set capabilities for a container]: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container
<!-- internal references -->
[kubectl]: kubectl.md
<!-- external references -->
[best practices for pod security in azure kubernetes service (aks)]: https://learn.microsoft.com/en-us/azure/aks/developer-best-practices-pod-security
[cncf]: https://www.cncf.io/
[container capabilities in kubernetes]: https://unofficial-kubernetes.readthedocs.io/en/latest/concepts/policy/container-capabilities/
[elasticsearch]: https://github.com/elastic/helm-charts/issues/689
[kubernetes securitycontext capabilities explained]: https://www.golinuxcloud.com/kubernetes-securitycontext-capabilities/
[linux capabilities]: https://man7.org/linux/man-pages/man7/capabilities.7.html
[no_new_privs linux kernel documentation]: https://www.kernel.org/doc/Documentation/prctl/no_new_privs.txt
[runtime privilege and linux capabilities in docker containers]: https://docs.docker.com/engine/reference/run/#runtime-privilege-and-linux-capabilities