diff --git a/config.toml b/config.toml index c054c77990521..9b649a423039f 100644 --- a/config.toml +++ b/config.toml @@ -138,10 +138,10 @@ time_format_default = "January 02, 2006 at 3:04 PM PST" description = "Production-Grade Container Orchestration" showedit = true -latest = "v1.25" +latest = "v1.26" -fullversion = "v1.25.0" -version = "v1.25" +fullversion = "v1.26.0" +version = "v1.26" githubbranch = "main" docsbranch = "main" deprecated = false @@ -180,6 +180,13 @@ js = [ "script" ] +[[params.versions]] +fullversion = "v1.26.0" +version = "v1.26" +githubbranch = "v1.26.0" +docsbranch = "main" +url = "https://kubernetes.io" + [[params.versions]] fullversion = "v1.25.0" version = "v1.25" @@ -208,13 +215,6 @@ githubbranch = "v1.22.11" docsbranch = "release-1.22" url = "https://v1-22.docs.kubernetes.io" -[[params.versions]] -fullversion = "v1.21.14" -version = "v1.21" -githubbranch = "v1.21.14" -docsbranch = "release-1.21" -url = "https://v1-21.docs.kubernetes.io" - # User interface configuration [params.ui] # Enable to show the side bar menu in its compact state. diff --git a/content/en/blog/_posts/2022-05-03-kubernetes-release-1.24.md b/content/en/blog/_posts/2022-05-03-kubernetes-release-1.24.md index f29c6c92fe7a8..be4c435009cc4 100644 --- a/content/en/blog/_posts/2022-05-03-kubernetes-release-1.24.md +++ b/content/en/blog/_posts/2022-05-03-kubernetes-release-1.24.md @@ -32,7 +32,7 @@ Existing beta APIs and new versions of existing beta APIs will continue to be en Release artifacts are [signed](https://github.com/kubernetes/enhancements/issues/3031) using [cosign](https://github.com/sigstore/cosign) signatures, -and there is experimental support for [verifying image signatures](/docs/tasks/administer-cluster/verify-signed-images/). +and there is experimental support for [verifying image signatures](/docs/tasks/administer-cluster/verify-signed-artifacts/). Signing and verification of release artifacts is part of [increasing software supply chain security for the Kubernetes release process](https://github.com/kubernetes/enhancements/issues/3027). ### OpenAPI v3 @@ -84,8 +84,7 @@ that enables the caller of a function to control all aspects of logging (output ### Avoiding Collisions in IP allocation to Services Kubernetes 1.24 introduces a new opt-in feature that allows you to -[soft-reserve a range for static IP address assignments](/docs/concepts/services-networking/service/#service-ip-static-sub-range) -to Services. +soft-reserve a range for static IP address assignments to Services. With the manual enablement of this feature, the cluster will prefer automatic assignment from the pool of Service IP addresses, thereby reducing the risk of collision. diff --git a/content/en/blog/_posts/2022-08-18-kubernetes-1.24-release-interview.md b/content/en/blog/_posts/2022-08-18-kubernetes-1.24-release-interview.md index 49cd30184ec6d..5aed0ffdb133a 100644 --- a/content/en/blog/_posts/2022-08-18-kubernetes-1.24-release-interview.md +++ b/content/en/blog/_posts/2022-08-18-kubernetes-1.24-release-interview.md @@ -203,7 +203,7 @@ JAMES LAVERACK: This is really about encouraging the use of stable APIs. There w JAMES LAVERACK: That's correct. There's no breaking changes in beta APIs other than the ones we've documented this release. It's only new things. -**CRAIG BOX: Now in this release, [the artifacts are signed](https://github.com/kubernetes/enhancements/issues/3031) using Cosign signatures, and there is [experimental support for verification of those signatures](https://kubernetes.io/docs/tasks/administer-cluster/verify-signed-images/). What needed to happen to make that process possible?** +**CRAIG BOX: Now in this release, [the artifacts are signed](https://github.com/kubernetes/enhancements/issues/3031) using Cosign signatures, and there is [experimental support for verification of those signatures](https://kubernetes.io/docs/tasks/administer-cluster/verify-signed-artifacts/). What needed to happen to make that process possible?** JAMES LAVERACK: This was a huge process from the other half of SIG Release. SIG Release has the release team, but it also has the release engineering team that handles the mechanics of actually pushing releases out. They have spent, and one of my friends over there, Adolfo, has spent a lot of time trying to bring us in line with [SLSA](https://slsa.dev/) compliance. I believe we're [looking now at Level 3 compliance](https://github.com/kubernetes/enhancements/issues/3027). diff --git a/content/en/docs/concepts/architecture/leases.md b/content/en/docs/concepts/architecture/leases.md new file mode 100644 index 0000000000000..f7fbd3906da61 --- /dev/null +++ b/content/en/docs/concepts/architecture/leases.md @@ -0,0 +1,80 @@ +--- +title: Leases +content_type: concept +weight: 30 +--- + + + +Distrbuted systems often have a need for "leases", which provides a mechanism to lock shared resources and coordinate activity between nodes. +In Kubernetes, the "lease" concept is represented by `Lease` objects in the `coordination.k8s.io` API group, which are used for system-critical +capabilities like node heart beats and component-level leader election. + + + +## Node Heart Beats + +Kubernetes uses the Lease API to communicate kubelet node heart beats to the Kubernetes API server. +For every `Node` , there is a `Lease` object with a matching name in the `kube-node-lease` +namespace. Under the hood, every kubelet heart beat is an UPDATE request to this `Lease` object, updating +the `spec.renewTime` field for the Lease. The Kubernetes control plane uses the time stamp of this field +to determine the availability of this `Node`. + +See [Node Lease objects](/docs/concepts/architecture/nodes/#heartbeats) for more details. + +## Leader Election + +Leases are also used in Kubernetes to ensure only one instance of a component is running at any given time. +This is used by control plane components like `kube-controller-manager` and `kube-scheduler` in +HA configurations, where only one instance of the component should be actively running while the other +instances are on stand-by. + +## API Server Identity + +{{< feature-state for_k8s_version="v1.26" state="beta" >}} + +Starting in Kubernetes v1.26, each `kube-apiserver` uses the Lease API to publish its identity to the +rest of the system. While not particularly useful on its own, this provides a mechanism for clients to +discover how many instances of `kube-apiserver` are operating the Kubernetes control plane. +Existence of kube-apiserver leases enables future capabilities that may require coordination between +each kube-apiserver. + +You can inspect Leases owned by each kube-apiserver by checking for lease objects in the `kube-system` namespace +with the name `kube-apiserver-`. Alternatively you can use the label selector `k8s.io/component=kube-apiserver`: + +```shell +$ kubectl -n kube-system get lease -l k8s.io/component=kube-apiserver +NAME HOLDER AGE +kube-apiserver-c4vwjftbvpc5os2vvzle4qg27a kube-apiserver-c4vwjftbvpc5os2vvzle4qg27a_9cbf54e5-1136-44bd-8f9a-1dcd15c346b4 5m33s +kube-apiserver-dz2dqprdpsgnm756t5rnov7yka kube-apiserver-dz2dqprdpsgnm756t5rnov7yka_84f2a85d-37c1-4b14-b6b9-603e62e4896f 4m23s +kube-apiserver-fyloo45sdenffw2ugwaz3likua kube-apiserver-fyloo45sdenffw2ugwaz3likua_c5ffa286-8a9a-45d4-91e7-61118ed58d2e 4m43s +``` + +The SHA256 hash used in the lease name is based on the OS hostname as seen by kube-apiserver. Each kube-apiserver should be +configured to use a hostname that is unique within the cluster. New instances of kube-apiserver that use the same hostname +will take over existing Leases using a new holder identity, as opposed to instantiating new lease objects. You can check the +hostname used by kube-apisever by checking the value of the `kubernetes.io/hostname` label: + +```shell +$ kubectl -n kube-system get lease kube-apiserver-c4vwjftbvpc5os2vvzle4qg27a -o yaml +``` + +```yaml +apiVersion: coordination.k8s.io/v1 +kind: Lease +metadata: + creationTimestamp: "2022-11-30T15:37:15Z" + labels: + k8s.io/component: kube-apiserver + kubernetes.io/hostname: kind-control-plane + name: kube-apiserver-c4vwjftbvpc5os2vvzle4qg27a + namespace: kube-system + resourceVersion: "18171" + uid: d6c68901-4ec5-4385-b1ef-2d783738da6c +spec: + holderIdentity: kube-apiserver-c4vwjftbvpc5os2vvzle4qg27a_9cbf54e5-1136-44bd-8f9a-1dcd15c346b4 + leaseDurationSeconds: 3600 + renewTime: "2022-11-30T18:04:27.912073Z" +``` + +Expired leases from kube-apiservers that no longer exist are garbage collected by new kube-apiservers after 1 hour. diff --git a/content/en/docs/concepts/architecture/nodes.md b/content/en/docs/concepts/architecture/nodes.md index ef48d31fa2596..d36d82174b70d 100644 --- a/content/en/docs/concepts/architecture/nodes.md +++ b/content/en/docs/concepts/architecture/nodes.md @@ -456,7 +456,7 @@ Message: Pod was terminated in response to imminent node shutdown. ## Non Graceful node shutdown {#non-graceful-node-shutdown} -{{< feature-state state="alpha" for_k8s_version="v1.24" >}} +{{< feature-state state="beta" for_k8s_version="v1.26" >}} A node shutdown action may not be detected by kubelet's Node Shutdown Manager, either because the command does not trigger the inhibitor locks mechanism used by diff --git a/content/en/docs/concepts/containers/images.md b/content/en/docs/concepts/containers/images.md index d7d037d21b59a..a135d1d1b6d46 100644 --- a/content/en/docs/concepts/containers/images.md +++ b/content/en/docs/concepts/containers/images.md @@ -167,6 +167,9 @@ Credentials can be provided in several ways: - Configuring Nodes to Authenticate to a Private Registry - all pods can read any configured private registries - requires node configuration by cluster administrator + - Kubelet Credential Provider to dynamically fetch credentials for private registries + - kubelet can be configured to use credential provider exec plugin + for the respective private registry. - Pre-pulled Images - all pods can use any images cached on a node - requires root access to all nodes to set up @@ -187,6 +190,18 @@ For an example of configuring a private container image registry, see the [Pull an Image from a Private Registry](/docs/tasks/configure-pod-container/pull-image-private-registry) task. That example uses a private registry in Docker Hub. +### Kubelet credential provider for authenticated image pulls {#kubelet-credential-provider} + +{{< note >}} +This approach is especially suitable when kubelet needs to fetch registry credentials dynamically. +Most commonly used for registries provided by cloud providers where auth tokens are short-lived. +{{< /note >}} + +You can configure the kubelet to invoke a plugin binary to dynamically fetch registry credentials for a container image. +This is the most robust and versatile way to fetch credentials for private registries, but also requires kubelet-level configuration to enable. + +See [Configure a kubelet image credential provider](/docs/tasks/administer-cluster/kubelet-credential-provider/) for more details. + ### Interpretation of config.json {#config-json} The interpretation of `config.json` varies between the original Docker diff --git a/content/en/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins.md b/content/en/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins.md index c845b949ff07b..6645984559bb1 100644 --- a/content/en/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins.md +++ b/content/en/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins.md @@ -8,7 +8,7 @@ weight: 20 --- -{{< feature-state for_k8s_version="v1.10" state="beta" >}} +{{< feature-state for_k8s_version="v1.26" state="stable" >}} Kubernetes provides a [device plugin framework](https://git.k8s.io/design-proposals-archive/resource-management/device-plugin.md) that you can use to advertise system hardware resources to the @@ -145,8 +145,8 @@ The general workflow of a device plugin includes the following steps: ### Handling kubelet restarts A device plugin is expected to detect kubelet restarts and re-register itself with the new -kubelet instance. In the current implementation, a new kubelet instance deletes all the existing Unix sockets -under `/var/lib/kubelet/device-plugins` when it starts. A device plugin can monitor the deletion +kubelet instance. A new kubelet instance deletes all the existing Unix sockets under +`/var/lib/kubelet/device-plugins` when it starts. A device plugin can monitor the deletion of its Unix socket and re-register itself upon such an event. ## Device plugin deployment @@ -165,16 +165,28 @@ Pod onto Nodes, to restart the daemon Pod after failure, and to help automate up ## API compatibility -Kubernetes device plugin support is in beta. The API may change before stabilization, -in incompatible ways. As a project, Kubernetes recommends that device plugin developers: +Previously, the versioning scheme required the Device Plugin's API version to match +exactly the Kubelet's version. Since the graduation of this feature to Beta in v1.12 +this is no longer a hard requirement. The API is versioned and has been stable since +Beta graduation of this feature. Because of this, kubelet upgrades should be seamless +but there still may be changes in the API before stabilization making upgrades not +guaranteed to be non-breaking. -* Watch for changes in future releases. +{{< caution >}} +Although the Device Manager component of Kubernetes is a generally available feature, +the _device plugin API_ is not stable. For information on the device plugin API and +version compatibility, read [Device Plugin API versions](/docs/reference/node/device-plugin-api-versions/). +{{< caution >}} + +As a project, Kubernetes recommends that device plugin developers: + +* Watch for Device Plugin API changes in the future releases. * Support multiple versions of the device plugin API for backward/forward compatibility. -If you enable the DevicePlugins feature and run device plugins on nodes that need to be upgraded to -a Kubernetes release with a newer device plugin API version, upgrade your device plugins -to support both versions before upgrading these nodes. Taking that approach will -ensure the continuous functioning of the device allocations during the upgrade. +To run device plugins on nodes that need to be upgraded to a Kubernetes release with +a newer device plugin API version, upgrade your device plugins to support both versions +before upgrading these nodes. Taking that approach will ensure the continuous functioning +of the device allocations during the upgrade. ## Monitoring device plugin resources diff --git a/content/en/docs/concepts/scheduling-eviction/_index.md b/content/en/docs/concepts/scheduling-eviction/_index.md index 00330b5a4b3eb..4ceb552edccac 100644 --- a/content/en/docs/concepts/scheduling-eviction/_index.md +++ b/content/en/docs/concepts/scheduling-eviction/_index.md @@ -26,8 +26,10 @@ of terminating one or more Pods on Nodes. * [Pod Topology Spread Constraints](/docs/concepts/scheduling-eviction/topology-spread-constraints/) * [Taints and Tolerations](/docs/concepts/scheduling-eviction/taint-and-toleration/) * [Scheduling Framework](/docs/concepts/scheduling-eviction/scheduling-framework) +* [Dynamic Resource Allocation](/docs/concepts/scheduling-eviction/dynamic-resource-allocation) * [Scheduler Performance Tuning](/docs/concepts/scheduling-eviction/scheduler-perf-tuning/) * [Resource Bin Packing for Extended Resources](/docs/concepts/scheduling-eviction/resource-bin-packing/) +* [Pod Scheduling Readiness](/docs/concepts/scheduling-eviction/pod-scheduling-readiness/) ## Pod Disruption diff --git a/content/en/docs/concepts/scheduling-eviction/dynamic-resource-allocation.md b/content/en/docs/concepts/scheduling-eviction/dynamic-resource-allocation.md new file mode 100644 index 0000000000000..e1c468f58f099 --- /dev/null +++ b/content/en/docs/concepts/scheduling-eviction/dynamic-resource-allocation.md @@ -0,0 +1,215 @@ +--- +reviewers: +- klueska +- pohly +title: Dynamic Resource Allocation +content_type: concept +weight: 65 +--- + + + +{{< feature-state for_k8s_version="v1.26" state="alpha" >}} + +Dynamic resource allocation is a new API for requesting and sharing resources +between pods and containers inside a pod. It is a generalization of the +persistent volumes API for generic resources. Third-party resource drivers are +responsible for tracking and allocating resources. Different kinds of +resources support arbitrary parameters for defining requirements and +initialization. + +## {{% heading "prerequisites" %}} + +Kubernetes v{{< skew currentVersion >}} includes cluster-level API support for +dynamic resource allocation, but it [needs to be +enabled](#enabling-dynamic-resource-allocation) explicitly. You also must +install a resource driver for specific resources that are meant to be managed +using this API. If you are not running Kubernetes v{{< skew currentVersion>}}, +check the documentation for that version of Kubernetes. + + + +## API + +The new `resource.k8s.io/v1alpha1` {{< glossary_tooltip text="API group" +term_id="api-group" >}} provides four new types: + +ResourceClass +: Defines which resource driver handles a certain kind of + resource and provides common parameters for it. ResourceClasses + are created by a cluster administrator when installing a resource + driver. + +ResourceClaim +: Defines a particular resource instances that is required by a + workload. Created by a user (lifecycle managed manually, can be shared + between different Pods) or for individual Pods by the control plane based on + a ResourceClaimTemplate (automatic lifecycle, typically used by just one + Pod). + +ResourceClaimTemplate +: Defines the spec and some meta data for creating + ResourceClaims. Created by a user when deploying a workload. + +PodScheduling +: Used internally by the control plane and resource drivers + to coordinate pod scheduling when ResourceClaims need to be allocated + for a Pod. + +Parameters for ResourceClass and ResourceClaim are stored in separate objects, +typically using the type defined by a {{< glossary_tooltip +term_id="CustomResourceDefinition" text="CRD" >}} that was created when +installing a resource driver. + +The `core/v1` `PodSpec` defines ResourceClaims that are needed for a Pod in a new +`resourceClaims` field. Entries in that list reference either a ResourceClaim +or a ResourceClaimTemplate. When referencing a ResourceClaim, all Pods using +this PodSpec (for example, inside a Deployment or StatefulSet) share the same +ResourceClaim instance. When referencing a ResourceClaimTemplate, each Pod gets +its own instance. + +The `resources.claims` list for container resources defines whether a container gets +access to these resource instances, which makes it possible to share resources +between one or more containers. + +Here is an example for a fictional resource driver. Two ResourceClaim objects +will get created for this Pod and each container gets access to one of them. + +```yaml +apiVersion: resource.k8s.io/v1alpha1 +kind: ResourceClass +name: resource.example.com +driverName: resource-driver.example.com +--- +apiVersion: cats.resource.example.com/v1 +kind: ClaimParameters +name: large-black-cat-claim-parameters +spec: + color: black + size: large +--- +apiVersion: resource.k8s.io/v1alpha1 +kind: ResourceClaimTemplate +metadata: + name: large-black-cat-claim-template +spec: + spec: + resourceClassName: resource.example.com + parametersRef: + apiGroup: cats.resource.example.com + kind: ClaimParameters + name: large-black-cat-claim-parameters +–-- +apiVersion: v1 +kind: Pod +metadata: + name: pod-with-cats +spec: + containers: + - name: container0 + image: ubuntu:20.04 + command: ["sleep", "9999"] + resources: + claims: + - name: cat-0 + - name: container1 + image: ubuntu:20.04 + command: ["sleep", "9999"] + resources: + claims: + - name: cat-1 + resourceClaims: + - name: cat-0 + source: + resourceClaimTemplateName: large-black-cat-claim-template + - name: cat-1 + source: + resourceClaimTemplateName: large-black-cat-claim-template +``` + +## Scheduling + +In contrast to native resources (CPU, RAM) and extended resources (managed by a +device plugin, advertised by kubelet), the scheduler has no knowledge of what +dynamic resources are available in a cluster or how they could be split up to +satisfy the requirements of a specific ResourceClaim. Resource drivers are +responsible for that. They mark ResourceClaims as "allocated" once resources +for it are reserved. This also then tells the scheduler where in the cluster a +ResourceClaim is available. + +ResourceClaims can get allocated as soon as they are created ("immediate +allocation"), without considering which Pods will use them. The default is to +delay allocation until a Pod gets scheduled which needs the ResourceClaim +(i.e. "wait for first consumer"). + +In that mode, the scheduler checks all ResourceClaims needed by a Pod and +creates a PodScheduling object where it informs the resource drivers +responsible for those ResourceClaims about nodes that the scheduler considers +suitable for the Pod. The resource drivers respond by excluding nodes that +don't have enough of the driver's resources left. Once the scheduler has that +information, it selects one node and stores that choice in the PodScheduling +object. The resource drivers then allocate their ResourceClaims so that the +resources will be available on that node. Once that is complete, the Pod +gets scheduled. + +As part of this process, ResourceClaims also get reserved for the +Pod. Currently ResourceClaims can either be used exclusively by a single Pod or +an unlimited number of Pods. + +One key feature is that Pods do not get scheduled to a node unless all of +their resources are allocated and reserved. This avoids the scenario where a Pod +gets scheduled onto one node and then cannot run there, which is bad because +such a pending Pod also blocks all other resources like RAM or CPU that were +set aside for it. + +## Limitations + +The scheduler plugin must be involved in scheduling Pods which use +ResourceClaims. Bypassing the scheduler by setting the `nodeName` field leads +to Pods that the kubelet refuses to start because the ResourceClaims are not +reserved or not even allocated. It may be possible to [remove this +limitation](https://github.com/kubernetes/kubernetes/issues/114005) in the +future. + +## Enabling dynamic resource allocation + +Dynamic resource allocation is an *alpha feature* and only enabled when the +`DynamicResourceAllocation` [feature +gate](/docs/reference/command-line-tools-reference/feature-gates/) and the +`resource.k8s.io/v1alpha1` {{< glossary_tooltip text="API group" +term_id="api-group" >}} are enabled. For details on that, see the +`--feature-gates` and `--runtime-config` [kube-apiserver +parameters](/docs/reference/command-line-tools-reference/kube-apiserver/). +kube-scheduler, kube-controller-manager and kubelet also need the feature gate. + +A quick check whether a Kubernetes cluster supports the feature is to list +ResourceClass objects with: + +```shell +kubectl get resourceclasses +``` + +If your cluster supports dynamic resource allocation, the response is either a +list of ResourceClass objects or: + +``` +No resources found +``` + +If not supported, this error is printed instead: + +``` +error: the server doesn't have a resource type "resourceclasses" +``` + +The default configuration of kube-scheduler enables the "DynamicResources" +plugin if and only if the feature gate is enabled. Custom configurations may +have to be modified to include it. + +In addition to enabling the feature in the cluster, a resource driver also has to +be installed. Please refer to the driver's documentation for details. + +## {{% heading "whatsnext" %}} + + - For more information on the design, see the +[Dynamic Resource Allocation KEP](https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/3063-dynamic-resource-allocation/README.md). diff --git a/content/en/docs/concepts/scheduling-eviction/pod-scheduling-readiness.md b/content/en/docs/concepts/scheduling-eviction/pod-scheduling-readiness.md new file mode 100644 index 0000000000000..5a5647985d589 --- /dev/null +++ b/content/en/docs/concepts/scheduling-eviction/pod-scheduling-readiness.md @@ -0,0 +1,109 @@ +--- +title: Pod Scheduling Readiness +content_type: concept +weight: 40 +--- + + + +{{< feature-state for_k8s_version="v1.26" state="alpha" >}} + +Pods were considered ready for scheduling once created. Kubernetes scheduler +does its due diligence to find nodes to place all pending Pods. However, in a +real-world case, some Pods may stay in a "miss-essential-resources" state for a long period. +These Pods actually churn the scheduler (and downstream integrators like Cluster AutoScaler) +in an unnecessary manner. + +By specifying/removing a Pod's `.spec.schedulingGates`, you can control when a Pod is ready +to be considered for scheduling. + + + +## Configuring Pod schedulingGates + +The `schedulingGates` field contains a list of strings, and each string literal is perceived as a +criteria that Pod should be satisfied before considered schedulable. This field can be initialized +only when a Pod is created (either by the client, or mutated during admission). After creation, +each schedulingGate can be removed in arbitrary order, but addition of a new scheduling gate is disallowed. + +{{}} +stateDiagram-v2 + s1: pod created + s2: pod scheduling gated + s3: pod scheduling ready + s4: pod running + if: empty scheduling gates? + [*] --> s1 + s1 --> if + s2 --> if: scheduling gate removed + if --> s2: no + if --> s3: yes + s3 --> s4 + s4 --> [*] +{{< /mermaid >}} + +## Usage example + +To mark a Pod not-ready for scheduling, you can create it with one or more scheduling gates like this: + +{{< codenew file="pods/pod-with-scheduling-gates.yaml" >}} + +After the Pod's creation, you can check its state using: + +```bash +kubectl get pod test-pod +``` + +The output reveals it's in `SchedulingGated` state: + +```none +NAME READY STATUS RESTARTS AGE +test-pod 0/1 SchedulingGated 0 7s +``` + +You can also check its `schedulingGates` field by running: + +```bash +kubectl get pod test-pod -o jsonpath='{.spec.schedulingGates}' +``` + +The output is: + +```none +[{"name":"foo"},{"name":"bar"}] +``` + +To inform scheduler this Pod is ready for scheduling, you can remove its `schedulingGates` entirely +by re-applying a modified manifest: + +{{< codenew file="pods/pod-without-scheduling-gates.yaml" >}} + +You can check if the `schedulingGates` is cleared by running: + +```bash +kubectl get pod test-pod -o jsonpath='{.spec.schedulingGates}' +``` + +The output is expected to be empty. And you can check its latest status by running: + +```bash +kubectl get pod test-pod -o wide +``` + +Given the test-pod doesn't request any CPU/memory resources, it's expected that this Pod's state get +transited from previous `SchedulingGated` to `Running`: + +```none +NAME READY STATUS RESTARTS AGE IP NODE +test-pod 1/1 Running 0 15s 10.0.0.4 node-2 +``` + +## Observability + +The metric `scheduler_pending_pods` comes with a new label `"gated"` to distinguish whether a Pod +has been tried scheduling but claimed as unschedulable, or explicitly marked as not ready for +scheduling. You can use `scheduler_pending_pods{queue="gated"}` to check the metric result. + +## {{% heading "whatsnext" %}} + +* Read the [PodSchedulingReadiness KEP](https://github.com/kubernetes/enhancements/blob/master/keps/sig-scheduling/3521-pod-scheduling-readiness) for more details diff --git a/content/en/docs/concepts/scheduling-eviction/topology-spread-constraints.md b/content/en/docs/concepts/scheduling-eviction/topology-spread-constraints.md index 9628e4a906da2..62098a0928f42 100644 --- a/content/en/docs/concepts/scheduling-eviction/topology-spread-constraints.md +++ b/content/en/docs/concepts/scheduling-eviction/topology-spread-constraints.md @@ -65,8 +65,8 @@ spec: whenUnsatisfiable: labelSelector: matchLabelKeys: # optional; alpha since v1.25 - nodeAffinityPolicy: [Honor|Ignore] # optional; alpha since v1.25 - nodeTaintsPolicy: [Honor|Ignore] # optional; alpha since v1.25 + nodeAffinityPolicy: [Honor|Ignore] # optional; beta since v1.26 + nodeTaintsPolicy: [Honor|Ignore] # optional; beta since v1.26 ### other Pod fields go here ``` @@ -157,9 +157,8 @@ your cluster. Those fields are: If this value is null, the behavior is equivalent to the Honor policy. {{< note >}} - The `nodeAffinityPolicy` is an alpha-level field added in 1.25. You have to enable the - `NodeInclusionPolicyInPodTopologySpread` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/) - in order to use it. + The `nodeAffinityPolicy` is a beta-level field and enabled by default in 1.26. You can disable it by disabling the + `NodeInclusionPolicyInPodTopologySpread` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/). {{< /note >}} - **nodeTaintsPolicy** indicates how we will treat node taints when calculating @@ -171,9 +170,8 @@ your cluster. Those fields are: If this value is null, the behavior is equivalent to the Ignore policy. {{< note >}} - The `nodeTaintsPolicy` is an alpha-level field added in 1.25. You have to enable the - `NodeInclusionPolicyInPodTopologySpread` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/) - in order to use it. + The `nodeTaintsPolicy` is a beta-level field and enabled by default in 1.26. You can disable it by disabling the + `NodeInclusionPolicyInPodTopologySpread` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/). {{< /note >}} When a Pod defines more than one `topologySpreadConstraint`, those constraints are diff --git a/content/en/docs/concepts/security/security-checklist.md b/content/en/docs/concepts/security/security-checklist.md index 9da2c30103b87..e2d352fa6b152 100644 --- a/content/en/docs/concepts/security/security-checklist.md +++ b/content/en/docs/concepts/security/security-checklist.md @@ -278,7 +278,7 @@ for time-bound service account credentials. - [ ] Container images are configured to be run as unprivileged user. - [ ] References to container images are made by sha256 digests (rather than tags) or the provenance of the image is validated by verifying the image's -digital signature at deploy time [via admission control](/docs/tasks/administer-cluster/verify-signed-images/#verifying-image-signatures-with-admission-controller). +digital signature at deploy time [via admission control](/docs/tasks/administer-cluster/verify-signed-artifacts/#verifying-image-signatures-with-admission-controller). - [ ] Container images are regularly scanned during creation and in deployment, and known vulnerable software is patched. @@ -301,7 +301,7 @@ Avoid using image tags to reference an image, especially the `latest` tag, the image behind a tag can be easily modified in a registry. Prefer using the complete `sha256` digest which is unique to the image manifest. This policy can be enforced via an [ImagePolicyWebhook](/docs/reference/access-authn-authz/admission-controllers/#imagepolicywebhook). -Image signatures can also be automatically [verified with an admission controller](/docs/tasks/administer-cluster/verify-signed-images/#verifying-image-signatures-with-admission-controller) +Image signatures can also be automatically [verified with an admission controller](/docs/tasks/administer-cluster/verify-signed-artifacts/#verifying-image-signatures-with-admission-controller) at deploy time to validate their authenticity and integrity. Scanning a container image can prevent critical vulnerabilities from being diff --git a/content/en/docs/concepts/services-networking/cluster-ip-allocation.md b/content/en/docs/concepts/services-networking/cluster-ip-allocation.md new file mode 100644 index 0000000000000..848cf7ba9230a --- /dev/null +++ b/content/en/docs/concepts/services-networking/cluster-ip-allocation.md @@ -0,0 +1,149 @@ +--- +reviewers: +- sftim +- thockin +title: Service ClusterIP allocation +content_type: concept +weight: 120 +--- + + + + +In Kubernetes, [Services](/docs/concepts/services-networking/service/) are an abstract way to expose +an application running on a set of Pods. Services +can have a cluster-scoped virtual IP address (using a Service of `type: ClusterIP`). +Clients can connect using that virtual IP address, and Kubernetes then load-balances traffic to that +Service across the different backing Pods. + + + +## How Service ClusterIPs are allocated? + +When Kubernetes needs to assign a virtual IP address for a Service, +that assignment happens one of two ways: + +_dynamically_ +: the cluster's control plane automatically picks a free IP address from within the configured IP range for `type: ClusterIP` Services. + +_statically_ +: you specify an IP address of your choice, from within the configured IP range for Services. + +Across your whole cluster, every Service `ClusterIP` must be unique. +Trying to create a Service with a specific `ClusterIP` that has already +been allocated will return an error. + +## Why do you need to reserve Service Cluster IPs? + +Sometimes you may want to have Services running in well-known IP addresses, so other components and +users in the cluster can use them. + +The best example is the DNS Service for the cluster. As a soft convention, some Kubernetes installers assign the 10th IP address from +the Service IP range to the DNS service. Assuming you configured your cluster with Service IP range +10.96.0.0/16 and you want your DNS Service IP to be 10.96.0.10, you'd have to create a Service like +this: + +```yaml +apiVersion: v1 +kind: Service +metadata: + labels: + k8s-app: kube-dns + kubernetes.io/cluster-service: "true" + kubernetes.io/name: CoreDNS + name: kube-dns + namespace: kube-system +spec: + clusterIP: 10.96.0.10 + ports: + - name: dns + port: 53 + protocol: UDP + targetPort: 53 + - name: dns-tcp + port: 53 + protocol: TCP + targetPort: 53 + selector: + k8s-app: kube-dns + type: ClusterIP +``` + +but as it was explained before, the IP address 10.96.0.10 has not been reserved; if other Services are created +before or in parallel with dynamic allocation, there is a chance they can allocate this IP, hence, +you will not be able to create the DNS Service because it will fail with a conflict error. + +## How can you avoid Service ClusterIP conflicts? {#avoid-ClusterIP-conflict} + +The allocation strategy implemented in Kubernetes to allocate ClusterIPs to Services reduces the +risk of collision. + +The `ClusterIP` range is divided, based on the formula `min(max(16, cidrSize / 16), 256)`, +described as _never less than 16 or more than 256 with a graduated step between them_. + +Dynamic IP assignment uses the upper band by default, once this has been exhausted it will +use the lower range. This will allow users to use static allocations on the lower band with a low +risk of collision. + +## Examples {#allocation-examples} + +### Example 1 {#allocation-example-1} + +This example uses the IP address range: 10.96.0.0/24 (CIDR notation) for the IP addresses +of Services. + +Range Size: 28 - 2 = 254 +Band Offset: `min(max(16, 256/16), 256)` = `min(16, 256)` = 16 +Static band start: 10.96.0.1 +Static band end: 10.96.0.16 +Range end: 10.96.0.254 + +{{< mermaid >}} +pie showData + title 10.96.0.0/24 + "Static" : 16 + "Dynamic" : 238 +{{< /mermaid >}} + +### Example 2 {#allocation-example-2} + +This example uses the IP address range: 10.96.0.0/20 (CIDR notation) for the IP addresses +of Services. + +Range Size: 212 - 2 = 4094 +Band Offset: `min(max(16, 4096/16), 256)` = `min(256, 256)` = 256 +Static band start: 10.96.0.1 +Static band end: 10.96.1.0 +Range end: 10.96.15.254 + +{{< mermaid >}} +pie showData + title 10.96.0.0/20 + "Static" : 256 + "Dynamic" : 3838 +{{< /mermaid >}} + +### Example 3 {#allocation-example-3} + +This example uses the IP address range: 10.96.0.0/16 (CIDR notation) for the IP addresses +of Services. + +Range Size: 216 - 2 = 65534 +Band Offset: `min(max(16, 65536/16), 256)` = `min(4096, 256)` = 256 +Static band start: 10.96.0.1 +Static band ends: 10.96.1.0 +Range end: 10.96.255.254 + +{{< mermaid >}} +pie showData + title 10.96.0.0/16 + "Static" : 256 + "Dynamic" : 65278 +{{< /mermaid >}} + +## {{% heading "whatsnext" %}} + +* Read about [Service External Traffic Policy](/docs/tasks/access-application-cluster/create-external-load-balancer/#preserving-the-client-source-ip) +* Read about [Connecting Applications with Services](/docs/concepts/services-networking/connect-applications-service/) +* Read about [Services](/docs/concepts/services-networking/service/) + diff --git a/content/en/docs/concepts/services-networking/dns-pod-service.md b/content/en/docs/concepts/services-networking/dns-pod-service.md index 1341d5e7c0b49..adb626830497d 100644 --- a/content/en/docs/concepts/services-networking/dns-pod-service.md +++ b/content/en/docs/concepts/services-networking/dns-pod-service.md @@ -323,16 +323,24 @@ search default.svc.cluster-domain.example svc.cluster-domain.example cluster-dom options ndots:5 ``` -#### Expanded DNS Configuration +## DNS search domain list limits -{{< feature-state for_k8s_version="1.22" state="alpha" >}} +{{< feature-state for_k8s_version="1.26" state="beta" >}} -By default, for Pod's DNS Config, Kubernetes allows at most 6 search domains and -a list of search domains of up to 256 characters. +Kubernetes itself does not limit the DNS Config until the length of the search +domain list exceeds 32 or the total length of all search domains exceeds 2048. +This limit applies to the node's resolver configuration file, the Pod's DNS +Config, and the merged DNS Config respectively. -If the feature gate `ExpandedDNSConfig` is enabled for the kube-apiserver and -the kubelet, it is allowed for Kubernetes to have at most 32 search domains and -a list of search domains of up to 2048 characters. +{{< note >}} +Some container runtimes of earlier versions may have their own restrictions on +the number of DNS search domains. Depending on the container runtime +environment, the pods with a large number of DNS search domains may get stuck in +the pending state. + +It is known that containerd v1.5.5 or earlier and CRI-O v1.21 or earlier have +this problem. +{{< /note >}} ## DNS resolution on Windows nodes {#dns-windows} diff --git a/content/en/docs/concepts/services-networking/service-traffic-policy.md b/content/en/docs/concepts/services-networking/service-traffic-policy.md index 9342da3cbc428..92e2c524bfa8e 100644 --- a/content/en/docs/concepts/services-networking/service-traffic-policy.md +++ b/content/en/docs/concepts/services-networking/service-traffic-policy.md @@ -14,7 +14,7 @@ description: >- -{{< feature-state for_k8s_version="v1.23" state="beta" >}} +{{< feature-state for_k8s_version="v1.26" state="stable" >}} _Service Internal Traffic Policy_ enables internal traffic restrictions to only route internal traffic to endpoints within the node the traffic originated from. The @@ -25,12 +25,10 @@ cluster. This can help to reduce costs and improve performance. ## Using Service Internal Traffic Policy -The `ServiceInternalTrafficPolicy` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/) -is a Beta feature and enabled by default. -When the feature is enabled, you can enable the internal-only traffic policy for a +You can enable the internal-only traffic policy for a {{< glossary_tooltip text="Service" term_id="service" >}}, by setting its -`.spec.internalTrafficPolicy` to `Local`. -This tells kube-proxy to only use node local endpoints for cluster internal traffic. +`.spec.internalTrafficPolicy` to `Local`. This tells kube-proxy to only use node local +endpoints for cluster internal traffic. {{< note >}} For pods on nodes with no endpoints for a given Service, the Service @@ -60,10 +58,8 @@ spec: The kube-proxy filters the endpoints it routes to based on the `spec.internalTrafficPolicy` setting. When it's set to `Local`, only node local -endpoints are considered. When it's `Cluster` or missing, all endpoints are -considered. -When the [feature gate](/docs/reference/command-line-tools-reference/feature-gates/) -`ServiceInternalTrafficPolicy` is enabled, `spec.internalTrafficPolicy` defaults to "Cluster". +endpoints are considered. When it's `Cluster` (the default), or is not set, +Kubernetes considers all endpoints. ## {{% heading "whatsnext" %}} diff --git a/content/en/docs/concepts/storage/persistent-volumes.md b/content/en/docs/concepts/storage/persistent-volumes.md index 06127e7c5f1a5..2ae3d42964d2e 100644 --- a/content/en/docs/concepts/storage/persistent-volumes.md +++ b/content/en/docs/concepts/storage/persistent-volumes.md @@ -297,7 +297,6 @@ the following types of volumes: * {{< glossary_tooltip text="csi" term_id="csi" >}} * flexVolume (deprecated) * gcePersistentDisk -* glusterfs (deprecated) * rbd * portworxVolume @@ -438,8 +437,6 @@ The following types of PersistentVolume are deprecated. This means that support (**deprecated** in v1.23) * [`gcePersistentDisk`](/docs/concepts/storage/volumes/#gcepersistentdisk) - GCE Persistent Disk (**deprecated** in v1.17) -* [`glusterfs`](/docs/concepts/storage/volumes/#glusterfs) - Glusterfs volume - (**deprecated** in v1.25) * [`portworxVolume`](/docs/concepts/storage/volumes/#portworxvolume) - Portworx volume (**deprecated** in v1.25) * [`vsphereVolume`](/docs/concepts/storage/volumes/#vspherevolume) - vSphere VMDK volume @@ -616,7 +613,6 @@ The following volume types support mount options: * `cephfs` * `cinder` (**deprecated** in v1.18) * `gcePersistentDisk` -* `glusterfs` (**deprecated** in v1.25) * `iscsi` * `nfs` * `rbd` @@ -745,10 +741,9 @@ it won't be supported in a future Kubernetes release. #### Retroactive default StorageClass assignment -{{< feature-state for_k8s_version="v1.25" state="alpha" >}} +{{< feature-state for_k8s_version="v1.26" state="beta" >}} You can create a PersistentVolumeClaim without specifying a `storageClassName` for the new PVC, and you can do so even when no default StorageClass exists in your cluster. In this case, the new PVC creates as you defined it, and the `storageClassName` of that PVC remains unset until default becomes available. -However, if you enable the [`RetroactiveDefaultStorageClass` feature gate](/docs/reference/command-line-tools-reference/feature-gates/) then Kubernetes behaves differently: existing PVCs without `storageClassName` update to use the new default StorageClass. When a default StorageClass becomes available, the control plane identifies any existing PVCs without `storageClassName`. For the PVCs that either have an empty value for `storageClassName` or do not have this key, the control plane then updates those PVCs to set `storageClassName` to match the new default StorageClass. If you have an existing PVC where the `storageClassName` is `""`, and you configure a default StorageClass, then this PVC will not get updated. @@ -953,6 +948,25 @@ or to a VolumeSnapshot, the `dataSourceRef` field can contain a reference to any same namespace, except for core objects other than PVCs. For clusters that have the feature gate enabled, use of the `dataSourceRef` is preferred over `dataSource`. +## Cross namespace data sources +{{< feature-state for_k8s_version="v1.26" state="alpha" >}} + +Kubernetes supports cross namespace volume data sources. +To use cross namespace volume data sources, you must enable the `AnyVolumeDataSource` and `CrossNamespaceVolumeDataSource` +[feature gates](/docs/reference/command-line-tools-reference/feature-gates/) for +the kube-apiserver, kube-controller-manager. +Also, you must enable the `CrossNamespaceVolumeDataSource` feature gate for the csi-provisioner. + +Enabling the `CrossNamespaceVolumeDataSource` feature gate allow you to specify a namespace in the dataSourceRef field. +{{< note >}} +When you specify a namespace for a volume data source, Kubernetes checks for a +ReferenceGrant in the other namespace before accepting the reference. +ReferenceGrant is part of the `gateway.networking.k8s.io` extension APIs. +See [ReferenceGrant](https://gateway-api.sigs.k8s.io/api-types/referencegrant/) in the Gateway API documentation for details. +This means that you must extend your Kubernetes cluster with at least ReferenceGrant from the +Gateway API before you can use this mechanism. +{{< /note >}} + ## Data source references The `dataSourceRef` field behaves almost the same as the `dataSource` field. If either one is @@ -970,6 +984,11 @@ users should be aware of: * The `dataSourceRef` field may contain different types of objects, while the `dataSource` field only allows PVCs and VolumeSnapshots. +When the `CrossNamespaceVolumeDataSource` feature is enabled, there are additional differences: + +* The `dataSource` field only allows local objects, while the `dataSourceRef` field allows objects in any namespaces. +* When namespace is specified, `dataSource` and `dataSourceRef` are not synced. + Users should always use `dataSourceRef` on clusters that have the feature gate enabled, and fall back to `dataSource` on clusters that do not. It is not necessary to look at both fields under any circumstance. The duplicated values with slightly different semantics exist only for @@ -1010,6 +1029,50 @@ is registered to handle that kind of data source. When a suitable populator is i responsibility of that populator controller to report Events that relate to volume creation and issues during the process. +### Using a cross-namespace volume data source +{{< feature-state for_k8s_version="v1.26" state="alpha" >}} + +Create a ReferenceGrant to allow the namespace owner to accept the reference. +You define a populated volume by specifying a cross namespace volume data source using the `dataSourceRef` field. You must already have a valid ReferenceGrant in the source namespace: + + ```yaml + apiVersion: gateway.networking.k8s.io/v1beta1 + kind: ReferenceGrant + metadata: + name: allow-ns1-pvc + namespace: default + spec: + from: + - group: "" + kind: PersistentVolumeClaim + namespace: ns1 + to: + - group: snapshot.storage.k8s.io + kind: VolumeSnapshot + name: new-snapshot-demo + ``` + + ```yaml + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: foo-pvc + namespace: ns1 + spec: + storageClassName: example + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi + dataSourceRef: + apiGroup: snapshot.storage.k8s.io + kind: VolumeSnapshot + name: new-snapshot-demo + namespace: default + volumeMode: Filesystem + ``` + ## Writing Portable Configuration If you're writing configuration templates or examples that run on a wide range of clusters diff --git a/content/en/docs/concepts/storage/storage-classes.md b/content/en/docs/concepts/storage/storage-classes.md index e733bb2debafd..cffed404563cc 100644 --- a/content/en/docs/concepts/storage/storage-classes.md +++ b/content/en/docs/concepts/storage/storage-classes.md @@ -72,7 +72,6 @@ for provisioning PVs. This field must be specified. | FC | - | - | | FlexVolume | - | - | | GCEPersistentDisk | ✓ | [GCE PD](#gce-pd) | -| Glusterfs | ✓ | [Glusterfs](#glusterfs) | | iSCSI | - | - | | NFS | - | [NFS](#nfs) | | RBD | ✓ | [Ceph RBD](#ceph-rbd) | @@ -123,7 +122,6 @@ Volume type | Required Kubernetes version gcePersistentDisk | 1.11 awsElasticBlockStore | 1.11 Cinder | 1.11 -glusterfs | 1.11 rbd | 1.11 Azure File | 1.11 Azure Disk | 1.11 @@ -338,87 +336,6 @@ using `allowedTopologies`. [allowedTopologies](#allowed-topologies) {{< /note >}} -### Glusterfs (deprecated) {#glusterfs} - -```yaml -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: slow -provisioner: kubernetes.io/glusterfs -parameters: - resturl: "http://127.0.0.1:8081" - clusterid: "630372ccdc720a92c681fb928f27b53f" - restauthenabled: "true" - restuser: "admin" - secretNamespace: "default" - secretName: "heketi-secret" - gidMin: "40000" - gidMax: "50000" - volumetype: "replicate:3" -``` - -* `resturl`: Gluster REST service/Heketi service url which provision gluster - volumes on demand. The general format should be `IPaddress:Port` and this is - a mandatory parameter for GlusterFS dynamic provisioner. If Heketi service is - exposed as a routable service in openshift/kubernetes setup, this can have a - format similar to `http://heketi-storage-project.cloudapps.mystorage.com` - where the fqdn is a resolvable Heketi service url. -* `restauthenabled` : Gluster REST service authentication boolean that enables - authentication to the REST server. If this value is `"true"`, `restuser` and - `restuserkey` or `secretNamespace` + `secretName` have to be filled. This - option is deprecated, authentication is enabled when any of `restuser`, - `restuserkey`, `secretName` or `secretNamespace` is specified. -* `restuser` : Gluster REST service/Heketi user who has access to create volumes - in the Gluster Trusted Pool. -* `restuserkey` : Gluster REST service/Heketi user's password which will be used - for authentication to the REST server. This parameter is deprecated in favor - of `secretNamespace` + `secretName`. -* `secretNamespace`, `secretName` : Identification of Secret instance that - contains user password to use when talking to Gluster REST service. These - parameters are optional, empty password will be used when both - `secretNamespace` and `secretName` are omitted. The provided secret must have - type `"kubernetes.io/glusterfs"`, for example created in this way: - - ``` - kubectl create secret generic heketi-secret \ - --type="kubernetes.io/glusterfs" --from-literal=key='opensesame' \ - --namespace=default - ``` - - Example of a secret can be found in - [glusterfs-provisioning-secret.yaml](https://github.com/kubernetes/examples/tree/master/staging/persistent-volume-provisioning/glusterfs/glusterfs-secret.yaml). - -* `clusterid`: `630372ccdc720a92c681fb928f27b53f` is the ID of the cluster - which will be used by Heketi when provisioning the volume. It can also be a - list of clusterids, for example: - `"8452344e2becec931ece4e33c4674e4e,42982310de6c63381718ccfa6d8cf397"`. This - is an optional parameter. -* `gidMin`, `gidMax` : The minimum and maximum value of GID range for the - StorageClass. A unique value (GID) in this range ( gidMin-gidMax ) will be - used for dynamically provisioned volumes. These are optional values. If not - specified, the volume will be provisioned with a value between 2000-2147483647 - which are defaults for gidMin and gidMax respectively. -* `volumetype` : The volume type and its parameters can be configured with this - optional value. If the volume type is not mentioned, it's up to the provisioner - to decide the volume type. - - For example: - * Replica volume: `volumetype: replicate:3` where '3' is replica count. - * Disperse/EC volume: `volumetype: disperse:4:2` where '4' is data and '2' is the redundancy count. - * Distribute volume: `volumetype: none` - - For available volume types and administration options, refer to the - [Administration Guide](https://access.redhat.com/documentation/en-us/red_hat_gluster_storage/). - - For further reference information, see - [How to configure Heketi](https://github.com/heketi/heketi/wiki/Setting-up-the-topology). - - When persistent volumes are dynamically provisioned, the Gluster plugin - automatically creates an endpoint and a headless service in the name - `gluster-dynamic-`. The dynamic endpoint and service are automatically - deleted when the persistent volume claim is deleted. - ### NFS ```yaml diff --git a/content/en/docs/concepts/storage/volumes.md b/content/en/docs/concepts/storage/volumes.md index a9cf03fe5b872..4b265eceb7820 100644 --- a/content/en/docs/concepts/storage/volumes.md +++ b/content/en/docs/concepts/storage/volumes.md @@ -172,7 +172,7 @@ For more details, see the [`azureFile` volume plugin](https://github.com/kuberne #### azureFile CSI migration -{{< feature-state for_k8s_version="v1.21" state="beta" >}} +{{< feature-state for_k8s_version="v1.26" state="stable" >}} The `CSIMigration` feature for `azureFile`, when enabled, redirects all plugin operations from the existing in-tree plugin to the `file.csi.azure.com` Container @@ -542,24 +542,15 @@ spec: repository: "git@somewhere:me/my-git-repository.git" revision: "22f1d8406d464b0c0874075539c1f2e96c253775" ``` +### glusterfs (removed) {#glusterfs} -### glusterfs (deprecated) {#glusterfs} + +- +Kubernetes {{< skew currentVersion >}} does not include a `glusterfs` volume type. -{{< feature-state for_k8s_version="v1.25" state="deprecated" >}} - -A `glusterfs` volume allows a [Glusterfs](https://www.gluster.org) (an open -source networked filesystem) volume to be mounted into your Pod. Unlike -`emptyDir`, which is erased when a Pod is removed, the contents of a -`glusterfs` volume are preserved and the volume is merely unmounted. This -means that a `glusterfs` volume can be pre-populated with data, and that data can -be shared between pods. GlusterFS can be mounted by multiple writers -simultaneously. - -{{< note >}} -You must have your own GlusterFS installation running before you can use it. -{{< /note >}} - -See the [GlusterFS example](https://github.com/kubernetes/examples/tree/master/volumes/glusterfs) for more details. +The GlusterFS in-tree storage driver was deprecated in the Kubernetes v1.25 release +and then removed entirely in the v1.26 release. ### hostPath {#hostpath} @@ -952,20 +943,17 @@ For more information, see the [vSphere volume](https://github.com/kubernetes/exa #### vSphere CSI migration {#vsphere-csi-migration} -{{< feature-state for_k8s_version="v1.19" state="beta" >}} - -The `CSIMigrationvSphere` feature for `vsphereVolume` is enabled by default as of Kubernetes v1.25. -All plugin operations from the in-tree `vspherevolume` will be redirected to the `csi.vsphere.vmware.com` {{< glossary_tooltip text="CSI" term_id="csi" >}} driver unless `CSIMigrationvSphere` feature gate is disabled. +{{< feature-state for_k8s_version="v1.26" state="stable" >}} +In Kubernetes {{< skew currentVersion >}}, all operations for the in-tree `vsphereVolume` type +are redirected to the `csi.vsphere.vmware.com` {{< glossary_tooltip text="CSI" term_id="csi" >}} driver. [vSphere CSI driver](https://github.com/kubernetes-sigs/vsphere-csi-driver) must be installed on the cluster. You can find additional advice on how to migrate in-tree `vsphereVolume` in VMware's documentation page -[Migrating In-Tree vSphere Volumes to vSphere Container Storage Plug-in](https://docs.vmware.com/en/VMware-vSphere-Container-Storage-Plug-in/2.0/vmware-vsphere-csp-getting-started/GUID-968D421F-D464-4E22-8127-6CB9FF54423F.html). +[Migrating In-Tree vSphere Volumes to vSphere Container Storage lug-in](https://docs.vmware.com/en/VMware-vSphere-Container-Storage-Plug-in/2.0/vmware-vsphere-csp-getting-started/GUID-968D421F-D464-4E22-8127-6CB9FF54423F.html). +If vSphere CSI Driver is not installed volume operations can not be performed on the PV created with the in-tree `vsphereVolume` type. -As of Kubernetes v1.25, vSphere releases less than 7.0u2 are not supported for the -(deprecated) in-tree vSphere storage driver. You must run vSphere 7.0u2 or later -in order to either continue using the deprecated driver, or to migrate to -the replacement CSI driver. +You must run vSphere 7.0u2 or later in order to migrate to the vSphere CSI driver. If you are running a version of Kubernetes other than v{{< skew currentVersion >}}, consult the documentation for that version of Kubernetes. diff --git a/content/en/docs/concepts/windows/intro.md b/content/en/docs/concepts/windows/intro.md index 89553a4f38e90..2d0fa8d0a7856 100644 --- a/content/en/docs/concepts/windows/intro.md +++ b/content/en/docs/concepts/windows/intro.md @@ -238,11 +238,11 @@ work between Windows and Linux: The following list documents differences between how Pod specifications work between Windows and Linux: * `hostIPC` and `hostpid` - host namespace sharing is not possible on Windows -* `hostNetwork` - There is no Windows OS support to share the host network +* `hostNetwork` - [see below](/docs/concepts/windows/intro#compatibility-v1-pod-spec-containers-hostnetwork) * `dnsPolicy` - setting the Pod `dnsPolicy` to `ClusterFirstWithHostNet` is not supported on Windows because host networking is not provided. Pods always run with a container network. -* `podSecurityContext` (see below) +* `podSecurityContext` [see below](/docs/concepts/windows/intro#compatibility-v1-pod-spec-containers-securitycontext) * `shareProcessNamespace` - this is a beta feature, and depends on Linux namespaces which are not implemented on Windows. Windows cannot share process namespaces or the container's root filesystem. Only the network can be shared. @@ -261,6 +261,17 @@ The following list documents differences between how Pod specifications work bet * You cannot enable `mountPropagation` for volume mounts as this is not supported on Windows. +#### Field compatibility for hostNetwork {#compatibility-v1-pod-spec-containers-hostnetwork} + +{{< feature-state for_k8s_version="v1.26" state="alpha" >}} + +The kubelet can now request that pods running on Windows nodes use the host's network namespace instead +of creating a new pod network namespace. To enable this functionality pass `--feature-gates=WindowsHostNetwork=true` to the kubelet. + +{{< note >}} +This functionality requires a container runtime that supports this functionality. +{{< /note >}} + #### Field compatibility for Pod security context {#compatibility-v1-pod-spec-containers-securitycontext} None of the Pod [`securityContext`](/docs/reference/kubernetes-api/workload-resources/pod-v1/#security-context) fields work on Windows. diff --git a/content/en/docs/concepts/workloads/controllers/job.md b/content/en/docs/concepts/workloads/controllers/job.md index 3d5fd8441b4a5..08855b8b08eed 100644 --- a/content/en/docs/concepts/workloads/controllers/job.md +++ b/content/en/docs/concepts/workloads/controllers/job.md @@ -1,5 +1,6 @@ --- reviewers: +- alculquicondor - erictune - soltysh title: Jobs @@ -290,6 +291,10 @@ starts a new Pod. This means that your application needs to handle the case whe pod. In particular, it needs to handle temporary files, locks, incomplete output and the like caused by previous runs. +By default, each pod failure is counted towards the `.spec.backoffLimit` limit, +see [pod backoff failure policy](#pod-backoff-failure-policy). However, you can +customize handling of pod failures by setting the Job's [pod failure policy](#pod-failure-policy). + Note that even if you specify `.spec.parallelism = 1` and `.spec.completions = 1` and `.spec.template.spec.restartPolicy = "Never"`, the same program may sometimes be started twice. @@ -297,6 +302,19 @@ sometimes be started twice. If you do specify `.spec.parallelism` and `.spec.completions` both greater than 1, then there may be multiple pods running at once. Therefore, your pods must also be tolerant of concurrency. +When the [feature gates](/docs/reference/command-line-tools-reference/feature-gates/) +`PodDisruptionConditions` and `JobPodFailurePolicy` are both enabled, +and the `.spec.podFailurePolicy` field is set, the Job controller does not consider a terminating +Pod (a pod that has a `.metadata.deletionTimestamp` field set) as a failure until that Pod is +terminal (its `.status.phase` is `Failed` or `Succeeded`). However, the Job controller +creates a replacement Pod as soon as the termination becomes apparent. Once the +pod terminates, the Job controller evaluates `.backoffLimit` and `.podFailurePolicy` +for the relevant Job, taking this now-terminated Pod into consideration. + +If either of these requirements is not satisfied, the Job controller counts +a terminating Pod as an immediate failure, even if that Pod later terminates +with `phase: "Succeeded"`. + ### Pod backoff failure policy There are situations where you want to fail a Job after some amount of retries @@ -314,10 +332,6 @@ The number of retries is calculated in two ways: If either of the calculations reaches the `.spec.backoffLimit`, the Job is considered failed. -When the [`JobTrackingWithFinalizers`](#job-tracking-with-finalizers) feature is -disabled, the number of failed Pods is only based on Pods that are still present -in the API. - {{< note >}} If your job has `restartPolicy = "OnFailure"`, keep in mind that your Pod running the Job will be terminated once the job backoff limit has been reached. This can make debugging the Job's executable more difficult. We suggest setting @@ -701,7 +715,7 @@ mismatch. ### Pod failure policy {#pod-failure-policy} -{{< feature-state for_k8s_version="v1.25" state="alpha" >}} +{{< feature-state for_k8s_version="v1.26" state="beta" >}} {{< note >}} You can only configure a Pod failure policy for a Job if you have the @@ -710,7 +724,7 @@ enabled in your cluster. Additionally, it is recommended to enable the `PodDisruptionConditions` feature gate in order to be able to detect and handle Pod disruption conditions in the Pod failure policy (see also: [Pod disruption conditions](/docs/concepts/workloads/pods/disruptions#pod-disruption-conditions)). Both feature gates are -available in Kubernetes v1.25. +available in Kubernetes {{< skew currentVersion >}}. {{< /note >}} A Pod failure policy, defined with the `.spec.podFailurePolicy` field, enables @@ -784,43 +798,33 @@ These are some requirements and semantics of the API: ### Job tracking with finalizers -{{< feature-state for_k8s_version="v1.23" state="beta" >}} +{{< feature-state for_k8s_version="v1.26" state="stable" >}} {{< note >}} -In order to use this behavior, you must enable the `JobTrackingWithFinalizers` -[feature gate](/docs/reference/command-line-tools-reference/feature-gates/) -on the [API server](/docs/reference/command-line-tools-reference/kube-apiserver/) -and the [controller manager](/docs/reference/command-line-tools-reference/kube-controller-manager/). - -When enabled, the control plane tracks new Jobs using the behavior described -below. Jobs created before the feature was enabled are unaffected. As a user, -the only difference you would see is that the control plane tracking of Job -completion is more accurate. +The control plane doesn't track Jobs using finalizers, if the Jobs were created +when the feature gate `JobTrackingWithFinalizers` was disabled, even after you +upgrade the control plane to 1.26. {{< /note >}} -When this feature isn't enabled, the Job {{< glossary_tooltip term_id="controller" >}} -relies on counting the Pods that exist in the cluster to track the Job status, -that is, to keep the counters for `succeeded` and `failed` Pods. -However, Pods can be removed for a number of reasons, including: -- The garbage collector that removes orphan Pods when a Node goes down. -- The garbage collector that removes finished Pods (in `Succeeded` or `Failed` - phase) after a threshold. -- Human intervention to delete Pods belonging to a Job. -- An external controller (not provided as part of Kubernetes) that removes or - replaces Pods. - -If you enable the `JobTrackingWithFinalizers` feature for your cluster, the -control plane keeps track of the Pods that belong to any Job and notices if any -such Pod is removed from the API server. To do that, the Job controller creates Pods with -the finalizer `batch.kubernetes.io/job-tracking`. The controller removes the -finalizer only after the Pod has been accounted for in the Job status, allowing -the Pod to be removed by other controllers or users. - -The Job controller uses the new algorithm for new Jobs only. Jobs created -before the feature is enabled are unaffected. You can determine if the Job -controller is tracking a Job using Pod finalizers by checking if the Job has the -annotation `batch.kubernetes.io/job-tracking`. You should **not** manually add -or remove this annotation from Jobs. +The control plane keeps track of the Pods that belong to any Job and notices if +any such Pod is removed from the API server. To do that, the Job controller +creates Pods with the finalizer `batch.kubernetes.io/job-tracking`. The +controller removes the finalizer only after the Pod has been accounted for in +the Job status, allowing the Pod to be removed by other controllers or users. + +Jobs created before upgrading to Kubernetes 1.26 or before the feature gate +`JobTrackingWithFinalizers` is enabled are tracked without the use of Pod +finalizers. +The Job {{< glossary_tooltip term_id="controller" text="controller" >}} updates +the status counters for `succeeded` and `failed` Pods based only on the Pods +that exist in the cluster. The contol plane can lose track of the progress of +the Job if Pods are deleted from the cluster. + +You can determine if the control plane is tracking a Job using Pod finalizers by +checking if the Job has the annotation +`batch.kubernetes.io/job-tracking`. You should **not** manually add or remove +this annotation from Jobs. Instead, you can recreate the Jobs to ensure they +are tracked using Pod finalizers. ## Alternatives diff --git a/content/en/docs/concepts/workloads/controllers/statefulset.md b/content/en/docs/concepts/workloads/controllers/statefulset.md index e7482851c40f4..bfe29e81a84ff 100644 --- a/content/en/docs/concepts/workloads/controllers/statefulset.md +++ b/content/en/docs/concepts/workloads/controllers/statefulset.md @@ -154,8 +154,23 @@ regardless of which node it's (re)scheduled on. ### Ordinal Index -For a StatefulSet with N replicas, each Pod in the StatefulSet will be -assigned an integer ordinal, from 0 up through N-1, that is unique over the Set. +For a StatefulSet with N [replicas](#replicas), each Pod in the StatefulSet +will be assigned an integer ordinal, that is unique over the Set. By default, +pods will be assigned ordinals from 0 up through N-1. + +### Start ordinal + +{{< feature-state for_k8s_version="v1.26" state="alpha" >}} + +`.spec.ordinals` is an optional field that allows you to configure the integer +ordinals assigned to each Pod. It defaults to nil. You must enable the +`StatefulSetStartOrdinal` +[feature gate](/docs/reference/command-line-tools-reference/feature-gates/) to +use this field. Once enabled, you can configure the following options: + +* `.spec.ordinals.start`: If the `.spec.ordinals.start` field is set, Pods will + be assigned ordinals from `.spec.ordinals.start` up through + `.spec.ordinals.start + .spec.replicas - 1`. ### Stable Network ID diff --git a/content/en/docs/concepts/workloads/pods/disruptions.md b/content/en/docs/concepts/workloads/pods/disruptions.md index eec529f2e6ca7..66d05ef92a3f1 100644 --- a/content/en/docs/concepts/workloads/pods/disruptions.md +++ b/content/en/docs/concepts/workloads/pods/disruptions.md @@ -229,12 +229,17 @@ can happen, according to: ## Pod disruption conditions {#pod-disruption-conditions} -{{< feature-state for_k8s_version="v1.25" state="alpha" >}} +{{< feature-state for_k8s_version="v1.26" state="beta" >}} {{< note >}} -In order to use this behavior, you must enable the `PodDisruptionConditions` +If you are using an older version of Kubernetes than {{< skew currentVersion >}} +please refer to the corresponding version of the documentation. +{{< /note >}} + +{{< note >}} +In order to use this behavior, you must have the `PodDisruptionConditions` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/) -in your cluster. +enabled in your cluster. {{< /note >}} When enabled, a dedicated Pod `DisruptionTarget` [condition](/docs/concepts/workloads/pods/pod-lifecycle/#pod-conditions) is added to indicate @@ -254,6 +259,9 @@ indicates one of the following reasons for the Pod termination: `DeletionByPodGC` : Pod, that is bound to a no longer existing Node, is due to be deleted by [Pod garbage collection](/docs/concepts/workloads/pods/pod-lifecycle/#pod-garbage-collection). +`TerminationByKubelet` +: Pod has been terminated by the kubelet, because of either {{}} or the [graceful node shutdown](/docs/concepts/architecture/nodes/#graceful-node-shutdown). + {{< note >}} A Pod disruption might be interrupted. The control plane might re-attempt to continue the disruption of the same Pod, but it is not guaranteed. As a result, @@ -262,6 +270,10 @@ deleted. In such a situation, after some time, the Pod disruption condition will be cleared. {{< /note >}} +When the `PodDisruptionConditions` feature gate is enabled, +along with cleaning up the pods, the Pod garbage collector (PodGC) will also mark them as failed if they are in a non-terminal +phase (see also [Pod garbage collection](/docs/concepts/workloads/pods/pod-lifecycle/#pod-garbage-collection)). + When using a Job (or CronJob), you may want to use these Pod disruption conditions as part of your Job's [Pod failure policy](/docs/concepts/workloads/controllers/job#pod-failure-policy). diff --git a/content/en/docs/concepts/workloads/pods/pod-lifecycle.md b/content/en/docs/concepts/workloads/pods/pod-lifecycle.md index 06734ee51d495..852f965530c63 100644 --- a/content/en/docs/concepts/workloads/pods/pod-lifecycle.md +++ b/content/en/docs/concepts/workloads/pods/pod-lifecycle.md @@ -507,17 +507,27 @@ If you need to force-delete Pods that are part of a StatefulSet, refer to the ta documentation for [deleting Pods from a StatefulSet](/docs/tasks/run-application/force-delete-stateful-set-pod/). -### Garbage collection of terminated Pods {#pod-garbage-collection} +### Garbage collection of Pods {#pod-garbage-collection} For failed Pods, the API objects remain in the cluster's API until a human or {{< glossary_tooltip term_id="controller" text="controller" >}} process explicitly removes them. -The control plane cleans up terminated Pods (with a phase of `Succeeded` or +The Pod garbage collector (PodGC), which is a controller in the control plane, cleans up terminated Pods (with a phase of `Succeeded` or `Failed`), when the number of Pods exceeds the configured threshold (determined by `terminated-pod-gc-threshold` in the kube-controller-manager). This avoids a resource leak as Pods are created and terminated over time. +Additionally, PodGC cleans up any Pods which satisfy any of the following conditions: +1. are orphan pods - bound to a node which no longer exists, +2. are unscheduled terminating pods, +3. are terminating pods, bound to a non-ready node tainted with [`node.kubernetes.io/out-of-service`](/docs/reference/labels-annotations-taints/#node-kubernetes-io-out-of-service), when the `NodeOutOfServiceVolumeDetach` feature gate is enabled. + +When the `PodDisruptionConditions` feature gate is enabled, along with +cleaning up the pods, PodGC will also mark them as failed if they are in a non-terminal +phase. Also, PodGC adds a pod disruption condition when cleaning up an orphan +pod (see also: +[Pod disruption conditions](/docs/concepts/workloads/pods/disruptions#pod-disruption-conditions)). ## {{% heading "whatsnext" %}} diff --git a/content/en/docs/reference/access-authn-authz/admission-controllers.md b/content/en/docs/reference/access-authn-authz/admission-controllers.md index 099409f2e5a56..f819c237b1684 100644 --- a/content/en/docs/reference/access-authn-authz/admission-controllers.md +++ b/content/en/docs/reference/access-authn-authz/admission-controllers.md @@ -102,9 +102,16 @@ kube-apiserver -h | grep enable-admission-plugins In Kubernetes {{< skew currentVersion >}}, the default ones are: ```shell -CertificateApproval, CertificateSigning, CertificateSubjectRestriction, DefaultIngressClass, DefaultStorageClass, DefaultTolerationSeconds, LimitRanger, MutatingAdmissionWebhook, NamespaceLifecycle, PersistentVolumeClaimResize, PodSecurity, Priority, ResourceQuota, RuntimeClass, ServiceAccount, StorageObjectInUseProtection, TaintNodesByCondition, ValidatingAdmissionWebhook +CertificateApproval, CertificateSigning, CertificateSubjectRestriction, DefaultIngressClass, DefaultStorageClass, DefaultTolerationSeconds, LimitRanger, MutatingAdmissionWebhook, NamespaceLifecycle, PersistentVolumeClaimResize, PodSecurity, Priority, ResourceQuota, RuntimeClass, ServiceAccount, StorageObjectInUseProtection, TaintNodesByCondition, ValidatingAdmissionPolicy, ValidatingAdmissionWebhook ``` +{{< note >}} +The [`ValidatingAdmissionPolicy`](#validatingadmissionpolicy) admission plugin is enabled +by default, but is only active if you enable the the `ValidatingAdmissionPolicy` +[feature gate](/docs/reference/command-line-tools-reference/feature-gates/) **and** +the `admissionregistration.k8s.io/v1alpha1` API. +{{< note >}} + ## What does each admission controller do? ### AlwaysAdmit {#alwaysadmit} @@ -774,6 +781,12 @@ Nodes as `NotReady` and `NoSchedule`. That tainting avoids a race condition that to be scheduled on new Nodes before their taints were updated to accurately reflect their reported conditions. +### ValidatingAdmissionPolicy {#validatingadmissionpolicy} + +[This admission controller](/docs/reference/access-authn-authz/validating-admission-policy/) implements the CEL validation for incoming matched requests. +It is enabled when both feature gate `validatingadmissionpolicy` and `admissionregistration.k8s.io/v1alpha1` group/version are enabled. +If any of the ValidatingAdmissionPolicy fails, the request fails. + ### ValidatingAdmissionWebhook {#validatingadmissionwebhook} This admission controller calls any validating webhooks which match the request. Matching diff --git a/content/en/docs/reference/access-authn-authz/authentication.md b/content/en/docs/reference/access-authn-authz/authentication.md index e2e422a1a1299..6ab9ba3c75d89 100644 --- a/content/en/docs/reference/access-authn-authz/authentication.md +++ b/content/en/docs/reference/access-authn-authz/authentication.md @@ -1219,6 +1219,147 @@ The following `ExecCredential` manifest describes a cluster information sample. {{% /tab %}} {{< /tabs >}} +## API access to authentication information for a client {#self-subject-review} + +{{< feature-state for_k8s_version="v1.26" state="alpha" >}} + +If your cluster has the API enabled, you can use the `SelfSubjectReview` API to find out how your Kubernetes cluster maps your authentication +information to identify you as a client. This works whether you are authenticating as a user (typically representing +a real person) or as a ServiceAccount. + +`SelfSubjectReview` objects do not have any configurable fields. On receiving a request, the Kubernetes API server fills the status with the user attributes and returns it to the user. + +Request example (the body would be a `SelfSubjectReview`): +``` +POST /apis/authentication.k8s.io/v1alpha1/selfsubjectreviews +``` +```json +{ + "apiVersion": "authentication.k8s.io/v1alpha1", + "kind": "SelfSubjectReview" +} +``` +Response example: + +```json +{ + "apiVersion": "authentication.k8s.io/v1alpha1", + "kind": "SelfSubjectReview", + "status": { + "userInfo": { + "name": "jane.doe", + "uid": "b6c7cfd4-f166-11ec-8ea0-0242ac120002", + "groups": [ + "viewers", + "editors", + "system:authenticated" + ], + "extra": { + "provider_id": ["token.company.example"] + } + } + } +} +``` + +For convenience, the `kubectl alpha auth whoami` command is present. Executing this command will produce the following output (yet different user attributes will be shown): + +* Simple output example + ``` + ATTRIBUTE VALUE + Username jane.doe + Groups [system:authenticated] + ``` + +* Complex example including extra attributes + ``` + ATTRIBUTE VALUE + Username jane.doe + UID b79dbf30-0c6a-11ed-861d-0242ac120002 + Groups [students teachers system:authenticated] + Extra: skills [reading learning] + Extra: subjects [math sports] + ``` +By providing the output flag, it is also possible to print the JSON or YAML representation of the result: + +{{< tabs name="self_subject_attributes_review_Example_1" >}} +{{% tab name="JSON" %}} +```json +{ + "apiVersion": "authentication.k8s.io/v1alpha1", + "kind": "SelfSubjectReview", + "status": { + "userInfo": { + "username": "jane.doe", + "uid": "b79dbf30-0c6a-11ed-861d-0242ac120002", + "groups": [ + "students", + "teachers", + "system:authenticated" + ], + "extra": { + "skills": [ + "reading", + "learning" + ], + "subjects": [ + "math", + "sports" + ] + } + } + } +} +``` +{{% /tab %}} + +{{% tab name="YAML" %}} +```yaml +apiVersion: authentication.k8s.io/v1alpha1 +kind: SelfSubjectReview +status: + userInfo: + username: jane.doe + uid: b79dbf30-0c6a-11ed-861d-0242ac120002 + groups: + - students + - teachers + - system:authenticated + extra: + skills: + - reading + - learning + subjects: + - math + - sports +``` +{{% /tab %}} +{{< /tabs >}} + +This feature is extremely useful when a complicated authentication flow is used in a Kubernetes cluster, +for example, if you use [webhook token authentication](/docs/reference/access-authn-authz/authentication/#webhook-token-authentication) or [authenticating proxy](/docs/reference/access-authn-authz/authentication/#authenticating-proxy). + +{{< note >}} +The Kubernetes API server fills the `userInfo` after all authentication mechanisms are applied, +including [impersonation](/docs/reference/access-authn-authz/authentication/#user-impersonation). +If you, or an authentication proxy, make a SelfSubjectReview using impersonation, +you see the user details and properties for the user that was impersonated. +{{< /note >}} + +By default, all authenticated users can create `SelfSubjectReview` objects when the `APISelfSubjectReview` feature is enabled. It is allowed by the `system:basic-user` cluster role. + +{{< note >}} +You can only make `SelfSubjectReview` requests if: +* the `APISelfSubjectReview` + [feature gate](/docs/reference/command-line-tools-reference/feature-gates/) + is enabled for your cluster +* the API server for your cluster has the `authentication.k8s.io/v1alpha1` + {{< glossary_tooltip term_id="api-group" text="API group" >}} + enabled. +{{< /note >}} + + + ## {{% heading "whatsnext" %}} * Read the [client authentication reference (v1beta1)](/docs/reference/config-api/client-authentication.v1beta1/) diff --git a/content/en/docs/reference/access-authn-authz/validating-admission-policy.md b/content/en/docs/reference/access-authn-authz/validating-admission-policy.md new file mode 100644 index 0000000000000..1cb2e0a2f579d --- /dev/null +++ b/content/en/docs/reference/access-authn-authz/validating-admission-policy.md @@ -0,0 +1,312 @@ +--- +reviewers: +- liggitt +- jpbetz +- cici37 +title: Validating Admission Policy +content_type: concept +--- + + + +{{< feature-state state="alpha" for_k8s_version="v1.26" >}} + +This page provides an overview of Validating Admission Policy. + + + + +## What is Validating Admission Policy? + +Validating admission policies offer a declarative, in-process alternative to validating admission webhooks. + +Validating admission policies use the Common Expression Language (CEL) to declare the validation rules of a policy. +Validation admission policies are highly configurable, enabling policy authors to define policies that can be parameterized and scoped to resources as needed by cluster administrators. + +## What Resources Make a Policy + +A policy is generally made up of three resources: + +- The `ValidatingAdmissionPolicy` describes the abstract logic of a policy (think: "this policy makes sure a particular label is set to a particular value"). + +- A `ValidatingAdmissionPolicyBinding` links the above resources together and provides scoping. If you only want to require an `owner` label to be set for `Pods`, the binding is where you would specify this restriction. + +- A parameter resource provides information to a ValidatingAdmissionPolicy to make it a concrete statement (think "the `owner` label must be set to something that ends in `.company.com`"). A native type such as ConfigMap or a CRD defines the schema of a parameter resource. `ValidatingAdmissionPolicy` objects specify what Kind they are expecting for their parameter resource. + + +At least a `ValidatingAdmissionPolicy` and a corresponding `ValidatingAdmissionPolicyBinding` must be defined for a policy to have an effect. + +If a `ValidatingAdmissionPolicy` does not need to be configured via parameters, simply leave `spec.paramKind` in `ValidatingAdmissionPolicy` unset. + +## {{% heading "prerequisites" %}} + +- Ensure the `ValidatingAdmissionPolicy` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/) is enabled. +- Ensure that the `admissionregistration.k8s.io/v1alpha1` API is enabled. + +## Getting Started with Validating Admission Policy + +Validating Admission Policy is part of the cluster control-plane. You should write and deploy them with great caution. The following describes how to quickly experiment with Validating Admission Policy. + +### Creating a ValidatingAdmissionPolicy + +The following is an example of a ValidatingAdmissionPolicy. +```yaml +apiVersion: admissionregistration.k8s.io/v1alpha1 +kind: ValidatingAdmissionPolicy +metadata: + name: "demo-policy.example.com" +spec: + failurePolicy: Fail + matchConstraints: + resourceRules: + - apiGroups: ["apps"] + apiVersions: ["v1"] + operations: ["CREATE", "UPDATE"] + resources: ["deployments"] + validations: + - expression: "object.spec.replicas <= 5" +``` +`spec.validations` contains CEL expressions which use the [Common Expression Language (CEL)](https://github.com/google/cel-spec) +to validate the request. If an expression evaluates to false, the validation check is enforced according to the `spec.failurePolicy` field. + +To configure a validating admission policy for use in a cluster, a binding is required. The following is an example of a ValidatingAdmissionPolicyBinding.: +```yaml +apiVersion: admissionregistration.k8s.io/v1alpha1 +kind: ValidatingAdmissionPolicyBinding +metadata: + name: "demo-binding-test.example.com" +spec: + policy: "replicalimit-policy.example.com" + matchResources: + namespaceSelectors: + - key: environment, + operator: In, + values: ["test"] +``` + +When trying to create a deployment with replicas set not satisfying the validation expression, an error will return containing message: +``` +ValidatingAdmissionPolicy 'demo-policy.example.com' with binding 'demo-binding-test.example.com' denied request: failed expression: object.spec.replicas <= 5 +``` + +The above provides a simple example of using ValidatingAdmissionPolicy without a parameter configured. + +#### Parameter resources + +Parameter resources allow a policy configuration to be separate from its definition. +A policy can define paramKind, which outlines GVK of the parameter resource, +and then a policy binding ties a policy by name (via policyName) to a particular parameter resource via paramRef. + +If parameter configuration is needed, the following is an example of a ValidatingAdmissionPolicy with parameter configuration. +```yaml +apiVersion: admissionregistration.k8s.io/v1alpha1 +kind: ValidatingAdmissionPolicy +metadata: + name: "replicalimit-policy.example.com" +Spec: + failurePolicy: Fail + paramKind: + apiVersion: rules.example.com/v1 + kind: ReplicaLimit + matchConstraints: + resourceRules: + - apiGroups: ["apps"] + apiVersions: ["v1"] + operations: ["CREATE", "UPDATE"] + resources: ["deployments"] + validations: + - expression: "object.spec.replicas <= params.maxReplicas" + reason: Invalid +``` +The `spec.paramKind` field of the ValidatingAdmissionPolicy specifies the kind of resources used to parameterize this policy. For this example, it is configured by ReplicaLimit custom resources. +Note in this example how the CEL expression references the parameters via the CEL params variable, e.g. `params.maxReplicas`. +spec.matchConstraints specifies what resources this policy is designed to validate. +Note that the native types such like `ConfigMap` could also be used as parameter reference. + +The `spec.validations` fields contain CEL expressions. If an expression evaluates to false, the validation check is enforced according to the `spec.failurePolicy` field. + +The validating admission policy author is responsible for providing the ReplicaLimit parameter CRD. + +To configure an validating admission policy for use in a cluster, a binding and parameter resource are created. The following is an example of a ValidatingAdmissionPolicyBinding. +```yaml +apiVersion: admissionregistration.k8s.io/v1alpha1 +kind: ValidatingAdmissionPolicyBinding +metadata: + name: "replicalimit-binding-test.example.com" +spec: + policy: "replicalimit-policy.example.com" + paramsRef: + name: "replica-limit-test.example.com" + matchResources: + namespaceSelectors: + - key: environment, + operator: In, + values: ["test"] +``` +The parameter resource could be as following: +```yaml +apiVersion: rules.example.com/v1 +kind: ReplicaLimit +metadata: + name: "replica-limit-test.example.com" +maxReplicas: 3 +``` +This policy parameter resource limits deployments to a max of 3 replicas in all namespaces in the test environment. +An admission policy may have multiple bindings. To bind all other environments environment to have a maxReplicas limit of 100, create another ValidatingAdmissionPolicyBinding: +```yaml +apiVersion: admissionregistration.k8s.io/v1alpha1 +kind: ValidatingAdmissionPolicyBinding +metadata: + name: "replicalimit-binding-nontest" +spec: + policy: "replicalimit-policy.example.com" + paramsRef: + name: "replica-limit-clusterwide.example.com" + matchResources: + namespaceSelectors: + - key: environment, + operator: NotIn, + values: ["test"] +``` +And have a parameter resource like: +```yaml +apiVersion: rules.example.com/v1 +kind: ReplicaLimit +metadata: + name: "replica-limit-clusterwide.example.com" +maxReplicas: 100 +``` +Bindings can have overlapping match criteria. The policy is evaluated for each matching binding. In the above example, the "nontest" policy binding could instead have been defined as a global policy: +```yaml +apiVersion: admissionregistration.k8s.io/v1alpha1 +kind: ValidatingAdmissionPolicyBinding +metadata: + name: "replicalimit-binding-global" +spec: + policy: "replicalimit-policy.example.com" + params: "replica-limit-clusterwide.example.com" + matchResources: + namespaceSelectors: + - key: environment, + operator: Exists +``` + +The params object representing a parameter resource will not be set if a parameter resource has not been bound, +so for policies requiring a parameter resource, +it can be useful to add a check to ensure one has been bound. + +For the use cases require parameter configuration, +we recommend to add a param check in `spec.validations[0].expression`: +``` +- expression: "params != null" + message: "params missing but required to bind to this policy" +``` + +It can be convenient to be able to have optional parameters as part of a parameter resource, and only validate them if present. +CEL provides has(), which checks if the key passed to it exists. CEL also implements Boolean short-circuiting: +If the first half of a logical OR evaluates to true, it won’t evaluate the other half (since the result of the entire OR will be true regardless). +Combining the two, we can provide a way to validate optional parameters: +`!has(params.optionalNumber) || (params.optionalNumber >= 5 && params.optionalNumber <= 10)` +Here, we first check that the optional parameter is present with `!has(params.optionalNumber)`. +If `optionalNumber` hasn’t been defined, then the expression short-circuits since `!has(params.optionalNumber)` will evaluate to true. +If `optionalNumber` has been defined, then the latter half of the CEL expression will be evaluated, and optionalNumber will be checked to ensure that it contains a value between 5 and 10 inclusive. + +#### Authorization Check + +We introduced the authorization check for parameter resources. +User is expected to have `read` access to the resources referenced by `paramKind` in `ValidatingAdmissionPolicy` and `paramRef` in `ValidatingAdmissionPolicyBinding`. + +Note that if a resource in `paramKind` fails resolving via the restmapper, `read` access to all resources of groups is required. + +### Failure Policy + +`failurePolicy` defines how mis-configurations and CEL expressions evaluating to error from the admission policy are handled. +Allowed values are `Ignore` or `Fail`. + +- `Ignore` means that an error calling the ValidatingAdmissionPolicy is ignored and the API request is allowed to continue. +- `Fail` means that an error calling the ValidatingAdmissionPolicy causes the admission to fail and the API request to be rejected. + +Note that the `failurePolicy` is defined inside `ValidatingAdmissionPolicy`: +```yaml +apiVersion: admissionregistration.k8s.io/v1alpha1 +kind: ValidatingAdmissionPolicy +spec: +... +failurePolicy: Ignore # The default is "Fail" +validations: +- expression: "object.spec.xyz == params.x" +``` + +### Validation Expression + +`spec.validations[i].expression` represents the expression which will be evaluated by CEL. +To learn more, see the [CEL language specification](https://github.com/google/cel-spec) +CEL expressions have access to the contents of the Admission request/response, organized into CEL variables as well as some other useful variables: +- 'object' - The object from the incoming request. The value is null for DELETE requests. +- 'oldObject' - The existing object. The value is null for CREATE requests. +- 'request' - Attributes of the [admission request](/pkg/apis/admission/types.go#AdmissionRequest). +- 'params' - Parameter resource referred to by the policy binding being evaluated. The value is null if `ParamKind` is unset. + +The `apiVersion`, `kind`, `metadata.name` and `metadata.generateName` are always accessible from the root of the +object. No other metadata properties are accessible. + +Only property names of the form `[a-zA-Z_.-/][a-zA-Z0-9_.-/]*` are accessible. +Only property names of the form `[a-zA-Z_.-/][a-zA-Z0-9_.-/]*` are accessible. +Accessible property names are escaped according to the following rules when accessed in the expression: + +| escape sequence | property name equivalent | +| ----------------------- | -----------------------| +| `__underscores__` | `__` | +| `__dot__` | `.` | +|`__dash__` | `-` | +| `__slash__` | `/` | +| `__{keyword}__` | [CEL RESERVED keyword](https://github.com/google/cel-spec/blob/v0.6.0/doc/langdef.md#syntax) | + +{{< note >}} +A **CEL reserved** keyword only needs to be escaped if the token is an exact match +for the reserved keyword. +For example, `int` in the word “sprint” would not be escaped. +{{< /note >}} + +Examples on escaping: + +|property name | rule with escaped property name | +| ----------------| ----------------------- | +| namespace | `self.__namespace__ > 0` | +| x-prop | `self.x__dash__prop > 0` | +| redact__d | `self.redact__underscores__d > 0` | +| string | `self.startsWith('kube')` | + +Equality on arrays with list type of 'set' or 'map' ignores element order, i.e. [1, 2] == [2, 1]. +Concatenation on arrays with x-kubernetes-list-type use the semantics of the list type: + - 'set': `X + Y` performs a union where the array positions of all elements in `X` are preserved and + non-intersecting elements in `Y` are appended, retaining their partial order. + - 'map': `X + Y` performs a merge where the array positions of all keys in `X` are preserved but the values + are overwritten by values in `Y` when the key sets of `X` and `Y` intersect. Elements in `Y` with + non-intersecting keys are appended, retaining their partial order. + +#### Validation expression examples + +| Expression | Purpose | +|----------------------------------------------------------------------------------------------| ------------ | +| `object.minReplicas <= object.replicas && object.replicas <= object.maxReplicas` | Validate that the three fields defining replicas are ordered appropriately | +| `'Available' in object.stateCounts` | Validate that an entry with the 'Available' key exists in a map | +| `(size(object.list1) == 0) != (size(object.list2) == 0)` | Validate that one of two lists is non-empty, but not both | +| !('MY_KEY' in object.map1) || object['MY_KEY'].matches('^[a-zA-Z]*$') | Validate the value of a map for a specific key, if it is in the map | +| `object.envars.filter(e, e.name == 'MY_ENV').all(e, e.value.matches('^[a-zA-Z]*$')` | Validate the 'value' field of a listMap entry where key field 'name' is 'MY_ENV' | +| `has(object.expired) && object.created + object.ttl < object.expired` | Validate that 'expired' date is after a 'create' date plus a 'ttl' duration | +| `object.health.startsWith('ok')` | Validate a 'health' string field has the prefix 'ok' | +| `object.widgets.exists(w, w.key == 'x' && w.foo < 10)` | Validate that the 'foo' property of a listMap item with a key 'x' is less than 10 | +| `type(object) == string ? object == '100%' : object == 1000` | Validate an int-or-string field for both the int and string cases | +| `object.metadata.name.startsWith(object.prefix)` | Validate that an object's name has the prefix of another field value | +| `object.set1.all(e, !(e in object.set2))` | Validate that two listSets are disjoint | +| `size(object.names) == size(object.details) && object.names.all(n, n in object.details)` | Validate the 'details' map is keyed by the items in the 'names' listSet | +| `size(object.clusters.filter(c, c.name == object.primary)) == 1` | Validate that the 'primary' property has one and only one occurrence in the 'clusters' listMap | + +Read [Supported evaluation on CEL](https://github.com/google/cel-spec/blob/v0.6.0/doc/langdef.md#evaluation) for more information about CEL rules. + +`spec.validation[i].reason` represents a machine-readable description of why this validation failed. +If this is the first validation in the list to fail, this reason, as well as the corresponding HTTP response code, are used in the +HTTP response to the client. +The currently supported reasons are: `Unauthorized`, `Forbidden`, `Invalid`, `RequestEntityTooLarge`. +If not set, `StatusReasonInvalid` is used in the response to the client. diff --git a/content/en/docs/reference/command-line-tools-reference/feature-gates-removed.md b/content/en/docs/reference/command-line-tools-reference/feature-gates-removed.md index a9c2b072d8c2d..26f6663e90291 100644 --- a/content/en/docs/reference/command-line-tools-reference/feature-gates-removed.md +++ b/content/en/docs/reference/command-line-tools-reference/feature-gates-removed.md @@ -86,6 +86,8 @@ In the following table: | `CronJobControllerV2` | `false` | Alpha | 1.20 | 1.20 | | `CronJobControllerV2` | `true` | Beta | 1.21 | 1.21 | | `CronJobControllerV2` | `true` | GA | 1.22 | 1.23 | +| `CSRDuration` | `true` | Beta | 1.22 | 1.23 | +| `CSRDuration` | `true` | GA | 1.24 | 1.25 | | `CustomPodDNS` | `false` | Alpha | 1.9 | 1.9 | | `CustomPodDNS` | `true` | Beta| 1.10 | 1.13 | | `CustomPodDNS` | `true` | GA | 1.14 | 1.16 | @@ -181,6 +183,9 @@ In the following table: | `PodDisruptionBudget` | `false` | Alpha | 1.3 | 1.4 | | `PodDisruptionBudget` | `true` | Beta | 1.5 | 1.20 | | `PodDisruptionBudget` | `true` | GA | 1.21 | 1.25 | +| `PodOverhead` | `false` | Alpha | 1.16 | 1.17 | +| `PodOverhead` | `true` | Beta | 1.18 | 1.23 | +| `PodOverhead` | `true` | GA | 1.24 | 1.25 | | `PodPriority` | `false` | Alpha | 1.8 | 1.10 | | `PodPriority` | `true` | Beta | 1.11 | 1.13 | | `PodPriority` | `true` | GA | 1.14 | 1.18 | @@ -408,6 +413,9 @@ In the following table: This field controls whether volumes created by a CSIDriver support volume ownership and permission modifications when these volumes are mounted. +- `CSRDuration`: Allows clients to request a duration for certificates issued + via the Kubernetes CSR API. + - `ConfigurableFSGroupPolicy`: Allows user to configure volume permission change policy for fsGroups when mounting a volume in a Pod. See [Configure volume permission and ownership change policy for Pods](/docs/tasks/configure-pod-container/security-context/#configure-volume-permission-and-ownership-change-policy-for-pods) @@ -533,6 +541,9 @@ In the following table: - `PodDisruptionBudget`: Enable the [PodDisruptionBudget](/docs/tasks/run-application/configure-pdb/) feature. +- `PodOverhead`: Enable the [PodOverhead](/docs/concepts/scheduling-eviction/pod-overhead/) + feature to account for pod overheads. + - `PodPriority`: Enable the descheduling and preemption of Pods based on their [priorities](/docs/concepts/scheduling-eviction/pod-priority-preemption/). diff --git a/content/en/docs/reference/command-line-tools-reference/feature-gates.md b/content/en/docs/reference/command-line-tools-reference/feature-gates.md index de91fa9c2f836..48de9a13dd29c 100644 --- a/content/en/docs/reference/command-line-tools-reference/feature-gates.md +++ b/content/en/docs/reference/command-line-tools-reference/feature-gates.md @@ -62,43 +62,35 @@ For a reference to old feature gates that are removed, please refer to | `APIPriorityAndFairness` | `true` | Beta | 1.20 | | | `APIResponseCompression` | `false` | Alpha | 1.7 | 1.15 | | `APIResponseCompression` | `true` | Beta | 1.16 | | -| `APIServerIdentity` | `false` | Alpha | 1.20 | | +| `APISelfSubjectAttributesReview` | `false` | Alpha | 1.26 | | +| `APIServerIdentity` | `false` | Alpha | 1.20 | 1.25 | +| `APIServerIdentity` | `true` | Beta | 1.26 | | | `APIServerTracing` | `false` | Alpha | 1.22 | | | `AllowInsecureBackendProxy` | `true` | Beta | 1.17 | | | `AnyVolumeDataSource` | `false` | Alpha | 1.18 | 1.23 | | `AnyVolumeDataSource` | `true` | Beta | 1.24 | | | `AppArmor` | `true` | Beta | 1.4 | | -| `CPUManager` | `false` | Alpha | 1.8 | 1.9 | -| `CPUManager` | `true` | Beta | 1.10 | | | `CPUManagerPolicyAlphaOptions` | `false` | Alpha | 1.23 | | | `CPUManagerPolicyBetaOptions` | `true` | Beta | 1.23 | | | `CPUManagerPolicyOptions` | `false` | Alpha | 1.22 | 1.22 | | `CPUManagerPolicyOptions` | `true` | Beta | 1.23 | | -| `CSIMigrationAzureFile` | `false` | Alpha | 1.15 | 1.20 | -| `CSIMigrationAzureFile` | `false` | Beta | 1.21 | 1.23 | -| `CSIMigrationAzureFile` | `true` | Beta | 1.24 | | | `CSIMigrationPortworx` | `false` | Alpha | 1.23 | 1.24 | | `CSIMigrationPortworx` | `false` | Beta | 1.25 | | | `CSIMigrationRBD` | `false` | Alpha | 1.23 | | -| `CSIMigrationvSphere` | `false` | Alpha | 1.18 | 1.18 | -| `CSIMigrationvSphere` | `false` | Beta | 1.19 | 1.24 | -| `CSIMigrationvSphere` | `true` | Beta | 1.25 | | | `CSINodeExpandSecret` | `false` | Alpha | 1.25 | | | `CSIVolumeHealth` | `false` | Alpha | 1.21 | | +| `CrossNamespaceVolumeDataSource` | `false` | Alpha| 1.26 | | | `ContainerCheckpoint` | `false` | Alpha | 1.25 | | | `ContextualLogging` | `false` | Alpha | 1.24 | | | `CustomCPUCFSQuotaPeriod` | `false` | Alpha | 1.12 | | | `CustomResourceValidationExpressions` | `false` | Alpha | 1.23 | 1.24 | | `CustomResourceValidationExpressions` | `true` | Beta | 1.25 | | -| `DelegateFSGroupToCSIDriver` | `false` | Alpha | 1.22 | 1.22 | -| `DelegateFSGroupToCSIDriver` | `true` | Beta | 1.23 | | -| `DevicePlugins` | `false` | Alpha | 1.8 | 1.9 | -| `DevicePlugins` | `true` | Beta | 1.10 | | | `DisableCloudProviders` | `false` | Alpha | 1.22 | | | `DisableKubeletCloudCredentialProviders` | `false` | Alpha | 1.23 | | | `DownwardAPIHugePages` | `false` | Alpha | 1.20 | 1.20 | | `DownwardAPIHugePages` | `false` | Beta | 1.21 | 1.21 | | `DownwardAPIHugePages` | `true` | Beta | 1.22 | | +| `DynamicResourceAllocation` | `false` | Alpha | 1.26 | | | `EndpointSliceTerminatingCondition` | `false` | Alpha | 1.20 | 1.21 | | `EndpointSliceTerminatingCondition` | `true` | Beta | 1.22 | | | `ExpandedDNSConfig` | `false` | Alpha | 1.22 | | @@ -122,22 +114,21 @@ For a reference to old feature gates that are removed, please refer to | `InTreePluginvSphereUnregister` | `false` | Alpha | 1.21 | | | `IPTablesOwnershipCleanup` | `false` | Alpha | 1.25 | | | `JobMutableNodeSchedulingDirectives` | `true` | Beta | 1.23 | | -| `JobPodFailurePolicy` | `false` | Alpha | 1.25 | - | +| `JobPodFailurePolicy` | `false` | Alpha | 1.25 | 1.25 | +| `JobPodFailurePolicy` | `true` | Beta | 1.26 | | | `JobReadyPods` | `false` | Alpha | 1.23 | 1.23 | | `JobReadyPods` | `true` | Beta | 1.24 | | | `JobTrackingWithFinalizers` | `false` | Alpha | 1.22 | 1.22 | | `JobTrackingWithFinalizers` | `false` | Beta | 1.23 | 1.24 | | `JobTrackingWithFinalizers` | `true` | Beta | 1.25 | | | `KMSv2` | `false` | Alpha | 1.25 | | -| `KubeletCredentialProviders` | `false` | Alpha | 1.20 | 1.23 | -| `KubeletCredentialProviders` | `true` | Beta | 1.24 | | | `KubeletInUserNamespace` | `false` | Alpha | 1.22 | | | `KubeletPodResources` | `false` | Alpha | 1.13 | 1.14 | | `KubeletPodResources` | `true` | Beta | 1.15 | | | `KubeletPodResourcesGetAllocatable` | `false` | Alpha | 1.21 | 1.22 | | `KubeletPodResourcesGetAllocatable` | `true` | Beta | 1.23 | | | `KubeletTracing` | `false` | Alpha | 1.25 | | -| `LegacyServiceAccountTokenNoAutoGeneration` | `true` | Beta | 1.24 | | +| `LegacyServiceAccountTokenTracking` | `false` | Alpha | 1.26 | | | `LocalStorageCapacityIsolationFSQuotaMonitoring` | `false` | Alpha | 1.15 | 1.24 | | `LocalStorageCapacityIsolationFSQuotaMonitoring` | `true` | Beta | 1.25 | | | `LogarithmicScaleDown` | `false` | Alpha | 1.21 | 1.21 | @@ -154,28 +145,34 @@ For a reference to old feature gates that are removed, please refer to | `MultiCIDRRangeAllocator` | `false` | Alpha | 1.25 | | | `NetworkPolicyStatus` | `false` | Alpha | 1.24 | | | `NodeInclusionPolicyInPodTopologySpread` | `false` | Alpha | 1.25 | | -| `NodeOutOfServiceVolumeDetach` | `false` | Alpha | 1.24 | | +| `NodeOutOfServiceVolumeDetach` | `false` | Alpha | 1.24 | 1.25 | +| `NodeOutOfServiceVolumeDetach` | `true` | Beta | 1.26 | | | `NodeSwap` | `false` | Alpha | 1.22 | | | `OpenAPIEnums` | `false` | Alpha | 1.23 | 1.23 | | `OpenAPIEnums` | `true` | Beta | 1.24 | | | `OpenAPIV3` | `false` | Alpha | 1.23 | 1.23 | | `OpenAPIV3` | `true` | Beta | 1.24 | | +| `PDBUnhealthyPodEvictionPolicy` | `false` | Alpha | 1.26 | | | `PodAndContainerStatsFromCRI` | `false` | Alpha | 1.23 | | | `PodDeletionCost` | `false` | Alpha | 1.21 | 1.21 | | `PodDeletionCost` | `true` | Beta | 1.22 | | -| `PodDisruptionConditions` | `false` | Alpha | 1.25 | - | +| `PodDisruptionConditions` | `false` | Alpha | 1.25 | 1.25 | +| `PodDisruptionConditions` | `true` | Beta | 1.26 | | | `PodHasNetworkCondition` | `false` | Alpha | 1.25 | | +| `PodSchedulingReadiness` | `false` | Alpha | 1.26 | | | `ProbeTerminationGracePeriod` | `false` | Alpha | 1.21 | 1.21 | | `ProbeTerminationGracePeriod` | `false` | Beta | 1.22 | 1.24 | | `ProbeTerminationGracePeriod` | `true` | Beta | 1.25 | | | `ProcMountType` | `false` | Alpha | 1.12 | | -| `ProxyTerminatingEndpoints` | `false` | Alpha | 1.22 | | +| `ProxyTerminatingEndpoints` | `false` | Alpha | 1.22 | 1.25 | +| `ProxyTerminatingEndpoints` | `true` | Beta | 1.26 | | | `QOSReserved` | `false` | Alpha | 1.11 | | | `ReadWriteOncePod` | `false` | Alpha | 1.22 | | | `RecoverVolumeExpansionFailure` | `false` | Alpha | 1.23 | | | `RemainingItemCount` | `false` | Alpha | 1.15 | 1.15 | | `RemainingItemCount` | `true` | Beta | 1.16 | | -| `RetroactiveDefaultStorageClass` | `false` | Alpha | 1.25 | | +| `RetroactiveDefaultStorageClass` | `false` | Alpha | 1.25 | 1.25 | +| `RetroactiveDefaultStorageClass` | `true` | Beta | 1.26 | | | `RotateKubeletServerCertificate` | `false` | Alpha | 1.7 | 1.11 | | `RotateKubeletServerCertificate` | `true` | Beta | 1.12 | | | `SELinuxMountReadWriteOncePod` | `false` | Alpha | 1.25 | | @@ -183,13 +180,10 @@ For a reference to old feature gates that are removed, please refer to | `SeccompDefault` | `true` | Beta | 1.25 | | | `ServerSideFieldValidation` | `false` | Alpha | 1.23 | 1.24 | | `ServerSideFieldValidation` | `true` | Beta | 1.25 | | -| `ServiceIPStaticSubrange` | `false` | Alpha | 1.24 | 1.24 | -| `ServiceIPStaticSubrange` | `true` | Beta | 1.25 | | -| `ServiceInternalTrafficPolicy` | `false` | Alpha | 1.21 | 1.21 | -| `ServiceInternalTrafficPolicy` | `true` | Beta | 1.22 | | | `SizeMemoryBackedVolumes` | `false` | Alpha | 1.20 | 1.21 | | `SizeMemoryBackedVolumes` | `true` | Beta | 1.22 | | | `StatefulSetAutoDeletePVC` | `false` | Alpha | 1.22 | | +| `StatefulSetStartOrdinal` | `false` | Alpha | 1.26 | | | `StorageVersionAPI` | `false` | Alpha | 1.20 | | | `StorageVersionHash` | `false` | Alpha | 1.14 | 1.14 | | `StorageVersionHash` | `true` | Beta | 1.15 | | @@ -198,13 +192,16 @@ For a reference to old feature gates that are removed, please refer to | `TopologyAwareHints` | `true` | Beta | 1.24 | | | `TopologyManager` | `false` | Alpha | 1.16 | 1.17 | | `TopologyManager` | `true` | Beta | 1.18 | | +| `TopologyManagerPolicyAlphaOptions` | `false` | Alpha | 1.26 | | +| `TopologyManagerPolicyBetaOptions` | `false` | Beta | 1.26 | | +| `TopologyManagerPolicyOptions` | `false` | Alpha | 1.26 | | | `UserNamespacesStatelessPodsSupport` | `false` | Alpha | 1.25 | | +| `ValidatingAdmissionPolicy` | `false` | Alpha | 1.26 | | | `VolumeCapacityPriority` | `false` | Alpha | 1.21 | - | | `WinDSR` | `false` | Alpha | 1.14 | | | `WinOverlay` | `false` | Alpha | 1.14 | 1.19 | | `WinOverlay` | `true` | Beta | 1.20 | | -| `WindowsHostProcessContainers` | `false` | Alpha | 1.22 | 1.22 | -| `WindowsHostProcessContainers` | `true` | Beta | 1.23 | | +| `WindowsHostNetwork` | `false` | Alpha | 1.26| | {{< /table >}} ### Feature gates for graduated or deprecated features @@ -216,6 +213,9 @@ For a reference to old feature gates that are removed, please refer to | `AdvancedAuditing` | `false` | Alpha | 1.7 | 1.7 | | `AdvancedAuditing` | `true` | Beta | 1.8 | 1.11 | | `AdvancedAuditing` | `true` | GA | 1.12 | - | +| `CPUManager` | `false` | Alpha | 1.8 | 1.9 | +| `CPUManager` | `true` | Beta | 1.10 | 1.25 | +| `CPUManager` | `true` | GA | 1.26 | - | | `CSIInlineVolume` | `false` | Alpha | 1.15 | 1.15 | | `CSIInlineVolume` | `true` | Beta | 1.16 | 1.24 | | `CSIInlineVolume` | `true` | GA | 1.25 | - | @@ -230,18 +230,24 @@ For a reference to old feature gates that are removed, please refer to | `CSIMigrationAzureDisk` | `false` | Beta | 1.19 | 1.22 | | `CSIMigrationAzureDisk` | `true` | Beta | 1.23 | 1.23 | | `CSIMigrationAzureDisk` | `true` | GA | 1.24 | | +| `CSIMigrationAzureFile` | `false` | Alpha | 1.15 | 1.20 | +| `CSIMigrationAzureFile` | `false` | Beta | 1.21 | 1.23 | +| `CSIMigrationAzureFile` | `true` | Beta | 1.24 | 1.25 | +| `CSIMigrationAzureFile` | `true` | GA | 1.26 | | | `CSIMigrationGCE` | `false` | Alpha | 1.14 | 1.16 | | `CSIMigrationGCE` | `false` | Beta | 1.17 | 1.22 | | `CSIMigrationGCE` | `true` | Beta | 1.23 | 1.24 | | `CSIMigrationGCE` | `true` | GA | 1.25 | - | +| `CSIMigrationvSphere` | `false` | Alpha | 1.18 | 1.18 | +| `CSIMigrationvSphere` | `false` | Beta | 1.19 | 1.24 | +| `CSIMigrationvSphere` | `true` | Beta | 1.25 | 1.25 | +| `CSIMigrationvSphere` | `true` | GA | 1.26 | - | | `CSIMigrationOpenStack` | `false` | Alpha | 1.14 | 1.17 | | `CSIMigrationOpenStack` | `true` | Beta | 1.18 | 1.23 | | `CSIMigrationOpenStack` | `true` | GA | 1.24 | | | `CSIStorageCapacity` | `false` | Alpha | 1.19 | 1.20 | | `CSIStorageCapacity` | `true` | Beta | 1.21 | 1.23 | | `CSIStorageCapacity` | `true` | GA | 1.24 | - | -| `CSRDuration` | `true` | Beta | 1.22 | 1.23 | -| `CSRDuration` | `true` | GA | 1.24 | - | | `ControllerManagerLeaderMigration` | `false` | Alpha | 1.21 | 1.21 | | `ControllerManagerLeaderMigration` | `true` | Beta | 1.22 | 1.23 | | `ControllerManagerLeaderMigration` | `true` | GA | 1.24 | - | @@ -253,9 +259,15 @@ For a reference to old feature gates that are removed, please refer to | `DefaultPodTopologySpread` | `false` | Alpha | 1.19 | 1.19 | | `DefaultPodTopologySpread` | `true` | Beta | 1.20 | 1.23 | | `DefaultPodTopologySpread` | `true` | GA | 1.24 | - | +| `DelegateFSGroupToCSIDriver` | `false` | Alpha | 1.22 | 1.22 | +| `DelegateFSGroupToCSIDriver` | `true` | Beta | 1.23 | 1.25 | +| `DelegateFSGroupToCSIDriver` | `true` | GA | 1.26 |-| | `DisableAcceleratorUsageMetrics` | `false` | Alpha | 1.19 | 1.19 | | `DisableAcceleratorUsageMetrics` | `true` | Beta | 1.20 | 1.24 | | `DisableAcceleratorUsageMetrics` | `true` | GA | 1.25 |- | +| `DevicePlugins` | `false` | Alpha | 1.8 | 1.9 | +| `DevicePlugins` | `true` | Beta | 1.10 | 1.25 | +| `DevicePlugins` | `true` | GA | 1.26 | - | | `DryRun` | `false` | Alpha | 1.12 | 1.12 | | `DryRun` | `true` | Beta | 1.13 | 1.18 | | `DryRun` | `true` | GA | 1.19 | - | @@ -268,6 +280,7 @@ For a reference to old feature gates that are removed, please refer to | `EphemeralContainers` | `false` | Alpha | 1.16 | 1.22 | | `EphemeralContainers` | `true` | Beta | 1.23 | 1.24 | | `EphemeralContainers` | `true` | GA | 1.25 | - | +| `EventedPLEG` | `false` | Alpha | 1.26 | - | | `ExecProbeTimeout` | `true` | GA | 1.20 | - | | `ExpandCSIVolumes` | `false` | Alpha | 1.14 | 1.15 | | `ExpandCSIVolumes` | `true` | Beta | 1.16 | 1.23 | @@ -284,6 +297,15 @@ For a reference to old feature gates that are removed, please refer to | `IndexedJob` | `false` | Alpha | 1.21 | 1.21 | | `IndexedJob` | `true` | Beta | 1.22 | 1.23 | | `IndexedJob` | `true` | GA | 1.24 | - | +| `JobTrackingWithFinalizers` | `false` | Alpha | 1.22 | 1.22 | +| `JobTrackingWithFinalizers` | `false` | Beta | 1.23 | 1.24 | +| `JobTrackingWithFinalizers` | `true` | Beta | 1.25 | 1.25 | +| `JobTrackingWithFinalizers` | `true` | GA | 1.26 | - | +| `KubeletCredentialProviders` | `false` | Alpha | 1.20 | 1.23 | +| `KubeletCredentialProviders` | `true` | Beta | 1.24 | 1.25 | +| `KubeletCredentialProviders` | `true` | GA | 1.26 | - | +| `LegacyServiceAccountTokenNoAutoGeneration` | `true` | Beta | 1.24 | 1.25 | +| `LegacyServiceAccountTokenNoAutoGeneration` | `true` | GA | 1.26 | - | | `LocalStorageCapacityIsolation` | `false` | Alpha | 1.7 | 1.9 | | `LocalStorageCapacityIsolation` | `true` | Beta | 1.10 | 1.24 | | `LocalStorageCapacityIsolation` | `true` | GA | 1.25 | - | @@ -296,9 +318,6 @@ For a reference to old feature gates that are removed, please refer to | `PodAffinityNamespaceSelector` | `false` | Alpha | 1.21 | 1.21 | | `PodAffinityNamespaceSelector` | `true` | Beta | 1.22 | 1.23 | | `PodAffinityNamespaceSelector` | `true` | GA | 1.24 | - | -| `PodOverhead` | `false` | Alpha | 1.16 | 1.17 | -| `PodOverhead` | `true` | Beta | 1.18 | 1.23 | -| `PodOverhead` | `true` | GA | 1.24 | - | | `PodSecurity` | `false` | Alpha | 1.22 | 1.22 | | `PodSecurity` | `true` | Beta | 1.23 | 1.24 | | `PodSecurity` | `true` | GA | 1.25 | | @@ -311,6 +330,12 @@ For a reference to old feature gates that are removed, please refer to | `ServerSideApply` | `false` | Alpha | 1.14 | 1.15 | | `ServerSideApply` | `true` | Beta | 1.16 | 1.21 | | `ServerSideApply` | `true` | GA | 1.22 | - | +| `ServiceInternalTrafficPolicy` | `false` | Alpha | 1.21 | 1.21 | +| `ServiceInternalTrafficPolicy` | `true` | Beta | 1.22 | 1.25 | +| `ServiceInternalTrafficPolicy` | `true` | GA | 1.26 | - | +| `ServiceIPStaticSubrange` | `false` | Alpha | 1.24 | 1.24 | +| `ServiceIPStaticSubrange` | `true` | Beta | 1.25 | 1.25 | +| `ServiceIPStaticSubrange` | `true` | GA | 1.26 | - | | `ServiceLBNodePortControl` | `false` | Alpha | 1.20 | 1.21 | | `ServiceLBNodePortControl` | `true` | Beta | 1.22 | 1.23 | | `ServiceLBNodePortControl` | `true` | GA | 1.24 | - | @@ -326,6 +351,9 @@ For a reference to old feature gates that are removed, please refer to | `WatchBookmark` | `false` | Alpha | 1.15 | 1.15 | | `WatchBookmark` | `true` | Beta | 1.16 | 1.16 | | `WatchBookmark` | `true` | GA | 1.17 | - | +| `WindowsHostProcessContainers` | `false` | Alpha | 1.22 | 1.22 | +| `WindowsHostProcessContainers` | `true` | Beta | 1.23 | 1.25 | +| `WindowsHostProcessContainers` | `true` | GA | 1.26 | - | {{< /table >}} ## Using a feature @@ -379,6 +407,10 @@ Each feature gate is designed for enabling/disabling a specific feature: - `APIServerIdentity`: Assign each API server an ID in a cluster. - `APIServerTracing`: Add support for distributed tracing in the API server. See [Traces for Kubernetes System Components](/docs/concepts/cluster-administration/system-traces) for more details. +- `APISelfSubjectAttributesReview`: Activate the `SelfSubjectReview` API which allows users + to see the requesting subject's authentication information. + See [API access to authentication information for a client](/docs/reference/access-authn-authz/authentication/#self-subject-review) + for more details. - `AdvancedAuditing`: Enable [advanced auditing](/docs/tasks/debug/debug-cluster/audit/#advanced-audit) - `AllowInsecureBackendProxy`: Enable the users to skip TLS verification of kubelets on Pod log requests. @@ -405,6 +437,9 @@ Each feature gate is designed for enabling/disabling a specific feature: This feature gate guards *a group* of CPUManager options whose quality level is beta. This feature gate will never graduate to stable. - `CPUManagerPolicyOptions`: Allow fine-tuning of CPUManager policies. +- `CrossNamespaceVolumeDataSource`: Enable the usage of cross namespace volume data source + to allow you to specify a source namespace in the `dataSourceRef` field of a + PersistentVolumeClaim. - `CSIInlineVolume`: Enable CSI Inline volumes support for pods. - `CSIMigration`: Enables shims and translation logic to route volume operations from in-tree plugins to corresponding pre-installed CSI plugins @@ -465,8 +500,6 @@ Each feature gate is designed for enabling/disabling a specific feature: [Storage Capacity](/docs/concepts/storage/storage-capacity/). Check the [`csi` volume type](/docs/concepts/storage/volumes/#csi) documentation for more details. - `CSIVolumeHealth`: Enable support for CSI volume health monitoring on node. -- `CSRDuration`: Allows clients to request a duration for certificates issued - via the Kubernetes CSR API. - `ContextualLogging`: When you enable this feature gate, Kubernetes components that support contextual logging add extra detail to log output. - `ControllerManagerLeaderMigration`: Enables leader migration for @@ -508,6 +541,14 @@ Each feature gate is designed for enabling/disabling a specific feature: - `EphemeralContainers`: Enable the ability to add {{< glossary_tooltip text="ephemeral containers" term_id="ephemeral-container" >}} to running pods. +- `EventedPLEG`: Enable support for the kubelet to receive container life cycle events from the + {{< glossary_tooltip text="container runtime" term_id="container-runtime" >}} via + an extension to {{}}. + (PLEG is an abbreviation for “Pod lifecycle event generator”). + For this feature to be useful, you also need to enable support for container lifecycle events + in each container runtime running in your cluster. If the container runtime does not announce + support for container lifecycle events then the kubelet automatically switches to the legacy + generic PLEG mechanism, even if you have this feature gate enabled. - `ExecProbeTimeout`: Ensure kubelet respects exec probe timeouts. This feature gate exists in case any of your existing workloads depend on a now-corrected fault where Kubernetes ignored exec probe timeouts. See @@ -599,6 +640,8 @@ Each feature gate is designed for enabling/disabling a specific feature: See [Traces for Kubernetes System Components](/docs/concepts/cluster-administration/system-traces) for more details. - `LegacyServiceAccountTokenNoAutoGeneration`: Stop auto-generation of Secret-based [service account tokens](/docs/reference/access-authn-authz/authentication/#service-account-tokens). +- `LegacyServiceAccountTokenTracking`: Track usage of Secret-based + [service account tokens](/docs/reference/access-authn-authz/authentication/#service-account-tokens). - `LocalStorageCapacityIsolation`: Enable the consumption of [local ephemeral storage](/docs/concepts/configuration/manage-resources-containers/) and also the `sizeLimit` property of an @@ -612,7 +655,7 @@ Each feature gate is designed for enabling/disabling a specific feature: filesystem walk for better performance and accuracy. - `LogarithmicScaleDown`: Enable semi-random selection of pods to evict on controller scaledown based on logarithmic bucketing of pod timestamps. -- `MatchLabelKeysInPodTopologySpread`: Enable the `matchLabelKeys` field for +- `MatchLabelKeysInPodTopologySpread`: Enable the `matchLabelKeys` field for [Pod topology spread constraints](/docs/concepts/scheduling-eviction/topology-spread-constraints/). - `MaxUnavailableStatefulSet`: Enables setting the `maxUnavailable` field for the [rolling update strategy](/docs/concepts/workloads/controllers/statefulset/#rolling-updates) @@ -644,18 +687,20 @@ Each feature gate is designed for enabling/disabling a specific feature: - `OpenAPIEnums`: Enables populating "enum" fields of OpenAPI schemas in the spec returned from the API server. - `OpenAPIV3`: Enables the API server to publish OpenAPI v3. +- `PDBUnhealthyPodEvictionPolicy`: Enables the `unhealthyPodEvictionPolicy` field of a `PodDisruptionBudget`. This specifies + when unhealthy pods should be considered for eviction. Please see [Unhealthy Pod Eviction Policy](/docs/tasks/run-application/configure-pdb/#unhealthy-pod-eviction-policy) + for more details. - `PodDeletionCost`: Enable the [Pod Deletion Cost](/docs/concepts/workloads/controllers/replicaset/#pod-deletion-cost) feature which allows users to influence ReplicaSet downscaling order. - `PodAffinityNamespaceSelector`: Enable the [Pod Affinity Namespace Selector](/docs/concepts/scheduling-eviction/assign-pod-node/#namespace-selector) and [CrossNamespacePodAffinity](/docs/concepts/policy/resource-quotas/#cross-namespace-pod-affinity-quota) quota scope features. -- `PodAndContainerStatsFromCRI`: Configure the kubelet to gather container and - pod stats from the CRI container runtime rather than gathering them from cAdvisor. +- `PodAndContainerStatsFromCRI`: Configure the kubelet to gather container and pod stats from the CRI container runtime rather than gathering them from cAdvisor. + As of 1.26, this also includes gathering metrics from CRI and emitting them over `/metrics/cadvisor` (rather than having cAdvisor emit them directly). - `PodDisruptionConditions`: Enables support for appending a dedicated pod condition indicating that the pod is being deleted due to a disruption. - `PodHasNetworkCondition`: Enable the kubelet to mark the [PodHasNetwork](/docs/concepts/workloads/pods/pod-lifecycle/#pod-has-network) condition on pods. -- `PodOverhead`: Enable the [PodOverhead](/docs/concepts/scheduling-eviction/pod-overhead/) - feature to account for pod overheads. +- `PodSchedulingReadiness`: Enable setting `schedulingGates` field to control a Pod's [scheduling readiness](/docs/concepts/scheduling-eviction/pod-scheduling-readiness). - `PodSecurity`: Enables the `PodSecurity` admission plugin. - `PreferNominatedNode`: This flag tells the scheduler whether the nominated nodes will be checked first before looping through all the other nodes in @@ -717,6 +762,10 @@ Each feature gate is designed for enabling/disabling a specific feature: memory-backed volumes (mainly `emptyDir` volumes). - `StatefulSetMinReadySeconds`: Allows `minReadySeconds` to be respected by the StatefulSet controller. +- `StatefulSetStartOrdinal`: Allow configuration of the start ordinal in a + StatefulSet. See + [Start ordinal](/docs/concepts/workloads/controllers/statefulset/#start-ordinal) + for more details. - `StorageVersionAPI`: Enable the [storage version API](/docs/reference/generated/kubernetes-api/{{< param "version" >}}/#storageversion-v1alpha1-internal-apiserver-k8s-io). - `StorageVersionHash`: Allow API servers to expose the storage version hash in the @@ -730,7 +779,17 @@ Each feature gate is designed for enabling/disabling a specific feature: - `TopologyManager`: Enable a mechanism to coordinate fine-grained hardware resource assignments for different components in Kubernetes. See [Control Topology Management Policies on a node](/docs/tasks/administer-cluster/topology-manager/). +- `TopologyManagerPolicyAlphaOptions`: Allow fine-tuning of topology manager policies, + experimental, Alpha-quality options. + This feature gate guards *a group* of topology manager options whose quality level is alpha. + This feature gate will never graduate to beta or stable. +- `TopologyManagerPolicyBetaOptions`: Allow fine-tuning of topology manager policies, + experimental, Beta-quality options. + This feature gate guards *a group* of topology manager options whose quality level is alpha. + This feature gate will never graduate to stable. +- `TopologyManagerPolicyOptions`: Allow fine-tuning of topology manager policies, - `UserNamespacesStatelessPodsSupport`: Enable user namespace support for stateless Pods. +- `ValidatingAdmissionPolicy`: Enable [ValidatingAdmissionPolicy](/docs/reference/access-authn-authz/validating-admission-policy/) support for CEL validations be used in Admission Control. - `VolumeCapacityPriority`: Enable support for prioritizing nodes in different topologies based on available PV capacity. - `WatchBookmark`: Enable support for watch bookmark events. diff --git a/content/en/docs/reference/instrumentation/slis.md b/content/en/docs/reference/instrumentation/slis.md new file mode 100644 index 0000000000000..744df09336e58 --- /dev/null +++ b/content/en/docs/reference/instrumentation/slis.md @@ -0,0 +1,76 @@ +--- +reviewers: +- logicalhan +title: Kubernetes Component SLI Metrics +linkTitle: Service Level Indicator Metrics +content_type: reference +weight: 20 +--- + + + +{{< feature-state for_k8s_version="v1.26" state="alpha" >}} + +As an alpha feature, Kubernetes lets you configure Service Level Indicator (SLI) metrics +for each Kubernetes component binary. This metric endpoint is exposed on the serving +HTTPS port of each component, at the path `/metrics/slis`. You must enable the +`ComponentSLIs` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/) +for every component from which you want to scrape SLI metrics. + + + +## SLI Metrics + +With SLI metrics enabled, each Kubernetes component exposes two metrics, +labeled per healthcheck: + +- a gauge (which represents the current state of the healthcheck) +- a counter (which records the cumulative counts observed for each healthcheck state) + +You can use the metric information to calculate per-component availability statistics. +For example, the API server checks the health of etcd. You can work out and report how +available or unavailable etcd has been - as reported by its client, the API server. + + +The prometheus gauge data looks like this: + +``` +# HELP kubernetes_healthcheck [ALPHA] This metric records the result of a single healthcheck. +# TYPE kubernetes_healthcheck gauge +kubernetes_healthcheck{name="autoregister-completion",type="healthz"} 1 +kubernetes_healthcheck{name="autoregister-completion",type="readyz"} 1 +kubernetes_healthcheck{name="etcd",type="healthz"} 1 +kubernetes_healthcheck{name="etcd",type="readyz"} 1 +kubernetes_healthcheck{name="etcd-readiness",type="readyz"} 1 +kubernetes_healthcheck{name="informer-sync",type="readyz"} 1 +kubernetes_healthcheck{name="log",type="healthz"} 1 +kubernetes_healthcheck{name="log",type="readyz"} 1 +kubernetes_healthcheck{name="ping",type="healthz"} 1 +kubernetes_healthcheck{name="ping",type="readyz"} 1 +``` + +While the counter data looks like this: + +``` +# HELP kubernetes_healthchecks_total [ALPHA] This metric records the results of all healthcheck. +# TYPE kubernetes_healthchecks_total counter +kubernetes_healthchecks_total{name="autoregister-completion",status="error",type="readyz"} 1 +kubernetes_healthchecks_total{name="autoregister-completion",status="success",type="healthz"} 15 +kubernetes_healthchecks_total{name="autoregister-completion",status="success",type="readyz"} 14 +kubernetes_healthchecks_total{name="etcd",status="success",type="healthz"} 15 +kubernetes_healthchecks_total{name="etcd",status="success",type="readyz"} 15 +kubernetes_healthchecks_total{name="etcd-readiness",status="success",type="readyz"} 15 +kubernetes_healthchecks_total{name="informer-sync",status="error",type="readyz"} 1 +kubernetes_healthchecks_total{name="informer-sync",status="success",type="readyz"} 14 +kubernetes_healthchecks_total{name="log",status="success",type="healthz"} 15 +kubernetes_healthchecks_total{name="log",status="success",type="readyz"} 15 +kubernetes_healthchecks_total{name="ping",status="success",type="healthz"} 15 +kubernetes_healthchecks_total{name="ping",status="success",type="readyz"} 15 +``` + +## Using this data + +The component SLIs metrics endpoint is intended to be scraped at a high frequency. Scraping +at a high frequency means that you end up with greater granularity of the gauge's signal, which +can be then used to calculate SLOs. The `/metrics/slis` endpoint provides the raw data necessary +to calculate an availability SLO for the respective Kubernetes component. diff --git a/content/en/docs/reference/kubectl/_index.md b/content/en/docs/reference/kubectl/_index.md index 781cd22f11f33..6d72bd0579cdf 100644 --- a/content/en/docs/reference/kubectl/_index.md +++ b/content/en/docs/reference/kubectl/_index.md @@ -136,6 +136,7 @@ Operation | Syntax | Description `diff` | `kubectl diff -f FILENAME [flags]`| Diff file or stdin against live configuration. `drain` | `kubectl drain NODE [options]` | Drain node in preparation for maintenance. `edit` | kubectl edit (-f FILENAME | TYPE NAME | TYPE/NAME) [flags] | Edit and update the definition of one or more resources on the server by using the default editor. +`events` | `kubectl events` | List events `exec` | `kubectl exec POD [-c CONTAINER] [-i] [-t] [flags] [-- COMMAND [args...]]` | Execute a command against a container in a pod. `explain` | `kubectl explain [--recursive=false] [flags]` | Get documentation of various resources. For instance pods, nodes, services, etc. `expose` | kubectl expose (-f FILENAME | TYPE NAME | TYPE/NAME) [--port=port] [--protocol=TCP|UDP] [--target-port=number-or-name] [--name=name] [--external-ip=external-ip-of-service] [--type=type] [flags] | Expose a replication controller, service, or pod as a new Kubernetes service. diff --git a/content/en/docs/reference/kubectl/cheatsheet.md b/content/en/docs/reference/kubectl/cheatsheet.md index 72ea96b24c808..0e3e3f285d54e 100644 --- a/content/en/docs/reference/kubectl/cheatsheet.md +++ b/content/en/docs/reference/kubectl/cheatsheet.md @@ -225,6 +225,9 @@ kubectl get pods --all-namespaces -o jsonpath='{range .items[*].status.initConta # List Events sorted by timestamp kubectl get events --sort-by=.metadata.creationTimestamp +# List all warning events +kubectl events --types=Warning + # Compares the current state of the cluster against the state that the cluster would be in if the manifest was applied. kubectl diff -f ./my-manifest.yaml diff --git a/content/en/docs/reference/kubectl/kubectl.md b/content/en/docs/reference/kubectl/kubectl.md index 1b855168e5a72..00314826750c4 100644 --- a/content/en/docs/reference/kubectl/kubectl.md +++ b/content/en/docs/reference/kubectl/kubectl.md @@ -351,6 +351,16 @@ kubectl [flags] When set to false, turns off extra HTTP headers detailing invoked kubectl command (Kubernetes version v1.22 or later) + + + +KUBECTL_EXPLAIN_OPENAPIV3 + + +Toggles whether calls to `kubectl explain` use the new OpenAPIv3 data source available. OpenAPIV3 is enabled by default since Kubernetes 1.24. + + + @@ -376,6 +386,7 @@ kubectl [flags] * [kubectl diff](/docs/reference/generated/kubectl/kubectl-commands#diff) - Diff live version against would-be applied version * [kubectl drain](/docs/reference/generated/kubectl/kubectl-commands#drain) - Drain node in preparation for maintenance * [kubectl edit](/docs/reference/generated/kubectl/kubectl-commands#edit) - Edit a resource on the server +* [kubectl events](/docs/reference/generated/kubectl/kubectl-commands#events) - List events * [kubectl exec](/docs/reference/generated/kubectl/kubectl-commands#exec) - Execute a command in a container * [kubectl explain](/docs/reference/generated/kubectl/kubectl-commands#explain) - Documentation of resources * [kubectl expose](/docs/reference/generated/kubectl/kubectl-commands#expose) - Take a replication controller, service, deployment or pod and expose it as a new Kubernetes Service diff --git a/content/en/docs/reference/kubernetes-api/config-and-storage-resources/persistent-volume-v1.md b/content/en/docs/reference/kubernetes-api/config-and-storage-resources/persistent-volume-v1.md index ffe70099304a2..0aa9021a429f5 100644 --- a/content/en/docs/reference/kubernetes-api/config-and-storage-resources/persistent-volume-v1.md +++ b/content/en/docs/reference/kubernetes-api/config-and-storage-resources/persistent-volume-v1.md @@ -519,29 +519,6 @@ PersistentVolumeSpec is the specification of a persistent volume. readOnly here will force the ReadOnly setting in VolumeMounts. Defaults to false. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk -- **glusterfs** (GlusterfsPersistentVolumeSource) - - glusterfs represents a Glusterfs volume that is attached to a host and exposed to the pod. Provisioned by an admin. More info: https://examples.k8s.io/volumes/glusterfs/README.md - - - *Represents a Glusterfs mount that lasts the lifetime of a pod. Glusterfs volumes do not support ownership management or SELinux relabeling.* - - - **glusterfs.endpoints** (string), required - - endpoints is the endpoint name that details Glusterfs topology. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod - - - **glusterfs.path** (string), required - - path is the Glusterfs volume path. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod - - - **glusterfs.endpointsNamespace** (string) - - endpointsNamespace is the namespace that contains Glusterfs endpoint. If this field is empty, the EndpointNamespace defaults to the same namespace as the bound PVC. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod - - - **glusterfs.readOnly** (boolean) - - readOnly here will force the Glusterfs volume to be mounted with read-only permissions. Defaults to false. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod - - **iscsi** (ISCSIPersistentVolumeSource) iscsi represents an ISCSI Disk resource that is attached to a kubelet's host machine and then exposed to the pod. Provisioned by an admin. diff --git a/content/en/docs/reference/kubernetes-api/config-and-storage-resources/volume.md b/content/en/docs/reference/kubernetes-api/config-and-storage-resources/volume.md index 227261b735d28..1586c8cdd4b95 100644 --- a/content/en/docs/reference/kubernetes-api/config-and-storage-resources/volume.md +++ b/content/en/docs/reference/kubernetes-api/config-and-storage-resources/volume.md @@ -541,25 +541,6 @@ Volume represents a named volume in a pod that may be accessed by any container readOnly here will force the ReadOnly setting in VolumeMounts. Defaults to false. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk -- **glusterfs** (GlusterfsVolumeSource) - - glusterfs represents a Glusterfs mount on the host that shares a pod's lifetime. More info: https://examples.k8s.io/volumes/glusterfs/README.md - - - *Represents a Glusterfs mount that lasts the lifetime of a pod. Glusterfs volumes do not support ownership management or SELinux relabeling.* - - - **glusterfs.endpoints** (string), required - - endpoints is the endpoint name that details Glusterfs topology. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod - - - **glusterfs.path** (string), required - - path is the Glusterfs volume path. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod - - - **glusterfs.readOnly** (boolean) - - readOnly here will force the Glusterfs volume to be mounted with read-only permissions. Defaults to false. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod - - **iscsi** (ISCSIVolumeSource) iscsi represents an ISCSI Disk resource that is attached to a kubelet's host machine and then exposed to the pod. More info: https://examples.k8s.io/volumes/iscsi/README.md diff --git a/content/en/docs/reference/labels-annotations-taints/_index.md b/content/en/docs/reference/labels-annotations-taints/_index.md index 684455129d8fb..75974454588c5 100644 --- a/content/en/docs/reference/labels-annotations-taints/_index.md +++ b/content/en/docs/reference/labels-annotations-taints/_index.md @@ -426,6 +426,20 @@ Used on: Secret This annotation records the {{< glossary_tooltip term_id="uid" text="unique ID" >}} of the ServiceAccount that the token (stored in the Secret of type `kubernetes.io/service-account-token`) represents. +### kubernetes.io/legacy-token-last-used + +Example: `kubernetes.io/legacy-token-last-used: 2022-10-24` + +Used on: Secret + +The control plane only adds this label for Secrets that have the type `kubernetes.io/service-account-token`. +The value of this label records the date (ISO 8601 format, UTC time zone) when the control plane last saw +a request where the client authenticated using the service account token. + +If a legacy token was last used before the cluster gained the feature (added in Kubernetes v1.26), then +the label isn't set. + + ### endpointslice.kubernetes.io/managed-by {#endpointslicekubernetesiomanaged-by} Example: `endpointslice.kubernetes.io/managed-by: "controller"` @@ -521,7 +535,7 @@ The {{< glossary_tooltip text="control plane" term_id="control-plane" >}} adds t If the number of backend endpoints falls below 1000, the control plane removes this annotation. -### batch.kubernetes.io/job-tracking +### batch.kubernetes.io/job-tracking (deprecated) {#batch-kubernetes-io-job-tracking} Example: `batch.kubernetes.io/job-tracking: ""` @@ -529,7 +543,15 @@ Used on: Jobs The presence of this annotation on a Job indicates that the control plane is [tracking the Job status using finalizers](/docs/concepts/workloads/controllers/job/#job-tracking-with-finalizers). -You should **not** manually add or remove this annotation. +The control plane uses this annotation to safely transition to tracking Jobs +using finalizers, while the feature is in development. +You should **not** manually add or remove this annotation. + +{{< note >}} +Starting from Kubernetes 1.26, this annotation is deprecated. +Kubernetes 1.27 and newer will ignore this annotation and always track Jobs +using finalizers. +{{< /note >}} ### scheduler.alpha.kubernetes.io/defaultTolerations {#scheduleralphakubernetesio-defaulttolerations} diff --git a/content/en/docs/reference/networking/service-protocols.md b/content/en/docs/reference/networking/service-protocols.md index 578020d30cbc3..e6cf721e05381 100644 --- a/content/en/docs/reference/networking/service-protocols.md +++ b/content/en/docs/reference/networking/service-protocols.md @@ -46,11 +46,6 @@ The support of multihomed SCTP associations requires that the CNI plugin can sup NAT for multihomed SCTP associations requires special logic in the corresponding kernel modules. -{{< note >}} -The kube-proxy does not support the management of SCTP associations when it is in userspace mode. -{{< /note >}} - - ### `TCP` {#protocol-tcp} You can use TCP for any kind of Service, and it's the default network protocol. diff --git a/content/en/docs/reference/networking/virtual-ips.md b/content/en/docs/reference/networking/virtual-ips.md index bf08efb1a91b3..4022317a82952 100644 --- a/content/en/docs/reference/networking/virtual-ips.md +++ b/content/en/docs/reference/networking/virtual-ips.md @@ -61,63 +61,6 @@ Note that the kube-proxy starts up in different modes, which are determined by i - The ConfigMap parameters for the kube-proxy cannot all be validated and verified on startup. For example, if your operating system doesn't allow you to run iptables commands, the standard kernel kube-proxy implementation will not work. - Likewise, if you have an operating system which doesn't support `netsh`, - it will not run in Windows userspace mode. - -### User space proxy mode {#proxy-mode-userspace} - -{{< feature-state for_k8s_version="v1.23" state="deprecated" >}} - -This (legacy) mode uses iptables to install interception rules, and then performs -traffic forwarding with the assistance of the kube-proxy tool. -The kube-procy watches the Kubernetes control plane for the addition, modification -and removal of Service and EndpointSlice objects. For each Service, the kube-proxy -opens a port (randomly chosen) on the local node. Any connections to this _proxy port_ -are proxied to one of the Service's backend Pods (as reported via -EndpointSlices). The kube-proxy takes the `sessionAffinity` setting of the Service into -account when deciding which backend Pod to use. - -The user-space proxy installs iptables rules which capture traffic to the -Service's `clusterIP` (which is virtual) and `port`. Those rules redirect that traffic -to the proxy port which proxies the backend Pod. - -By default, kube-proxy in userspace mode chooses a backend via a round-robin algorithm. - -{{< figure src="/images/docs/services-userspace-overview.svg" title="Services overview diagram for userspace proxy" class="diagram-medium" >}} - - -#### Example {#packet-processing-userspace} - -As an example, consider the image processing application described [earlier](#example) -in the page. -When the backend Service is created, the Kubernetes control plane assigns a virtual -IP address, for example 10.0.0.1. Assuming the Service port is 1234, the -Service is observed by all of the kube-proxy instances in the cluster. -When a proxy sees a new Service, it opens a new random port, establishes an -iptables redirect from the virtual IP address to this new port, and starts accepting -connections on it. - -When a client connects to the Service's virtual IP address, the iptables -rule kicks in, and redirects the packets to the proxy's own port. -The "Service proxy" chooses a backend, and starts proxying traffic from the client to the backend. - -This means that Service owners can choose any port they want without risk of -collision. Clients can connect to an IP and port, without being aware -of which Pods they are actually accessing. - -#### Scaling challenges {#scaling-challenges-userspace} - -Using the userspace proxy for VIPs works at small to medium scale, but will -not scale to very large clusters with thousands of Services. The -[original design proposal for portals](https://github.com/kubernetes/kubernetes/issues/1107) -has more details on this. - -Using the userspace proxy obscures the source IP address of a packet accessing -a Service. -This makes some kinds of network filtering (firewalling) impossible. The iptables -proxy mode does not -obscure in-cluster source IPs, but it does still impact clients coming through -a load balancer or node-port. ### `iptables` proxy mode {#proxy-mode-iptables} @@ -135,7 +78,7 @@ is handled by Linux netfilter without the need to switch between userspace and t kernel space. This approach is also likely to be more reliable. If kube-proxy is running in iptables mode and the first Pod that's selected -does not respond, the connection fails. This is different from userspace +does not respond, the connection fails. This is different from the old `userspace` mode: in that scenario, kube-proxy would detect that the connection to the first Pod had failed and would automatically retry with a different backend Pod. @@ -148,7 +91,8 @@ having traffic sent via kube-proxy to a Pod that's known to have failed. #### Example {#packet-processing-iptables} -Again, consider the image processing application described [earlier](#example). +As an example, consider the image processing application described [earlier](#example) +in the page. When the backend Service is created, the Kubernetes control plane assigns a virtual IP address, for example 10.0.0.1. For this example, assume that the Service port is 1234. @@ -162,10 +106,7 @@ endpoint rules redirect traffic (using destination NAT) to the backends. When a client connects to the Service's virtual IP address the iptables rule kicks in. A backend is chosen (either based on session affinity or randomly) and packets are -redirected to the backend. Unlike the userspace proxy, packets are never -copied to userspace, the kube-proxy does not have to be running for the virtual -IP address to work, and Nodes see traffic arriving from the unaltered client IP -address. +redirected to the backend without rewriting the client IP address. This same basic flow executes when traffic comes in through a node-port or through a load-balancer, though in those cases the client IP address does get altered. @@ -289,6 +230,16 @@ that are used for `type: ClusterIP` Services. You can set the `.spec.internalTrafficPolicy` and `.spec.externalTrafficPolicy` fields to control how Kubernetes routes traffic to healthy (“ready”) backends. +### Internal traffic policy + +{{< feature-state for_k8s_version="v1.22" state="beta" >}} + +You can set the `.spec.internalTrafficPolicy` field to control how traffic from +internal sources is routed. Valid values are `Cluster` and `Local`. Set the field to +`Cluster` to route internal traffic to all ready endpoints and `Local` to only route +to ready node-local endpoints. If the traffic policy is `Local` and there are no +node-local endpoints, traffic is dropped by kube-proxy. + ### External traffic policy You can set the `.spec.externalTrafficPolicy` field to control how traffic from @@ -298,34 +249,29 @@ route to ready node-local endpoints. If the traffic policy is `Local` and there are no node-local endpoints, the kube-proxy does not forward any traffic for the relevant Service. -{{< note >}} -{{< feature-state for_k8s_version="v1.22" state="alpha" >}} +### Traffic to terminating endpoints + +{{< feature-state for_k8s_version="v1.26" state="beta" >}} -If you enable the `ProxyTerminatingEndpoints` +If the `ProxyTerminatingEndpoints` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/) -for the kube-proxy, the kube-proxy checks if the node +is enabled in kube-proxy and the traffic policy is `Local`, that node's +kube-proxy uses a more complicated algorithm to select endpoints for a Service. +With the feature enabled, kube-proxy checks if the node has local endpoints and whether or not all the local endpoints are marked as terminating. -If there are local endpoints and **all** of those are terminating, then the kube-proxy ignores -any external traffic policy of `Local`. Instead, whilst the node-local endpoints remain as all -terminating, the kube-proxy forwards traffic for that Service to healthy endpoints elsewhere, -as if the external traffic policy were set to `Cluster`. - -This forwarding behavior for terminating endpoints exists to allow external load balancers to -gracefully drain connections that are backed by `NodePort` Services, even when the health check -node port starts to fail. Otherwise, traffic can be lost between the time a node is -still in the node pool of a load balancer and traffic is being dropped during the -termination period of a pod. -{{< /note >}} - -### Internal traffic policy - -{{< feature-state for_k8s_version="v1.22" state="beta" >}} - -You can set the `.spec.internalTrafficPolicy` field to control how traffic from -internal sources is routed. Valid values are `Cluster` and `Local`. Set the field to -`Cluster` to route internal traffic to all ready endpoints and `Local` to only route -to ready node-local endpoints. If the traffic policy is `Local` and there are no -node-local endpoints, traffic is dropped by kube-proxy. +If there are local endpoints and **all** of them are terminating, then kube-proxy +will forward traffic to those terminating endpoints. Otherwise, kube-proxy will always +prefer forwarding traffic to endpoints that are not terminating. + +This forwarding behavior for terminating endpoints exist to allow `NodePort` and `LoadBalancer` +Services to gracefully drain connections when using `externalTrafficPolicy: Local`. + +As a deployment goes through a rolling update, nodes backing a load balancer may transition from +N to 0 replicas of that deployment. In some cases, external load balancers can send traffic to +a node with 0 replicas in between health check probes. Routing traffic to terminating endpoints +ensures that Node's that are scaling down Pods can gracefully receive and drain traffic to +those terminating Pods. By the time the Pod completes termination, the external load balancer +should have seen the node's health check failing and fully removed the node from the backend pool. ## {{% heading "whatsnext" %}} diff --git a/content/en/docs/reference/node/device-plugin-api-versions.md b/content/en/docs/reference/node/device-plugin-api-versions.md new file mode 100644 index 0000000000000..c242068e6729b --- /dev/null +++ b/content/en/docs/reference/node/device-plugin-api-versions.md @@ -0,0 +1,33 @@ +--- +content_type: "reference" +title: Kubelet Device Manager API Versions +weight: 10 +--- + +This page provides details of version compatibility between the Kubernetes +[device plugin API](https://github.com/kubernetes/kubelet/tree/master/pkg/apis/deviceplugin), +and different versions of Kubernetes itself. + +## Compatibility matrix + +| | `v1alpha1` | `v1beta1` | +|-----------------|-------------|-------------| +| Kubernetes 1.21 | - | ✓ | +| Kubernetes 1.22 | - | ✓ | +| Kubernetes 1.23 | - | ✓ | +| Kubernetes 1.24 | - | ✓ | +| Kubernetes 1.25 | - | ✓ | +| Kubernetes 1.26 | - | ✓ | + +Key: + +* `✓` Exactly the same features / API objects in both device plugin API and + the Kubernetes version. +* `+` The device plugin API has features or API objects that may not be present in the + Kubernetes cluster, either because the device plugin API has added additional new API + calls, or that the server has removed an old API call. However, everything they have in + common (most other APIs) will work. Note that alpha APIs may vanish or + change significantly between one minor release and the next. +* `-` The Kubernetes cluster has features the device plugin API can't use, + either because server has added additional API calls, or that device plugin API has + removed an old API call. However, everything they share in common (most APIs) will work. diff --git a/content/en/docs/tasks/access-application-cluster/configure-access-multiple-clusters.md b/content/en/docs/tasks/access-application-cluster/configure-access-multiple-clusters.md index e3503dffb130f..03dc6b4dc031f 100644 --- a/content/en/docs/tasks/access-application-cluster/configure-access-multiple-clusters.md +++ b/content/en/docs/tasks/access-application-cluster/configure-access-multiple-clusters.md @@ -398,6 +398,18 @@ export KUBECONFIG="$KUBECONFIG_SAVED" $Env:KUBECONFIG=$ENV:KUBECONFIG_SAVED ``` +## Check the subject represented by the kubeconfig + +It is not always obvious what attributes (username, groups) you will get after authenticating to the cluster. +It can be even more challenging if you are managing more than one cluster at the same time. + +There is a `kubectl` alpha subcommand command to check subject attributes, such as username, +for your selected Kubernetes client context: `kubectl alpha auth whoami`. + +Read [API access to authentication information for a client](/docs/reference/access-authn-authz/authentication/#self-subject-review) +to learn about this in more detail. + + ## {{% heading "whatsnext" %}} * [Organizing Cluster Access Using kubeconfig Files](/docs/concepts/configuration/organize-cluster-access-kubeconfig/) diff --git a/content/en/docs/tasks/administer-cluster/cluster-upgrade.md b/content/en/docs/tasks/administer-cluster/cluster-upgrade.md index e3f9595d26bf1..17473ac2895ba 100644 --- a/content/en/docs/tasks/administer-cluster/cluster-upgrade.md +++ b/content/en/docs/tasks/administer-cluster/cluster-upgrade.md @@ -91,3 +91,12 @@ kubectl convert -f pod.yaml --output-version v1 The `kubectl` tool replaces the contents of `pod.yaml` with a manifest that sets `kind` to Pod (unchanged), but with a revised `apiVersion`. + +### Device Plugins + +If your cluster is running device plugins and the node needs to be upgraded to a Kubernetes +release with a newer device plugin API version, device plugins must be upgraded to support +both version before the node is upgraded in order to guarantee that device allocations +continue to complete successfully during the upgrade. + +Refer to [API compatiblity](docs/concepts/extend-kubernetes/compute-storage-net/device-plugins.md/#api-compatibility) and [Kubelet Device Manager API Versions](docs/reference/node/device-plugin-api-versions.md) for more details. diff --git a/content/en/docs/tasks/administer-cluster/cpu-management-policies.md b/content/en/docs/tasks/administer-cluster/cpu-management-policies.md index 345775ddebbb6..b077415a05aac 100644 --- a/content/en/docs/tasks/administer-cluster/cpu-management-policies.md +++ b/content/en/docs/tasks/administer-cluster/cpu-management-policies.md @@ -4,12 +4,14 @@ reviewers: - sjenning - ConnorDoyle - balajismaniam + content_type: task +min-kubernetes-server-version: v1.26 --- -{{< feature-state for_k8s_version="v1.12" state="beta" >}} +{{< feature-state for_k8s_version="v1.26" state="stable" >}} Kubernetes keeps many aspects of how pods execute on nodes abstracted from the user. This is by design.  However, some workloads require @@ -26,6 +28,7 @@ directives. {{< include "task-tutorial-prereqs.md" >}} {{< version-check >}} +If you are running an older version of Kubernetes, please look at the documentation for the version you are actually running. @@ -61,10 +64,14 @@ duration as `--node-status-update-frequency`. The behavior of the static policy can be fine-tuned using the `--cpu-manager-policy-options` flag. The flag takes a comma-separated list of `key=value` policy options. -This feature can be disabled completely using the `CPUManagerPolicyOptions` feature gate. - -The policy options are split into two groups: alpha quality (hidden by default) and beta quality -(visible by default). The groups are guarded respectively by the `CPUManagerPolicyAlphaOptions` +If you disable the `CPUManagerPolicyOptions` +[feature gate](/docs/reference/command-line-tools-reference/feature-gates/) +then you cannot fine-tune CPU manager policies. In that case, the CPU manager +operates only using its default settings. + +In addition to the top-level `CPUManagerPolicyOptions` feature gate, the policy options are split +into two groups: alpha quality (hidden by default) and beta quality (visible by default). +The groups are guarded respectively by the `CPUManagerPolicyAlphaOptions` and `CPUManagerPolicyBetaOptions` feature gates. Diverging from the Kubernetes standard, these feature gates guard groups of options, because it would have been too cumbersome to add a feature gate for each individual option. diff --git a/content/en/docs/tasks/administer-cluster/dns-debugging-resolution.md b/content/en/docs/tasks/administer-cluster/dns-debugging-resolution.md index ff52a11f05407..755a6cc717ce9 100644 --- a/content/en/docs/tasks/administer-cluster/dns-debugging-resolution.md +++ b/content/en/docs/tasks/administer-cluster/dns-debugging-resolution.md @@ -334,14 +334,12 @@ Kubernetes installs do not configure the nodes' `resolv.conf` files to use the cluster DNS by default, because that process is inherently distribution-specific. This should probably be implemented eventually. -Linux's libc (a.k.a. glibc) has a limit for the DNS `nameserver` records to 3 by default. What's more, for the glibc versions which are older than glibc-2.17-222 ([the new versions update see this issue](https://access.redhat.com/solutions/58028)), the allowed number of DNS `search` records has been limited to 6 ([see this bug from 2005](https://bugzilla.redhat.com/show_bug.cgi?id=168253)). Kubernetes needs to consume 1 `nameserver` record and 3 `search` records. This means that if a local installation already uses 3 `nameserver`s or uses more than 3 `search`es while your glibc version is in the affected list, some of those settings will be lost. To work around the DNS `nameserver` records limit, the node can run `dnsmasq`, which will provide more `nameserver` entries. You can also use kubelet's `--resolv-conf` flag. To fix the DNS `search` records limit, consider upgrading your linux distribution or upgrading to an unaffected version of glibc. - -{{< note >}} - -With [Expanded DNS Configuration](/docs/concepts/services-networking/dns-pod-service/#expanded-dns-configuration), -Kubernetes allows more DNS `search` records. - -{{< /note >}} +Linux's libc (a.k.a. glibc) has a limit for the DNS `nameserver` records to 3 by +default and Kubernetes needs to consume 1 `nameserver` record. This means that +if a local installation already uses 3 `nameserver`s, some of those entries will +be lost. To work around this limit, the node can run `dnsmasq`, which will +provide more `nameserver` entries. You can also use kubelet's `--resolv-conf` +flag. If you are using Alpine version 3.3 or earlier as your base image, DNS may not work properly due to a known issue with Alpine. diff --git a/content/en/docs/tasks/administer-cluster/encrypt-data.md b/content/en/docs/tasks/administer-cluster/encrypt-data.md index 03a28ccd60dd6..a740b890ac515 100644 --- a/content/en/docs/tasks/administer-cluster/encrypt-data.md +++ b/content/en/docs/tasks/administer-cluster/encrypt-data.md @@ -2,6 +2,7 @@ title: Encrypting Secret Data at Rest reviewers: - smarterclayton +- enj content_type: task min-kubernetes-server-version: 1.13 --- @@ -15,6 +16,9 @@ This page shows how to enable and configure encryption of secret data at rest. * etcd v3.0 or later is required +* To encrypt a custom resource, your cluster must be running Kubernetes v1.26 or newer. + + ## Configuration and determining whether encryption at rest is already enabled @@ -22,8 +26,7 @@ This page shows how to enable and configure encryption of secret data at rest. The `kube-apiserver` process accepts an argument `--encryption-provider-config` that controls how API data is encrypted in etcd. The configuration is provided as an API named -[`EncryptionConfiguration`](/docs/reference/config-api/apiserver-encryption.v1/). -An example configuration is provided below. +[`EncryptionConfiguration`](/docs/reference/config-api/apiserver-encryption.v1/). `--encryption-provider-config-automatic-reload` boolean argument determines if the file set by `--encryption-provider-config` should be automatically reloaded if the disk contents change. This enables key rotation without API server restarts. An example configuration is provided below. {{< caution >}} **IMPORTANT:** For high-availability configurations (with two or more control plane nodes), the @@ -39,6 +42,8 @@ kind: EncryptionConfiguration resources: - resources: - secrets + - configmaps + - pandas.awesome.bears.example providers: - identity: {} - aesgcm: @@ -60,9 +65,16 @@ resources: ``` Each `resources` array item is a separate config and contains a complete configuration. The -`resources.resources` field is an array of Kubernetes resource names (`resource` or `resource.group`) -that should be encrypted. The `providers` array is an ordered list of the possible encryption -providers. +`resources.resources` field is an array of Kubernetes resource names (`resource` or `resource.group` +that should be encrypted like Secrets, ConfigMaps, or other resources. + +If custom resources are added to `EncryptionConfiguration` and the cluster version is 1.26 or newer, +any newly created custom resources mentioned in the `EncryptionConfiguration` will be encrypted. +Any custom resources that existed in etcd prior to that version and configuration will be unencrypted +until they are next written to storage. This is the same behavior as built-in resources. +See the [Ensure all secrets are encrypted](#ensure-all-secrets-are-encrypted) section. + +The `providers` array is an ordered list of the possible encryption providers to use for the APIs that you listed. Only one provider type may be specified per entry (`identity` or `aescbc` may be provided, but not both in the same item). @@ -100,11 +112,11 @@ Storing the raw encryption key in the EncryptionConfig only moderately improves posture, compared to no encryption. Please use `kms` provider for additional security. {{< /caution >}} -By default, the `identity` provider is used to protect Secrets in etcd, which provides no -encryption. `EncryptionConfiguration` was introduced to encrypt Secrets locally, with a locally +By default, the `identity` provider is used to protect secret data in etcd, which provides no +encryption. `EncryptionConfiguration` was introduced to encrypt secret data locally, with a locally managed key. -Encrypting Secrets with a locally managed key protects against an etcd compromise, but it fails to +Encrypting secret data with a locally managed key protects against an etcd compromise, but it fails to protect against a host compromise. Since the encryption keys are stored on the host in the EncryptionConfiguration YAML file, a skilled attacker can access that file and extract the encryption keys. @@ -123,6 +135,8 @@ kind: EncryptionConfiguration resources: - resources: - secrets + - configmaps + - pandas.awesome.bears.example providers: - aescbc: keys: @@ -191,8 +205,9 @@ permissions on your control-plane nodes so only the user who runs the `kube-apis ## Verifying that data is encrypted Data is encrypted when written to etcd. After restarting your `kube-apiserver`, any newly created or -updated Secret should be encrypted when stored. To check this, you can use the `etcdctl` command line -program to retrieve the contents of your Secret. +updated Secret or other resource types configured in `EncryptionConfiguration` should be encrypted +when stored. To check this, you can use the `etcdctl` command line +program to retrieve the contents of your secret data. 1. Create a new Secret called `secret1` in the `default` namespace: @@ -307,4 +322,3 @@ kubectl get secrets --all-namespaces -o json | kubectl replace -f - ## {{% heading "whatsnext" %}} * Learn more about the [EncryptionConfiguration configuration API (v1)](/docs/reference/config-api/apiserver-encryption.v1/). - diff --git a/content/en/docs/tasks/administer-cluster/kms-provider.md b/content/en/docs/tasks/administer-cluster/kms-provider.md index a2f794d3d8b0a..5900be0c4ff34 100644 --- a/content/en/docs/tasks/administer-cluster/kms-provider.md +++ b/content/en/docs/tasks/administer-cluster/kms-provider.md @@ -1,6 +1,7 @@ --- reviewers: - smarterclayton +- enj title: Using a KMS provider for data encryption content_type: task --- @@ -146,10 +147,14 @@ Ensure that the KMS plugin runs on the same host(s) as the Kubernetes master(s). To encrypt the data: -1. Create a new `EncryptionConfiguration` file using the appropriate properties for the `kms` provider to encrypt resources like Secrets and ConfigMaps. +1. Create a new `EncryptionConfiguration` file using the appropriate properties for the `kms` provider +to encrypt resources like Secrets and ConfigMaps. If you want to encrypt an extension API that is +defined in a CustomResourceDefinition, your cluster must be running Kubernetes v1.26 or newer. 1. Set the `--encryption-provider-config` flag on the kube-apiserver to point to the location of the configuration file. +1. `--encryption-provider-config-automatic-reload` boolean argument determines if the file set by `--encryption-provider-config` should be automatically reloaded if the disk contents change. This enables key rotation without API server restarts. + 1. Restart your API server. ### KMS v1 {#encrypting-your-data-with-the-kms-provider-kms-v1} @@ -160,6 +165,8 @@ To encrypt the data: resources: - resources: - secrets + - configmaps + - pandas.awesome.bears.example providers: - kms: name: myKmsPluginFoo @@ -181,6 +188,8 @@ To encrypt the data: resources: - resources: - secrets + - configmaps + - pandas.awesome.bears.example providers: - kms: apiVersion: v2 @@ -195,6 +204,23 @@ To encrypt the data: timeout: 3s ``` +Setting `--encryption-provider-config-automatic-reload` to `true` collapses all health checks to a single health check endpoint. Individual health checks are only available when KMS v1 providers are in use and the encryption config is not auto-reloaded. + +Following table summarizes the health check endpoints for each KMS version: + +| KMS configurations | Without Automatic Reload | With Automatic Reload | +| ------------------------- |------------------------------------| -----------------------| +| KMS v1 only | Individual Healthchecks | Single Healthcheck | +| KMS v2 only | Single Healthcheck | Single Healthcheck | +| Both KMS v1 and v2 | Individual Healthchecks | Single Healthcheck | +| No KMS | None | Single Healthcheck | + +`Single Healthcheck` means that the only health check endpoint is `/healthz/kms-providers`. + +`Individual Healthchecks` means that each KMS plugin has an associated health check endpoint based on its location in the encryption config: `/healthz/kms-provider-0`, `/healthz/kms-provider-1` etc. + +These healthcheck endpoint paths are hard coded and generated/controlled by the server. The indices for individual healthchecks corresponds to the order in which the KMS encryption config is processed. + Until the steps defined in [Ensuring all secrets are encrypted](#ensuring-all-secrets-are-encrypted) are performed, the `providers` list should end with the `identity: {}` provider to allow unencrypted data to be read. Once all resources are encrypted, the `identity` provider should be removed to prevent the API server from honoring unencrypted data. For details about the `EncryptionConfiguration` format, please check the @@ -203,8 +229,8 @@ For details about the `EncryptionConfiguration` format, please check the ## Verifying that the data is encrypted Data is encrypted when written to etcd. After restarting your `kube-apiserver`, -any newly created or updated secret should be encrypted when stored. To verify, -you can use the `etcdctl` command line program to retrieve the contents of your secret. +any newly created or updated Secret or other resource types configured in `EncryptionConfiguration` should be encrypted when stored. To verify, +you can use the `etcdctl` command line program to retrieve the contents of your secret data. 1. Create a new secret called `secret1` in the `default` namespace: diff --git a/content/en/docs/tasks/kubelet-credential-provider/kubelet-credential-provider.md b/content/en/docs/tasks/administer-cluster/kubelet-credential-provider.md similarity index 89% rename from content/en/docs/tasks/kubelet-credential-provider/kubelet-credential-provider.md rename to content/en/docs/tasks/administer-cluster/kubelet-credential-provider.md index 16547f0bf4507..3da341dbccc0a 100644 --- a/content/en/docs/tasks/kubelet-credential-provider/kubelet-credential-provider.md +++ b/content/en/docs/tasks/administer-cluster/kubelet-credential-provider.md @@ -5,9 +5,10 @@ reviewers: - cheftako description: Configure the kubelet's image credential provider plugin content_type: task +min-kubernetes-server-version: v1.26 --- -{{< feature-state for_k8s_version="v1.24" state="beta" >}} +{{< feature-state for_k8s_version="v1.26" state="stable" >}} @@ -27,10 +28,13 @@ This guide demonstrates how to configure the kubelet's image credential provider ## {{% heading "prerequisites" %}} -* The kubelet image credential provider is introduced in v1.20 as an alpha feature. As with other alpha features, - a feature gate `KubeletCredentialProviders` must be enabled on only the kubelet for the feature to work. +* You need a Kubernetes cluster with nodes that support kubelet credential + provider plugins. This support is available in Kubernetes {{< skew currentVersion >}}; + Kubernetes v1.24 and v1.25 included this as a beta feature, enabled by default. * A working implementation of a credential provider exec plugin. You can build your own plugin or use one provided by cloud providers. +{{< version-check >}} + ## Installing Plugins on Nodes @@ -52,9 +56,9 @@ should be invoked for which container images. Here's an example configuration fi [ECR](https://aws.amazon.com/ecr/)-based plugin: ```yaml -apiVersion: kubelet.config.k8s.io/v1alpha1 +apiVersion: kubelet.config.k8s.io/v1 kind: CredentialProviderConfig -# providers is a list of credential provider plugins that will be enabled by the kubelet. +# providers is a list of credential provider helper plugins that will be enabled by the kubelet. # Multiple providers may match against a single image, in which case credentials # from all providers will be returned to the kubelet. If multiple providers are called # for a single image, the results are combined. If providers return overlapping @@ -74,7 +78,7 @@ providers: # Globs can be used in the domain, but not in the port or the path. Globs are supported # as subdomains like '*.k8s.io' or 'k8s.*.io', and top-level-domains such as 'k8s.*'. # Matching partial subdomains like 'app*.k8s.io' is also supported. Each glob can only match - # a single subdomain segment, so *.io does not match *.k8s.io. + # a single subdomain segment, so `*.io` does **not** match `*.k8s.io`. # # A match exists between an image and a matchImage when all of the below are true: # - Both contain the same number of domain parts and each part matches. @@ -98,8 +102,8 @@ providers: defaultCacheDuration: "12h" # Required input version of the exec CredentialProviderRequest. The returned CredentialProviderResponse # MUST use the same encoding version as the input. Current supported values are: - # - credentialprovider.kubelet.k8s.io/v1alpha1 - apiVersion: credentialprovider.kubelet.k8s.io/v1alpha1 + # - credentialprovider.kubelet.k8s.io/v1 + apiVersion: credentialprovider.kubelet.k8s.io/v1 # Arguments to pass to the command when executing it. # +optional args: @@ -151,6 +155,6 @@ Some example values of `matchImages` patterns are: ## {{% heading "whatsnext" %}} * Read the details about `CredentialProviderConfig` in the - [kubelet configuration API (v1alpha1) reference](/docs/reference/config-api/kubelet-config.v1alpha1/). -* Read the [kubelet credential provider API reference (v1alpha1)](/docs/reference/config-api/kubelet-credentialprovider.v1alpha1/). + [kubelet configuration API (v1) reference](/docs/reference/config-api/kubelet-config.v1/). +* Read the [kubelet credential provider API reference (v1)](/docs/reference/config-api/kubelet-credentialprovider.v1/). diff --git a/content/en/docs/tasks/administer-cluster/memory-manager.md b/content/en/docs/tasks/administer-cluster/memory-manager.md index d73c5c5cccabf..55d61c3313c74 100644 --- a/content/en/docs/tasks/administer-cluster/memory-manager.md +++ b/content/en/docs/tasks/administer-cluster/memory-manager.md @@ -459,8 +459,9 @@ by using `--reserved-memory` flag. ### Device plugin resource API -By employing the [API](/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/), -the information about reserved memory for each container can be retrieved, which is contained +The kubelet provides a `PodResourceLister` gRPC service to enable discovery of resources and associated metadata. +By using its [List gRPC endpoint](/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/#grpc-endpoint-list), +information about reserved memory for each container can be retrieved, which is contained in protobuf `ContainerMemory` message. This information can be retrieved solely for pods in Guaranteed QoS class. diff --git a/content/en/docs/tasks/administer-cluster/safely-drain-node.md b/content/en/docs/tasks/administer-cluster/safely-drain-node.md index 6dc5cf93c0f60..f2ffbacac4392 100644 --- a/content/en/docs/tasks/administer-cluster/safely-drain-node.md +++ b/content/en/docs/tasks/administer-cluster/safely-drain-node.md @@ -88,10 +88,11 @@ respect the PodDisruptionBudget you specify. For example, if you have a StatefulSet with three replicas and have set a PodDisruptionBudget for that set specifying `minAvailable: 2`, `kubectl drain` only evicts a pod from the StatefulSet if all three -replicas pods are ready; if then you issue multiple drain commands in -parallel, Kubernetes respects the PodDisruptionBudget and ensure -that only 1 (calculated as `replicas - minAvailable`) Pod is unavailable -at any given time. Any drains that would cause the number of ready +replicas pods are [healthy](/docs/tasks/run-application/configure-pdb/#healthiness-of-a-pod); +if then you issue multiple drain commands in parallel, +Kubernetes respects the PodDisruptionBudget and ensures that +only 1 (calculated as `replicas - minAvailable`) Pod is unavailable +at any given time. Any drains that would cause the number of [healthy](/docs/tasks/run-application/configure-pdb/#healthiness-of-a-pod) replicas to fall below the specified budget are blocked. ## The Eviction API {#eviction-api} diff --git a/content/en/docs/tasks/administer-cluster/securing-a-cluster.md b/content/en/docs/tasks/administer-cluster/securing-a-cluster.md index c5e253a7accd5..d864bb1d32e6f 100644 --- a/content/en/docs/tasks/administer-cluster/securing-a-cluster.md +++ b/content/en/docs/tasks/administer-cluster/securing-a-cluster.md @@ -2,8 +2,7 @@ reviewers: - smarterclayton - liggitt -- ericchiang -- destijl +- enj title: Securing a Cluster content_type: task --- @@ -255,11 +254,14 @@ and may grant an attacker significant visibility into the state of your cluster. your backups using a well reviewed backup and encryption solution, and consider using full disk encryption where possible. -Kubernetes supports [encryption at rest](/docs/tasks/administer-cluster/encrypt-data/), a feature -introduced in 1.7, v1 beta since 1.13, and v2 alpha since 1.25. This will encrypt resources like `Secret` and `ConfigMap` in etcd, preventing -parties that gain access to your etcd backups from viewing the content of those secrets. While -this feature is currently beta, it offers an additional level of defense when backups -are not encrypted or an attacker gains read access to etcd. +Kubernetes supports optional [encryption at rest](/docs/tasks/administer-cluster/encrypt-data/) for information in the Kubernetes API. +This lets you ensure that when Kubernetes stores data for objects (for example, `Secret` or +`ConfigMap` objects), the API server writes an encrypted representation of the object. +That encryption means that even someone who has access to etcd backup data is unable +to view the content of those objects. +In Kubernetes {{< skew currentVersion >}} you can also encrypt custom resources; +encryption-at-rest for extension APIs defined in CustomResourceDefinitions was added to +Kubernetes as part of the v1.26 release. ### Receiving alerts for security updates and reporting vulnerabilities diff --git a/content/en/docs/tasks/administer-cluster/topology-manager.md b/content/en/docs/tasks/administer-cluster/topology-manager.md index a238f132a1b4e..b02b2531b600f 100644 --- a/content/en/docs/tasks/administer-cluster/topology-manager.md +++ b/content/en/docs/tasks/administer-cluster/topology-manager.md @@ -213,6 +213,28 @@ reschedule the pod. It is recommended to use a Deployment with replicas to trigg the Pod.An external control loop could be also implemented to trigger a redeployment of pods that have the `Topology Affinity` error. +### Topology manager policy options + +Support for the Topology Manager policy options requires `TopologyManagerPolicyOptions` +[feature gate](/docs/reference/command-line-tools-reference/feature-gates/) to be enabled. + +You can toggle groups of options on and off based upon their maturity level using the following feature gates: +* `TopologyManagerPolicyBetaOptions` default disabled. Enable to show beta-level options. Currently there are no beta-level options. +* `TopologyManagerPolicyAlphaOptions` default disabled. Enable to show alpha-level options. You will still have to enable each option using the `TopologyManagerPolicyOptions` kubelet option. + +The following policy options exists: +* `prefer-closest-numa-nodes` (alpha, invisible by default, `TopologyManagerPolicyOptions` and `TopologyManagerPolicyAlphaOptions` feature gates have to be enabled)(1.26 or higher) + +If the `prefer-closest-numa-nodes` policy option is specified, the `best-effort` and `restricted` +policies will favor sets of NUMA nodes with shorter distance between them when making admission decisions. +You can enable this option by adding `prefer-closest-numa-nodes=true` to the Topology Manager policy options. +By default, without this option, Topology Manager aligns resources on either a single NUMA node or +the minimum number of NUMA nodes (in cases where more than one NUMA node is required). However, +the `TopologyManager` is not aware of NUMA distances and does not take them into account when making admission decisions. +This limitation surfaces in multi-socket, as well as single-socket multi NUMA systems, +and can cause significant performance degradation in latency-critical execution and high-throughput applications if the +Topology Manager decides to align resources on non-adjacent NUMA nodes. + ### Pod Interactions with Topology Manager Policies Consider the containers in the following pod specs: @@ -330,4 +352,3 @@ assignments. 2. The scheduler is not topology-aware, so it is possible to be scheduled on a node and then fail on the node due to the Topology Manager. - diff --git a/content/en/docs/tasks/administer-cluster/verify-signed-images.md b/content/en/docs/tasks/administer-cluster/verify-signed-artifacts.md similarity index 66% rename from content/en/docs/tasks/administer-cluster/verify-signed-images.md rename to content/en/docs/tasks/administer-cluster/verify-signed-artifacts.md index 0a3ebbb651c17..19f99ab4c8dcf 100644 --- a/content/en/docs/tasks/administer-cluster/verify-signed-images.md +++ b/content/en/docs/tasks/administer-cluster/verify-signed-artifacts.md @@ -1,12 +1,12 @@ --- -title: Verify Signed Container Images +title: Verify Signed Kubernetes Artifacts content_type: task -min-kubernetes-server-version: v1.24 +min-kubernetes-server-version: v1.26 --- -{{< feature-state state="alpha" for_k8s_version="v1.24" >}} +{{< feature-state state="beta" for_k8s_version="v1.26" >}} ## {{% heading "prerequisites" %}} @@ -19,6 +19,38 @@ You will need to have the following tools installed: - `cosign` ([install guide](https://docs.sigstore.dev/cosign/installation/)) - `curl` (often provided by your operating system) +## Verifying binary signatures + +The Kubernetes release process signs all binary artifacts (tarballs, SPDX files, +standalone binaries) by using cosign's keyless signing. To verify a particular +binary, retrieve it together with its signature and certificate: + +```bash +URL=https://dl.k8s.io/release/v{{< skew currentVersion >}}.0/bin/linux/amd64 +BINARY=kubectl + +FILES=( + "$BINARY" + "$BINARY.sig" + "$BINARY.cert" +) + +for FILE in "${FILES[@]}"; do + curl -sSfL --retry 3 --retry-delay 3 "$URL/$FILE" -o "$FILE" +done +``` + +Then verify the blob by using `cosign`: + +```shell +cosign verify-blob "$BINARY" --signature "$BINARY".sig --certificate "$BINARY".cert +``` + +{{< note >}} +To learn more about keyless signing, please refer to [Keyless +Signatures](https://github.com/sigstore/cosign/blob/main/KEYLESS.md#keyless-signatures). +{{< /note >}} + ## Verifying image signatures For a complete list of images that are signed please refer @@ -28,7 +60,7 @@ Let's pick one image from this list and verify its signature using the `cosign verify` command: ```shell -COSIGN_EXPERIMENTAL=1 cosign verify registry.k8s.io/kube-apiserver-amd64:v1.24.0 +COSIGN_EXPERIMENTAL=1 cosign verify registry.k8s.io/kube-apiserver-amd64:v{{< skew currentVersion >}}.0 ``` {{< note >}} @@ -68,5 +100,5 @@ e.g. [conformance image](https://github.com/kubernetes/kubernetes/blob/master/te admission controller. To get started with `policy-controller` here are a few helpful resources: -* [Installation](https://github.com/sigstore/helm-charts/tree/main/charts/policy-controller) -* [Configuration Options](https://github.com/sigstore/policy-controller/tree/main/config) +- [Installation](https://github.com/sigstore/helm-charts/tree/main/charts/policy-controller) +- [Configuration Options](https://github.com/sigstore/policy-controller/tree/main/config) diff --git a/content/en/docs/tasks/configure-pod-container/create-hostprocess-pod.md b/content/en/docs/tasks/configure-pod-container/create-hostprocess-pod.md index ed5506c45b689..cad26cf29a374 100644 --- a/content/en/docs/tasks/configure-pod-container/create-hostprocess-pod.md +++ b/content/en/docs/tasks/configure-pod-container/create-hostprocess-pod.md @@ -7,7 +7,7 @@ min-kubernetes-server-version: 1.23 -{{< feature-state for_k8s_version="v1.23" state="beta" >}} +{{< feature-state for_k8s_version="v1.26" state="stable" >}} Windows HostProcess containers enable you to run containerized workloads on a Windows host. These containers operate as @@ -42,7 +42,6 @@ HostProcess containers have access to the host's network interfaces and IP addre - Consolidation of administrative tasks and security policies. This reduces the degree of privileges needed by Windows nodes. - ## {{% heading "prerequisites" %}} @@ -56,24 +55,13 @@ communicate with containerd directly by passing the hostprocess flag via CRI. Yo latest version of containerd (v1.6+) to run HostProcess containers. [How to install containerd.](/docs/setup/production-environment/container-runtimes/#containerd) -To *disable* HostProcess containers you need to pass the following feature gate flag to the -**kubelet** and **kube-apiserver**: - -```powershell ---feature-gates=WindowsHostProcessContainers=false -``` - -See [Features Gates](/docs/reference/command-line-tools-reference/feature-gates/#overview) -documentation for more details. - - - ## Limitations These limitations are relevant for Kubernetes v{{< skew currentVersion >}}: - HostProcess containers require containerd 1.6 or higher - {{< glossary_tooltip text="container runtime" term_id="container-runtime" >}}. + {{< glossary_tooltip text="container runtime" term_id="container-runtime" >}} and + containerd 1.7 is recommended. - HostProcess pods can only contain HostProcess containers. This is a current limitation of the Windows OS; non-privileged Windows containers cannot share a vNIC with the host IP namespace. - HostProcess containers run as a process on the host and do not have any degree of @@ -121,9 +109,7 @@ the configurations which need to be set to enable the creation of a HostProcess hostNetwork -

Will be in host network by default initially. Support - to set network to a different compartment may be desirable in - the future.

+

Pods container HostProcess containers must use the host's network namespace.

Allowed Values

  • true
  • @@ -131,7 +117,7 @@ the configurations which need to be set to enable the creation of a HostProcess - securityContext.windowsOptions.runAsUsername + securityContext.windowsOptions.runAsUserName

    Specification of which user the HostProcess container should run as is required for the pod spec.

    Allowed Values

    @@ -139,6 +125,7 @@ the configurations which need to be set to enable the creation of a HostProcess
  • NT AUTHORITY\SYSTEM
  • NT AUTHORITY\Local service
  • NT AUTHORITY\NetworkService
  • +
  • Local usergroup names (see below)
@@ -179,18 +166,33 @@ spec: ## Volume mounts HostProcess containers support the ability to mount volumes within the container volume space. +Volume mount behavior differs depending on the version of containerd runtime used by on the node. + +### Containerd v1.6 + Applications running inside the container can access volume mounts directly via relative or absolute paths. An environment variable `$CONTAINER_SANDBOX_MOUNT_POINT` is set upon container creation and provides the absolute host path to the container volume. Relative paths are based upon the `.spec.containers.volumeMounts.mountPath` configuration. -### Example {#volume-mount-example} +To access service account tokens (for example) the following path structures are supported within the container: + +- `.\var\run\secrets\kubernetes.io\serviceaccount\` +- `$CONTAINER_SANDBOX_MOUNT_POINT\var\run\secrets\kubernetes.io\serviceaccount\` + +### Containerd v1.7 (and greater) -To access service account tokens the following path structures are supported within the container: +Applications running inside the container can access volume mounts directly via the volumeMount's +specified `mountPath` (just like Linux and non-HostProcess Windows containers). -`.\var\run\secrets\kubernetes.io\serviceaccount\` +For backwards compatibility volumes can also be accessed via using the same relative paths configured +by containerd v1.6. -`$CONTAINER_SANDBOX_MOUNT_POINT\var\run\secrets\kubernetes.io\serviceaccount\` +As an example, to access service account tokens within the container you would use one of the following paths: + +- `c:\var\run\secrets\kubernetes.io\serviceaccount` +- `/var/run/secrets/kubernetes.io/serviceaccount/` +- `$CONTAINER_SANDBOX_MOUNT_POINT\var\run\secrets\kubernetes.io\serviceaccount\` ## Resource limits @@ -203,7 +205,9 @@ used for resource tracking due to the difference in how HostProcess containers a ## Choosing a user account -HostProcess containers support the ability to run as one of three supported Windows service accounts: +### System accounts + +By default, HostProcess containers support the ability to run as one of three supported Windows service accounts: - **[LocalSystem](https://docs.microsoft.com/windows/win32/services/localsystem-account)** - **[LocalService](https://docs.microsoft.com/windows/win32/services/localservice-account)** @@ -215,6 +219,51 @@ malicious) damage to the host. The LocalSystem service account has the highest l of privilege of the three and should be used only if absolutely necessary. Where possible, use the LocalService service account as it is the least privileged of the three options. +### Local accounts {#local-accounts} + +If configured, HostProcess containers can also run as local user accounts which allows for node operators to give +fine-grained access to workloads. + +To run HostProcess containers as a local user; A local usergroup must first be created on the node +and the name of that local usergroup must be specified in the `runAsUserName` field in the deployment. +Prior to initializing the HostProcess container, a new **ephemeral** local user account to be created and joined to the specified usergroup, from which the container is run. +This provides a number a benefits including eliminating the need to manage passwords for local user accounts. +passwords for local user accounts. An initial HostProcess container running as a service account can be used to +prepare the user groups for later HostProcess containers. + +{{< note >}} +Running HostProcess containers as local user accounts requires containerd v1.7+ +{{< /note >}} + +Example: + +1. Create a local user group on the node (this can be done in another HostProcess container). + + ```cmd + net localgroup hpc-localgroup /add + ``` + +1. Grant access to desired resources on the node to the local usergroup. + This can be done with tools like [icacls](https://learn.microsoft.com/en-us/windows-server/administration/windows-commands/icacls). + +1. Set `runAsUserName` to the name of the local usergroup for the pod or individual containers. + + ```yaml + securityContext: + windowsOptions: + hostProcess: true + runAsUserName: hpc-localgroup + ``` + +1. Schedule the pod! + +## Base Image for HostProcess Containers + +HostProcess containers can be built from any of the existing [Windows Container base images](https://learn.microsoft.com/virtualization/windowscontainers/manage-containers/container-base-images). + +Additionally a new base mage has been created just for HostProcess containers! +For more information please check out the [windows-host-process-containers-base-image github project](https://github.com/microsoft/windows-host-process-containers-base-image#overview). + ## Troubleshooting HostProcess containers - HostProcess containers fail to start with `failed to create user process token: failed to logon user: Access is denied.: unknown` diff --git a/content/en/docs/tasks/configure-pod-container/security-context.md b/content/en/docs/tasks/configure-pod-container/security-context.md index b91764898897a..d5399ca9f58d2 100644 --- a/content/en/docs/tasks/configure-pod-container/security-context.md +++ b/content/en/docs/tasks/configure-pod-container/security-context.md @@ -197,24 +197,18 @@ and [`emptydir`](/docs/concepts/storage/volumes/#emptydir). ## Delegating volume permission and ownership change to CSI driver -{{< feature-state for_k8s_version="v1.23" state="beta" >}} +{{< feature-state for_k8s_version="v1.26" state="stable" >}} If you deploy a [Container Storage Interface (CSI)](https://github.com/container-storage-interface/spec/blob/master/spec.md) driver which supports the `VOLUME_MOUNT_GROUP` `NodeServiceCapability`, the process of setting file ownership and permissions based on the `fsGroup` specified in the `securityContext` will be performed by the CSI driver -instead of Kubernetes, provided that the `DelegateFSGroupToCSIDriver` Kubernetes -feature gate is enabled. In this case, since Kubernetes doesn't perform any +instead of Kubernetes. In this case, since Kubernetes doesn't perform any ownership and permission change, `fsGroupChangePolicy` does not take effect, and as specified by CSI, the driver is expected to mount the volume with the provided `fsGroup`, resulting in a volume that is readable/writable by the `fsGroup`. -Please refer to the [KEP](https://github.com/gnufied/enhancements/blob/master/keps/sig-storage/2317-fsgroup-on-mount/README.md) -and the description of the `VolumeCapability.MountVolume.volume_mount_group` -field in the [CSI spec](https://github.com/container-storage-interface/spec/blob/master/spec.md#createvolume) -for more information. - ## Set the security context for a Container To specify security settings for a Container, include the `securityContext` field diff --git a/content/en/docs/tasks/debug/debug-application/debug-service.md b/content/en/docs/tasks/debug/debug-application/debug-service.md index 788254ea71359..9ddb4709ac2ad 100644 --- a/content/en/docs/tasks/debug/debug-application/debug-service.md +++ b/content/en/docs/tasks/debug/debug-application/debug-service.md @@ -527,7 +527,6 @@ should see something like: ```none I1027 22:14:53.995134 5063 server.go:200] Running in resource-only container "/kube-proxy" I1027 22:14:53.998163 5063 server.go:247] Using iptables Proxier. -I1027 22:14:53.999055 5063 server.go:255] Tearing down userspace rules. Errors here are acceptable. I1027 22:14:54.038140 5063 proxier.go:352] Setting endpoints for "kube-system/kube-dns:dns-tcp" to [10.244.1.3:53] I1027 22:14:54.038164 5063 proxier.go:352] Setting endpoints for "kube-system/kube-dns:dns" to [10.244.1.3:53] I1027 22:14:54.038209 5063 proxier.go:352] Setting endpoints for "default/kubernetes:https" to [10.240.0.2:443] @@ -549,8 +548,7 @@ and then retry. Kube-proxy can run in one of a few modes. In the log listed above, the line `Using iptables Proxier` indicates that kube-proxy is running in -"iptables" mode. The most common other mode is "ipvs". The older "userspace" -mode has largely been replaced by these. +"iptables" mode. The most common other mode is "ipvs". #### Iptables mode @@ -602,24 +600,6 @@ endpoint, it will create corresponding real servers. In this example, service hostnames(`10.0.1.175:80`) has 3 endpoints(`10.244.0.5:9376`, `10.244.0.6:9376`, `10.244.0.7:9376`). -#### Userspace mode - -In rare cases, you may be using "userspace" mode. From your Node: - -```shell -iptables-save | grep hostnames -``` -```none --A KUBE-PORTALS-CONTAINER -d 10.0.1.175/32 -p tcp -m comment --comment "default/hostnames:default" -m tcp --dport 80 -j REDIRECT --to-ports 48577 --A KUBE-PORTALS-HOST -d 10.0.1.175/32 -p tcp -m comment --comment "default/hostnames:default" -m tcp --dport 80 -j DNAT --to-destination 10.240.115.247:48577 -``` - -There should be 2 rules for each port of your Service (only one in this -example) - a "KUBE-PORTALS-CONTAINER" and a "KUBE-PORTALS-HOST". - -Almost nobody should be using the "userspace" mode any more, so you won't spend -more time on it here. - ### Is kube-proxy proxying? Assuming you do see one the above cases, try again to access your Service by @@ -632,20 +612,6 @@ curl 10.0.1.175:80 hostnames-632524106-bbpiw ``` -If this fails and you are using the userspace proxy, you can try accessing the -proxy directly. If you are using the iptables proxy, skip this section. - -Look back at the `iptables-save` output above, and extract the -port number that `kube-proxy` is using for your Service. In the above -examples it is "48577". Now connect to that: - -```shell -curl localhost:48577 -``` -```none -hostnames-632524106-tlaok -``` - If this still fails, look at the `kube-proxy` logs for specific lines like: ```none diff --git a/content/en/docs/tasks/job/pod-failure-policy.md b/content/en/docs/tasks/job/pod-failure-policy.md index 954880c894a08..3cafd35ae98ea 100644 --- a/content/en/docs/tasks/job/pod-failure-policy.md +++ b/content/en/docs/tasks/job/pod-failure-policy.md @@ -5,7 +5,7 @@ min-kubernetes-server-version: v1.25 weight: 60 --- -{{< feature-state for_k8s_version="v1.25" state="alpha" >}} +{{< feature-state for_k8s_version="v1.26" state="beta" >}} @@ -28,14 +28,6 @@ You should already be familiar with the basic use of [Job](/docs/concepts/worklo {{< include "task-tutorial-prereqs.md" >}} {{< version-check >}} - - -{{< note >}} -As the features are in Alpha, prepare the Kubernetes cluster with the two -[feature gates](/docs/reference/command-line-tools-reference/feature-gates/) -enabled: `JobPodFailurePolicy` and `PodDisruptionConditions`. -{{< /note >}} - ## Using Pod failure policy to avoid unnecessary Pod retries With the following example, you can learn how to use Pod failure policy to diff --git a/content/en/docs/tasks/manage-gpus/scheduling-gpus.md b/content/en/docs/tasks/manage-gpus/scheduling-gpus.md index c6c37a41de826..40e7bf547b167 100644 --- a/content/en/docs/tasks/manage-gpus/scheduling-gpus.md +++ b/content/en/docs/tasks/manage-gpus/scheduling-gpus.md @@ -8,10 +8,11 @@ description: Configure and schedule GPUs for use as a resource by nodes in a clu -{{< feature-state state="beta" for_k8s_version="v1.10" >}} +{{< feature-state state="stable" for_k8s_version="v1.26" >}} -Kubernetes includes **experimental** support for managing GPUs -(graphical processing units) across several nodes. +Kubernetes includes **stable** support for managing AMD and NVIDIA GPUs +(graphical processing units) across different nodes in your cluster, using +{{< glossary_tooltip text="device plugins" term_id="device-plugin" >}}. This page describes how users can consume GPUs, and outlines some of the limitations in the implementation. @@ -20,8 +21,7 @@ some of the limitations in the implementation. ## Using device plugins -Kubernetes implements {{< glossary_tooltip text="device plugins" term_id="device-plugin" >}} -to let Pods access specialized hardware features such as GPUs. +Kubernetes implements device plugins to let Pods access specialized hardware features such as GPUs. {{% thirdparty-content %}} diff --git a/content/en/docs/tasks/run-application/configure-pdb.md b/content/en/docs/tasks/run-application/configure-pdb.md index b4c735a330f38..ecfb1f5a6ce02 100644 --- a/content/en/docs/tasks/run-application/configure-pdb.md +++ b/content/en/docs/tasks/run-application/configure-pdb.md @@ -127,7 +127,7 @@ is the `scale` of the controller managing the pods being selected by the `PodDisruptionBudget`. Example 1: With a `minAvailable` of 5, evictions are allowed as long as they leave behind -5 or more healthy pods among those selected by the PodDisruptionBudget's `selector`. +5 or more [healthy](#healthiness-of-a-pod) pods among those selected by the PodDisruptionBudget's `selector`. Example 2: With a `minAvailable` of 30%, evictions are allowed as long as at least 30% of the number of desired replicas are healthy. @@ -229,6 +229,51 @@ status: observedGeneration: 1 ``` +### Healthiness of a Pod + +The current implementation considers healthy pods, as pods that have `.status.conditions` item with `type="Ready"` and `status="True"`. +These pods are tracked via `.status.currentHealthy` field in the PDB status. + +## Unhealthy Pod Eviction Policy + +{{< feature-state for_k8s_version="v1.26" state="alpha" >}} + +{{< note >}} +In order to use this behavior, you must enable the `PDBUnhealthyPodEvictionPolicy` +[feature gate](/docs/reference/command-line-tools-reference/feature-gates/) +on the [API server](/docs/reference/command-line-tools-reference/kube-apiserver/). +{{< /note >}} + +PodDisruptionBudget guarding an application ensures that `.status.currentHealthy` number of pods +does not fall below the number specified in `.status.desiredHealthy` by disallowing eviction of healthy pods. +By using `.spec.unhealthyPodEvictionPolicy`, you can also define the criteria when unhealthy pods +should be considered for eviction. The default behavior when no policy is specified corresponds +to the `IfHealthyBudget` policy. + +Policies: + +`IfHealthyBudget` +: Running pods (`.status.phase="Running"`), but not yet healthy can be evicted only if the guarded application is not +disrupted (`.status.currentHealthy` is at least equal to `.status.desiredHealthy`). + +: This policy ensures that running pods of an already disrupted application have the best chance to become healthy. +This has negative implications for draining nodes, which can be blocked by misbehaving applications that are guarded by a PDB. +More specifically applications with pods in `CrashLoopBackOff` state (due to a bug or misconfiguration), +or pods that are just failing to report the `Ready` condition. + +`AlwaysAllow` +: Running pods (`.status.phase="Running"`), but not yet healthy are considered disrupted and can be evicted +regardless of whether the criteria in a PDB is met. + +: This means prospective running pods of a disrupted application might not get a chance to become healthy. +By using this policy, cluster managers can easily evict misbehaving applications that are guarded by a PDB. +More specifically applications with pods in `CrashLoopBackOff` state (due to a bug or misconfiguration), +or pods that are just failing to report the `Ready` condition. + +{{< note >}} +Pods in `Pending`, `Succeeded` or `Failed` phase are always considered for eviction. +{{< /note >}} + ## Arbitrary Controllers and Selectors You can skip this section if you only use PDBs with the built-in diff --git a/content/en/docs/tutorials/stateful-application/basic-stateful-set.md b/content/en/docs/tutorials/stateful-application/basic-stateful-set.md index 024fdacdbfd8a..c78ac76fff172 100644 --- a/content/en/docs/tutorials/stateful-application/basic-stateful-set.md +++ b/content/en/docs/tutorials/stateful-application/basic-stateful-set.md @@ -131,6 +131,11 @@ Notice that the `web-1` Pod is not launched until the `web-0` Pod is _Running_ (see [Pod Phase](/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase)) and _Ready_ (see `type` in [Pod Conditions](/docs/concepts/workloads/pods/pod-lifecycle/#pod-conditions)). +{{< note >}} +To configure the integer ordinal assigned to each Pod in a StatefulSet, see +[Start ordinal](/docs/concepts/workloads/controllers/statefulset/#start-ordinal). +{{< /note >}} + ## Pods in a StatefulSet Pods in a StatefulSet have a unique ordinal index and a stable network identity. diff --git a/content/en/examples/pods/pod-with-scheduling-gates.yaml b/content/en/examples/pods/pod-with-scheduling-gates.yaml new file mode 100644 index 0000000000000..b0b012fb72ca8 --- /dev/null +++ b/content/en/examples/pods/pod-with-scheduling-gates.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Pod +metadata: + name: test-pod +spec: + schedulingGates: + - name: foo + - name: bar + containers: + - name: pause + image: registry.k8s.io/pause:3.6 diff --git a/content/en/examples/pods/pod-without-scheduling-gates.yaml b/content/en/examples/pods/pod-without-scheduling-gates.yaml new file mode 100644 index 0000000000000..5638b6e97af5f --- /dev/null +++ b/content/en/examples/pods/pod-without-scheduling-gates.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Pod +metadata: + name: test-pod +spec: + containers: + - name: pause + image: registry.k8s.io/pause:3.6 diff --git a/content/en/releases/download.md b/content/en/releases/download.md index d82d610d4c3ed..a2c6f85b3b858 100644 --- a/content/en/releases/download.md +++ b/content/en/releases/download.md @@ -79,7 +79,7 @@ you can verify integrity for is a container image, using the experimental signing support. To manually verify signed container images of Kubernetes core components, refer to -[Verify Signed Container Images](/docs/tasks/administer-cluster/verify-signed-images). +[Verify Signed Container Images](/docs/tasks/administer-cluster/verify-signed-artifacts). diff --git a/data/releases/schedule.yaml b/data/releases/schedule.yaml index f1226419ea93a..d42a5005041d9 100644 --- a/data/releases/schedule.yaml +++ b/data/releases/schedule.yaml @@ -1,4 +1,12 @@ schedules: +- release: 1.26 + releaseDate: 2022-12-09 + maintenanceModeStartDate: 2023-12-28 + endOfLifeDate: 2024-02-24 + next: + release: 1.26.1 + cherryPickDeadline: 2022-01-06 + targetDate: 2023-01-11 - release: 1.25 releaseDate: 2022-08-23 maintenanceModeStartDate: 2023-08-28 diff --git a/static/_redirects b/static/_redirects index 99c40ce0cda32..ba55969fc1337 100644 --- a/static/_redirects +++ b/static/_redirects @@ -316,6 +316,7 @@ /docs/tasks/kubectl/install/ /docs/tasks/tools/ 301 /docs/tasks/tools/install-kubectl/ /docs/tasks/tools/ 301 /docs/tasks/kubectl/list-all-running-container-images/ /docs/tasks/access-application-cluster/list-all-running-container-images/ 301 +/docs/tasks/kubelet-credential-provider/kubelet-credential-provider/ /docs/tasks/administer-cluster/kubelet-credential-provider/ 301 /docs/tasks/manage-stateful-set/debugging-a-statefulset/ /docs/tasks/debug/debug-application/debug-statefulset/ 301 /docs/tasks/manage-stateful-set/delete-pods/ /docs/tasks/run-application/delete-stateful-set/ 301 /docs/tasks/manage-stateful-set/deleting-a-statefulset/ /docs/tasks/run-application/delete-stateful-set/ 301 @@ -590,3 +591,4 @@ /zh/* /zh-cn/:splat 302! /docs/concepts/overview/what-is-kubernetes/ /docs/concepts/overview/ 301 +/docs/tasks/administer-cluster/verify-signed-images/ /docs/tasks/administer-cluster/verify-signed-artifacts/ 301