From c4fe7d49ec378e4fe691533afb4acc3cffba8c75 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Thu, 25 Jun 2026 13:59:01 -0700 Subject: [PATCH] Add upgradePolicy to NVIDIADriver CRD The upgradePolicy influences how the driver-upgrade controller upgrades GPU driver daemonsets. Adding this field to the NVIDIADriver CRD allows users to define different upgrade policies for different NVIDIADriver CRs. If nil or empty, the driver-upgrade controller will fallback to using a default upgradePolicy defined in the code which aligns with the defaults in our helm values. Signed-off-by: Christopher Desiniotis --- api/nvidia/v1alpha1/nvidiadriver_types.go | 117 ++++++++++++ api/nvidia/v1alpha1/zz_generated.deepcopy.go | 41 +++++ ...rator-certified.clusterserviceversion.yaml | 19 ++ .../manifests/nvidia.com_nvidiadrivers.yaml | 101 +++++++++++ .../crd/bases/nvidia.com_nvidiadrivers.yaml | 101 +++++++++++ .../samples/nvidia_v1alpha1_nvidiadriver.yaml | 22 +++ controllers/upgrade_controller.go | 170 +++++++++++++++++- .../crds/nvidia.com_nvidiadrivers.yaml | 101 +++++++++++ .../gpu-operator/templates/nvidiadriver.yaml | 23 +++ 9 files changed, 690 insertions(+), 5 deletions(-) diff --git a/api/nvidia/v1alpha1/nvidiadriver_types.go b/api/nvidia/v1alpha1/nvidiadriver_types.go index 0ec6ecbc1b..b9ecdcd5ac 100644 --- a/api/nvidia/v1alpha1/nvidiadriver_types.go +++ b/api/nvidia/v1alpha1/nvidiadriver_types.go @@ -24,6 +24,9 @@ import ( "golang.org/x/mod/semver" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + + upgrade_v1alpha1 "github.com/NVIDIA/k8s-operator-libs/api/upgrade/v1alpha1" "github.com/NVIDIA/gpu-operator/internal/consts" "github.com/NVIDIA/gpu-operator/internal/image" @@ -173,6 +176,11 @@ type NVIDIADriverSpec struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Name of the Kubernetes Secret with secret environment variables for the NVIDIA Driver" SecretEnv string `json:"secretEnv,omitempty"` + // UpgradePolicy allows to control automatic upgrade of the driver on nodes + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Driver Upgrade Policy" + UpgradePolicy *DriverUpgradePolicySpec `json:"upgradePolicy,omitempty"` + // +kubebuilder:validation:Optional // NodeSelector specifies a selector for installation of NVIDIA driver NodeSelector map[string]string `json:"nodeSelector,omitempty"` @@ -780,3 +788,112 @@ func (l *DriverLicensingConfigSpec) IsNLSEnabled() bool { } return *l.NLSEnabled } + +// DriverUpgradePolicySpec describes policy configuration for automatic upgrades of the driver. +type DriverUpgradePolicySpec struct { + // AutoUpgrade is a switch for automatic upgrade feature. + // If set to false all other options are ignored. + // +optional + // +kubebuilder:default=true + AutoUpgrade bool `json:"autoUpgrade,omitempty"` + // MaxParallelUpgrades indicates how many nodes can be upgraded in parallel. + // 0 means no limit, all nodes will be upgraded in parallel. + // +optional + // +kubebuilder:default=1 + // +kubebuilder:validation:Minimum=0 + MaxParallelUpgrades int `json:"maxParallelUpgrades,omitempty"` + // MaxUnavailable is the maximum number of nodes with the driver installed, that can be unavailable during the upgrade. + // Value can be an absolute number (ex: 5) or a percentage of total nodes at the start of upgrade (ex: 10%). + // Absolute number is calculated from percentage by rounding up. + // By default, a fixed value of 25% is used. + // +optional + // +kubebuilder:default="25%" + MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty"` + PodDeletion *PodDeletionSpec `json:"podDeletion,omitempty"` + WaitForCompletion *WaitForCompletionSpec `json:"waitForCompletion,omitempty"` + DrainSpec *DrainSpec `json:"drain,omitempty"` +} + +type PodDeletionSpec = upgrade_v1alpha1.PodDeletionSpec +type WaitForCompletionSpec = upgrade_v1alpha1.WaitForCompletionSpec +type DrainSpec = upgrade_v1alpha1.DrainSpec + +// GetUpgradePolicyWithDefaults returns the upgrade policy for this driver +// with default values applied for any unset fields. +func (s *NVIDIADriverSpec) GetUpgradePolicyWithDefaults() *upgrade_v1alpha1.DriverUpgradePolicySpec { + if s.UpgradePolicy == nil { + return getDefaultUpgradePolicySpec() + } + + result := &upgrade_v1alpha1.DriverUpgradePolicySpec{ + AutoUpgrade: s.UpgradePolicy.AutoUpgrade, + MaxParallelUpgrades: s.UpgradePolicy.MaxParallelUpgrades, + } + + if s.UpgradePolicy.MaxUnavailable != nil { + result.MaxUnavailable = s.UpgradePolicy.MaxUnavailable + } else { + result.MaxUnavailable = getDefaultMaxUnavailable() + } + + if s.UpgradePolicy.PodDeletion != nil { + result.PodDeletion = s.UpgradePolicy.PodDeletion + } else { + result.PodDeletion = getDefaultPodCompletionSpec() + } + + if s.UpgradePolicy.WaitForCompletion != nil { + result.WaitForCompletion = s.UpgradePolicy.WaitForCompletion + } else { + result.WaitForCompletion = getDefaultWaitForCompletionSpec() + } + + if s.UpgradePolicy.DrainSpec != nil { + result.DrainSpec = s.UpgradePolicy.DrainSpec + } else { + result.DrainSpec = getDefaultDrainSpec() + } + + return result +} + +func getDefaultUpgradePolicySpec() *upgrade_v1alpha1.DriverUpgradePolicySpec { + return &upgrade_v1alpha1.DriverUpgradePolicySpec{ + AutoUpgrade: true, + MaxParallelUpgrades: 1, + MaxUnavailable: getDefaultMaxUnavailable(), + PodDeletion: getDefaultPodCompletionSpec(), + WaitForCompletion: getDefaultWaitForCompletionSpec(), + DrainSpec: getDefaultDrainSpec(), + } +} + +func getDefaultMaxUnavailable() *intstr.IntOrString { + defaultMaxUnavailable := intstr.FromString("25%") + return &defaultMaxUnavailable +} + +func getDefaultPodCompletionSpec() *PodDeletionSpec { + return &PodDeletionSpec{ + Force: false, + TimeoutSecond: 300, + DeleteEmptyDir: false, + } +} + +func getDefaultWaitForCompletionSpec() *WaitForCompletionSpec { + return &WaitForCompletionSpec{ + PodSelector: "", + TimeoutSecond: 0, + } +} + +func getDefaultDrainSpec() *DrainSpec { + return &DrainSpec{ + Enable: false, + Force: false, + PodSelector: "", + TimeoutSecond: 300, + DeleteEmptyDir: false, + } +} diff --git a/api/nvidia/v1alpha1/zz_generated.deepcopy.go b/api/nvidia/v1alpha1/zz_generated.deepcopy.go index 466ecebbea..09a218dde5 100644 --- a/api/nvidia/v1alpha1/zz_generated.deepcopy.go +++ b/api/nvidia/v1alpha1/zz_generated.deepcopy.go @@ -24,6 +24,7 @@ import ( "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/intstr" ) // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. @@ -116,6 +117,41 @@ func (in *DriverRepoConfigSpec) DeepCopy() *DriverRepoConfigSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DriverUpgradePolicySpec) DeepCopyInto(out *DriverUpgradePolicySpec) { + *out = *in + if in.MaxUnavailable != nil { + in, out := &in.MaxUnavailable, &out.MaxUnavailable + *out = new(intstr.IntOrString) + **out = **in + } + if in.PodDeletion != nil { + in, out := &in.PodDeletion, &out.PodDeletion + *out = new(PodDeletionSpec) + **out = **in + } + if in.WaitForCompletion != nil { + in, out := &in.WaitForCompletion, &out.WaitForCompletion + *out = new(WaitForCompletionSpec) + **out = **in + } + if in.DrainSpec != nil { + in, out := &in.DrainSpec, &out.DrainSpec + *out = new(DrainSpec) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DriverUpgradePolicySpec. +func (in *DriverUpgradePolicySpec) DeepCopy() *DriverUpgradePolicySpec { + if in == nil { + return nil + } + out := new(DriverUpgradePolicySpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *EnvVar) DeepCopyInto(out *EnvVar) { *out = *in @@ -389,6 +425,11 @@ func (in *NVIDIADriverSpec) DeepCopyInto(out *NVIDIADriverSpec) { *out = new(KernelModuleConfigSpec) **out = **in } + if in.UpgradePolicy != nil { + in, out := &in.UpgradePolicy, &out.UpgradePolicy + *out = new(DriverUpgradePolicySpec) + (*in).DeepCopyInto(*out) + } if in.NodeSelector != nil { in, out := &in.NodeSelector, &out.NodeSelector *out = make(map[string]string, len(*in)) diff --git a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml index 0edaf5d0de..961cabd507 100644 --- a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml +++ b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml @@ -167,6 +167,25 @@ metadata: "version": "sha256:3d7ae961d7bce5e193885aa99e91ba87421bccc3d187cd9997083faf021208be", "nodeSelector": {}, "manager": {}, + "upgradePolicy": { + "autoUpgrade": true, + "drain": { + "deleteEmptyDir": false, + "enable": false, + "force": false, + "timeoutSeconds": 300 + }, + "maxParallelUpgrades": 1, + "maxUnavailable": "25%", + "podDeletion": { + "deleteEmptyDir": false, + "force": false, + "timeoutSeconds": 300 + }, + "waitForCompletion": { + "timeoutSeconds": 0 + } + }, "repoConfig": { "name": "" }, diff --git a/bundle/manifests/nvidia.com_nvidiadrivers.yaml b/bundle/manifests/nvidia.com_nvidiadrivers.yaml index 62336ab570..0b858cd192 100644 --- a/bundle/manifests/nvidia.com_nvidiadrivers.yaml +++ b/bundle/manifests/nvidia.com_nvidiadrivers.yaml @@ -953,6 +953,107 @@ spec: type: string type: object type: array + upgradePolicy: + description: UpgradePolicy allows to control automatic upgrade of + the driver on nodes + properties: + autoUpgrade: + default: true + description: |- + AutoUpgrade is a switch for automatic upgrade feature. + If set to false all other options are ignored. + type: boolean + drain: + description: DrainSpec describes configuration for node drain + during automatic upgrade + properties: + deleteEmptyDir: + default: false + description: |- + DeleteEmptyDir indicates if should continue even if there are pods using emptyDir + (local data that will be deleted when the node is drained) + type: boolean + enable: + default: false + description: Enable indicates if node draining is allowed + during upgrade + type: boolean + force: + default: false + description: Force indicates if force draining is allowed + type: boolean + podSelector: + description: |- + PodSelector specifies a label selector to filter pods on the node that need to be drained + For more details on label selectors, see: + https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + type: string + timeoutSeconds: + default: 300 + description: TimeoutSecond specifies the length of time in + seconds to wait before giving up drain, zero means infinite + minimum: 0 + type: integer + type: object + maxParallelUpgrades: + default: 1 + description: |- + MaxParallelUpgrades indicates how many nodes can be upgraded in parallel. + 0 means no limit, all nodes will be upgraded in parallel. + minimum: 0 + type: integer + maxUnavailable: + anyOf: + - type: integer + - type: string + default: 25% + description: |- + MaxUnavailable is the maximum number of nodes with the driver installed, that can be unavailable during the upgrade. + Value can be an absolute number (ex: 5) or a percentage of total nodes at the start of upgrade (ex: 10%). + Absolute number is calculated from percentage by rounding up. + By default, a fixed value of 25% is used. + x-kubernetes-int-or-string: true + podDeletion: + description: PodDeletionSpec describes configuration for deletion + of pods using special resources during automatic upgrade + properties: + deleteEmptyDir: + default: false + description: |- + DeleteEmptyDir indicates if should continue even if there are pods using emptyDir + (local data that will be deleted when the pod is deleted) + type: boolean + force: + default: false + description: Force indicates if force deletion is allowed + type: boolean + timeoutSeconds: + default: 300 + description: |- + TimeoutSecond specifies the length of time in seconds to wait before giving up on pod termination, zero means + infinite + minimum: 0 + type: integer + type: object + waitForCompletion: + description: WaitForCompletionSpec describes the configuration + for waiting on job completions + properties: + podSelector: + description: |- + PodSelector specifies a label selector for the pods to wait for completion + For more details on label selectors, see: + https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + type: string + timeoutSeconds: + default: 0 + description: |- + TimeoutSecond specifies the length of time in seconds to wait before giving up on pod termination, zero means + infinite + minimum: 0 + type: integer + type: object + type: object useOpenKernelModules: description: |- Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. diff --git a/config/crd/bases/nvidia.com_nvidiadrivers.yaml b/config/crd/bases/nvidia.com_nvidiadrivers.yaml index 62336ab570..0b858cd192 100644 --- a/config/crd/bases/nvidia.com_nvidiadrivers.yaml +++ b/config/crd/bases/nvidia.com_nvidiadrivers.yaml @@ -953,6 +953,107 @@ spec: type: string type: object type: array + upgradePolicy: + description: UpgradePolicy allows to control automatic upgrade of + the driver on nodes + properties: + autoUpgrade: + default: true + description: |- + AutoUpgrade is a switch for automatic upgrade feature. + If set to false all other options are ignored. + type: boolean + drain: + description: DrainSpec describes configuration for node drain + during automatic upgrade + properties: + deleteEmptyDir: + default: false + description: |- + DeleteEmptyDir indicates if should continue even if there are pods using emptyDir + (local data that will be deleted when the node is drained) + type: boolean + enable: + default: false + description: Enable indicates if node draining is allowed + during upgrade + type: boolean + force: + default: false + description: Force indicates if force draining is allowed + type: boolean + podSelector: + description: |- + PodSelector specifies a label selector to filter pods on the node that need to be drained + For more details on label selectors, see: + https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + type: string + timeoutSeconds: + default: 300 + description: TimeoutSecond specifies the length of time in + seconds to wait before giving up drain, zero means infinite + minimum: 0 + type: integer + type: object + maxParallelUpgrades: + default: 1 + description: |- + MaxParallelUpgrades indicates how many nodes can be upgraded in parallel. + 0 means no limit, all nodes will be upgraded in parallel. + minimum: 0 + type: integer + maxUnavailable: + anyOf: + - type: integer + - type: string + default: 25% + description: |- + MaxUnavailable is the maximum number of nodes with the driver installed, that can be unavailable during the upgrade. + Value can be an absolute number (ex: 5) or a percentage of total nodes at the start of upgrade (ex: 10%). + Absolute number is calculated from percentage by rounding up. + By default, a fixed value of 25% is used. + x-kubernetes-int-or-string: true + podDeletion: + description: PodDeletionSpec describes configuration for deletion + of pods using special resources during automatic upgrade + properties: + deleteEmptyDir: + default: false + description: |- + DeleteEmptyDir indicates if should continue even if there are pods using emptyDir + (local data that will be deleted when the pod is deleted) + type: boolean + force: + default: false + description: Force indicates if force deletion is allowed + type: boolean + timeoutSeconds: + default: 300 + description: |- + TimeoutSecond specifies the length of time in seconds to wait before giving up on pod termination, zero means + infinite + minimum: 0 + type: integer + type: object + waitForCompletion: + description: WaitForCompletionSpec describes the configuration + for waiting on job completions + properties: + podSelector: + description: |- + PodSelector specifies a label selector for the pods to wait for completion + For more details on label selectors, see: + https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + type: string + timeoutSeconds: + default: 0 + description: |- + TimeoutSecond specifies the length of time in seconds to wait before giving up on pod termination, zero means + infinite + minimum: 0 + type: integer + type: object + type: object useOpenKernelModules: description: |- Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. diff --git a/config/samples/nvidia_v1alpha1_nvidiadriver.yaml b/config/samples/nvidia_v1alpha1_nvidiadriver.yaml index a39869621e..5b01de3a73 100644 --- a/config/samples/nvidia_v1alpha1_nvidiadriver.yaml +++ b/config/samples/nvidia_v1alpha1_nvidiadriver.yaml @@ -13,6 +13,28 @@ spec: imagePullSecrets: [] nodeSelector: {} manager: {} + upgradePolicy: + # global switch for automatic upgrade feature + # if set to false all other options are ignored + autoUpgrade: true + # how many nodes can be upgraded in parallel + # 0 means no limit, all nodes will be upgraded in parallel + maxParallelUpgrades: 1 + # maximum number of nodes with the driver installed, that can be unavailable during + # the upgrade. Value can be an absolute number (ex: 5) or + # a percentage of total nodes at the start of upgrade (ex: + # 10%). Absolute number is calculated from percentage by rounding + # up. By default, a fixed value of 25% is used.' + maxUnavailable: 25% + # options for waiting on pod(job) completions + waitForCompletion: + timeoutSeconds: 0 + podSelector: "" + # options for gpu pod deletion + podDeletion: + force: false + timeoutSeconds: 300 + deleteEmptyDir: false rdma: enabled: false useHostMofed: false diff --git a/controllers/upgrade_controller.go b/controllers/upgrade_controller.go index 9ba941ef7e..38947f78cb 100644 --- a/controllers/upgrade_controller.go +++ b/controllers/upgrade_controller.go @@ -47,6 +47,7 @@ import ( gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" + gpuconsts "github.com/NVIDIA/gpu-operator/internal/consts" ) // UpgradeReconciler reconciles Driver Daemon Sets for upgrade @@ -108,6 +109,19 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, r.removeNodeUpgradeStateLabels(ctx) } + // TODO: When integrating the NVIDIA DRA Driver for GPUs, decouple + // the driver-upgrade controller from ClusterPolicy. If a ClusterPolicy + // CR does not exist, take the NVIDIADriver code path. + if clusterPolicy.Spec.Driver.UseNvidiaDriverCRDType() { + return r.reconcileNVIDIADriverUpgrades(ctx, reqLogger) + } + + return r.reconcileClusterPolicyDriverUpgrades(ctx, reqLogger, clusterPolicy) +} + +// reconcileClusterPolicyDriverUpgrades handles driver upgrade reconciliation when the +// ClusterPolicy CR is used for driver management. +func (r *UpgradeReconciler) reconcileClusterPolicyDriverUpgrades(ctx context.Context, reqLogger logr.Logger, clusterPolicy *gpuv1.ClusterPolicy) (ctrl.Result, error) { if clusterPolicy.Spec.Driver.UpgradePolicy == nil || !clusterPolicy.Spec.Driver.UpgradePolicy.AutoUpgrade { reqLogger.V(consts.LogLevelInfo).Info("Advanced driver upgrade policy is disabled, cleaning up upgrade state and skipping reconciliation") @@ -122,11 +136,7 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct driverLabelKey := DriverLabelKey driverLabelValue := DriverLabelValue - if clusterPolicy.Spec.Driver.UseNvidiaDriverCRDType() { - // app component label is added for all new driver daemonsets deployed by NVIDIADriver controller - driverLabelKey = AppComponentLabelKey - driverLabelValue = DriverAppComponentLabelValue - } else if clusterPolicyCtrl.openshift != "" && clusterPolicyCtrl.ocpDriverToolkit.enabled { + if clusterPolicyCtrl.openshift != "" && clusterPolicyCtrl.ocpDriverToolkit.enabled { // For OCP, when DTK is enabled app=nvidia-driver-daemonset label is not constant and changes // based on rhcos version. Hence use DTK label instead driverLabelKey = ocpDriverToolkitIdentificationLabel @@ -190,6 +200,130 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{Requeue: true, RequeueAfter: plannedRequeueInterval}, nil } +// reconcileNVIDIADriverUpgrades handles driver upgrade reconciliation when the NVIDIADriver CRD +// is used for driver management. Each NVIDIADriver instance may have its own upgrade policy. +func (r *UpgradeReconciler) reconcileNVIDIADriverUpgrades(ctx context.Context, reqLogger logr.Logger) (ctrl.Result, error) { + var ( + upgradesInProgress, upgradesDone, upgradesAvailable, upgradesFailed, upgradesPending int + ) + + nvidiaDriverList := &nvidiav1alpha1.NVIDIADriverList{} + if err := r.List(ctx, nvidiaDriverList); err != nil { + return ctrl.Result{}, err + } + + // Check if all NVIDIADriver instances have disabled automatic upgrades + noAutoUpgradesEnabled := true + for _, nvd := range nvidiaDriverList.Items { + upgradePolicy := nvd.Spec.GetUpgradePolicyWithDefaults() + if upgradePolicy.AutoUpgrade { + noAutoUpgradesEnabled = false + break + } + } + + if noAutoUpgradesEnabled { + reqLogger.V(consts.LogLevelInfo).Info("No NVIDIADriver instance has upgrade policy enabled, cleaning up upgrade state and skipping reconciliation") + r.OperatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled) + return ctrl.Result{}, r.removeNodeUpgradeStateLabels(ctx) + } + + r.OperatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeEnabled) + + // Build a cluster-wide upgrade state using only the component label so that ALL + // driver pods are captured, including orphaned pods (e.g. pods left over from a + // ClusterPolicy-managed DaemonSet). + // TODO: decouple the operatorNamespace field from the ClusterPolicyController object + clusterState, err := r.StateManager.BuildState(ctx, clusterPolicyCtrl.operatorNamespace, map[string]string{AppComponentLabelKey: DriverAppComponentLabelValue}) + if err != nil { + r.Log.Error(err, "Failed to build cluster upgrade state") + return ctrl.Result{}, err + } + + // Partition the cluster upgrade state into per-NVIDIADriver buckets by reading the + // nvidia.com/gpu-operator.driver.owner label from each node. + statesByNVD := make(map[string]*upgrade.ClusterUpgradeState) + for stateKey, nodeStates := range clusterState.NodeStates { + for _, nodeState := range nodeStates { + ownerName := nodeState.Node.Labels[gpuconsts.NVIDIADriverOwnerLabel] + if ownerName == "" { + reqLogger.V(consts.LogLevelInfo).Info("Node does not have nvidia.com/gpu-operator.driver.owner label, skipping ...", "NodeName", nodeState.Node.Name) + continue + } + if statesByNVD[ownerName] == nil { + s := upgrade.NewClusterUpgradeState() + statesByNVD[ownerName] = &s + } + statesByNVD[ownerName].NodeStates[stateKey] = append(statesByNVD[ownerName].NodeStates[stateKey], nodeState) + } + } + + // Apply the upgrade policy for each NVIDIADriver instance using its partitioned cluster upgrade state + for _, nvd := range nvidiaDriverList.Items { + upgradePolicy := nvd.Spec.GetUpgradePolicyWithDefaults() + if !upgradePolicy.AutoUpgrade { + reqLogger.V(consts.LogLevelInfo).Info("Auto upgrade is disabled for NVIDIADriver, cleaning up upgrade state for nodes it manages", + "name", nvd.Name) + if err := r.removeNodeUpgradeStateLabelsForNVD(ctx, nvd.Name); err != nil { + r.Log.Error(err, "Failed to remove upgrade state labels for NVIDIADriver", "name", nvd.Name) + return ctrl.Result{}, err + } + continue + } + + state, ok := statesByNVD[nvd.Name] + if !ok { + continue + } + + reqLogger.V(consts.LogLevelDebug).Info("Current cluster upgrade state for NVIDIADriver", + "name", nvd.Name, "state", state) + + totalNodes := r.StateManager.GetTotalManagedNodes(state) + maxUnavailable, err := intstr.GetScaledValueFromIntOrPercent(upgradePolicy.MaxUnavailable, totalNodes, true) + if err != nil { + r.Log.Error(err, "Failed to compute maxUnavailable for NVIDIADriver", "name", nvd.Name) + return ctrl.Result{}, err + } + + upgradesInProgress += r.StateManager.GetUpgradesInProgress(state) + upgradesDone += r.StateManager.GetUpgradesDone(state) + upgradesAvailable += r.StateManager.GetUpgradesAvailable(state, upgradePolicy.MaxParallelUpgrades, maxUnavailable) + upgradesFailed += r.StateManager.GetUpgradesFailed(state) + upgradesPending += r.StateManager.GetUpgradesPending(state) + + // We want to skip the operator itself during the drain because the upgrade process might hang + // if the operator is evicted and can't be rescheduled to any other node, e.g. in a single-node cluster. + // It's safe to do because the goal of the node draining during the upgrade is to + // evict pods that might use driver and operator doesn't use in its own pod. + if upgradePolicy.DrainSpec.PodSelector == "" { + upgradePolicy.DrainSpec.PodSelector = UpgradeSkipDrainLabelSelector + } else { + upgradePolicy.DrainSpec.PodSelector = fmt.Sprintf("%s,%s", upgradePolicy.DrainSpec.PodSelector, UpgradeSkipDrainLabelSelector) + } + + reqLogger.Info("Applying upgrade policy for NVIDIADriver", "name", nvd.Name) + if err := r.StateManager.ApplyState(ctx, state, upgradePolicy); err != nil { + r.Log.Error(err, "Failed to apply cluster upgrade state for NVIDIADriver", "name", nvd.Name) + return ctrl.Result{}, err + } + } + + // Capture aggregate metrics from all NVIDIADriver CRs processed. This should provide + // a cluster-wide view of driver daemonset upgrades. + r.OperatorMetrics.upgradesInProgress.Set(float64(upgradesInProgress)) + r.OperatorMetrics.upgradesDone.Set(float64(upgradesDone)) + r.OperatorMetrics.upgradesAvailable.Set(float64(upgradesAvailable)) + r.OperatorMetrics.upgradesFailed.Set(float64(upgradesFailed)) + r.OperatorMetrics.upgradesPending.Set(float64(upgradesPending)) + + // In some cases if node state changes fail to apply, upgrade process + // might become stuck until the new reconcile loop is scheduled. + // Since node/ds/clusterpolicy updates from outside of the upgrade flow + // are not guaranteed, for safety reconcile loop should be requeued every few minutes. + return ctrl.Result{Requeue: true, RequeueAfter: plannedRequeueInterval}, nil +} + // removeNodeUpgradeStateLabels loops over nodes in the cluster and removes "nvidia.com/gpu-driver-upgrade-state" // It is used for cleanup when autoUpgrade feature gets disabled func (r *UpgradeReconciler) removeNodeUpgradeStateLabels(ctx context.Context) error { @@ -219,6 +353,32 @@ func (r *UpgradeReconciler) removeNodeUpgradeStateLabels(ctx context.Context) er return nil } +// removeNodeUpgradeStateLabelsForNVD removes the upgrade-state label from all nodes owned by +// the given NVIDIADriver CR. It is used for cleanup when autoUpgrade is disabled for that CR. +func (r *UpgradeReconciler) removeNodeUpgradeStateLabelsForNVD(ctx context.Context, nvdName string) error { + r.Log.Info("Resetting node upgrade labels for NVIDIADriver", "name", nvdName) + + nodeList := &corev1.NodeList{} + if err := r.List(ctx, nodeList, client.MatchingLabels{gpuconsts.NVIDIADriverOwnerLabel: nvdName}); err != nil { + r.Log.Error(err, "Failed to list nodes for NVIDIADriver", "name", nvdName) + return err + } + + upgradeStateLabel := upgrade.GetUpgradeStateLabelKey() + + for _, node := range nodeList.Items { + if _, present := node.Labels[upgradeStateLabel]; !present { + continue + } + delete(node.Labels, upgradeStateLabel) + if err := r.Update(ctx, &node); err != nil { + r.Log.Error(err, "Failed to reset upgrade state label from node", "node", node.Name) + return err + } + } + return nil +} + // SetupWithManager sets up the controller with the Manager. // //nolint:dupl diff --git a/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml b/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml index 62336ab570..0b858cd192 100644 --- a/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml @@ -953,6 +953,107 @@ spec: type: string type: object type: array + upgradePolicy: + description: UpgradePolicy allows to control automatic upgrade of + the driver on nodes + properties: + autoUpgrade: + default: true + description: |- + AutoUpgrade is a switch for automatic upgrade feature. + If set to false all other options are ignored. + type: boolean + drain: + description: DrainSpec describes configuration for node drain + during automatic upgrade + properties: + deleteEmptyDir: + default: false + description: |- + DeleteEmptyDir indicates if should continue even if there are pods using emptyDir + (local data that will be deleted when the node is drained) + type: boolean + enable: + default: false + description: Enable indicates if node draining is allowed + during upgrade + type: boolean + force: + default: false + description: Force indicates if force draining is allowed + type: boolean + podSelector: + description: |- + PodSelector specifies a label selector to filter pods on the node that need to be drained + For more details on label selectors, see: + https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + type: string + timeoutSeconds: + default: 300 + description: TimeoutSecond specifies the length of time in + seconds to wait before giving up drain, zero means infinite + minimum: 0 + type: integer + type: object + maxParallelUpgrades: + default: 1 + description: |- + MaxParallelUpgrades indicates how many nodes can be upgraded in parallel. + 0 means no limit, all nodes will be upgraded in parallel. + minimum: 0 + type: integer + maxUnavailable: + anyOf: + - type: integer + - type: string + default: 25% + description: |- + MaxUnavailable is the maximum number of nodes with the driver installed, that can be unavailable during the upgrade. + Value can be an absolute number (ex: 5) or a percentage of total nodes at the start of upgrade (ex: 10%). + Absolute number is calculated from percentage by rounding up. + By default, a fixed value of 25% is used. + x-kubernetes-int-or-string: true + podDeletion: + description: PodDeletionSpec describes configuration for deletion + of pods using special resources during automatic upgrade + properties: + deleteEmptyDir: + default: false + description: |- + DeleteEmptyDir indicates if should continue even if there are pods using emptyDir + (local data that will be deleted when the pod is deleted) + type: boolean + force: + default: false + description: Force indicates if force deletion is allowed + type: boolean + timeoutSeconds: + default: 300 + description: |- + TimeoutSecond specifies the length of time in seconds to wait before giving up on pod termination, zero means + infinite + minimum: 0 + type: integer + type: object + waitForCompletion: + description: WaitForCompletionSpec describes the configuration + for waiting on job completions + properties: + podSelector: + description: |- + PodSelector specifies a label selector for the pods to wait for completion + For more details on label selectors, see: + https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + type: string + timeoutSeconds: + default: 0 + description: |- + TimeoutSecond specifies the length of time in seconds to wait before giving up on pod termination, zero means + infinite + minimum: 0 + type: integer + type: object + type: object useOpenKernelModules: description: |- Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. diff --git a/deployments/gpu-operator/templates/nvidiadriver.yaml b/deployments/gpu-operator/templates/nvidiadriver.yaml index ecf656e841..31a071bae6 100644 --- a/deployments/gpu-operator/templates/nvidiadriver.yaml +++ b/deployments/gpu-operator/templates/nvidiadriver.yaml @@ -33,6 +33,29 @@ spec: {{- if .Values.driver.readinessProbe }} readinessProbe: {{ toYaml .Values.driver.readinessProbe | nindent 4 }} {{- end }} + {{- if .Values.driver.upgradePolicy }} + upgradePolicy: + autoUpgrade: {{ .Values.driver.upgradePolicy.autoUpgrade | default false }} + maxParallelUpgrades: {{ .Values.driver.upgradePolicy.maxParallelUpgrades | default 0 }} + maxUnavailable: {{ .Values.driver.upgradePolicy.maxUnavailable | default "25%" }} + waitForCompletion: + timeoutSeconds: {{ .Values.driver.upgradePolicy.waitForCompletion.timeoutSeconds }} + {{- if .Values.driver.upgradePolicy.waitForCompletion.podSelector }} + podSelector: {{ .Values.driver.upgradePolicy.waitForCompletion.podSelector }} + {{- end }} + podDeletion: + force: {{ .Values.driver.upgradePolicy.gpuPodDeletion.force | default false }} + timeoutSeconds: {{ .Values.driver.upgradePolicy.gpuPodDeletion.timeoutSeconds }} + deleteEmptyDir: {{ .Values.driver.upgradePolicy.gpuPodDeletion.deleteEmptyDir | default false }} + drain: + enable: {{ .Values.driver.upgradePolicy.drain.enable | default false }} + force: {{ .Values.driver.upgradePolicy.drain.force | default false }} + {{- if .Values.driver.upgradePolicy.drain.podSelector }} + podSelector: {{ .Values.driver.upgradePolicy.drain.podSelector }} + {{- end }} + timeoutSeconds: {{ .Values.driver.upgradePolicy.drain.timeoutSeconds }} + deleteEmptyDir: {{ .Values.driver.upgradePolicy.drain.deleteEmptyDir | default false }} + {{- end }} rdma: enabled: {{ .Values.driver.rdma.enabled }} useHostMofed: {{ .Values.driver.rdma.useHostMofed }}