Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions api/nvidia/v1alpha1/nvidiadriver_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ import (
"golang.org/x/mod/semver"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"

upgrade_v1alpha1 "github.com/NVIDIA/k8s-operator-libs/api/upgrade/v1alpha1"

"github.com/NVIDIA/gpu-operator/internal/consts"
"github.com/NVIDIA/gpu-operator/internal/image"
Expand Down Expand Up @@ -173,6 +176,11 @@ type NVIDIADriverSpec struct {
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Name of the Kubernetes Secret with secret environment variables for the NVIDIA Driver"
SecretEnv string `json:"secretEnv,omitempty"`

// UpgradePolicy allows to control automatic upgrade of the driver on nodes
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Driver Upgrade Policy"
UpgradePolicy *DriverUpgradePolicySpec `json:"upgradePolicy,omitempty"`

// +kubebuilder:validation:Optional
// NodeSelector specifies a selector for installation of NVIDIA driver
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
Expand Down Expand Up @@ -780,3 +788,112 @@ func (l *DriverLicensingConfigSpec) IsNLSEnabled() bool {
}
return *l.NLSEnabled
}

Comment thread
rahulait marked this conversation as resolved.
// DriverUpgradePolicySpec describes policy configuration for automatic upgrades of the driver.
type DriverUpgradePolicySpec struct {
// AutoUpgrade is a switch for automatic upgrade feature.
// If set to false all other options are ignored.
// +optional
// +kubebuilder:default=true
AutoUpgrade bool `json:"autoUpgrade,omitempty"`
// MaxParallelUpgrades indicates how many nodes can be upgraded in parallel.
// 0 means no limit, all nodes will be upgraded in parallel.
// +optional
// +kubebuilder:default=1
// +kubebuilder:validation:Minimum=0
MaxParallelUpgrades int `json:"maxParallelUpgrades,omitempty"`
// MaxUnavailable is the maximum number of nodes with the driver installed, that can be unavailable during the upgrade.
// Value can be an absolute number (ex: 5) or a percentage of total nodes at the start of upgrade (ex: 10%).
// Absolute number is calculated from percentage by rounding up.
// By default, a fixed value of 25% is used.
// +optional
// +kubebuilder:default="25%"
MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty"`
PodDeletion *PodDeletionSpec `json:"podDeletion,omitempty"`
WaitForCompletion *WaitForCompletionSpec `json:"waitForCompletion,omitempty"`
DrainSpec *DrainSpec `json:"drain,omitempty"`
}

type PodDeletionSpec = upgrade_v1alpha1.PodDeletionSpec
type WaitForCompletionSpec = upgrade_v1alpha1.WaitForCompletionSpec
type DrainSpec = upgrade_v1alpha1.DrainSpec

// GetUpgradePolicyWithDefaults returns the upgrade policy for this driver
// with default values applied for any unset fields.
func (s *NVIDIADriverSpec) GetUpgradePolicyWithDefaults() *upgrade_v1alpha1.DriverUpgradePolicySpec {
if s.UpgradePolicy == nil {
return getDefaultUpgradePolicySpec()
}

Comment thread
rahulait marked this conversation as resolved.
result := &upgrade_v1alpha1.DriverUpgradePolicySpec{
AutoUpgrade: s.UpgradePolicy.AutoUpgrade,
MaxParallelUpgrades: s.UpgradePolicy.MaxParallelUpgrades,
}

if s.UpgradePolicy.MaxUnavailable != nil {
result.MaxUnavailable = s.UpgradePolicy.MaxUnavailable
} else {
result.MaxUnavailable = getDefaultMaxUnavailable()
}

if s.UpgradePolicy.PodDeletion != nil {
result.PodDeletion = s.UpgradePolicy.PodDeletion
} else {
result.PodDeletion = getDefaultPodCompletionSpec()
}

if s.UpgradePolicy.WaitForCompletion != nil {
result.WaitForCompletion = s.UpgradePolicy.WaitForCompletion
} else {
result.WaitForCompletion = getDefaultWaitForCompletionSpec()
}

if s.UpgradePolicy.DrainSpec != nil {
result.DrainSpec = s.UpgradePolicy.DrainSpec
} else {
result.DrainSpec = getDefaultDrainSpec()
}

return result
}

func getDefaultUpgradePolicySpec() *upgrade_v1alpha1.DriverUpgradePolicySpec {
return &upgrade_v1alpha1.DriverUpgradePolicySpec{
AutoUpgrade: true,
MaxParallelUpgrades: 1,
MaxUnavailable: getDefaultMaxUnavailable(),
PodDeletion: getDefaultPodCompletionSpec(),
WaitForCompletion: getDefaultWaitForCompletionSpec(),
DrainSpec: getDefaultDrainSpec(),
}
}

func getDefaultMaxUnavailable() *intstr.IntOrString {
defaultMaxUnavailable := intstr.FromString("25%")
return &defaultMaxUnavailable
}

func getDefaultPodCompletionSpec() *PodDeletionSpec {
return &PodDeletionSpec{
Force: false,
TimeoutSecond: 300,
DeleteEmptyDir: false,
}
}

func getDefaultWaitForCompletionSpec() *WaitForCompletionSpec {
return &WaitForCompletionSpec{
PodSelector: "",
TimeoutSecond: 0,
}
}

func getDefaultDrainSpec() *DrainSpec {
return &DrainSpec{
Enable: false,
Force: false,
PodSelector: "",
TimeoutSecond: 300,
DeleteEmptyDir: false,
}
}
41 changes: 41 additions & 0 deletions api/nvidia/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,25 @@ metadata:
"version": "sha256:3d7ae961d7bce5e193885aa99e91ba87421bccc3d187cd9997083faf021208be",
"nodeSelector": {},
"manager": {},
"upgradePolicy": {
"autoUpgrade": true,
"drain": {
"deleteEmptyDir": false,
"enable": false,
"force": false,
"timeoutSeconds": 300
},
"maxParallelUpgrades": 1,
"maxUnavailable": "25%",
"podDeletion": {
"deleteEmptyDir": false,
"force": false,
"timeoutSeconds": 300
},
"waitForCompletion": {
"timeoutSeconds": 0
}
},
"repoConfig": {
"name": ""
},
Expand Down
101 changes: 101 additions & 0 deletions bundle/manifests/nvidia.com_nvidiadrivers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -953,6 +953,107 @@ spec:
type: string
type: object
type: array
upgradePolicy:
description: UpgradePolicy allows to control automatic upgrade of
the driver on nodes
properties:
autoUpgrade:
default: true
description: |-
AutoUpgrade is a switch for automatic upgrade feature.
If set to false all other options are ignored.
type: boolean
drain:
description: DrainSpec describes configuration for node drain
during automatic upgrade
properties:
deleteEmptyDir:
default: false
description: |-
DeleteEmptyDir indicates if should continue even if there are pods using emptyDir
(local data that will be deleted when the node is drained)
type: boolean
enable:
default: false
description: Enable indicates if node draining is allowed
during upgrade
type: boolean
force:
default: false
description: Force indicates if force draining is allowed
type: boolean
podSelector:
description: |-
PodSelector specifies a label selector to filter pods on the node that need to be drained
For more details on label selectors, see:
https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors
type: string
timeoutSeconds:
default: 300
description: TimeoutSecond specifies the length of time in
seconds to wait before giving up drain, zero means infinite
minimum: 0
type: integer
type: object
maxParallelUpgrades:
default: 1
description: |-
MaxParallelUpgrades indicates how many nodes can be upgraded in parallel.
0 means no limit, all nodes will be upgraded in parallel.
minimum: 0
type: integer
maxUnavailable:
anyOf:
- type: integer
- type: string
default: 25%
description: |-
MaxUnavailable is the maximum number of nodes with the driver installed, that can be unavailable during the upgrade.
Value can be an absolute number (ex: 5) or a percentage of total nodes at the start of upgrade (ex: 10%).
Absolute number is calculated from percentage by rounding up.
By default, a fixed value of 25% is used.
x-kubernetes-int-or-string: true
podDeletion:
description: PodDeletionSpec describes configuration for deletion
of pods using special resources during automatic upgrade
properties:
deleteEmptyDir:
default: false
description: |-
DeleteEmptyDir indicates if should continue even if there are pods using emptyDir
(local data that will be deleted when the pod is deleted)
type: boolean
force:
default: false
description: Force indicates if force deletion is allowed
type: boolean
timeoutSeconds:
default: 300
description: |-
TimeoutSecond specifies the length of time in seconds to wait before giving up on pod termination, zero means
infinite
minimum: 0
type: integer
type: object
waitForCompletion:
description: WaitForCompletionSpec describes the configuration
for waiting on job completions
properties:
podSelector:
description: |-
PodSelector specifies a label selector for the pods to wait for completion
For more details on label selectors, see:
https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors
type: string
timeoutSeconds:
default: 0
description: |-
TimeoutSecond specifies the length of time in seconds to wait before giving up on pod termination, zero means
infinite
minimum: 0
type: integer
type: object
type: object
useOpenKernelModules:
description: |-
Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead.
Expand Down
Loading
Loading