Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cmd/gpu-operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,16 @@ func main() {
setupLog.Error(err, "unable to create controller", "controller", "NVIDIADriver")
os.Exit(1)
}

if err = (&controllers.NodeLabelingReconciler{
Namespace: operatorNamespace,
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: ctrl.Log.WithName("controllers").WithName("NodeLabeling"),
}).SetupWithManager(ctx, mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "NodeLabeling")
os.Exit(1)
}
// +kubebuilder:scaffold:builder
if err := mgr.AddHealthzCheck("health", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
Expand Down
11 changes: 10 additions & 1 deletion config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ rules:
- endpoints
- events
- namespaces
- nodes
- persistentvolumeclaims
- pods
- pods/eviction
Expand All @@ -27,6 +26,16 @@ rules:
- patch
- update
- watch
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- patch
- update
- watch
- apiGroups:
- apiextensions.k8s.io
resources:
Expand Down
13 changes: 6 additions & 7 deletions controllers/clusterpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ type ClusterPolicyReconciler struct {
// +kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=use,resourceNames=privileged
// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterroles;clusterrolebindings;roles;rolebindings,verbs=*
// +kubebuilder:rbac:groups="",resources=namespaces;serviceaccounts;pods;pods/eviction;services;services/finalizers;endpoints,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups="",resources=persistentvolumeclaims;events;configmaps;secrets;nodes,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups="",resources=persistentvolumeclaims;events;configmaps;secrets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch
// +kubebuilder:rbac:groups=apps,resources=deployments;daemonsets;replicasets;statefulsets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=apps,resources=controllerrevisions,verbs=get;list;watch
// +kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors;prometheusrules,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -281,9 +282,9 @@ func addWatchNewGPUNode(r *ClusterPolicyReconciler, c controller.Controller, mgr
oldLabels := e.ObjectOld.GetLabels()
nodeName := e.ObjectNew.GetName()

gpuCommonLabelMissing := hasGPULabels(newLabels) && !hasCommonGPULabel(newLabels)
// Trigger when NodeLabelingReconciler sets gpu.present=true on a new GPU node.
gpuCommonLabelAdded := !hasCommonGPULabel(oldLabels) && hasCommonGPULabel(newLabels)
gpuCommonLabelOutdated := !hasGPULabels(newLabels) && hasCommonGPULabel(newLabels)
migManagerLabelMissing := hasMIGCapableGPU(newLabels) && !hasMIGManagerLabel(newLabels)
commonOperandsLabelChanged := hasOperandsDisabled(oldLabels) != hasOperandsDisabled(newLabels)

oldGPUWorkloadConfig, _ := getWorkloadConfig(oldLabels, true)
Expand All @@ -294,19 +295,17 @@ func addWatchNewGPUNode(r *ClusterPolicyReconciler, c controller.Controller, mgr
newOSTreeLabel := newLabels[nfdOSTreeVersionLabelKey]
osTreeLabelChanged := oldOSTreeLabel != newOSTreeLabel

needsUpdate := gpuCommonLabelMissing ||
needsUpdate := gpuCommonLabelAdded ||
gpuCommonLabelOutdated ||
migManagerLabelMissing ||
commonOperandsLabelChanged ||
gpuWorkloadConfigLabelChanged ||
osTreeLabelChanged

if needsUpdate {
r.Log.Info("Node needs an update",
"name", nodeName,
"gpuCommonLabelMissing", gpuCommonLabelMissing,
"gpuCommonLabelAdded", gpuCommonLabelAdded,
"gpuCommonLabelOutdated", gpuCommonLabelOutdated,
"migManagerLabelMissing", migManagerLabelMissing,
"commonOperandsLabelChanged", commonOperandsLabelChanged,
"gpuWorkloadConfigLabelChanged", gpuWorkloadConfigLabelChanged,
"osTreeLabelChanged", osTreeLabelChanged,
Expand Down
Loading