diff --git a/terraform/eks/daemon/otel/main.tf b/terraform/eks/daemon/otel/main.tf index 22b3d5de..edb8964b 100644 --- a/terraform/eks/daemon/otel/main.tf +++ b/terraform/eks/daemon/otel/main.tf @@ -347,9 +347,29 @@ resource "null_resource" "ksm_replicaset" { # --- Test runner --- +# --- Apply test taints to first node in standard node group --- + +resource "null_resource" "apply_test_taints" { + depends_on = [null_resource.restart_pods, null_resource.kubectl] + triggers = { always_run = timestamp() } + provisioner "local-exec" { + command = <<-EOT + NODE=$(kubectl get nodes -l "node.kubernetes.io/instance-type=${var.instance_type}" -o jsonpath='{.items[0].metadata.name}') + kubectl taint nodes "$NODE" \ + ci-test.example.com/dedicated=gpu:NoSchedule \ + ci-test.example.com/team=ml:NoSchedule \ + --overwrite + echo "Applied test taints to $NODE" + EOT + } +} + +# --- Test runner --- + resource "null_resource" "validator" { depends_on = [ null_resource.restart_pods, + null_resource.apply_test_taints, kubernetes_deployment_v1.nginx_test, ] diff --git a/test/otel/standard/taints_test.go b/test/otel/standard/taints_test.go new file mode 100644 index 00000000..a152507e --- /dev/null +++ b/test/otel/standard/taints_test.go @@ -0,0 +1,109 @@ +//go:build integration + +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package standard + +import ( + "context" + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +// Test taints applied to the first node: +// ci-test.example.com/dedicated=gpu:NoSchedule +// ci-test.example.com/empty=:NoExecute + +const ( + taintKeyDedicated = "k8s.node.taint.ci-test.example.com/dedicated" + taintKeyEmpty = "k8s.node.taint.ci-test.example.com/empty" + taintPrefix = "k8s.node.taint.ci-test.example.com/" +) + +func TestTaintsAppearOnDaemonSetMetrics(t *testing.T) { + results, err := queryCache.Get(context.Background(), "node_cpu_seconds_total") + require.NoError(t, err, "querying node_cpu_seconds_total") + require.NotEmpty(t, results, "node_cpu_seconds_total not available") + + var found bool + for _, r := range results { + if r.Labels.Resource[taintKeyDedicated] == "gpu" { + found = true + break + } + } + require.True(t, found, "expected at least one node_cpu_seconds_total series with %s=gpu", taintKeyDedicated) +} + +func TestTaintsEmptyValueSkipped(t *testing.T) { + results, err := queryCache.Get(context.Background(), "node_cpu_seconds_total") + require.NoError(t, err, "querying node_cpu_seconds_total") + require.NotEmpty(t, results, "node_cpu_seconds_total not available") + + for _, r := range results { + if r.Labels.Resource[taintKeyDedicated] != "gpu" { + continue + } + _, hasEmpty := r.Labels.Resource[taintKeyEmpty] + require.False(t, hasEmpty, "empty-value taint %s should NOT be present", taintKeyEmpty) + return + } + t.Fatal("no tainted series found to verify empty-value taint absence") +} + +func TestTaintsAbsentOnUntaintedNodes(t *testing.T) { + results, err := queryCache.Get(context.Background(), "node_cpu_seconds_total") + require.NoError(t, err, "querying node_cpu_seconds_total") + require.NotEmpty(t, results, "node_cpu_seconds_total not available") + + var untainted int + for _, r := range results { + hasTaint := false + for k := range r.Labels.Resource { + if strings.HasPrefix(k, taintPrefix) { + hasTaint = true + break + } + } + if !hasTaint { + untainted++ + } + } + require.Greater(t, untainted, 0, "expected at least one series from untainted node with no %s* attrs", taintPrefix) +} + +func TestTaintsAppearOnClusterScraperMetrics(t *testing.T) { + results, err := queryCache.Get(context.Background(), "kube_node_info") + require.NoError(t, err, "querying kube_node_info") + require.NotEmpty(t, results, "kube_node_info not available") + + var found bool + for _, r := range results { + if r.Labels.Resource[taintKeyDedicated] == "gpu" { + found = true + break + } + } + require.True(t, found, "expected at least one kube_node_info series with %s=gpu (cluster-scraper)", taintKeyDedicated) +} + +func TestTaintsMultipleOnSameNode(t *testing.T) { + results, err := queryCache.Get(context.Background(), "node_cpu_seconds_total") + require.NoError(t, err, "querying node_cpu_seconds_total") + require.NotEmpty(t, results, "node_cpu_seconds_total not available") + + for _, r := range results { + if r.Labels.Resource[taintKeyDedicated] != "gpu" { + continue + } + // The tainted node also has ci-test.example.com/team=ml + val, hasTeam := r.Labels.Resource["k8s.node.taint.ci-test.example.com/team"] + require.True(t, hasTeam, "tainted node should have both test taints present") + require.Equal(t, "ml", val) + return + } + t.Fatal("no tainted series found to verify multiple taints") +}