diff --git a/cmd/main.go b/cmd/main.go index 173c59058..7bde7f979 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -83,6 +83,8 @@ func main() { // nolint: gocyclo eventPort int eventURL string eventProtocol string + redfishMetricLabelsFromBMC string + redfishMetricLabelsFromServer string registryClientTimeout time.Duration registryDataMaxAge time.Duration registryResyncInterval time.Duration @@ -143,6 +145,16 @@ func main() { // nolint: gocyclo flag.IntVar(&eventPort, "event-port", 10001, "The port to use for the server events endpoint for alerts and metrics.") flag.StringVar(&eventProtocol, "event-protocol", "http", "The protocol to use for the server events endpoint for alerts and metrics.") + flag.StringVar(&redfishMetricLabelsFromBMC, "redfish-metric-labels-from-bmc", "", + "Comma-separated list of 'kubernetes-label-key=prometheus-label-name' pairs. "+ + "Each pair adds an additional label dimension to Redfish telemetry metrics, "+ + "sourced from the matching label on the BMC resource. "+ + "Example: topology.kubernetes.io/region=region,topology.kubernetes.io/zone=zone") + flag.StringVar(&redfishMetricLabelsFromServer, "redfish-metric-labels-from-server", "", + "Comma-separated list of 'kubernetes-label-key=prometheus-label-name' pairs. "+ + "Each pair adds an additional label dimension to Redfish telemetry metrics, "+ + "sourced from the matching label on the Server resource linked via spec.bmcRef.name. "+ + "Example: metadata.metal.ironcore.dev/location=location,metadata.metal.ironcore.dev/rack=rack") flag.StringVar(&probeImage, "probe-image", "", "Image for the first boot probing of a Server.") flag.StringVar(&probeOSImage, "probe-os-image", "", "OS image for the first boot probing of a Server.") flag.StringVar(&managerNamespace, "manager-namespace", "default", "Namespace the manager is running in.") @@ -708,9 +720,21 @@ func main() { // nolint: gocyclo } if eventURL != "" { + bmcLabelMappings, err := serverevents.ParseLabelMappings(redfishMetricLabelsFromBMC) + if err != nil { + setupLog.Error(err, "Invalid --redfish-metric-labels-from-bmc") + os.Exit(1) + } + serverLabelMappings, err := serverevents.ParseLabelMappings(redfishMetricLabelsFromServer) + if err != nil { + setupLog.Error(err, "Invalid --redfish-metric-labels-from-server") + os.Exit(1) + } if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { setupLog.Info("starting event server for alerts and metrics", "EventURL", eventURL) - eventServer := serverevents.NewServer(setupLog, fmt.Sprintf(":%d", eventPort)) + eventServer := serverevents.NewServer( + setupLog, fmt.Sprintf(":%d", eventPort), mgr.GetClient(), bmcLabelMappings, serverLabelMappings, + ) if err := eventServer.Start(ctx); err != nil { return fmt.Errorf("unable to start event server: %w", err) } diff --git a/dist/chart/templates/_helpers.tpl b/dist/chart/templates/_helpers.tpl index cd82641a8..c4db4a17f 100644 --- a/dist/chart/templates/_helpers.tpl +++ b/dist/chart/templates/_helpers.tpl @@ -48,3 +48,17 @@ app.kubernetes.io/instance: {{ .Release.Name }} $hasValidating = true }}{{- end }} {{- end }} {{ $hasValidating }}}}{{- end }} + +{{/* +chart.redfishLabelFlag renders a single CLI flag string from a map of +kubernetes-label-key -> prometheus-label-name entries. + +Usage: {{ include "chart.redfishLabelFlag" (dict "flag" "redfish-metric-labels-from-bmc" "map" .Values.redfishLabels.bmc) }} +*/}} +{{- define "chart.redfishLabelFlag" -}} +{{- $pairs := list -}} +{{- range $k, $v := .map -}} + {{- $pairs = append $pairs (printf "%s=%s" $k $v) -}} +{{- end -}} +{{- printf "--%s=%s" .flag (join "," ($pairs | sortAlpha)) -}} +{{- end }} diff --git a/dist/chart/templates/manager/manager.yaml b/dist/chart/templates/manager/manager.yaml index d6125e3a9..c5f694d22 100644 --- a/dist/chart/templates/manager/manager.yaml +++ b/dist/chart/templates/manager/manager.yaml @@ -34,6 +34,12 @@ spec: {{- range .Values.controllerManager.manager.args }} - {{ . }} {{- end }} + {{- if .Values.redfishLabels.bmc }} + - {{ include "chart.redfishLabelFlag" (dict "flag" "redfish-metric-labels-from-bmc" "map" .Values.redfishLabels.bmc) | quote }} + {{- end }} + {{- if .Values.redfishLabels.server }} + - {{ include "chart.redfishLabelFlag" (dict "flag" "redfish-metric-labels-from-server" "map" .Values.redfishLabels.server) | quote }} + {{- end }} command: - /manager image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag }} diff --git a/dist/chart/values.yaml b/dist/chart/values.yaml index b2830ee6f..ec7d83d5f 100644 --- a/dist/chart/values.yaml +++ b/dist/chart/values.yaml @@ -67,6 +67,23 @@ crd: # (Certificates, Issuers, ...) due to garbage collection. keep: true +# [REDFISH LABEL ENRICHMENT]: Optional label enrichment for Redfish telemetry metrics. +# Define mappings from Kubernetes resource label keys to Prometheus label names. +# Mapped labels are appended as additional dimensions to redfish_monitor_reading and +# redfish_event_alert_total metrics. Leave empty ({}) to disable enrichment. +redfishLabels: + # Labels sourced from the BMC resource (matched by hostname == BMC resource name). + bmc: {} + # Example: + # topology.kubernetes.io/region: region + # topology.kubernetes.io/zone: zone + + # Labels sourced from the Server resource linked via spec.bmcRef.name. + server: {} + # Example: + # metadata.metal.ironcore.dev/location: location + # metadata.metal.ironcore.dev/rack: rack + # [METRICS]: Set to true to generate manifests for exporting metrics. # To disable metrics export set false, and ensure that the # ControllerManager argument "--metrics-bind-address=:8443" is removed. @@ -101,4 +118,3 @@ ignition: # Template content that can be customized - this will be created as a ConfigMap # and mounted to override the default template # template: | - \ No newline at end of file diff --git a/docs/observability/metrics.md b/docs/observability/metrics.md index 54431c8c1..c07e3dbf7 100644 --- a/docs/observability/metrics.md +++ b/docs/observability/metrics.md @@ -281,6 +281,97 @@ rate(metal_server_reconciliation_total{result=~"error_.*"}[5m]) count(metal_server_state{state="Available"} == 1) ``` +## Redfish Telemetry Metrics + +When the operator is configured with an event URL (`--event-url`), it subscribes to Redfish MetricReport and Alert events from each BMC and exposes two additional metrics. + +### Sensor Readings (`redfish_monitor_reading`) + +**Type:** Gauge +**Description:** Latest sensor value pushed via a Redfish MetricReport event. +**Fixed labels:** +- `hostname`: BMC Kubernetes resource name +- `metric_id`: Redfish metric ID (e.g., `CPU1Temp`) +- `type`: Metric type (e.g., `Temperature`, `Voltage`) +- `unit`: Unit of measure (e.g., `Cel`, `V`) +- `origin_context`: Originating hardware component path + +**Dynamic labels:** Additional label dimensions can be injected from the BMC or Server resource (see [Label Enrichment](#label-enrichment) below). + +**Example values:** +```text +redfish_monitor_reading{hostname="node001-bmc", metric_id="CPU1Temp", type="Temperature", unit="Cel", origin_context="/Chassis/1/Thermal"} 42.5 +redfish_monitor_reading{hostname="node001-bmc", metric_id="FanSpeed1", type="Rotational", unit="RPM", origin_context="/Chassis/1/Thermal"} 3200 +``` + +**Use cases:** +- Alert on thermal readings exceeding thresholds: `redfish_monitor_reading{type="Temperature"} > 80` +- Track fan speeds: `redfish_monitor_reading{type="Rotational"}` +- Compare readings across regions or racks when enriched with topology labels + +### Alert Event Counter (`redfish_event_alert_total`) + +**Type:** Counter +**Description:** Total count of Redfish alert/event messages received from each BMC. +**Fixed labels:** +- `hostname`: BMC Kubernetes resource name +- `severity`: Event severity (e.g., `OK`, `Warning`, `Critical`) +- `message_id`: Redfish MessageId (e.g., `Alert.1.0.ResourceStatusChangedOK`) +- `component`: Originating hardware component + +**Dynamic labels:** Same enrichment as `redfish_monitor_reading`. + +**Example values:** +```text +redfish_event_alert_total{hostname="node001-bmc", severity="Warning", message_id="ThermalEvents.1.0.TemperatureAboveUpperCautionThreshold", component="/Chassis/1/Thermal/CPU1Temp"} 3 +redfish_event_alert_total{hostname="node001-bmc", severity="OK", message_id="Alert.1.0.ResourceStatusChangedOK", component="/Systems/1"} 12 +``` + +**Use cases:** +- Alert on sustained critical events: `increase(redfish_event_alert_total{severity="Critical"}[5m]) > 0` +- Track warning frequency per host: `rate(redfish_event_alert_total{severity="Warning"}[1h])` + +### Label Enrichment + +When managing a large number of servers, it is often necessary to filter dashboard panels and alert rules by topology or location (e.g., region, availability zone, rack). Both Redfish metrics support optional dynamic label dimensions sourced from Kubernetes resources for exactly this purpose — enabling operators to slice telemetry by any organisational dimension without modifying the operator itself. + +This is configured via two CLI flags: + +| Flag | Source resource | Match key | +|------|----------------|-----------| +| `--redfish-metric-labels-from-bmc` | `BMC` resource | resource name == `hostname` label | +| `--redfish-metric-labels-from-server` | `Server` resource | `spec.bmcRef.name` == `hostname` label | + +**Flag format:** `kubernetes-label-key=prometheus-label-name,...` + +**Example:** +```bash +--redfish-metric-labels-from-bmc=topology.kubernetes.io/region=region,topology.kubernetes.io/zone=zone +--redfish-metric-labels-from-server=metadata.metal.ironcore.dev/location=location,metadata.metal.ironcore.dev/rack=rack +``` + +When configured, every Redfish metric gains the extra label columns. If a label key is missing from the resource, the value is emitted as an empty string — missing labels never block metric emission. + +Labels are read from the controller-runtime informer cache, which is watch-based and always reflects the current cluster state. There is no TTL — label changes on BMC or Server resources are visible immediately. + +#### Helm chart configuration + +```yaml +redfishLabels: + bmc: + topology.kubernetes.io/region: region + topology.kubernetes.io/zone: zone + server: + metadata.metal.ironcore.dev/location: location + metadata.metal.ironcore.dev/rack: rack +``` + +#### Example enriched output + +```text +redfish_monitor_reading{hostname="node001-bmc", metric_id="CPU1Temp", type="Temperature", unit="Cel", origin_context="/Chassis/1/Thermal", region="eu-de-1", zone="eu-de-1a", location="building-b", rack="row3-rack7"} 42.5 +``` + ## Implementation Details ### Metric Collection Strategy diff --git a/internal/serverevents/metrics.go b/internal/serverevents/metrics.go index 18bc318a7..b4c3aa76c 100644 --- a/internal/serverevents/metrics.go +++ b/internal/serverevents/metrics.go @@ -4,15 +4,68 @@ package serverevents import ( + "context" + "fmt" + "regexp" "strconv" "strings" "sync" "time" + metalv1alpha1 "github.com/ironcore-dev/metal-operator/api/v1alpha1" "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/metrics" ) +// LabelMapping maps a Kubernetes resource label key to a Prometheus label name. +type LabelMapping struct { + K8sKey string + PromLabel string +} + +// promLabelPattern is the set of valid Prometheus label name characters. +var promLabelPattern = regexp.MustCompile(`^[a-zA-Z_][a-zA-Z0-9_]*$`) + +// ParseLabelMappings parses a comma-separated list of "kubernetes-label-key=prometheus-label-name" +// pairs into a []LabelMapping. +// +// Format: "some.domain/key=prom_label,other.domain/key2=prom_label2" +// +// Rules: +// - Empty string returns nil (no mappings, valid). +// - Each token must contain exactly one "=". +// - The Prometheus label name must match [a-zA-Z_][a-zA-Z0-9_]*. +// - Whitespace is trimmed from both sides of each token and each part. +func ParseLabelMappings(s string) ([]LabelMapping, error) { + s = strings.TrimSpace(s) + if s == "" { + return nil, nil + } + tokens := strings.Split(s, ",") + mappings := make([]LabelMapping, 0, len(tokens)) + for _, token := range tokens { + token = strings.TrimSpace(token) + if token == "" { + continue + } + parts := strings.SplitN(token, "=", 2) + if len(parts) != 2 { + return nil, fmt.Errorf("invalid label mapping %q: must be 'kubernetes-label=prometheus-label'", token) + } + k8sKey := strings.TrimSpace(parts[0]) + promLabel := strings.TrimSpace(parts[1]) + if k8sKey == "" { + return nil, fmt.Errorf("invalid label mapping %q: Kubernetes label key must not be empty", token) + } + if !promLabelPattern.MatchString(promLabel) { + return nil, fmt.Errorf("invalid label mapping %q: Prometheus label name %q must match [a-zA-Z_][a-zA-Z0-9_]*", token, promLabel) + } + mappings = append(mappings, LabelMapping{K8sKey: k8sKey, PromLabel: promLabel}) + } + return mappings, nil +} + type MetricEntry struct { MetricID string Value float64 @@ -29,6 +82,11 @@ type RedfishEventCollector struct { mux sync.RWMutex sensorDesc *prometheus.Desc alertDesc *prometheus.Desc + + k8sClient client.Client + bmcMappings []LabelMapping + serverMappings []LabelMapping + allLabelCount int } type EventKey struct { @@ -39,20 +97,37 @@ type EventKey struct { } // NewRedfishEventCollector initializes a new RedfishEventCollector and registers it with Prometheus. -func NewRedfishEventCollector() *RedfishEventCollector { +// +// bmcMappings and serverMappings define which Kubernetes resource labels are propagated to Redfish +// metrics as additional Prometheus label dimensions. Pass nil for either to disable enrichment from +// that resource. The k8sClient is used to look up the resources at runtime; pass nil to disable all +// enrichment (e.g. in tests or standalone tooling). +func NewRedfishEventCollector(k8sClient client.Client, bmcMappings, serverMappings []LabelMapping) *RedfishEventCollector { + allLabelCount := len(bmcMappings) + len(serverMappings) + allLabels := make([]string, 0, allLabelCount) + for _, m := range bmcMappings { + allLabels = append(allLabels, m.PromLabel) + } + for _, m := range serverMappings { + allLabels = append(allLabels, m.PromLabel) + } c := &RedfishEventCollector{ - lastReadings: make(map[string]MetricEntry), - alertCounts: make(map[EventKey]uint64), + lastReadings: make(map[string]MetricEntry), + alertCounts: make(map[EventKey]uint64), + k8sClient: k8sClient, + bmcMappings: bmcMappings, + serverMappings: serverMappings, + allLabelCount: allLabelCount, sensorDesc: prometheus.NewDesc( "redfish_monitor_reading", "Latest value pushed via Redfish MetricReport event", - []string{"hostname", "metric_id", "type", "unit", "origin_context"}, + append([]string{"hostname", "metric_id", "type", "unit", "origin_context"}, allLabels...), nil, ), alertDesc: prometheus.NewDesc( "redfish_event_alert_total", "Total count of Redfish alerts/events received", - []string{"hostname", "severity", "message_id", "component"}, + append([]string{"hostname", "severity", "message_id", "component"}, allLabels...), nil, ), } @@ -60,6 +135,39 @@ func NewRedfishEventCollector() *RedfishEventCollector { return c } +// getLabels returns the enrichment label values for the given hostname by reading from the +// controller-runtime informer cache. Reads are local in-memory operations — the informer cache +// is watch-based and always reflects the current cluster state without making API server calls. +// Returns empty strings for any label that cannot be resolved. +func (c *RedfishEventCollector) getLabels(hostname string) []string { + vals := make([]string, c.allLabelCount) + if c.k8sClient == nil || c.allLabelCount == 0 { + return vals + } + ctx := context.Background() + + // --- BMC labels --- + if len(c.bmcMappings) > 0 { + bmc := &metalv1alpha1.BMC{} + if err := c.k8sClient.Get(ctx, client.ObjectKey{Name: hostname}, bmc); err == nil { + for i, m := range c.bmcMappings { + vals[i] = bmc.Labels[m.K8sKey] + } + } + } + + // --- Server labels (looked up via spec.bmcRef.name field index) --- + if len(c.serverMappings) > 0 { + serverList := &metalv1alpha1.ServerList{} + if err := c.k8sClient.List(ctx, serverList, client.MatchingFields{"spec.bmcRef.name": hostname}); err == nil && len(serverList.Items) == 1 { + for i, m := range c.serverMappings { + vals[len(c.bmcMappings)+i] = serverList.Items[0].Labels[m.K8sKey] + } + } + } + return vals +} + // UpdateFromMetricsReport processes incoming MetricReport events and updates the internal state. func (c *RedfishEventCollector) UpdateFromMetricsReport(hostname string, report MetricsReport) { c.mux.Lock() @@ -101,7 +209,7 @@ func (c *RedfishEventCollector) UpdateFromEvent(hostname string, data EventData) c.mux.Lock() defer c.mux.Unlock() - events := data.GetEvents() // Use new method to get events from either field + events := data.GetEvents() for _, event := range events { // Determine the component from the URI (e.g., .../Sensors/Fan1 -> Fan1) component := "system" @@ -118,7 +226,6 @@ func (c *RedfishEventCollector) UpdateFromEvent(hostname string, data EventData) } c.alertCounts[key]++ } - } // Describe and Collect implement the prometheus.Collector interface to expose metrics. @@ -135,26 +242,27 @@ func (c *RedfishEventCollector) Collect(ch chan<- prometheus.Metric) { if time.Since(data.Timestamp) > 10*time.Minute { continue } + labelValues := append( + []string{data.Source, data.MetricID, data.Type, data.Unit, data.OriginContext}, + c.getLabels(data.Source)..., + ) ch <- prometheus.MustNewConstMetric( c.sensorDesc, prometheus.GaugeValue, data.Value, - data.Source, - data.MetricID, - data.Type, - data.Unit, - data.OriginContext, + labelValues..., ) } for key, count := range c.alertCounts { + labelValues := append( + []string{key.Source, key.Severity, key.EventID, key.Component}, + c.getLabels(key.Source)..., + ) ch <- prometheus.MustNewConstMetric( c.alertDesc, prometheus.CounterValue, float64(count), - key.Source, - key.Severity, - key.EventID, - key.Component, + labelValues..., ) } } diff --git a/internal/serverevents/server.go b/internal/serverevents/server.go index 932fc8e44..70f0c18a5 100644 --- a/internal/serverevents/server.go +++ b/internal/serverevents/server.go @@ -14,6 +14,7 @@ import ( "time" "github.com/go-logr/logr" + "sigs.k8s.io/controller-runtime/pkg/client" ) type Server struct { @@ -71,13 +72,17 @@ type Event struct { OriginOfCondition string `json:"OriginOfCondition"` } -func NewServer(log logr.Logger, addr string) *Server { +// NewServer creates a new event server. bmcMappings and serverMappings define which Kubernetes +// resource labels are propagated to Redfish metrics as additional Prometheus label dimensions; +// pass nil for either to disable enrichment from that resource. The k8sClient is used to look up +// the resources at runtime; pass nil to disable all enrichment. +func NewServer(log logr.Logger, addr string, k8sClient client.Client, bmcMappings, serverMappings []LabelMapping) *Server { mux := http.NewServeMux() server := &Server{ addr: addr, mux: mux, log: log, - collector: NewRedfishEventCollector(), + collector: NewRedfishEventCollector(k8sClient, bmcMappings, serverMappings), } server.routes() return server diff --git a/test/serverevents/main.go b/test/serverevents/main.go index a9c6b05fa..5f9be3b77 100644 --- a/test/serverevents/main.go +++ b/test/serverevents/main.go @@ -28,7 +28,7 @@ func main() { ctx := ctrl.SetupSignalHandler() setupLog.Info("starting serverevent agent") - server := serverevents.NewServer(setupLog, ":8888") + server := serverevents.NewServer(setupLog, ":8888", nil, nil, nil) if err := server.Start(ctx); err != nil { setupLog.Error(err, "problem running telemetry server")