diff --git a/cmd/main.go b/cmd/main.go index fc3c9b1b8..9b42905d7 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -652,10 +652,10 @@ func main() { os.Exit(1) } + syncerMonitor := commitments.NewSyncerMonitor() + must.Succeed(metrics.Registry.Register(syncerMonitor)) if slices.Contains(mainConfig.EnabledTasks, "commitments-sync-task") { setupLog.Info("starting commitments syncer") - syncerMonitor := commitments.NewSyncerMonitor() - must.Succeed(metrics.Registry.Register(syncerMonitor)) syncer := commitments.NewSyncer(multiclusterClient, syncerMonitor) syncerConfig := conf.GetConfigOrDie[commitments.SyncerConfig]() syncerConfig.ApplyDefaults() diff --git a/internal/scheduling/reservations/commitments/syncer.go b/internal/scheduling/reservations/commitments/syncer.go index 1949186a6..c304369df 100644 --- a/internal/scheduling/reservations/commitments/syncer.go +++ b/internal/scheduling/reservations/commitments/syncer.go @@ -149,7 +149,7 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo "expectedUnit", expectedUnit, "smallestFlavorMemoryMB", flavorGroup.SmallestFlavor.MemoryMB) if s.monitor != nil { - s.monitor.RecordUnitMismatch(flavorGroupName) + s.monitor.RecordCommitmentSkipped(SkipReasonUnitMismatch) } // Track skipped commitment so its existing CRDs won't be deleted if commitment.UUID != "" { diff --git a/internal/scheduling/reservations/commitments/syncer_monitor.go b/internal/scheduling/reservations/commitments/syncer_monitor.go index efa25b0a4..e597f5884 100644 --- a/internal/scheduling/reservations/commitments/syncer_monitor.go +++ b/internal/scheduling/reservations/commitments/syncer_monitor.go @@ -109,11 +109,6 @@ func (m *SyncerMonitor) RecordCommitmentSkipped(reason string) { m.commitmentsSkipped.WithLabelValues(reason).Inc() } -// RecordUnitMismatch records a unit mismatch skip (convenience method). -func (m *SyncerMonitor) RecordUnitMismatch(_ string) { - m.commitmentsSkipped.WithLabelValues(SkipReasonUnitMismatch).Inc() -} - // RecordReservationsCreated records reservations created. func (m *SyncerMonitor) RecordReservationsCreated(count int) { m.reservationsCreated.Add(float64(count)) diff --git a/internal/scheduling/reservations/commitments/syncer_monitor_test.go b/internal/scheduling/reservations/commitments/syncer_monitor_test.go new file mode 100644 index 000000000..2e9c65029 --- /dev/null +++ b/internal/scheduling/reservations/commitments/syncer_monitor_test.go @@ -0,0 +1,108 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "testing" + + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" +) + +// TestSyncerMonitor_MetricsRegistration verifies that all syncer metrics are +// present at zero immediately after registration, without any increments. +// This matters because alert rules reference these metrics and will warn +// "metric missing" if they don't appear before the first sync run. +func TestSyncerMonitor_MetricsRegistration(t *testing.T) { + registry := prometheus.NewRegistry() + monitor := NewSyncerMonitor() + + if err := registry.Register(monitor); err != nil { + t.Fatalf("failed to register monitor: %v", err) + } + + families, err := registry.Gather() + if err != nil { + t.Fatalf("failed to gather metrics: %v", err) + } + + gathered := make(map[string]*dto.MetricFamily) + for _, f := range families { + gathered[f.GetName()] = f + } + + cases := []struct { + name string + metricType dto.MetricType + }{ + {"cortex_committed_resource_syncer_runs_total", dto.MetricType_COUNTER}, + {"cortex_committed_resource_syncer_errors_total", dto.MetricType_COUNTER}, + {"cortex_committed_resource_syncer_commitments_total", dto.MetricType_COUNTER}, + {"cortex_committed_resource_syncer_commitments_processed_total", dto.MetricType_COUNTER}, + {"cortex_committed_resource_syncer_commitments_skipped_total", dto.MetricType_COUNTER}, + {"cortex_committed_resource_syncer_reservations_created_total", dto.MetricType_COUNTER}, + {"cortex_committed_resource_syncer_reservations_deleted_total", dto.MetricType_COUNTER}, + {"cortex_committed_resource_syncer_reservations_repaired_total", dto.MetricType_COUNTER}, + } + + for _, tc := range cases { + f, ok := gathered[tc.name] + if !ok { + t.Errorf("metric %q missing after registration (no increments needed)", tc.name) + continue + } + if f.GetType() != tc.metricType { + t.Errorf("metric %q: expected type %v, got %v", tc.name, tc.metricType, f.GetType()) + } + } +} + +// TestSyncerMonitor_SkipReasonsPreInitialized verifies that all skip reason +// label combinations are present at zero before any commitment is skipped. +// This prevents "metric missing" warnings for the skipped_total CounterVec. +func TestSyncerMonitor_SkipReasonsPreInitialized(t *testing.T) { + registry := prometheus.NewRegistry() + monitor := NewSyncerMonitor() + + if err := registry.Register(monitor); err != nil { + t.Fatalf("failed to register monitor: %v", err) + } + + families, err := registry.Gather() + if err != nil { + t.Fatalf("failed to gather metrics: %v", err) + } + + var skippedFamily *dto.MetricFamily + for _, f := range families { + if f.GetName() == "cortex_committed_resource_syncer_commitments_skipped_total" { + skippedFamily = f + break + } + } + if skippedFamily == nil { + t.Fatal("cortex_committed_resource_syncer_commitments_skipped_total missing after registration") + } + + presentReasons := make(map[string]bool) + for _, m := range skippedFamily.Metric { + for _, l := range m.Label { + if l.GetName() == "reason" { + presentReasons[l.GetValue()] = true + } + } + } + + for _, reason := range []string{ + SkipReasonUnitMismatch, + SkipReasonUnknownFlavorGroup, + SkipReasonInvalidResource, + SkipReasonEmptyUUID, + SkipReasonNonCompute, + } { + if !presentReasons[reason] { + t.Errorf("skip reason %q not pre-initialized in commitments_skipped_total", reason) + } + } +}