From 081184cd4de32a027fd894bd4e0279264929cfa2 Mon Sep 17 00:00:00 2001 From: Philipp Matthes <27271818+PhilippMatthes@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:11:28 +0100 Subject: [PATCH 01/55] Remove filter packed virtqueue (#565) It was discussed that we probably don't need this filter in the future. According to internal sources the filter was introduced to support the transition of old images to new images needing this trait. Since, now, all hosts provide this trait, it's unnecessary to keep this filter. --- .../cortex-nova/templates/pipelines_kvm.yaml | 10 - .../filters/filter_packed_virtqueue.go | 60 --- .../filters/filter_packed_virtqueue_test.go | 510 ------------------ 3 files changed, 580 deletions(-) delete mode 100644 internal/scheduling/nova/plugins/filters/filter_packed_virtqueue.go delete mode 100644 internal/scheduling/nova/plugins/filters/filter_packed_virtqueue_test.go diff --git a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml index 815fed441..a17f75f5b 100644 --- a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml @@ -151,11 +151,6 @@ spec: `domain_name` scheduler hint from the nova request spec. params: - {key: domainNamePrefixes, stringListValue: ["iaas-"]} - - name: filter_packed_virtqueue - description: | - If the flavor extra specs contain the `hw:virtio_packed_ring` key, or the - image properties contain the `hw_virtio_packed_ring` key, this step will - filter out hosts that do not have the `COMPUTE_NET_VIRTIO_PACKED` trait. - name: filter_allowed_projects description: | This step filters hosts based on allowed projects defined in the @@ -282,11 +277,6 @@ spec: `domain_name` scheduler hint from the nova request spec. params: - {key: domainNamePrefixes, stringListValue: ["iaas-"]} - - name: filter_packed_virtqueue - description: | - If the flavor extra specs contain the `hw:virtio_packed_ring` key, or the - image properties contain the `hw_virtio_packed_ring` key, this step will - filter out hosts that do not have the `COMPUTE_NET_VIRTIO_PACKED` trait. - name: filter_allowed_projects description: | This step filters hosts based on allowed projects defined in the diff --git a/internal/scheduling/nova/plugins/filters/filter_packed_virtqueue.go b/internal/scheduling/nova/plugins/filters/filter_packed_virtqueue.go deleted file mode 100644 index dac317e59..000000000 --- a/internal/scheduling/nova/plugins/filters/filter_packed_virtqueue.go +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package filters - -import ( - "context" - "log/slog" - "slices" - - api "github.com/cobaltcore-dev/cortex/api/external/nova" - "github.com/cobaltcore-dev/cortex/internal/scheduling/lib" - hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" -) - -type FilterPackedVirtqueueStep struct { - lib.BaseFilter[api.ExternalSchedulerRequest, lib.EmptyFilterWeigherPipelineStepOpts] -} - -// If requested, only get hosts with packed virtqueues. -func (s *FilterPackedVirtqueueStep) Run(traceLog *slog.Logger, request api.ExternalSchedulerRequest) (*lib.FilterWeigherPipelineStepResult, error) { - result := s.IncludeAllHostsFromRequest(request) - // We don't care about the value. - _, reqInSpecs := request.Spec.Data.Flavor.Data.ExtraSpecs["hw:virtio_packed_ring"] - _, reqInProps := request.Spec.Data.Image.Data.Properties.Data["hw_virtio_packed_ring"] - if !reqInSpecs && !reqInProps { - traceLog.Info("no request for packed virtqueues, skipping filter") - return result, nil // No packed virtqueue requested, nothing to filter. - } - - hvs := &hv1.HypervisorList{} - if err := s.Client.List(context.Background(), hvs); err != nil { - traceLog.Error("failed to list hypervisors", "error", err) - return nil, err - } - hvsWithTrait := make(map[string]struct{}) - for _, hv := range hvs.Items { - traits := hv.Status.Traits - traits = append(traits, hv.Spec.CustomTraits...) - if !slices.Contains(traits, "COMPUTE_NET_VIRTIO_PACKED") { - continue - } - hvsWithTrait[hv.Name] = struct{}{} - } - - traceLog.Info("hosts with packed virtqueues", "hosts", hvsWithTrait) - for host := range result.Activations { - if _, ok := hvsWithTrait[host]; ok { - traceLog.Info("host has packed virtqueues, keeping", "host", host) - continue - } - delete(result.Activations, host) - traceLog.Info("filtering host without packed virtqueues", "host", host) - } - return result, nil -} - -func init() { - Index["filter_packed_virtqueue"] = func() NovaFilter { return &FilterPackedVirtqueueStep{} } -} diff --git a/internal/scheduling/nova/plugins/filters/filter_packed_virtqueue_test.go b/internal/scheduling/nova/plugins/filters/filter_packed_virtqueue_test.go deleted file mode 100644 index 82b68da81..000000000 --- a/internal/scheduling/nova/plugins/filters/filter_packed_virtqueue_test.go +++ /dev/null @@ -1,510 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package filters - -import ( - "log/slog" - "testing" - - api "github.com/cobaltcore-dev/cortex/api/external/nova" - hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/fake" -) - -func TestFilterPackedVirtqueueStep_Run(t *testing.T) { - scheme, err := hv1.SchemeBuilder.Build() - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - hvs := []client.Object{ - &hv1.Hypervisor{ - ObjectMeta: v1.ObjectMeta{ - Name: "host1", - }, - Status: hv1.HypervisorStatus{ - Traits: []string{"COMPUTE_NET_VIRTIO_PACKED"}, - }, - }, - &hv1.Hypervisor{ - ObjectMeta: v1.ObjectMeta{ - Name: "host2", - }, - Status: hv1.HypervisorStatus{ - Traits: []string{"COMPUTE_NET_VIRTIO_PACKED", "SOME_OTHER_TRAIT"}, - }, - }, - &hv1.Hypervisor{ - ObjectMeta: v1.ObjectMeta{ - Name: "host3", - }, - Status: hv1.HypervisorStatus{ - Traits: []string{"SOME_OTHER_TRAIT"}, - }, - }, - &hv1.Hypervisor{ - ObjectMeta: v1.ObjectMeta{ - Name: "host4", - }, - Status: hv1.HypervisorStatus{ - Traits: []string{}, - }, - }, - } - - tests := []struct { - name string - request api.ExternalSchedulerRequest - expectedHosts []string - filteredHosts []string - }{ - { - name: "No packed virtqueue requested - all hosts pass", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{}, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host2"}, - {ComputeHost: "host3"}, - {ComputeHost: "host4"}, - }, - }, - expectedHosts: []string{"host1", "host2", "host3", "host4"}, - filteredHosts: []string{}, - }, - { - name: "Packed virtqueue requested in flavor extra specs", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "true", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host2"}, - {ComputeHost: "host3"}, - {ComputeHost: "host4"}, - }, - }, - expectedHosts: []string{"host1", "host2"}, - filteredHosts: []string{"host3", "host4"}, - }, - { - name: "Packed virtqueue requested in image properties", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{}, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{ - "hw_virtio_packed_ring": "true", - }, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host2"}, - {ComputeHost: "host3"}, - {ComputeHost: "host4"}, - }, - }, - expectedHosts: []string{"host1", "host2"}, - filteredHosts: []string{"host3", "host4"}, - }, - { - name: "Packed virtqueue requested in both flavor and image", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "true", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{ - "hw_virtio_packed_ring": "true", - }, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host3"}, - }, - }, - expectedHosts: []string{"host1"}, - filteredHosts: []string{"host3"}, - }, - { - name: "Packed virtqueue with false value in flavor - still triggers filter", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "false", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host2"}, - {ComputeHost: "host3"}, - }, - }, - expectedHosts: []string{"host1", "host2"}, - filteredHosts: []string{"host3"}, - }, - { - name: "Packed virtqueue with empty value in image - still triggers filter", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{}, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{ - "hw_virtio_packed_ring": "", - }, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host4"}, - }, - }, - expectedHosts: []string{"host1"}, - filteredHosts: []string{"host4"}, - }, - { - name: "No hosts with trait - all filtered", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "true", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host3"}, - {ComputeHost: "host4"}, - }, - }, - expectedHosts: []string{}, - filteredHosts: []string{"host3", "host4"}, - }, - { - name: "All hosts have trait", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "true", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host2"}, - }, - }, - expectedHosts: []string{"host1", "host2"}, - filteredHosts: []string{}, - }, - { - name: "Empty host list with packed virtqueue requested", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "true", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{}, - }, - expectedHosts: []string{}, - filteredHosts: []string{}, - }, - { - name: "Empty host list without packed virtqueue requested", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{}, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{}, - }, - expectedHosts: []string{}, - filteredHosts: []string{}, - }, - { - name: "Host not in database with packed virtqueue requested", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "true", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host-unknown"}, - }, - }, - expectedHosts: []string{"host1"}, - filteredHosts: []string{"host-unknown"}, - }, - { - name: "Packed virtqueue with additional extra specs", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "true", - "hw:cpu_policy": "dedicated", - "hw:mem_page_size": "large", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host3"}, - }, - }, - expectedHosts: []string{"host1"}, - filteredHosts: []string{"host3"}, - }, - { - name: "Mixed hosts with and without trait", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "true", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host2"}, - {ComputeHost: "host3"}, - {ComputeHost: "host4"}, - }, - }, - expectedHosts: []string{"host1", "host2"}, - filteredHosts: []string{"host3", "host4"}, - }, - { - name: "Image property with additional properties", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{}, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{ - "hw_virtio_packed_ring": "true", - "hw_disk_bus": "virtio", - "hw_vif_model": "virtio", - }, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host2"}, - {ComputeHost: "host4"}, - }, - }, - expectedHosts: []string{"host2"}, - filteredHosts: []string{"host4"}, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - step := &FilterPackedVirtqueueStep{} - step.Client = fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(hvs...). - Build() - - result, err := step.Run(slog.Default(), tt.request) - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - // Check expected hosts are present - for _, host := range tt.expectedHosts { - if _, ok := result.Activations[host]; !ok { - t.Errorf("expected host %s to be present in activations", host) - } - } - - // Check filtered hosts are not present - for _, host := range tt.filteredHosts { - if _, ok := result.Activations[host]; ok { - t.Errorf("expected host %s to be filtered out", host) - } - } - - // Check total count - if len(result.Activations) != len(tt.expectedHosts) { - t.Errorf("expected %d hosts, got %d", len(tt.expectedHosts), len(result.Activations)) - } - }) - } -} From 03031a865348e80911979099c7da3f9032bd8a35 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 10 Mar 2026 12:21:28 +0000 Subject: [PATCH 02/55] Bump cortex chart appVersions to sha-081184cd [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 684bbd759..f4ef6e2bb 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-349d742b" +appVersion: "sha-081184cd" icon: "https://example.com/icon.png" dependencies: [] From 3ee76b67b87efdb7269f91239cccc7fceff1bcd4 Mon Sep 17 00:00:00 2001 From: mblos <156897072+mblos@users.noreply.github.com> Date: Thu, 12 Mar 2026 11:52:45 +0100 Subject: [PATCH 03/55] Add basic committed resource functionality with reservations (#566) This adds basic committed resource (CR) functionality: - update reservations with AZ - reservation label - flavor group knowledge - knowledge with contentChanged - commitments syncer works with flavor group CRs - commitments API endpoint for batch update of CRs - commitments API version endpoint - skeletons for capacity endpoint --- CODEOWNERS | 2 +- api/v1alpha1/knowledge_types.go | 6 + api/v1alpha1/reservation_types.go | 20 +- api/v1alpha1/zz_generated.deepcopy.go | 1 + cmd/main.go | 12 +- go.mod | 5 +- helm/bundles/cortex-cinder/values.yaml | 2 +- helm/bundles/cortex-crds/values.yaml | 2 +- helm/bundles/cortex-ironcore/values.yaml | 2 +- helm/bundles/cortex-manila/values.yaml | 2 +- .../cortex-nova/templates/knowledges_kvm.yaml | 17 + helm/bundles/cortex-nova/values.yaml | 7 +- helm/bundles/cortex-pods/values.yaml | 2 +- .../files/crds/cortex.cloud_knowledges.yaml | 9 + .../files/crds/cortex.cloud_reservations.yaml | 6 +- internal/knowledge/extractor/controller.go | 20 + .../plugins/compute/flavor_groups.go | 154 +++++ .../plugins/compute/flavor_groups.sql | 17 + .../plugins/compute/flavor_groups_test.go | 273 +++++++++ .../extractor/supported_extractors.go | 1 + .../reservations/commitments/api.go | 33 ++ .../commitments/api_change_commitments.go | 353 ++++++++++++ .../api_change_commitments_test.go | 246 ++++++++ .../reservations/commitments/api_info.go | 117 ++++ .../reservations/commitments/api_info_test.go | 78 +++ .../commitments/api_report_capacity.go | 61 ++ .../commitments/api_report_capacity_test.go | 285 +++++++++ .../reservations/commitments/capacity.go | 124 ++++ .../reservations/commitments/client.go | 125 +--- .../reservations/commitments/client_test.go | 400 ------------- .../commitments/reservation_manager.go | 310 ++++++++++ .../commitments/reservation_manager_test.go | 540 ++++++++++++++++++ .../reservations/commitments/state.go | 202 +++++++ .../reservations/commitments/state_test.go | 252 ++++++++ .../reservations/commitments/syncer.go | 303 ++++------ .../reservations/commitments/syncer_test.go | 243 +++++--- .../reservations/commitments/utils.go | 46 ++ .../reservations/commitments/utils_test.go | 84 +++ .../reservations/controller/client.go | 135 ----- .../reservations/controller/client_test.go | 23 - .../reservations/controller/controller.go | 328 +++++++++-- .../controller/controller_test.go | 91 ++- .../scheduling/reservations/flavor_groups.go | 74 +++ 43 files changed, 3990 insertions(+), 1023 deletions(-) create mode 100644 internal/knowledge/extractor/plugins/compute/flavor_groups.go create mode 100644 internal/knowledge/extractor/plugins/compute/flavor_groups.sql create mode 100644 internal/knowledge/extractor/plugins/compute/flavor_groups_test.go create mode 100644 internal/scheduling/reservations/commitments/api.go create mode 100644 internal/scheduling/reservations/commitments/api_change_commitments.go create mode 100644 internal/scheduling/reservations/commitments/api_change_commitments_test.go create mode 100644 internal/scheduling/reservations/commitments/api_info.go create mode 100644 internal/scheduling/reservations/commitments/api_info_test.go create mode 100644 internal/scheduling/reservations/commitments/api_report_capacity.go create mode 100644 internal/scheduling/reservations/commitments/api_report_capacity_test.go create mode 100644 internal/scheduling/reservations/commitments/capacity.go create mode 100644 internal/scheduling/reservations/commitments/reservation_manager.go create mode 100644 internal/scheduling/reservations/commitments/reservation_manager_test.go create mode 100644 internal/scheduling/reservations/commitments/state.go create mode 100644 internal/scheduling/reservations/commitments/state_test.go create mode 100644 internal/scheduling/reservations/commitments/utils.go create mode 100644 internal/scheduling/reservations/commitments/utils_test.go delete mode 100644 internal/scheduling/reservations/controller/client.go delete mode 100644 internal/scheduling/reservations/controller/client_test.go create mode 100644 internal/scheduling/reservations/flavor_groups.go diff --git a/CODEOWNERS b/CODEOWNERS index 23f370f22..f6e423ecb 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1 +1 @@ -* arno.uhlig@sap.com julius.clausnitzer@sap.com malte.viering@sap.com marcel.bloecher@sap.com markus.wieland@sap.com p.matthes@sap.com \ No newline at end of file +* arno.uhlig@sap.com julius.clausnitzer@sap.com malte.viering@sap.com marcel.gute@sap.com markus.wieland@sap.com p.matthes@sap.com \ No newline at end of file diff --git a/api/v1alpha1/knowledge_types.go b/api/v1alpha1/knowledge_types.go index d90f76565..504b30449 100644 --- a/api/v1alpha1/knowledge_types.go +++ b/api/v1alpha1/knowledge_types.go @@ -93,6 +93,11 @@ type KnowledgeStatus struct { // +kubebuilder:validation:Optional LastExtracted metav1.Time `json:"lastExtracted"` + // When the extracted knowledge content last changed. + // Updated only when the Raw data actually changes, not on every reconcile. + // +kubebuilder:validation:Optional + LastContentChange metav1.Time `json:"lastContentChange,omitempty"` + // The raw data behind the extracted knowledge, e.g. a list of features. // +kubebuilder:validation:Optional Raw runtime.RawExtension `json:"raw"` @@ -111,6 +116,7 @@ type KnowledgeStatus struct { // +kubebuilder:printcolumn:name="Domain",type="string",JSONPath=".spec.schedulingDomain" // +kubebuilder:printcolumn:name="Created",type="date",JSONPath=".metadata.creationTimestamp" // +kubebuilder:printcolumn:name="Extracted",type="date",JSONPath=".status.lastExtracted" +// +kubebuilder:printcolumn:name="Changed",type="date",JSONPath=".status.lastContentChange" // +kubebuilder:printcolumn:name="Recency",type="string",JSONPath=".spec.recency" // +kubebuilder:printcolumn:name="Features",type="integer",JSONPath=".status.rawLength" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" diff --git a/api/v1alpha1/reservation_types.go b/api/v1alpha1/reservation_types.go index ed8e42f43..df3ad473e 100644 --- a/api/v1alpha1/reservation_types.go +++ b/api/v1alpha1/reservation_types.go @@ -21,6 +21,20 @@ const ( ReservationTypeFailover ReservationType = "FailoverReservation" ) +// Label keys for Reservation metadata. +// Labels follow Kubernetes naming conventions using reverse-DNS notation +const ( + // ===== Common Reservation Labels ===== + + // LabelReservationType identifies the type of reservation. + // This label is present on all reservations to enable type-based filtering. + LabelReservationType = "reservations.cortex.sap.com/type" + + // Reservation type label values + ReservationTypeLabelCommittedResource = "committed-resource" + ReservationTypeLabelFailover = "failover" +) + // CommittedResourceAllocation represents a workload's assignment to a committed resource reservation slot. // The workload could be a VM (Nova/IronCore), Pod (Kubernetes), or other resource. type CommittedResourceAllocation struct { @@ -79,6 +93,10 @@ type ReservationSpec struct { // +kubebuilder:validation:Optional SchedulingDomain string `json:"schedulingDomain,omitempty"` + // AvailabilityZone specifies the availability zone for this reservation, if restricted to a specific AZ. + // +kubebuilder:validation:Optional + AvailabilityZone string `json:"availabilityZone,omitempty"` + // Resources to reserve for this instance. // +kubebuilder:validation:Optional Resources map[string]resource.Quantity `json:"resources,omitempty"` @@ -166,7 +184,7 @@ type ReservationStatus struct { // +kubebuilder:object:root=true // +kubebuilder:subresource:status // +kubebuilder:resource:scope=Cluster -// +kubebuilder:printcolumn:name="Type",type="string",JSONPath=".spec.type" +// +kubebuilder:printcolumn:name="Type",type="string",JSONPath=".metadata.labels['reservations\\.cortex\\.sap\\.com/type']" // +kubebuilder:printcolumn:name="Host",type="string",JSONPath=".status.host" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 5a756e045..564f30cac 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -833,6 +833,7 @@ func (in *KnowledgeSpec) DeepCopy() *KnowledgeSpec { func (in *KnowledgeStatus) DeepCopyInto(out *KnowledgeStatus) { *out = *in in.LastExtracted.DeepCopyInto(&out.LastExtracted) + in.LastContentChange.DeepCopyInto(&out.LastContentChange) in.Raw.DeepCopyInto(&out.Raw) if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions diff --git a/cmd/main.go b/cmd/main.go index 4e4865567..43ae63f9c 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -309,6 +309,10 @@ func main() { httpAPIConf := conf.GetConfigOrDie[nova.HTTPAPIConfig]() nova.NewAPI(httpAPIConf, filterWeigherController).Init(mux) + // Initialize commitments API for LIQUID interface + commitmentsAPI := commitments.NewAPI(multiclusterClient) + commitmentsAPI.Init(mux) + // Detector pipeline controller setup. novaClient := nova.NewNovaClient() novaClientConfig := conf.GetConfigOrDie[nova.NovaClientConfig]() @@ -456,11 +460,11 @@ func main() { monitor := reservationscontroller.NewControllerMonitor(multiclusterClient) metrics.Registry.MustRegister(&monitor) reservationsControllerConfig := conf.GetConfigOrDie[reservationscontroller.Config]() + if err := (&reservationscontroller.ReservationReconciler{ - Client: multiclusterClient, - Scheme: mgr.GetScheme(), - Conf: reservationsControllerConfig, - HypervisorClient: reservationscontroller.NewHypervisorClient(), + Client: multiclusterClient, + Scheme: mgr.GetScheme(), + Conf: reservationsControllerConfig, }).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Reservation") os.Exit(1) diff --git a/go.mod b/go.mod index 245513cb3..f6b914718 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/go-gorp/gorp v2.2.0+incompatible github.com/gophercloud/gophercloud/v2 v2.10.0 github.com/ironcore-dev/ironcore v0.2.4 + github.com/majewsky/gg v1.5.0 github.com/prometheus/client_golang v1.23.2 github.com/prometheus/client_model v0.6.2 github.com/sapcc/go-bits v0.0.0-20260226170120-c20f89b66c3c @@ -36,7 +37,7 @@ require ( github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect - github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/logr v1.4.3 github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.22.1 // indirect @@ -71,7 +72,7 @@ require ( github.com/poy/onpar v0.3.5 // indirect github.com/prometheus/common v0.67.5 // indirect github.com/prometheus/procfs v0.17.0 // indirect - github.com/sapcc/go-api-declarations v1.20.2 // indirect + github.com/sapcc/go-api-declarations v1.20.2 github.com/sirupsen/logrus v1.9.3 // indirect github.com/spf13/cobra v1.10.1 // indirect github.com/spf13/pflag v1.0.10 // indirect diff --git a/helm/bundles/cortex-cinder/values.yaml b/helm/bundles/cortex-cinder/values.yaml index f002fc58b..b01656205 100644 --- a/helm/bundles/cortex-cinder/values.yaml +++ b/helm/bundles/cortex-cinder/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" diff --git a/helm/bundles/cortex-crds/values.yaml b/helm/bundles/cortex-crds/values.yaml index 2033e435c..bf072086c 100644 --- a/helm/bundles/cortex-crds/values.yaml +++ b/helm/bundles/cortex-crds/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" diff --git a/helm/bundles/cortex-ironcore/values.yaml b/helm/bundles/cortex-ironcore/values.yaml index 2f885c7a5..82e490585 100644 --- a/helm/bundles/cortex-ironcore/values.yaml +++ b/helm/bundles/cortex-ironcore/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" diff --git a/helm/bundles/cortex-manila/values.yaml b/helm/bundles/cortex-manila/values.yaml index cc341a112..50d16352e 100644 --- a/helm/bundles/cortex-manila/values.yaml +++ b/helm/bundles/cortex-manila/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" diff --git a/helm/bundles/cortex-nova/templates/knowledges_kvm.yaml b/helm/bundles/cortex-nova/templates/knowledges_kvm.yaml index f2181fe96..6b3d9fcbc 100644 --- a/helm/bundles/cortex-nova/templates/knowledges_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/knowledges_kvm.yaml @@ -2,6 +2,23 @@ --- apiVersion: cortex.cloud/v1alpha1 kind: Knowledge +metadata: + name: flavor-groups +spec: + schedulingDomain: nova + extractor: + name: flavor_groups + recency: "5m" + description: | + This knowledge extracts flavor groups from Nova flavors based on the + hw_version extra_spec. It identifies all flavors belonging to each group + and determines the largest flavor for reservation slot sizing. + dependencies: + datasources: + - name: nova-flavors +--- +apiVersion: cortex.cloud/v1alpha1 +kind: Knowledge metadata: name: kvm-libvirt-domain-cpu-steal-pct spec: diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index b2dbba788..200ba3ff3 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" @@ -114,8 +114,12 @@ cortex-scheduling-controllers: - nova-pipeline-controllers - nova-deschedulings-executor - explanation-controller + - reservations-controller enabledTasks: - nova-decisions-cleanup-task + # Endpoints configuration for reservations controller + endpoints: + novaExternalScheduler: "http://localhost:8080/scheduler/nova/external" cortex-knowledge-controllers: <<: *cortex @@ -134,7 +138,6 @@ cortex-knowledge-controllers: - datasource-controllers - knowledge-controllers - kpis-controller - - reservations-controller enabledTasks: - commitments-sync-task diff --git a/helm/bundles/cortex-pods/values.yaml b/helm/bundles/cortex-pods/values.yaml index b7aab8a6d..4c381f539 100644 --- a/helm/bundles/cortex-pods/values.yaml +++ b/helm/bundles/cortex-pods/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" diff --git a/helm/library/cortex/files/crds/cortex.cloud_knowledges.yaml b/helm/library/cortex/files/crds/cortex.cloud_knowledges.yaml index 0ac596bc2..2e3891ffa 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_knowledges.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_knowledges.yaml @@ -24,6 +24,9 @@ spec: - jsonPath: .status.lastExtracted name: Extracted type: date + - jsonPath: .status.lastContentChange + name: Changed + type: date - jsonPath: .spec.recency name: Recency type: string @@ -248,6 +251,12 @@ spec: - type type: object type: array + lastContentChange: + description: |- + When the extracted knowledge content last changed. + Updated only when the Raw data actually changes, not on every reconcile. + format: date-time + type: string lastExtracted: description: When the knowledge was last successfully extracted. format: date-time diff --git a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml index 5d341cdf6..915e5677e 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml @@ -15,7 +15,7 @@ spec: scope: Cluster versions: - additionalPrinterColumns: - - jsonPath: .spec.type + - jsonPath: .metadata.labels['reservations\.cortex\.sap\.com/type'] name: Type type: string - jsonPath: .status.host @@ -49,6 +49,10 @@ spec: spec: description: spec defines the desired state of Reservation properties: + availabilityZone: + description: AvailabilityZone specifies the availability zone for + this reservation, if restricted to a specific AZ. + type: string committedResourceReservation: description: |- CommittedResourceReservation contains fields specific to committed resource reservations. diff --git a/internal/knowledge/extractor/controller.go b/internal/knowledge/extractor/controller.go index cd4f63972..3dd511b39 100644 --- a/internal/knowledge/extractor/controller.go +++ b/internal/knowledge/extractor/controller.go @@ -5,6 +5,8 @@ package extractor import ( "context" + "encoding/json" + "reflect" "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" @@ -202,9 +204,27 @@ func (r *KnowledgeReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( Reason: "KnowledgeExtracted", Message: "knowledge extracted successfully", }) + + // Check if content actually changed by comparing deserialized data structures. + // This avoids false positives from JSON serialization non-determinism (e.g., map key ordering). + contentChanged := true + if len(knowledge.Status.Raw.Raw) > 0 { + var oldData, newData interface{} + if err := json.Unmarshal(knowledge.Status.Raw.Raw, &oldData); err == nil { + if err := json.Unmarshal(raw.Raw, &newData); err == nil { + contentChanged = !reflect.DeepEqual(oldData, newData) + } + } + } + knowledge.Status.Raw = raw knowledge.Status.LastExtracted = metav1.NewTime(time.Now()) knowledge.Status.RawLength = len(features) + + if contentChanged { + log.Info("content of knowledge has changed", "name", knowledge.Name) + knowledge.Status.LastContentChange = metav1.NewTime(time.Now()) + } patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, knowledge, patch); err != nil { log.Error(err, "failed to patch knowledge status") diff --git a/internal/knowledge/extractor/plugins/compute/flavor_groups.go b/internal/knowledge/extractor/plugins/compute/flavor_groups.go new file mode 100644 index 000000000..d5c47cf2a --- /dev/null +++ b/internal/knowledge/extractor/plugins/compute/flavor_groups.go @@ -0,0 +1,154 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package compute + +import ( + _ "embed" + "encoding/json" + "errors" + "sort" + + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins" + ctrl "sigs.k8s.io/controller-runtime" +) + +// FlavorInGroup represents a single flavor within a flavor group. +type FlavorInGroup struct { + Name string `json:"name"` + VCPUs uint64 `json:"vcpus"` + MemoryMB uint64 `json:"memoryMB"` + DiskGB uint64 `json:"diskGB"` + EphemeralGB uint64 `json:"ephemeralGB,omitempty"` + ExtraSpecs map[string]string `json:"extraSpecs,omitempty"` +} + +// FlavorGroupFeature represents a flavor group with all its member flavors. +// This is the feature that gets stored in the Knowledge CRD. +type FlavorGroupFeature struct { + // Name of the flavor group (from hw_version extra_spec) + Name string `json:"name"` + + // All flavors belonging to this group + Flavors []FlavorInGroup `json:"flavors"` + + // The largest flavor in the group (used for reservation slot sizing) + LargestFlavor FlavorInGroup `json:"largestFlavor"` + + // The smallest flavor in the group (used for CR size quantification) + SmallestFlavor FlavorInGroup `json:"smallestFlavor"` +} + +// flavorRow represents a row from the SQL query. +type flavorRow struct { + Name string `db:"name"` + VCPUs uint64 `db:"vcpus"` + MemoryMB uint64 `db:"memory_mb"` + DiskGB uint64 `db:"disk"` + EphemeralGB uint64 `db:"ephemeral"` + ExtraSpecs string `db:"extra_specs"` +} + +// FlavorGroupExtractor extracts flavor group information from the database. +type FlavorGroupExtractor struct { + // Common base for all extractors that provides standard functionality. + plugins.BaseExtractor[ + struct{}, // No options passed through yaml config + FlavorGroupFeature, // Feature model + ] +} + +//go:embed flavor_groups.sql +var flavorGroupsQuery string + +var flavorGroupLog = ctrl.Log.WithName("flavor_group_extractor") + +// Extract flavor groups from the database. +func (e *FlavorGroupExtractor) Extract() ([]plugins.Feature, error) { + if e.DB == nil { + return nil, errors.New("database connection is not initialized") + } + + // Query all flavors from database + var rows []flavorRow + if _, err := e.DB.Select(&rows, flavorGroupsQuery); err != nil { + flavorGroupLog.Error(err, "failed to query flavors") + return nil, err + } + + // Group flavors by flavorGroupIdentifierName + groupMap := make(map[string][]FlavorInGroup) + + for _, row := range rows { + // Parse extra_specs JSON + var extraSpecs map[string]string + if row.ExtraSpecs != "" { + if err := json.Unmarshal([]byte(row.ExtraSpecs), &extraSpecs); err != nil { + flavorGroupLog.Info("failed to parse extra_specs for flavor", "flavor", row.Name, "error", err) + continue + } + } + + hwVersion, exists := extraSpecs["quota:hw_version"] + if !exists || hwVersion == "" { + flavorGroupLog.Info("flavor missing hw_version extra_spec", "flavor", row.Name) + continue + } + + // Add flavor to its group + flavor := FlavorInGroup{ + Name: row.Name, + VCPUs: row.VCPUs, + MemoryMB: row.MemoryMB, + DiskGB: row.DiskGB, + EphemeralGB: row.EphemeralGB, + ExtraSpecs: extraSpecs, + } + groupMap[hwVersion] = append(groupMap[hwVersion], flavor) + } + + // Convert map to features + features := make([]FlavorGroupFeature, 0, len(groupMap)) + for groupName, flavors := range groupMap { + if len(flavors) == 0 { + continue + } + + // Sort flavors by size descending (largest first), tie break by name for consistent ordering + sort.Slice(flavors, func(i, j int) bool { + if flavors[i].MemoryMB != flavors[j].MemoryMB { + return flavors[i].MemoryMB > flavors[j].MemoryMB + } + if flavors[i].VCPUs != flavors[j].VCPUs { + return flavors[i].VCPUs > flavors[j].VCPUs + } + return flavors[i].Name < flavors[j].Name + }) + + largest := flavors[0] + smallest := flavors[len(flavors)-1] + + flavorGroupLog.Info("identified largest and smallest flavors", + "groupName", groupName, + "largestFlavor", largest.Name, + "largestMemoryMB", largest.MemoryMB, + "largestVCPUs", largest.VCPUs, + "smallestFlavor", smallest.Name, + "smallestMemoryMB", smallest.MemoryMB, + "smallestVCPUs", smallest.VCPUs) + + features = append(features, FlavorGroupFeature{ + Name: groupName, + Flavors: flavors, + LargestFlavor: largest, + SmallestFlavor: smallest, + }) + } + + // Sort features by group name for consistent ordering + sort.Slice(features, func(i, j int) bool { + return features[i].Name < features[j].Name + }) + + return e.Extracted(features) +} diff --git a/internal/knowledge/extractor/plugins/compute/flavor_groups.sql b/internal/knowledge/extractor/plugins/compute/flavor_groups.sql new file mode 100644 index 000000000..0905e0b7d --- /dev/null +++ b/internal/knowledge/extractor/plugins/compute/flavor_groups.sql @@ -0,0 +1,17 @@ +-- Copyright SAP SE +-- SPDX-License-Identifier: Apache-2.0 + +-- Query to extract flavor groups from the openstack_flavors_v2 table +-- Groups flavors by their hw_version extra_spec (or flavor name prefix as workaround) +-- Filters to only include KVM flavors (QEMU and Cloud-Hypervisor) +SELECT + name, + vcpus, + ram as memory_mb, + disk, + ephemeral, + extra_specs +FROM openstack_flavors_v2 +WHERE LOWER(extra_specs) LIKE '%"capabilities:hypervisor_type":"qemu"%' + OR LOWER(extra_specs) LIKE '%"capabilities:hypervisor_type":"ch"%' +ORDER BY name; diff --git a/internal/knowledge/extractor/plugins/compute/flavor_groups_test.go b/internal/knowledge/extractor/plugins/compute/flavor_groups_test.go new file mode 100644 index 000000000..becccadd0 --- /dev/null +++ b/internal/knowledge/extractor/plugins/compute/flavor_groups_test.go @@ -0,0 +1,273 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package compute + +import ( + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" +) + +func TestFlavorGroupExtractor_Extract(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + defer dbEnv.Close() + testDB := db.DB{DbMap: dbEnv.DbMap} + + // Setup test data - create flavors table + if err := testDB.CreateTable( + testDB.AddTable(nova.Flavor{}), + ); err != nil { + t.Fatal(err) + } + + // Insert test flavors with quota:hw_version in extra_specs + // Mix of KVM flavors (should be included) and VMware flavors (should be excluded) + flavors := []any{ + &nova.Flavor{ + ID: "1", + Name: "hana_c30_m480_v2", + VCPUs: 30, + RAM: 491520, // 480GB in MB + Disk: 100, + Ephemeral: 0, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","hw:cpu_policy":"dedicated","quota:hw_version":"v2"}`, + }, + &nova.Flavor{ + ID: "2", + Name: "hana_c60_m960_v2", + VCPUs: 60, + RAM: 983040, // 960GB in MB + Disk: 100, + Ephemeral: 0, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","hw:cpu_policy":"dedicated","quota:hw_version":"v2"}`, + }, + &nova.Flavor{ + ID: "3", + Name: "hana_c240_m3840_v2", + VCPUs: 240, + RAM: 3932160, // 3840GB in MB + Disk: 100, + Ephemeral: 0, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","hw:cpu_policy":"dedicated","hw:numa_nodes":"4","quota:hw_version":"v2"}`, + }, + &nova.Flavor{ + ID: "4", + Name: "gp_c8_m32_v2", + VCPUs: 8, + RAM: 32768, // 32GB in MB + Disk: 50, + Ephemeral: 10, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","quota:hw_version":"v2"}`, + }, + &nova.Flavor{ + ID: "5", + Name: "gp_c16_m64_v2", + VCPUs: 16, + RAM: 65536, // 64GB in MB + Disk: 50, + Ephemeral: 20, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","quota:hw_version":"v2"}`, + }, + // VMware flavor - should be excluded from results (filtered by SQL query) + &nova.Flavor{ + ID: "6", + Name: "vmwa_c32_m512_v1", + VCPUs: 32, + RAM: 524288, // 512GB in MB + Disk: 200, + Ephemeral: 0, + ExtraSpecs: `{"capabilities:hypervisor_type":"VMware vCenter Server","quota:hw_version":"v1"}`, + }, + // Cloud-Hypervisor flavor - should be included (case insensitive) + &nova.Flavor{ + ID: "7", + Name: "gp_c4_m16_ch", + VCPUs: 4, + RAM: 16384, // 16GB in MB + Disk: 25, + Ephemeral: 5, + ExtraSpecs: `{"capabilities:hypervisor_type":"CH","quota:hw_version":"ch"}`, + }, + // Corner case: Same memory as gp_c8_m32_v2 but MORE VCPUs (should come first) + &nova.Flavor{ + ID: "8", + Name: "gp_c12_m32_v2", + VCPUs: 12, + RAM: 32768, // 32GB in MB - same as gp_c8_m32_v2 + Disk: 50, + Ephemeral: 10, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","quota:hw_version":"v2"}`, + }, + // Corner case: Same memory AND same VCPUs as gp_c12_m32_v2 (tests name sorting) + &nova.Flavor{ + ID: "9", + Name: "gp_c12_m32_alt", + VCPUs: 12, + RAM: 32768, // 32GB in MB + Disk: 50, + Ephemeral: 10, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","quota:hw_version":"v2"}`, + }, + } + + if err := testDB.Insert(flavors...); err != nil { + t.Fatal(err) + } + + // Create and run extractor + extractor := &FlavorGroupExtractor{} + config := v1alpha1.KnowledgeSpec{} + if err := extractor.Init(&testDB, nil, config); err != nil { + t.Fatal(err) + } + + features, err := extractor.Extract() + if err != nil { + t.Fatal(err) + } + + // Verify results - should be 2 groups (v2 and ch based on hw_version) + // VMware flavor should be filtered out, Cloud-Hypervisor should be included + if len(features) != 2 { + t.Fatalf("expected 2 flavor groups, got %d", len(features)) + } + + // Convert to typed features for easier testing + var v2Group, chGroup *FlavorGroupFeature + for _, f := range features { + fg := f.(FlavorGroupFeature) + switch fg.Name { + case "v2": + v2Group = &fg + case "ch": + chGroup = &fg + } + } + + // Verify v2 group (contains both HANA and general purpose flavors) + if v2Group == nil { + t.Fatal("v2 group not found") + } + if len(v2Group.Flavors) != 7 { + t.Errorf("expected 7 flavors in v2 group (3 HANA + 4 general purpose), got %d", len(v2Group.Flavors)) + } + // Largest flavor in v2 group should be hana_c240_m3840_v2 (highest memory) + if v2Group.LargestFlavor.Name != "hana_c240_m3840_v2" { + t.Errorf("expected largest flavor to be hana_c240_m3840_v2, got %s", v2Group.LargestFlavor.Name) + } + if v2Group.LargestFlavor.VCPUs != 240 { + t.Errorf("expected largest flavor VCPUs to be 240, got %d", v2Group.LargestFlavor.VCPUs) + } + if v2Group.LargestFlavor.MemoryMB != 3932160 { + t.Errorf("expected largest flavor memory to be 3932160 MB, got %d", v2Group.LargestFlavor.MemoryMB) + } + if v2Group.LargestFlavor.DiskGB != 100 { + t.Errorf("expected largest flavor disk to be 100 GB, got %d", v2Group.LargestFlavor.DiskGB) + } + if v2Group.LargestFlavor.ExtraSpecs == nil { + t.Error("expected largest flavor to have extra_specs") + } + if v2Group.LargestFlavor.ExtraSpecs["hw:numa_nodes"] != "4" { + t.Errorf("expected largest flavor to have hw:numa_nodes=4, got %s", v2Group.LargestFlavor.ExtraSpecs["hw:numa_nodes"]) + } + if v2Group.LargestFlavor.ExtraSpecs["quota:hw_version"] != "v2" { + t.Errorf("expected largest flavor to have quota:hw_version=v2, got %s", v2Group.LargestFlavor.ExtraSpecs["quota:hw_version"]) + } + + // Verify smallest flavor in v2 group should be gp_c4_m16_ch is NOT in v2, so it's gp_c8_m32_v2 (lowest memory among v2 flavors) + if v2Group.SmallestFlavor.Name != "gp_c8_m32_v2" { + t.Errorf("expected smallest flavor to be gp_c8_m32_v2, got %s", v2Group.SmallestFlavor.Name) + } + if v2Group.SmallestFlavor.MemoryMB != 32768 { + t.Errorf("expected smallest flavor memory to be 32768 MB, got %d", v2Group.SmallestFlavor.MemoryMB) + } + if v2Group.SmallestFlavor.VCPUs != 8 { + t.Errorf("expected smallest flavor VCPUs to be 8, got %d", v2Group.SmallestFlavor.VCPUs) + } + + // Verify Cloud-Hypervisor group + if chGroup == nil { + t.Fatal("ch group not found") + } + if len(chGroup.Flavors) != 1 { + t.Errorf("expected 1 flavor in ch group, got %d", len(chGroup.Flavors)) + } + if chGroup.LargestFlavor.Name != "gp_c4_m16_ch" { + t.Errorf("expected largest flavor to be gp_c4_m16_ch, got %s", chGroup.LargestFlavor.Name) + } + if chGroup.LargestFlavor.ExtraSpecs["quota:hw_version"] != "ch" { + t.Errorf("expected ch flavor to have quota:hw_version=ch, got %s", chGroup.LargestFlavor.ExtraSpecs["quota:hw_version"]) + } + + // Verify smallest flavor in ch group (only has 1 flavor, so same as largest) + if chGroup.SmallestFlavor.Name != "gp_c4_m16_ch" { + t.Errorf("expected smallest flavor to be gp_c4_m16_ch, got %s", chGroup.SmallestFlavor.Name) + } + + // Generic check: Verify all flavor groups have correctly ordered flavors + // Flavors must be sorted descending by memory (largest first), with VCPUs as tiebreaker + for _, f := range features { + fg := f.(FlavorGroupFeature) + + // Check that flavors are sorted in descending order + for i := range len(fg.Flavors) - 1 { + current := fg.Flavors[i] + next := fg.Flavors[i+1] + + // Primary sort: memory descending + if current.MemoryMB < next.MemoryMB { + t.Errorf("Flavors in group %s not sorted by memory: %s (%d MB) should come after %s (%d MB)", + fg.Name, current.Name, current.MemoryMB, next.Name, next.MemoryMB) + } + + // Secondary sort: if memory equal, VCPUs descending + if current.MemoryMB == next.MemoryMB && current.VCPUs < next.VCPUs { + t.Errorf("Flavors in group %s with equal memory not sorted by VCPUs: %s (%d VCPUs) should come after %s (%d VCPUs)", + fg.Name, current.Name, current.VCPUs, next.Name, next.VCPUs) + } + } + + // Verify LargestFlavor matches the first flavor in sorted list + if len(fg.Flavors) > 0 && fg.LargestFlavor.Name != fg.Flavors[0].Name { + t.Errorf("Group %s: LargestFlavor (%s) doesn't match first flavor in sorted list (%s)", + fg.Name, fg.LargestFlavor.Name, fg.Flavors[0].Name) + } + + // Verify SmallestFlavor matches the last flavor in sorted list + if len(fg.Flavors) > 0 && fg.SmallestFlavor.Name != fg.Flavors[len(fg.Flavors)-1].Name { + t.Errorf("Group %s: SmallestFlavor (%s) doesn't match last flavor in sorted list (%s)", + fg.Name, fg.SmallestFlavor.Name, fg.Flavors[len(fg.Flavors)-1].Name) + } + } + + // Verify that VMware flavor was filtered out + for _, f := range features { + fg := f.(FlavorGroupFeature) + for _, flavor := range fg.Flavors { + if flavor.Name == "vmwa_c32_m512_v1" { + t.Errorf("VMware flavor should have been filtered out but was found in group %s", fg.Name) + } + } + } + + // Verify that Cloud-Hypervisor flavor was included in ch group + foundCH := false + for _, flavor := range chGroup.Flavors { + if flavor.Name == "gp_c4_m16_ch" { + foundCH = true + if flavor.ExtraSpecs["capabilities:hypervisor_type"] != "CH" { + t.Errorf("expected CH hypervisor_type, got %s", flavor.ExtraSpecs["capabilities:hypervisor_type"]) + } + if flavor.ExtraSpecs["quota:hw_version"] != "ch" { + t.Errorf("expected quota:hw_version=ch, got %s", flavor.ExtraSpecs["quota:hw_version"]) + } + } + } + if !foundCH { + t.Error("Cloud-Hypervisor flavor should have been included but was not found") + } +} diff --git a/internal/knowledge/extractor/supported_extractors.go b/internal/knowledge/extractor/supported_extractors.go index 684697928..6f1cb2fd2 100644 --- a/internal/knowledge/extractor/supported_extractors.go +++ b/internal/knowledge/extractor/supported_extractors.go @@ -23,6 +23,7 @@ var supportedExtractors = map[string]plugins.FeatureExtractor{ "host_az_extractor": &compute.HostAZExtractor{}, "host_pinned_projects_extractor": &compute.HostPinnedProjectsExtractor{}, "sap_host_details_extractor": &compute.HostDetailsExtractor{}, + "flavor_groups": &compute.FlavorGroupExtractor{}, "netapp_storage_pool_cpu_usage_extractor": &storage.StoragePoolCPUUsageExtractor{}, } diff --git a/internal/scheduling/reservations/commitments/api.go b/internal/scheduling/reservations/commitments/api.go new file mode 100644 index 000000000..ba83e2ab8 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api.go @@ -0,0 +1,33 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "net/http" + "sync" + + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// HTTPAPI implements Limes LIQUID commitment validation endpoints. +type HTTPAPI struct { + client client.Client + // Mutex to serialize change-commitments requests + changeMutex sync.Mutex +} + +func NewAPI(client client.Client) *HTTPAPI { + return &HTTPAPI{ + client: client, + } +} + +func (api *HTTPAPI) Init(mux *http.ServeMux) { + mux.HandleFunc("/v1/change-commitments", api.HandleChangeCommitments) + // mux.HandleFunc("/v1/report-capacity", api.HandleReportCapacity) + mux.HandleFunc("/v1/info", api.HandleInfo) +} + +var commitmentApiLog = ctrl.Log.WithName("commitment_api") diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go new file mode 100644 index 000000000..3134b3b9d --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_change_commitments.go @@ -0,0 +1,353 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/go-logr/logr" + . "github.com/majewsky/gg/option" + "github.com/sapcc/go-api-declarations/liquid" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + // watchTimeout is how long to wait for all reservations to become ready + watchTimeout = 20 * time.Second + + // pollInterval is how frequently to poll reservation status + pollInterval = 1 * time.Second +) + +// implements POST /v1/change-commitments from Limes LIQUID API: +// See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid +// +// This endpoint handles commitment changes by creating/updating/deleting Reservation CRDs based on the commitment lifecycle. +// A request may contain multiple commitment changes which are processed in a single transaction. If any change fails, all changes are rolled back. +func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Request) { + // Serialize all change-commitments requests + api.changeMutex.Lock() + defer api.changeMutex.Unlock() + + // Extract or generate request ID for tracing + requestID := r.Header.Get("X-Request-ID") + if requestID == "" { + requestID = fmt.Sprintf("req-%d", time.Now().UnixNano()) + } + log := commitmentApiLog.WithValues("requestID", requestID, "endpoint", "/v1/change-commitments") + + // Only accept POST method + if r.Method != http.MethodPost { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + // Parse request body + var req liquid.CommitmentChangeRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + log.Error(err, "invalid request body") + http.Error(w, "Invalid request body: "+err.Error(), http.StatusBadRequest) + return + } + + log.Info("received change commitments request", "affectedProjects", len(req.ByProject), "dryRun", req.DryRun, "availabilityZone", req.AZ) + + // Initialize response + resp := liquid.CommitmentChangeResponse{} + + // Check for dry run -> early reject, not supported yet + if req.DryRun { + resp.RejectionReason = "Dry run not supported yet" + log.Info("rejecting dry run request") + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(resp); err != nil { + return + } + return + } + + // Process commitment changes + // For now, we'll implement a simplified path that checks capacity for immediate start CRs + if err := api.processCommitmentChanges(w, log, req, &resp); err != nil { + // Error already written to response by processCommitmentChanges + return + } + + // Return response + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(resp); err != nil { + return + } +} + +func (api *HTTPAPI) processCommitmentChanges(w http.ResponseWriter, log logr.Logger, req liquid.CommitmentChangeRequest, resp *liquid.CommitmentChangeResponse) error { + ctx := context.Background() + manager := NewReservationManager(api.client) + requireRollback := false + log.Info("processing commitment change request", "availabilityZone", req.AZ, "dryRun", req.DryRun, "affectedProjects", len(req.ByProject)) + + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: api.client} + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) + if err != nil { + log.Info("failed to get flavor groups from knowledge extractor", "error", err) + resp.RejectionReason = "caches not ready" + retryTime := time.Now().Add(1 * time.Minute) + resp.RetryAt = Some(retryTime) + return nil + } + + // Validate InfoVersion from request matches current version (= last content change of flavor group knowledge) + var currentVersion int64 = -1 + if knowledgeCRD, err := knowledge.Get(ctx); err == nil && knowledgeCRD != nil && !knowledgeCRD.Status.LastContentChange.IsZero() { + currentVersion = knowledgeCRD.Status.LastContentChange.Unix() + } + + if req.InfoVersion != currentVersion { + log.Info("version mismatch in commitment change request", + "requestVersion", req.InfoVersion, + "currentVersion", currentVersion) + http.Error(w, fmt.Sprintf("Version mismatch: request version %d, current version %d. Please refresh and retry.", + req.InfoVersion, currentVersion), http.StatusConflict) + return errors.New("version mismatch") + } + + statesBefore := make(map[string]*CommitmentState) // map of commitmentID to existing state for rollback + var reservationsToWatch []v1alpha1.Reservation + + if req.DryRun { + resp.RejectionReason = "Dry run not supported yet" + return nil + } + +ProcessLoop: + for projectID, projectChanges := range req.ByProject { + for resourceName, resourceChanges := range projectChanges.ByResource { + // Validate resource name pattern (instances_group_*) + flavorGroupName, err := getFlavorGroupNameFromResource(string(resourceName)) + if err != nil { + resp.RejectionReason = fmt.Sprintf("project with unknown resource name %s: %v", projectID, err) + requireRollback = true + break ProcessLoop + } + + // Verify flavor group exists in Knowledge CRDs + flavorGroup, flavorGroupExists := flavorGroups[flavorGroupName] + if !flavorGroupExists { + resp.RejectionReason = "flavor group not found: " + flavorGroupName + requireRollback = true + break ProcessLoop + } + + for _, commitment := range resourceChanges.Commitments { + // Additional per-commitment validation if needed + log.Info("processing commitment change", "commitmentUUID", commitment.UUID, "projectID", projectID, "resourceName", resourceName, "oldStatus", commitment.OldStatus.UnwrapOr("none"), "newStatus", commitment.NewStatus.UnwrapOr("none")) + + // TODO add domain + + // List all committed resource reservations, then filter by name prefix + var all_reservations v1alpha1.ReservationList + if err := api.client.List(ctx, &all_reservations, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + resp.RejectionReason = fmt.Sprintf("failed to list reservations for commitment %s: %v", commitment.UUID, err) + requireRollback = true + break ProcessLoop + } + + // Filter by name prefix to find reservations for this commitment + namePrefix := fmt.Sprintf("commitment-%s-", string(commitment.UUID)) + var existing_reservations v1alpha1.ReservationList + for _, res := range all_reservations.Items { + if len(res.Name) >= len(namePrefix) && res.Name[:len(namePrefix)] == namePrefix { + existing_reservations.Items = append(existing_reservations.Items, res) + } + } + + var stateBefore *CommitmentState + if len(existing_reservations.Items) == 0 { + stateBefore = &CommitmentState{ + CommitmentUUID: string(commitment.UUID), + ProjectID: string(projectID), + FlavorGroupName: flavorGroupName, + TotalMemoryBytes: 0, + } + } else { + stateBefore, err = FromReservations(existing_reservations.Items) + if err != nil { + resp.RejectionReason = fmt.Sprintf("failed to get existing state for commitment %s: %v", commitment.UUID, err) + requireRollback = true + break ProcessLoop + } + } + statesBefore[string(commitment.UUID)] = stateBefore + + // get desired state + stateDesired, err := FromChangeCommitmentTargetState(commitment, string(projectID), flavorGroupName, flavorGroup, string(req.AZ)) + if err != nil { + resp.RejectionReason = fmt.Sprintf("failed to get desired state for commitment %s: %v", commitment.UUID, err) + requireRollback = true + break ProcessLoop + } + + log.Info("applying commitment state change", "commitmentUUID", commitment.UUID, "oldState", stateBefore, "desiredState", stateDesired) + + touchedReservations, deletedReservations, err := manager.ApplyCommitmentState(ctx, log, stateDesired, flavorGroups, "changeCommitmentsApi") + if err != nil { + resp.RejectionReason = fmt.Sprintf("failed to apply commitment state for commitment %s: %v", commitment.UUID, err) + requireRollback = true + break ProcessLoop + } + log.Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(touchedReservations), "deletedReservations", len(deletedReservations)) + reservationsToWatch = append(reservationsToWatch, touchedReservations...) + } + } + } + + // TODO make the rollback defer safe + if !requireRollback { + log.Info("applied commitment changes, now watching for reservation readiness", "reservationsToWatch", len(reservationsToWatch)) + + time_start := time.Now() + + if err := watchReservationsUntilReady(ctx, log, api.client, reservationsToWatch, watchTimeout); err != nil { + log.Info("reservations failed to become ready, initiating rollback", + "reason", err.Error()) + resp.RejectionReason = fmt.Sprintf("Not all reservations can be fulfilled: %v", err) + requireRollback = true + } + + log.Info("finished watching reservation", "totalSchedulingTimeSeconds", time.Since(time_start).Seconds()) + } + + if requireRollback { + log.Info("rollback of commitment changes") + for commitmentUUID, state := range statesBefore { + // Rollback to statesBefore for this commitment + log.Info("applying rollback for commitment", "commitmentUUID", commitmentUUID, "stateBefore", state) + _, _, err := manager.ApplyCommitmentState(ctx, log, state, flavorGroups, "changeCommitmentsApiRollback") + if err != nil { + log.Info("failed to apply rollback state for commitment", "commitmentUUID", commitmentUUID, "error", err) + // continue with best effort rollback for other projects + } + } + + log.Info("finished applying rollbacks for commitment changes", "reasonOfRollback", resp.RejectionReason) + + // TODO improve human-readable reasoning based on actual failure, i.e. polish resp.RejectionReason + return nil + } + + log.Info("commitment changes accepted") + if resp.RejectionReason != "" { + log.Info("unexpected non-empty rejection reason without rollback", "reason", resp.RejectionReason) + resp.RejectionReason = "" + } + return nil +} + +// watchReservationsUntilReady polls until all reservations reach Ready=True or timeout. +func watchReservationsUntilReady( + ctx context.Context, + log logr.Logger, + k8sClient client.Client, + reservations []v1alpha1.Reservation, + timeout time.Duration, +) error { + + if len(reservations) == 0 { + return nil + } + + deadline := time.Now().Add(timeout) + + for { + if time.Now().After(deadline) { + return fmt.Errorf("timeout after %v waiting for reservations to become ready", timeout) + } + + allReady := true + var notReadyReasons []string + + for _, res := range reservations { + // Fetch current state + var current v1alpha1.Reservation + nn := types.NamespacedName{ + Name: res.Name, + Namespace: res.Namespace, + } + + if err := k8sClient.Get(ctx, nn, ¤t); err != nil { + if apierrors.IsNotFound(err) { + // Reservation is still in process of being created + allReady = false + continue + } + return fmt.Errorf("failed to get reservation %s: %w", res.Name, err) + } + + // Check Ready condition + readyCond := meta.FindStatusCondition( + current.Status.Conditions, + v1alpha1.ReservationConditionReady, + ) + + if readyCond == nil { + // Condition not set yet, keep waiting + allReady = false + notReadyReasons = append(notReadyReasons, + res.Name+": condition not set") + continue + } + + switch readyCond.Status { + case metav1.ConditionTrue: + // This reservation is ready + continue + case metav1.ConditionFalse: + // Explicit failure - stop immediately + return fmt.Errorf("reservation %s failed: %s (reason: %s)", + res.Name, readyCond.Message, readyCond.Reason) + case metav1.ConditionUnknown: + // Still processing + allReady = false + notReadyReasons = append(notReadyReasons, + fmt.Sprintf("%s: %s", res.Name, readyCond.Message)) + } + } + + if allReady { + log.Info("all reservations are ready", + "count", len(reservations)) + return nil + } + + // Log progress + log.Info("waiting for reservations to become ready", + "notReady", len(notReadyReasons), + "total", len(reservations), + "timeRemaining", time.Until(deadline).Round(time.Second)) + + // Wait before next poll + select { + case <-time.After(pollInterval): + // Continue polling + case <-ctx.Done(): + return fmt.Errorf("context cancelled while waiting for reservations: %w", ctx.Err()) + } + } +} diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_test.go new file mode 100644 index 000000000..c4703c4a1 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_change_commitments_test.go @@ -0,0 +1,246 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "bytes" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/sapcc/go-api-declarations/liquid" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +// TODO refactor with proper integration tests + +func TestHandleChangeCommitments_VersionMismatch(t *testing.T) { + // Create a fake Kubernetes client with a Knowledge CRD + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + // Create a Knowledge CRD with a specific version timestamp and flavor groups + knowledgeTimestamp := time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC) + flavorGroup := createTestFlavorGroup() + + // Box the features using the Knowledge API + rawExt, err := v1alpha1.BoxFeatureList([]compute.FlavorGroupFeature{flavorGroup}) + if err != nil { + t.Fatalf("failed to box feature list: %v", err) + } + + knowledge := &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{ + Name: "flavor-groups", + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{ + Name: "flavor-groups", + }, + }, + Status: v1alpha1.KnowledgeStatus{ + LastContentChange: metav1.Time{Time: knowledgeTimestamp}, + Raw: rawExt, + RawLength: 1, + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionTrue, + Reason: "Ready", + }, + }, + }, + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge). + WithStatusSubresource(knowledge). + Build() + + api := &HTTPAPI{ + client: k8sClient, + } + + // Create request JSON with mismatched version + requestJSON := `{ + "az": "az-a", + "dryRun": false, + "infoVersion": 12345, + "byProject": {} + }` + + req := httptest.NewRequest(http.MethodPost, "/v1/change-commitments", bytes.NewReader([]byte(requestJSON))) + req.Header.Set("Content-Type", "application/json") + + w := httptest.NewRecorder() + + // Call the handler + api.HandleChangeCommitments(w, req) + + // Check response + resp := w.Result() + defer resp.Body.Close() + + // Verify HTTP 409 Conflict status + if resp.StatusCode != http.StatusConflict { + t.Errorf("expected status code %d (Conflict), got %d", http.StatusConflict, resp.StatusCode) + } + + // Verify Content-Type is text/plain (set by http.Error) + contentType := resp.Header.Get("Content-Type") + if contentType != "text/plain; charset=utf-8" { + t.Errorf("expected Content-Type 'text/plain; charset=utf-8', got %q", contentType) + } + + // Verify error message contains version information + var responseBody bytes.Buffer + if _, err = responseBody.ReadFrom(resp.Body); err != nil { + t.Fatalf("failed to read response body: %v", err) + } + + bodyStr := responseBody.String() + if !bytes.Contains([]byte(bodyStr), []byte("Version mismatch")) { + t.Errorf("expected response to contain 'Version mismatch', got: %s", bodyStr) + } + if !bytes.Contains([]byte(bodyStr), []byte("12345")) { + t.Errorf("expected response to contain request version '12345', got: %s", bodyStr) + } +} +func TestHandleChangeCommitments_DryRun(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + api := &HTTPAPI{ + client: k8sClient, + } + + // Create dry run request JSON + requestJSON := `{ + "az": "az-a", + "dryRun": true, + "infoVersion": 12345, + "byProject": {} + }` + + req := httptest.NewRequest(http.MethodPost, "/v1/change-commitments", bytes.NewReader([]byte(requestJSON))) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + + api.HandleChangeCommitments(w, req) + + resp := w.Result() + defer resp.Body.Close() + + // Dry run should return 200 OK with rejection reason + if resp.StatusCode != http.StatusOK { + t.Errorf("expected status code %d (OK), got %d", http.StatusOK, resp.StatusCode) + } + + // Verify response is JSON + contentType := resp.Header.Get("Content-Type") + if contentType != "application/json" { + t.Errorf("expected Content-Type 'application/json', got %q", contentType) + } + + // Parse response + var response liquid.CommitmentChangeResponse + if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + + if response.RejectionReason != "Dry run not supported yet" { + t.Errorf("expected rejection reason 'Dry run not supported yet', got %q", response.RejectionReason) + } +} + +func TestProcessCommitmentChanges_KnowledgeNotReady(t *testing.T) { + // Test when flavor groups knowledge is not available + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + // No Knowledge CRD created - simulates knowledge not ready + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + api := &HTTPAPI{ + client: k8sClient, + } + + requestJSON := `{ + "az": "az-a", + "dryRun": false, + "infoVersion": 12345, + "byProject": {} + }` + + req := httptest.NewRequest(http.MethodPost, "/v1/change-commitments", bytes.NewReader([]byte(requestJSON))) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + + api.HandleChangeCommitments(w, req) + + resp := w.Result() + defer resp.Body.Close() + + // Should return 200 OK with rejection reason + if resp.StatusCode != http.StatusOK { + t.Errorf("expected status code %d (OK), got %d", http.StatusOK, resp.StatusCode) + } + + var response liquid.CommitmentChangeResponse + if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + + if response.RejectionReason != "caches not ready" { + t.Errorf("expected rejection reason 'caches not ready', got %q", response.RejectionReason) + } + + if response.RetryAt.IsNone() { + t.Error("expected RetryAt to be set") + } +} + +// Helper function to create a minimal flavor group for testing +func createTestFlavorGroup() compute.FlavorGroupFeature { + return compute.FlavorGroupFeature{ + Name: "test_group", + Flavors: []compute.FlavorInGroup{ + { + Name: "test.small", + MemoryMB: 8192, + VCPUs: 2, + DiskGB: 40, + ExtraSpecs: map[string]string{ + "quota:separate": "true", + }, + }, + }, + SmallestFlavor: compute.FlavorInGroup{ + Name: "test.small", + MemoryMB: 8192, + VCPUs: 2, + DiskGB: 40, + }, + } +} diff --git a/internal/scheduling/reservations/commitments/api_info.go b/internal/scheduling/reservations/commitments/api_info.go new file mode 100644 index 000000000..db02dd708 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_info.go @@ -0,0 +1,117 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "strings" + "time" + + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/go-logr/logr" + liquid "github.com/sapcc/go-api-declarations/liquid" +) + +// handles GET /v1/info requests from Limes: +// See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid +func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) { + // Extract or generate request ID for tracing + requestID := r.Header.Get("X-Request-ID") + if requestID == "" { + requestID = fmt.Sprintf("req-%d", time.Now().UnixNano()) + } + log := commitmentApiLog.WithValues("requestID", requestID, "endpoint", "/v1/info") + + // Only accept GET method + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + log.V(1).Info("processing info request") + + // Build info response + info, err := api.buildServiceInfo(r.Context(), log) + if err != nil { + // Use Info level for expected conditions like knowledge not being ready yet + log.Info("service info not available yet", "error", err.Error()) + http.Error(w, "Service temporarily unavailable: "+err.Error(), + http.StatusServiceUnavailable) + return + } + + // Return response + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(info); err != nil { + log.Error(err, "failed to encode service info") + return + } +} + +// buildServiceInfo constructs the ServiceInfo response with metadata for all flavor groups. +func (api *HTTPAPI) buildServiceInfo(ctx context.Context, log logr.Logger) (liquid.ServiceInfo, error) { + // Get all flavor groups from Knowledge CRDs + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: api.client} + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) + if err != nil { + // Return -1 as version when knowledge is not ready + return liquid.ServiceInfo{ + Version: -1, + Resources: make(map[liquid.ResourceName]liquid.ResourceInfo), + }, err + } + + // Build resources map + resources := make(map[liquid.ResourceName]liquid.ResourceInfo) + for groupName, groupData := range flavorGroups { + resourceName := liquid.ResourceName("ram_" + groupName) + + flavorNames := make([]string, 0, len(groupData.Flavors)) + for _, flavor := range groupData.Flavors { + flavorNames = append(flavorNames, flavor.Name) + } + displayName := fmt.Sprintf( + "multiples of %d MiB (usable by: %s)", + groupData.SmallestFlavor.MemoryMB, + strings.Join(flavorNames, ", "), + ) + + resources[resourceName] = liquid.ResourceInfo{ + DisplayName: displayName, + Unit: liquid.UnitNone, // Countable: multiples of smallest flavor instances + Topology: liquid.AZAwareTopology, // Commitments are per-AZ + NeedsResourceDemand: false, // Capacity planning out of scope for now + HasCapacity: true, // We report capacity via /v1/report-capacity + HasQuota: false, // No quota enforcement as of now + HandlesCommitments: true, // We handle commitment changes via /v1/change-commitments + } + + log.V(1).Info("registered flavor group resource", + "resourceName", resourceName, + "flavorGroup", groupName, + "displayName", displayName, + "smallestFlavor", groupData.SmallestFlavor.Name, + "smallestRamMB", groupData.SmallestFlavor.MemoryMB) + } + + // Get last content changed from flavor group knowledge and treat it as version + var version int64 = -1 + if knowledgeCRD, err := knowledge.Get(ctx); err == nil && knowledgeCRD != nil && !knowledgeCRD.Status.LastContentChange.IsZero() { + version = knowledgeCRD.Status.LastContentChange.Unix() + } + + log.Info("built service info", + "resourceCount", len(resources), + "version", version) + + return liquid.ServiceInfo{ + Version: version, + Resources: resources, + }, nil +} diff --git a/internal/scheduling/reservations/commitments/api_info_test.go b/internal/scheduling/reservations/commitments/api_info_test.go new file mode 100644 index 000000000..71c560c19 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_info_test.go @@ -0,0 +1,78 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "net/http" + "net/http/httptest" + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestHandleInfo_KnowledgeNotReady(t *testing.T) { + // Test when flavor groups knowledge is not available + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + // No Knowledge CRD created - simulates knowledge not ready + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + api := &HTTPAPI{ + client: k8sClient, + } + + req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody) + w := httptest.NewRecorder() + + api.HandleInfo(w, req) + + resp := w.Result() + defer resp.Body.Close() + + // Should return 503 Service Unavailable when knowledge is not ready + if resp.StatusCode != http.StatusServiceUnavailable { + t.Errorf("expected status code %d (Service Unavailable), got %d", http.StatusServiceUnavailable, resp.StatusCode) + } + + // Verify Content-Type is text/plain (set by http.Error) + contentType := resp.Header.Get("Content-Type") + if contentType != "text/plain; charset=utf-8" { + t.Errorf("expected Content-Type 'text/plain; charset=utf-8', got %q", contentType) + } +} + +func TestHandleInfo_MethodNotAllowed(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + api := &HTTPAPI{ + client: k8sClient, + } + + // Use POST instead of GET + req := httptest.NewRequest(http.MethodPost, "/v1/info", http.NoBody) + w := httptest.NewRecorder() + + api.HandleInfo(w, req) + + resp := w.Result() + defer resp.Body.Close() + + if resp.StatusCode != http.StatusMethodNotAllowed { + t.Errorf("expected status code %d (Method Not Allowed), got %d", http.StatusMethodNotAllowed, resp.StatusCode) + } +} diff --git a/internal/scheduling/reservations/commitments/api_report_capacity.go b/internal/scheduling/reservations/commitments/api_report_capacity.go new file mode 100644 index 000000000..0ec1f5e7d --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_report_capacity.go @@ -0,0 +1,61 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "encoding/json" + "fmt" + "net/http" + "time" + + "github.com/sapcc/go-api-declarations/liquid" +) + +// handles POST /v1/report-capacity requests from Limes: +// See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid +// Reports available capacity across all flavor group resources. Note, unit is specified in the Info API response with multiple of the smallest memory resource unit within a flavor group. +func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) { + // Extract or generate request ID for tracing + requestID := r.Header.Get("X-Request-ID") + if requestID == "" { + requestID = fmt.Sprintf("req-%d", time.Now().UnixNano()) + } + log := commitmentApiLog.WithValues("requestID", requestID, "endpoint", "/v1/report-capacity") + + // Only accept POST method + if r.Method != http.MethodPost { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + log.V(1).Info("processing report capacity request") + + // Parse request body (may be empty or contain ServiceCapacityRequest) + var req liquid.ServiceCapacityRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + // Empty body is acceptable for capacity reports + req = liquid.ServiceCapacityRequest{} + } + + // Calculate capacity + calculator := NewCapacityCalculator(api.client) + report, err := calculator.CalculateCapacity(r.Context()) + if err != nil { + log.Error(err, "failed to calculate capacity") + http.Error(w, "Failed to calculate capacity: "+err.Error(), + http.StatusInternalServerError) + return + } + + log.Info("calculated capacity report", "resourceCount", len(report.Resources)) + + // Return response + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(report); err != nil { + log.Error(err, "failed to encode capacity report") + return + } +} diff --git a/internal/scheduling/reservations/commitments/api_report_capacity_test.go b/internal/scheduling/reservations/commitments/api_report_capacity_test.go new file mode 100644 index 000000000..76140e218 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_report_capacity_test.go @@ -0,0 +1,285 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/sapcc/go-api-declarations/liquid" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" +) + +func TestHandleReportCapacity(t *testing.T) { + // Setup fake client + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + // Create empty flavor groups knowledge so capacity calculation doesn't fail + emptyKnowledge := createEmptyFlavorGroupKnowledge() + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(emptyKnowledge). + Build() + + api := NewAPI(fakeClient) + + tests := []struct { + name string + method string + body interface{} + expectedStatus int + checkResponse func(*testing.T, *liquid.ServiceCapacityReport) + }{ + { + name: "POST request succeeds", + method: http.MethodPost, + body: liquid.ServiceCapacityRequest{}, + expectedStatus: http.StatusOK, + checkResponse: func(t *testing.T, resp *liquid.ServiceCapacityReport) { + // Resources may be nil or empty for empty capacity + if len(resp.Resources) != 0 { + t.Errorf("Expected empty or nil Resources, got %d resources", len(resp.Resources)) + } + }, + }, + { + name: "POST with empty body succeeds", + method: http.MethodPost, + body: nil, + expectedStatus: http.StatusOK, + checkResponse: func(t *testing.T, resp *liquid.ServiceCapacityReport) { + // Resources may be nil or empty for empty capacity + if len(resp.Resources) != 0 { + t.Errorf("Expected empty or nil Resources, got %d resources", len(resp.Resources)) + } + }, + }, + { + name: "GET request fails", + method: http.MethodGet, + body: nil, + expectedStatus: http.StatusMethodNotAllowed, + checkResponse: nil, + }, + { + name: "PUT request fails", + method: http.MethodPut, + body: nil, + expectedStatus: http.StatusMethodNotAllowed, + checkResponse: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create request + var req *http.Request + if tt.body != nil { + bodyBytes, err := json.Marshal(tt.body) + if err != nil { + t.Fatal(err) + } + req = httptest.NewRequest(tt.method, "/v1/report-capacity", bytes.NewReader(bodyBytes)) + } else { + req = httptest.NewRequest(tt.method, "/v1/report-capacity", http.NoBody) + } + req = req.WithContext(context.Background()) + + // Create response recorder + rr := httptest.NewRecorder() + + // Call handler + api.HandleReportCapacity(rr, req) + + // Check status code + if rr.Code != tt.expectedStatus { + t.Errorf("Expected status %d, got %d", tt.expectedStatus, rr.Code) + } + + // Check response if applicable + if tt.checkResponse != nil && rr.Code == http.StatusOK { + var resp liquid.ServiceCapacityReport + if err := json.NewDecoder(rr.Body).Decode(&resp); err != nil { + t.Fatalf("Failed to decode response: %v", err) + } + tt.checkResponse(t, &resp) + } + }) + } +} + +func TestCapacityCalculator(t *testing.T) { + // Setup fake client with Knowledge CRD + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + t.Run("CalculateCapacity returns error when no flavor groups knowledge exists", func(t *testing.T) { + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + calculator := NewCapacityCalculator(fakeClient) + _, err := calculator.CalculateCapacity(context.Background()) + if err == nil { + t.Fatal("Expected error when flavor groups knowledge doesn't exist, got nil") + } + if !strings.Contains(err.Error(), "not found") { + t.Errorf("Expected 'not found' error, got: %v", err) + } + }) + + t.Run("CalculateCapacity returns empty report when flavor groups knowledge exists but is empty", func(t *testing.T) { + // Create empty flavor groups knowledge + emptyKnowledge := createEmptyFlavorGroupKnowledge() + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(emptyKnowledge). + Build() + + calculator := NewCapacityCalculator(fakeClient) + report, err := calculator.CalculateCapacity(context.Background()) + if err != nil { + t.Fatalf("Expected no error, got: %v", err) + } + + if report.Resources == nil { + t.Error("Expected Resources map to be initialized") + } + + if len(report.Resources) != 0 { + t.Errorf("Expected 0 resources, got %d", len(report.Resources)) + } + }) + + t.Run("CalculateCapacity returns empty perAZ when no HostDetails exist", func(t *testing.T) { + // Create a flavor group knowledge without host details + flavorGroupKnowledge := createTestFlavorGroupKnowledge(t, "test-group") + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(flavorGroupKnowledge). + Build() + + calculator := NewCapacityCalculator(fakeClient) + report, err := calculator.CalculateCapacity(context.Background()) + if err != nil { + t.Fatalf("Expected no error, got: %v", err) + } + + if len(report.Resources) != 1 { + t.Fatalf("Expected 1 resource, got %d", len(report.Resources)) + } + + resource := report.Resources[liquid.ResourceName("ram_test-group")] + if resource == nil { + t.Fatal("Expected ram_test-group resource to exist") + } + + // Should have empty perAZ map when no host details + if len(resource.PerAZ) != 0 { + t.Errorf("Expected 0 AZs, got %d", len(resource.PerAZ)) + } + }) +} + +// createEmptyFlavorGroupKnowledge creates an empty flavor groups Knowledge CRD +func createEmptyFlavorGroupKnowledge() *v1alpha1.Knowledge { + // Box empty array properly + emptyFeatures := []map[string]interface{}{} + raw, err := v1alpha1.BoxFeatureList(emptyFeatures) + if err != nil { + panic(err) // Should never happen for empty slice + } + + return &v1alpha1.Knowledge{ + ObjectMeta: v1.ObjectMeta{ + Name: "flavor-groups", + // No namespace - Knowledge is cluster-scoped + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{ + Name: "flavor_groups", + }, + }, + Status: v1alpha1.KnowledgeStatus{ + Conditions: []v1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: "True", + }, + }, + Raw: raw, + }, + } +} + +// createTestFlavorGroupKnowledge creates a test Knowledge CRD with flavor group data +func createTestFlavorGroupKnowledge(t *testing.T, groupName string) *v1alpha1.Knowledge { + t.Helper() + + features := []map[string]interface{}{ + { + "name": groupName, + "flavors": []map[string]interface{}{ + { + "name": "test_c8_m32", + "vcpus": 8, + "memoryMB": 32768, + "diskGB": 50, + }, + }, + "largestFlavor": map[string]interface{}{ + "name": "test_c8_m32", + "vcpus": 8, + "memoryMB": 32768, + "diskGB": 50, + }, + }, + } + + // Use BoxFeatureList to properly format the features + raw, err := v1alpha1.BoxFeatureList(features) + if err != nil { + t.Fatal(err) + } + + return &v1alpha1.Knowledge{ + ObjectMeta: v1.ObjectMeta{ + Name: "flavor-groups", + // No namespace - Knowledge is cluster-scoped + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{ + Name: "flavor_groups", + }, + }, + Status: v1alpha1.KnowledgeStatus{ + Conditions: []v1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: "True", + }, + }, + Raw: raw, + }, + } +} diff --git a/internal/scheduling/reservations/commitments/capacity.go b/internal/scheduling/reservations/commitments/capacity.go new file mode 100644 index 000000000..04ad177e1 --- /dev/null +++ b/internal/scheduling/reservations/commitments/capacity.go @@ -0,0 +1,124 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "fmt" + "sort" + + "github.com/sapcc/go-api-declarations/liquid" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" +) + +// CapacityCalculator computes capacity reports for Limes LIQUID API. +type CapacityCalculator struct { + client client.Client +} + +func NewCapacityCalculator(client client.Client) *CapacityCalculator { + return &CapacityCalculator{client: client} +} + +// CalculateCapacity computes per-AZ capacity for all flavor groups. +func (c *CapacityCalculator) CalculateCapacity(ctx context.Context) (liquid.ServiceCapacityReport, error) { + // Get all flavor groups from Knowledge CRDs + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: c.client} + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) + if err != nil { + return liquid.ServiceCapacityReport{}, fmt.Errorf("failed to get flavor groups: %w", err) + } + + // Build capacity report per flavor group + report := liquid.ServiceCapacityReport{ + Resources: make(map[liquid.ResourceName]*liquid.ResourceCapacityReport), + } + + for groupName, groupData := range flavorGroups { + // Resource name follows pattern: ram_ + resourceName := liquid.ResourceName("ram_" + groupName) + + // Calculate per-AZ capacity and usage + azCapacity, err := c.calculateAZCapacity(ctx, groupName, groupData) + if err != nil { + return liquid.ServiceCapacityReport{}, fmt.Errorf("failed to calculate capacity for %s: %w", groupName, err) + } + + report.Resources[resourceName] = &liquid.ResourceCapacityReport{ + PerAZ: azCapacity, + } + } + + return report, nil +} + +func (c *CapacityCalculator) calculateAZCapacity( + ctx context.Context, + _ string, // groupName - reserved for future use + _ compute.FlavorGroupFeature, // groupData - reserved for future use +) (map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport, error) { + // Get list of availability zones from HostDetails Knowledge + azs, err := c.getAvailabilityZones(ctx) + if err != nil { + return nil, fmt.Errorf("failed to get availability zones: %w", err) + } + + // Create report entry for each AZ with empty capacity/usage + // Capacity and Usage are left unset (zero value of option.Option[uint64]) + // This signals to Limes: "These AZs exist, but capacity/usage not yet calculated" + result := make(map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport) + for _, az := range azs { + result[liquid.AvailabilityZone(az)] = &liquid.AZResourceCapacityReport{ + // Both Capacity and Usage left unset (empty optional values) + // TODO: Calculate actual capacity from Reservation CRDs or host resources + // TODO: Calculate actual usage from VM allocations + } + } + + return result, nil +} + +func (c *CapacityCalculator) getAvailabilityZones(ctx context.Context) ([]string, error) { + // List all Knowledge CRDs to find host-details knowledge + var knowledgeList v1alpha1.KnowledgeList + if err := c.client.List(ctx, &knowledgeList); err != nil { + return nil, fmt.Errorf("failed to list Knowledge CRDs: %w", err) + } + + // Find host-details knowledge and extract AZs + azSet := make(map[string]struct{}) + for _, knowledge := range knowledgeList.Items { + // Look for host-details extractor + if knowledge.Spec.Extractor.Name != "host_details" { + continue + } + + // Parse features from Raw data + features, err := v1alpha1.UnboxFeatureList[compute.HostDetails](knowledge.Status.Raw) + if err != nil { + // Skip if we can't parse this knowledge + continue + } + + // Collect unique AZ names + for _, feature := range features { + if feature.AvailabilityZone != "" { + azSet[feature.AvailabilityZone] = struct{}{} + } + } + } + + // Convert set to sorted slice + azs := make([]string, 0, len(azSet)) + for az := range azSet { + azs = append(azs, az) + } + sort.Strings(azs) + + return azs, nil +} diff --git a/internal/scheduling/reservations/commitments/client.go b/internal/scheduling/reservations/commitments/client.go index 31e79c5b0..2e5585c99 100644 --- a/internal/scheduling/reservations/commitments/client.go +++ b/internal/scheduling/reservations/commitments/client.go @@ -14,11 +14,10 @@ import ( "github.com/cobaltcore-dev/cortex/pkg/keystone" "github.com/cobaltcore-dev/cortex/pkg/sso" "github.com/gophercloud/gophercloud/v2" - "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/flavors" - "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/servers" "github.com/gophercloud/gophercloud/v2/openstack/identity/v3/projects" "github.com/sapcc/go-bits/jobloop" "github.com/sapcc/go-bits/must" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -28,13 +27,8 @@ type CommitmentsClient interface { Init(ctx context.Context, client client.Client, conf SyncerConfig) error // List all projects to resolve commitments. ListProjects(ctx context.Context) ([]Project, error) - // List all flavors by their name to resolve instance commitments. - ListFlavorsByName(ctx context.Context) (map[string]Flavor, error) // List all commitments with resolved metadata (e.g. project, flavor, ...). ListCommitmentsByID(ctx context.Context, projects ...Project) (map[string]Commitment, error) - // List all servers for the given projects from nova. - // The result is a map from project ID to the list of servers. - ListServersByProjectID(ctx context.Context, projects ...Project) (map[string][]Server, error) } // Commitments client fetching commitments from openstack services. @@ -49,14 +43,13 @@ type commitmentsClient struct { limes *gophercloud.ServiceClient } -// Create a new commitments client. -// By default, this client will fetch commitments from the limes API. func NewCommitmentsClient() CommitmentsClient { return &commitmentsClient{} } -// Init the client. func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf SyncerConfig) error { + log := ctrl.Log.WithName("CommitmentClient") + var authenticatedHTTP = http.DefaultClient if conf.SSOSecretRef != nil { var err error @@ -79,7 +72,7 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf Type: "identity", Availability: "public", })) - syncLog.Info("using identity endpoint", "url", url) + log.Info("using identity endpoint", "url", url) c.keystone = &gophercloud.ServiceClient{ ProviderClient: c.provider, Endpoint: url, @@ -91,7 +84,7 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf Type: "compute", Availability: "public", })) - syncLog.Info("using nova endpoint", "url", url) + log.Info("using nova endpoint", "url", url) c.nova = &gophercloud.ServiceClient{ ProviderClient: c.provider, Endpoint: url, @@ -104,7 +97,7 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf Type: "resources", Availability: "public", })) - syncLog.Info("using limes endpoint", "url", url) + log.Info("using limes endpoint", "url", url) c.limes = &gophercloud.ServiceClient{ ProviderClient: c.provider, Endpoint: url, @@ -113,32 +106,10 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf return nil } -// Get all Nova flavors by their name to resolve instance commitments. -func (c *commitmentsClient) ListFlavorsByName(ctx context.Context) (map[string]Flavor, error) { - syncLog.Info("fetching all flavors from nova") - flo := flavors.ListOpts{AccessType: flavors.AllAccess} - pages, err := flavors.ListDetail(c.nova, flo).AllPages(ctx) - if err != nil { - return nil, err - } - // Parse the json data into our custom model. - var data = &struct { - Flavors []Flavor `json:"flavors"` - }{} - if err := pages.(flavors.FlavorPage).ExtractInto(data); err != nil { - return nil, err - } - syncLog.Info("fetched flavors from nova", "count", len(data.Flavors)) - flavorsByName := make(map[string]Flavor, len(data.Flavors)) - for _, flavor := range data.Flavors { - flavorsByName[flavor.Name] = flavor - } - return flavorsByName, nil -} - -// Get all projects from Keystone to resolve commitments. func (c *commitmentsClient) ListProjects(ctx context.Context) ([]Project, error) { - syncLog.Info("fetching projects from keystone") + log := ctrl.Log.WithName("CommitmentClient") + + log.V(1).Info("fetching projects from keystone") allPages, err := projects.List(c.keystone, nil).AllPages(ctx) if err != nil { return nil, err @@ -149,14 +120,15 @@ func (c *commitmentsClient) ListProjects(ctx context.Context) ([]Project, error) if err := allPages.(projects.ProjectPage).ExtractInto(data); err != nil { return nil, err } - syncLog.Info("fetched projects from keystone", "count", len(data.Projects)) + log.V(1).Info("fetched projects from keystone", "count", len(data.Projects)) return data.Projects, nil } -// Get all available commitments from limes + keystone + nova. -// This function fetches the commitments for each project in parallel. +// ListCommitmentsByID fetches commitments for all projects in parallel. func (c *commitmentsClient) ListCommitmentsByID(ctx context.Context, projects ...Project) (map[string]Commitment, error) { - syncLog.Info("fetching commitments from limes", "projects", len(projects)) + log := ctrl.Log.WithName("CommitmentClient") + + log.V(1).Info("fetching commitments from limes", "projects", len(projects)) commitmentsMutex := gosync.Mutex{} commitments := make(map[string]Commitment) var wg gosync.WaitGroup @@ -189,15 +161,14 @@ func (c *commitmentsClient) ListCommitmentsByID(ctx context.Context, projects .. // Return the first error encountered, if any. for err := range errChan { if err != nil { - syncLog.Error(err, "failed to resolve commitments") + log.Error(err, "failed to resolve commitments") return nil, err } } - syncLog.Info("resolved commitments from limes", "count", len(commitments)) + log.V(1).Info("resolved commitments from limes", "count", len(commitments)) return commitments, nil } -// Resolve the commitments for the given project. func (c *commitmentsClient) listCommitments(ctx context.Context, project Project) ([]Commitment, error) { url := c.limes.Endpoint + "v1" + "/domains/" + project.DomainID + @@ -232,67 +203,3 @@ func (c *commitmentsClient) listCommitments(ctx context.Context, project Project } return commitments, nil } - -// Get all servers for the given project ids from nova. -// The result is a map from project ID to the list of servers. -func (c *commitmentsClient) ListServersByProjectID(ctx context.Context, projects ...Project) (map[string][]Server, error) { - syncLog.Info("fetching servers from nova") - serversByProject := make(map[string][]Server, len(projects)) - var mu gosync.Mutex - var wg gosync.WaitGroup - ctx, cancel := context.WithCancel(ctx) - defer cancel() - // Channel to communicate errors from goroutines. - errChan := make(chan error, len(projects)) - for _, project := range projects { - wg.Go(func() { - servers, err := c.listServersForProject(ctx, project) - if err != nil { - errChan <- err - cancel() - return - } - mu.Lock() - serversByProject[project.ID] = servers - mu.Unlock() - }) - time.Sleep(jobloop.DefaultJitter(50 * time.Millisecond)) // Don't overload the API. - } - // Wait for all goroutines to finish and close the error channel. - go func() { - wg.Wait() - close(errChan) - }() - // Return the first error encountered, if any. - for err := range errChan { - if err != nil { - syncLog.Error(err, "failed to fetch servers") - return nil, err - } - } - syncLog.Info("fetched servers from nova", "projects", len(serversByProject)) - return serversByProject, nil -} - -// Get all servers for the given project id from nova. -func (c *commitmentsClient) listServersForProject(ctx context.Context, project Project) ([]Server, error) { - lo := servers.ListOpts{ - // AllTenants must be set to fetch servers from other projects - // than the one we are authenticated with. - AllTenants: true, - TenantID: project.ID, - } - pages, err := servers.List(c.nova, lo).AllPages(ctx) - if err != nil { - return nil, err - } - // Parse the json data into our custom model. - var data = &struct { - Servers []Server `json:"servers"` - }{} - if err := pages.(servers.ServerPage).ExtractInto(data); err != nil { - return nil, err - } - syncLog.Info("fetched servers for project", "project", project.ID, "count", len(data.Servers)) - return data.Servers, nil -} diff --git a/internal/scheduling/reservations/commitments/client_test.go b/internal/scheduling/reservations/commitments/client_test.go index f3a1d0a8f..be2d66ff9 100644 --- a/internal/scheduling/reservations/commitments/client_test.go +++ b/internal/scheduling/reservations/commitments/client_test.go @@ -8,7 +8,6 @@ import ( "encoding/json" "net/http" "net/http/httptest" - "reflect" "strings" "testing" "time" @@ -127,134 +126,6 @@ func TestCommitmentsClient_ListProjects_Error(t *testing.T) { } } -func TestCommitmentsClient_ListFlavorsByName(t *testing.T) { - // Mock server for Nova compute service - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if strings.Contains(r.URL.Path, "/flavors/detail") { - // Return raw JSON string as the gophercloud pages expect - w.Header().Set("Content-Type", "application/json") - _, err := w.Write([]byte(`{ - "flavors": [ - { - "id": "flavor1", - "name": "m1.small", - "ram": 2048, - "vcpus": 1, - "disk": 20, - "rxtx_factor": 1.0, - "os-flavor-access:is_public": true, - "OS-FLV-EXT-DATA:ephemeral": 0, - "description": "Small flavor", - "extra_specs": {"hw:cpu_policy": "shared"} - }, - { - "id": "flavor2", - "name": "m1.medium", - "ram": 4096, - "vcpus": 2, - "disk": 40, - "rxtx_factor": 1.0, - "os-flavor-access:is_public": true, - "OS-FLV-EXT-DATA:ephemeral": 0, - "description": "Medium flavor", - "extra_specs": {"hw:cpu_policy": "dedicated"} - } - ] - }`)) - if err != nil { - t.Fatalf("failed to write response: %v", err) - } - return - } - http.NotFound(w, r) - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL + "/", - Microversion: "2.61", - }, - } - - ctx := context.Background() - flavorsByName, err := client.ListFlavorsByName(ctx) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - expectedFlavors := map[string]Flavor{ - "m1.small": { - ID: "flavor1", - Name: "m1.small", - RAM: 2048, - VCPUs: 1, - Disk: 20, - RxTxFactor: 1.0, - IsPublic: true, - Ephemeral: 0, - Description: "Small flavor", - ExtraSpecs: map[string]string{"hw:cpu_policy": "shared"}, - }, - "m1.medium": { - ID: "flavor2", - Name: "m1.medium", - RAM: 4096, - VCPUs: 2, - Disk: 40, - RxTxFactor: 1.0, - IsPublic: true, - Ephemeral: 0, - Description: "Medium flavor", - ExtraSpecs: map[string]string{"hw:cpu_policy": "dedicated"}, - }, - } - - if len(flavorsByName) != len(expectedFlavors) { - t.Fatalf("expected %d flavors, got %d", len(expectedFlavors), len(flavorsByName)) - } - - for name, expected := range expectedFlavors { - actual, exists := flavorsByName[name] - if !exists { - t.Errorf("expected flavor %s to exist", name) - continue - } - if !reflect.DeepEqual(actual, expected) { - t.Errorf("flavor %s: expected %+v, got %+v", name, expected, actual) - } - } -} - -func TestCommitmentsClient_ListFlavorsByName_Error(t *testing.T) { - // Mock server that returns an error - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - http.Error(w, "Service Unavailable", http.StatusServiceUnavailable) - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL + "/", - }, - } - - ctx := context.Background() - flavors, err := client.ListFlavorsByName(ctx) - if err == nil { - t.Fatal("expected error, got nil") - } - if flavors != nil { - t.Errorf("expected nil flavors, got %+v", flavors) - } -} - func TestCommitmentsClient_ListCommitmentsByID(t *testing.T) { // Mock server for Limes service server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -342,147 +213,6 @@ func TestCommitmentsClient_ListCommitmentsByID(t *testing.T) { } } -func TestCommitmentsClient_ListCommitmentsByID_Error(t *testing.T) { - // Mock server that returns an error - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - http.Error(w, "Unauthorized", http.StatusUnauthorized) - })) - defer server.Close() - - client := &commitmentsClient{ - limes: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - TokenID: "test-token", - }, - Endpoint: server.URL + "/", - }, - } - - projects := []Project{ - {ID: "project1", DomainID: "domain1"}, - } - - ctx := context.Background() - commitments, err := client.ListCommitmentsByID(ctx, projects...) - if err == nil { - t.Fatal("expected error, got nil") - } - if commitments != nil { - t.Errorf("expected nil commitments, got %+v", commitments) - } -} - -func TestCommitmentsClient_ListServersByProjectID(t *testing.T) { - // Mock server for Nova compute service - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if strings.Contains(r.URL.Path, "/servers/detail") { - // Parse query parameters to determine which project - tenantID := r.URL.Query().Get("tenant_id") - - // Return raw JSON string as the gophercloud pages expect - w.Header().Set("Content-Type", "application/json") - if tenantID == "project1" { - if _, err := w.Write([]byte(`{ - "servers": [ - { - "id": "server1", - "name": "test-server-1", - "status": "ACTIVE", - "tenant_id": "project1", - "flavor": {"original_name": "m1.small"} - } - ] - }`)); err != nil { - t.Fatalf("failed to write response: %v", err) - } - } else { - if _, err := w.Write([]byte(`{"servers": []}`)); err != nil { - t.Fatalf("failed to write response: %v", err) - } - } - return - } - http.NotFound(w, r) - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL + "/", - }, - } - - projects := []Project{ - {ID: "project1", Name: "Test Project 1"}, - {ID: "project2", Name: "Test Project 2"}, - } - - ctx := context.Background() - serversByProject, err := client.ListServersByProjectID(ctx, projects...) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if len(serversByProject) != 2 { - t.Fatalf("expected 2 project entries, got %d", len(serversByProject)) - } - - // Check project1 has 1 server - servers1, exists := serversByProject["project1"] - if !exists { - t.Fatal("expected project1 to exist in results") - } - if len(servers1) != 1 { - t.Fatalf("expected 1 server for project1, got %d", len(servers1)) - } - if servers1[0].ID != "server1" { - t.Errorf("expected server ID server1, got %s", servers1[0].ID) - } - - // Check project2 has 0 servers - servers2, exists := serversByProject["project2"] - if !exists { - t.Fatal("expected project2 to exist in results") - } - if len(servers2) != 0 { - t.Fatalf("expected 0 servers for project2, got %d", len(servers2)) - } -} - -func TestCommitmentsClient_ListServersByProjectID_Error(t *testing.T) { - // Mock server that returns an error - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - http.Error(w, "Forbidden", http.StatusForbidden) - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL + "/", - }, - } - - projects := []Project{ - {ID: "project1"}, - } - - ctx := context.Background() - servers, err := client.ListServersByProjectID(ctx, projects...) - if err == nil { - t.Fatal("expected error, got nil") - } - if servers != nil { - t.Errorf("expected nil servers, got %+v", servers) - } -} - func TestCommitmentsClient_listCommitments(t *testing.T) { // Mock server for Limes service server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -625,136 +355,6 @@ func TestCommitmentsClient_listCommitments_JSONError(t *testing.T) { } } -func TestCommitmentsClient_listServersForProject(t *testing.T) { - // Mock server for Nova compute service - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if !strings.Contains(r.URL.Path, "/servers/detail") { - http.NotFound(w, r) - return - } - - // Verify query parameters - query := r.URL.Query() - if query.Get("all_tenants") != "true" { - t.Errorf("expected all_tenants=true, got %s", query.Get("all_tenants")) - } - if query.Get("tenant_id") != "test-project" { - t.Errorf("expected tenant_id=test-project, got %s", query.Get("tenant_id")) - } - - // Return raw JSON string as the gophercloud pages expect - w.Header().Set("Content-Type", "application/json") - if _, err := w.Write([]byte(`{ - "servers": [ - { - "id": "server1", - "name": "test-server", - "status": "ACTIVE", - "tenant_id": "test-project", - "flavor": {"original_name": "m1.small"} - }, - { - "id": "server2", - "name": "another-server", - "status": "ACTIVE", - "tenant_id": "test-project", - "flavor": {"original_name": "m1.medium"} - } - ] - }`)); err != nil { - t.Fatalf("failed to write response: %v", err) - } - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL + "/", - }, - } - - project := Project{ - ID: "test-project", - Name: "Test Project", - } - - ctx := context.Background() - servers, err := client.listServersForProject(ctx, project) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if len(servers) != 2 { - t.Fatalf("expected 2 servers, got %d", len(servers)) - } - - expectedServers := []Server{ - { - ID: "server1", - Name: "test-server", - Status: "ACTIVE", - TenantID: "test-project", - FlavorName: "m1.small", - }, - { - ID: "server2", - Name: "another-server", - Status: "ACTIVE", - TenantID: "test-project", - FlavorName: "m1.medium", - }, - } - - for i, expected := range expectedServers { - if servers[i].ID != expected.ID { - t.Errorf("server %d: expected ID %s, got %s", i, expected.ID, servers[i].ID) - } - if servers[i].Name != expected.Name { - t.Errorf("server %d: expected Name %s, got %s", i, expected.Name, servers[i].Name) - } - if servers[i].Status != expected.Status { - t.Errorf("server %d: expected Status %s, got %s", i, expected.Status, servers[i].Status) - } - if servers[i].TenantID != expected.TenantID { - t.Errorf("server %d: expected TenantID %s, got %s", i, expected.TenantID, servers[i].TenantID) - } - if servers[i].FlavorName != expected.FlavorName { - t.Errorf("server %d: expected FlavorName %s, got %s", i, expected.FlavorName, servers[i].FlavorName) - } - } -} - -func TestCommitmentsClient_listServersForProject_Error(t *testing.T) { - // Mock server that returns an error - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - http.Error(w, "Internal Server Error", http.StatusInternalServerError) - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL, - }, - } - - project := Project{ID: "test-project"} - - ctx := context.Background() - servers, err := client.listServersForProject(ctx, project) - if err == nil { - t.Fatal("expected error, got nil") - } - if servers != nil { - t.Errorf("expected nil servers, got %+v", servers) - } -} - func TestCommitmentsClient_ContextCancellation(t *testing.T) { // Test context cancellation handling slowServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go new file mode 100644 index 000000000..350de7e8c --- /dev/null +++ b/internal/scheduling/reservations/commitments/reservation_manager.go @@ -0,0 +1,310 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "fmt" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/go-logr/logr" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// ReservationManager handles CRUD operations for Reservation CRDs. +type ReservationManager struct { + client.Client +} + +func NewReservationManager(k8sClient client.Client) *ReservationManager { + return &ReservationManager{ + Client: k8sClient, + } +} + +// ApplyCommitmentState synchronizes Reservation CRDs to match the desired commitment state. +// This function performs CRUD operations (create/update/delete) on reservation slots to align +// with the capacity specified in desiredState. +// +// Entry points: +// - from Syncer - periodic sync with Limes state +// - from API ChangeCommitmentsHandler - batch processing of commitment changes +// +// The function is idempotent and handles: +// - Repairing inconsistent slots (wrong flavor group/project) +// - Creating new reservation slots when capacity increases +// - Deleting unused/excess slots when capacity decreases +// - Syncing reservation metadata for all remaining slots +// +// Returns touched reservations (created/updated) and removed reservations for caller tracking. +func (m *ReservationManager) ApplyCommitmentState( + ctx context.Context, + log logr.Logger, + desiredState *CommitmentState, + flavorGroups map[string]compute.FlavorGroupFeature, + creator string, +) (touchedReservations, removedReservations []v1alpha1.Reservation, err error) { + + log = log.WithName("ReservationManager") + + // Phase 1: List and filter existing reservations for this commitment + var allReservations v1alpha1.ReservationList + if err := m.List(ctx, &allReservations, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + return nil, nil, fmt.Errorf("failed to list reservations: %w", err) + } + + // Filter by name prefix to find reservations for this commitment + namePrefix := fmt.Sprintf("commitment-%s-", desiredState.CommitmentUUID) + var existing []v1alpha1.Reservation + for _, res := range allReservations.Items { + if len(res.Name) >= len(namePrefix) && res.Name[:len(namePrefix)] == namePrefix { + existing = append(existing, res) + } + } + + // Phase 2: Calculate memory delta (desired - current) + flavorGroup, exists := flavorGroups[desiredState.FlavorGroupName] + + if !exists { + return nil, nil, fmt.Errorf("flavor group not found: %s", desiredState.FlavorGroupName) + } + deltaMemoryBytes := desiredState.TotalMemoryBytes + for _, res := range existing { + memoryQuantity := res.Spec.Resources["memory"] + deltaMemoryBytes -= memoryQuantity.Value() + } + + log.Info("applying commitment state", + "commitmentUUID", desiredState.CommitmentUUID, + "desiredMemoryBytes", desiredState.TotalMemoryBytes, + "deltaMemoryBytes", deltaMemoryBytes, + "existingSlots", len(existing), + ) + + nextSlotIndex := GetNextSlotIndex(existing) + + // Phase 3 (DELETE): Delete inconsistent reservations (wrong flavor group/project) + // They will be recreated with correct metadata in subsequent phases. + var validReservations []v1alpha1.Reservation + for _, res := range existing { + if res.Spec.CommittedResourceReservation.ResourceGroup != desiredState.FlavorGroupName || + res.Spec.CommittedResourceReservation.ProjectID != desiredState.ProjectID { + log.Info("Found a reservation with wrong flavor group or project, delete and recreate afterward", + "commitmentUUID", desiredState.CommitmentUUID, + "name", res.Name, + "expectedFlavorGroup", desiredState.FlavorGroupName, + "actualFlavorGroup", res.Spec.CommittedResourceReservation.ResourceGroup, + "expectedProjectID", desiredState.ProjectID, + "actualProjectID", res.Spec.CommittedResourceReservation.ProjectID) + removedReservations = append(removedReservations, res) + memValue := res.Spec.Resources["memory"] + deltaMemoryBytes += memValue.Value() + + if err := m.Delete(ctx, &res); err != nil { + return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", res.Name, err) + } + } else { + validReservations = append(validReservations, res) + } + } + existing = validReservations + + // Phase 4 (DELETE): Remove reservations (capacity decreased) + for deltaMemoryBytes < 0 && len(existing) > 0 { + // prefer unused reservation slot or simply remove last one + var reservationToDelete *v1alpha1.Reservation + for i, res := range existing { + if len(res.Spec.CommittedResourceReservation.Allocations) == 0 { + reservationToDelete = &res + existing = append(existing[:i], existing[i+1:]...) // remove from existing list + break + } + } + if reservationToDelete == nil { + reservationToDelete = &existing[len(existing)-1] + existing = existing[:len(existing)-1] // remove from existing list + } + removedReservations = append(removedReservations, *reservationToDelete) + memValue := reservationToDelete.Spec.Resources["memory"] + deltaMemoryBytes += memValue.Value() + + log.Info("deleting reservation", + "commitmentUUID", desiredState.CommitmentUUID, + "deltaMemoryBytes", deltaMemoryBytes, + "name", reservationToDelete.Name, + "numAllocations", len(reservationToDelete.Spec.CommittedResourceReservation.Allocations), + "memoryBytes", memValue.Value()) + + if err := m.Delete(ctx, reservationToDelete); err != nil { + return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", reservationToDelete.Name, err) + } + } + + // Phase 5 (CREATE): Create new reservations (capacity increased) + for deltaMemoryBytes > 0 { + // Need to create new reservation slots, always prefer largest flavor within the group + // TODO more sophisticated flavor selection, especially with flavors of different cpu/memory ratio + reservation := m.newReservation(desiredState, nextSlotIndex, deltaMemoryBytes, flavorGroup, creator) + touchedReservations = append(touchedReservations, *reservation) + memValue := reservation.Spec.Resources["memory"] + deltaMemoryBytes -= memValue.Value() + + log.Info("creating reservation", + "commitmentUUID", desiredState.CommitmentUUID, + "deltaMemoryBytes", deltaMemoryBytes, + "name", reservation.Name, + "memoryBytes", memValue.Value()) + + if err := m.Create(ctx, reservation); err != nil { + if apierrors.IsAlreadyExists(err) { + return touchedReservations, removedReservations, fmt.Errorf( + "reservation %s already exists (collision detected): %w", + reservation.Name, err) + } + return touchedReservations, removedReservations, fmt.Errorf( + "failed to create reservation slot %d: %w", + nextSlotIndex, err) + } + + nextSlotIndex++ + } + + // Phase 6 (UPDATE): Sync metadata for remaining reservations + for i := range existing { + updated, err := m.syncReservationMetadata(ctx, log, &existing[i], desiredState) + if err != nil { + return touchedReservations, removedReservations, err + } + if updated != nil { + touchedReservations = append(touchedReservations, *updated) + } + } + + log.Info("completed commitment state sync", + "commitmentUUID", desiredState.CommitmentUUID, + "totalReservations", len(existing), + "created", len(touchedReservations)-len(existing), + "deleted", len(removedReservations)) + + return touchedReservations, removedReservations, nil +} + +// syncReservationMetadata updates reservation metadata if it differs from desired state. +func (m *ReservationManager) syncReservationMetadata( + ctx context.Context, + log logr.Logger, + reservation *v1alpha1.Reservation, + state *CommitmentState, +) (*v1alpha1.Reservation, error) { + + // if any of AZ, StarTime, EndTime differ from desired state, need to patch + if (state.AvailabilityZone != "" && reservation.Spec.AvailabilityZone != state.AvailabilityZone) || + (state.StartTime != nil && (reservation.Spec.StartTime == nil || !reservation.Spec.StartTime.Time.Equal(*state.StartTime))) || + (state.EndTime != nil && (reservation.Spec.EndTime == nil || !reservation.Spec.EndTime.Time.Equal(*state.EndTime))) { + // Apply patch + log.Info("syncing reservation metadata", + "reservation", reservation.Name, + "availabilityZone", state.AvailabilityZone, + "startTime", state.StartTime, + "endTime", state.EndTime) + + patch := client.MergeFrom(reservation.DeepCopy()) + + if state.AvailabilityZone != "" { + reservation.Spec.AvailabilityZone = state.AvailabilityZone + } + if state.StartTime != nil { + reservation.Spec.StartTime = &metav1.Time{Time: *state.StartTime} + } + if state.EndTime != nil { + reservation.Spec.EndTime = &metav1.Time{Time: *state.EndTime} + } + + if err := m.Patch(ctx, reservation, patch); err != nil { + return nil, fmt.Errorf("failed to patch reservation %s: %w", + reservation.Name, err) + } + + return reservation, nil + } else { + return nil, nil // No changes needed + } +} + +func (m *ReservationManager) newReservation( + state *CommitmentState, + slotIndex int, + deltaMemoryBytes int64, + flavorGroup compute.FlavorGroupFeature, + creator string, +) *v1alpha1.Reservation { + + name := fmt.Sprintf("commitment-%s-%d", state.CommitmentUUID, slotIndex) + + // Select first flavor that fits remaining memory (flavors sorted descending by size) + flavorInGroup := flavorGroup.Flavors[len(flavorGroup.Flavors)-1] // default to smallest + memoryBytes := deltaMemoryBytes + cpus := int64(flavorInGroup.VCPUs) //nolint:gosec // VCPUs from flavor specs, realistically bounded + + for _, flavor := range flavorGroup.Flavors { + flavorMemoryBytes := int64(flavor.MemoryMB) * 1024 * 1024 //nolint:gosec // flavor memory from specs, realistically bounded + if flavorMemoryBytes <= deltaMemoryBytes { + flavorInGroup = flavor + memoryBytes = flavorMemoryBytes + cpus = int64(flavorInGroup.VCPUs) //nolint:gosec // VCPUs from flavor specs, realistically bounded + break + } + } + + spec := v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity( + memoryBytes, + resource.BinarySI, + ), + "cpu": *resource.NewQuantity( + cpus, + resource.DecimalSI, + ), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: state.ProjectID, + DomainID: state.DomainID, + ResourceGroup: state.FlavorGroupName, + ResourceName: flavorInGroup.Name, + Creator: creator, + Allocations: nil, + }, + } + + // Set AvailabilityZone if specified + if state.AvailabilityZone != "" { + spec.AvailabilityZone = state.AvailabilityZone + } + + // Set validity times if specified + if state.StartTime != nil { + spec.StartTime = &metav1.Time{Time: *state.StartTime} + } + if state.EndTime != nil { + spec.EndTime = &metav1.Time{Time: *state.EndTime} + } + + return &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: spec, + } +} diff --git a/internal/scheduling/reservations/commitments/reservation_manager_test.go b/internal/scheduling/reservations/commitments/reservation_manager_test.go new file mode 100644 index 000000000..d8cf9c267 --- /dev/null +++ b/internal/scheduling/reservations/commitments/reservation_manager_test.go @@ -0,0 +1,540 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/go-logr/logr" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestApplyCommitmentState_CreatesNewReservations(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + manager := NewReservationManager(client) + flavorGroup := testFlavorGroup() + flavorGroups := map[string]compute.FlavorGroupFeature{ + "test-group": flavorGroup, + } + + // Desired state: 3 multiples of smallest flavor (24 GiB) + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: 3 * 8192 * 1024 * 1024, + } + + touched, removed, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(removed) != 0 { + t.Errorf("expected 0 removed reservations, got %d", len(removed)) + } + + // Should create reservations to fulfill the commitment + if len(touched) == 0 { + t.Fatal("expected at least one reservation to be created") + } + + // Verify created reservations sum to desired state + totalMemory := int64(0) + for _, res := range touched { + memQuantity := res.Spec.Resources["memory"] + totalMemory += memQuantity.Value() + } + + if totalMemory != desiredState.TotalMemoryBytes { + t.Errorf("expected total memory %d, got %d", desiredState.TotalMemoryBytes, totalMemory) + } +} + +func TestApplyCommitmentState_DeletesExcessReservations(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + // Create existing reservations (32 GiB total) + existingReservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-1", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, + }, + }, + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(&existingReservations[0], &existingReservations[1]). + Build() + + manager := NewReservationManager(client) + flavorGroup := testFlavorGroup() + flavorGroups := map[string]compute.FlavorGroupFeature{ + "test-group": flavorGroup, + } + + // Desired state: only 8 GiB (need to reduce) + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: 8 * 1024 * 1024 * 1024, + } + + _, removed, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Note: May create a new 8GiB reservation while removing the two 16GiB ones + // This is expected behavior based on the slot sizing algorithm + + // Should remove excess reservations + if len(removed) == 0 { + t.Fatal("expected reservations to be removed") + } + + // Verify remaining capacity matches desired state + var remainingList v1alpha1.ReservationList + if err := client.List(context.Background(), &remainingList); err != nil { + t.Fatal(err) + } + + totalMemory := int64(0) + for _, res := range remainingList.Items { + memQuantity := res.Spec.Resources["memory"] + totalMemory += memQuantity.Value() + } + + if totalMemory != desiredState.TotalMemoryBytes { + t.Errorf("expected remaining memory %d, got %d", desiredState.TotalMemoryBytes, totalMemory) + } +} + +func TestApplyCommitmentState_PreservesAllocatedReservations(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + // Create reservations: one with allocation, one without + existingReservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{ + "vm-123": {}, // Has allocation + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-1", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, // No allocation + }, + }, + }, + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(&existingReservations[0], &existingReservations[1]). + Build() + + manager := NewReservationManager(client) + flavorGroup := testFlavorGroup() + flavorGroups := map[string]compute.FlavorGroupFeature{ + "test-group": flavorGroup, + } + + // Desired state: only 16 GiB (need to reduce by one slot) + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: 16 * 1024 * 1024 * 1024, + } + + _, removed, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Should remove the unallocated reservation, not the allocated one + if len(removed) != 1 { + t.Fatalf("expected 1 removed reservation, got %d", len(removed)) + } + + // Verify the removed one had no allocations + if len(removed[0].Spec.CommittedResourceReservation.Allocations) != 0 { + t.Error("expected unallocated reservation to be removed first") + } + + // Verify the allocated reservation still exists + var remainingList v1alpha1.ReservationList + if err := client.List(context.Background(), &remainingList); err != nil { + t.Fatal(err) + } + + if len(remainingList.Items) != 1 { + t.Fatalf("expected 1 remaining reservation, got %d", len(remainingList.Items)) + } + + // Verify the remaining one has the allocation + if len(remainingList.Items[0].Spec.CommittedResourceReservation.Allocations) == 0 { + t.Error("expected allocated reservation to be preserved") + } +} + +func TestApplyCommitmentState_HandlesZeroCapacity(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + // Create existing reservation + existingReservation := v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, + }, + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(&existingReservation). + Build() + + manager := NewReservationManager(client) + flavorGroup := testFlavorGroup() + flavorGroups := map[string]compute.FlavorGroupFeature{ + "test-group": flavorGroup, + } + + // Desired state: zero capacity (commitment expired or canceled) + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: 0, + } + + touched, removed, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(touched) != 0 { + t.Errorf("expected 0 new reservations, got %d", len(touched)) + } + + // Should remove all reservations + if len(removed) != 1 { + t.Fatalf("expected 1 removed reservation, got %d", len(removed)) + } + + // Verify no reservations remain + var remainingList v1alpha1.ReservationList + if err := client.List(context.Background(), &remainingList); err != nil { + t.Fatal(err) + } + + if len(remainingList.Items) != 0 { + t.Errorf("expected 0 remaining reservations, got %d", len(remainingList.Items)) + } +} + +func TestApplyCommitmentState_FixesWrongFlavorGroup(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + // Create reservation with wrong flavor group + existingReservation := v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "wrong-group", // Wrong flavor group + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, + }, + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(&existingReservation). + Build() + + manager := NewReservationManager(client) + flavorGroup := testFlavorGroup() + flavorGroups := map[string]compute.FlavorGroupFeature{ + "test-group": flavorGroup, + } + + // Desired state with correct flavor group + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: 8 * 1024 * 1024 * 1024, + } + + touched, removed, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Should remove wrong reservation and create new one + if len(removed) != 1 { + t.Fatalf("expected 1 removed reservation, got %d", len(removed)) + } + + if len(touched) != 1 { + t.Fatalf("expected 1 new reservation, got %d", len(touched)) + } + + // Verify new reservation has correct flavor group + if touched[0].Spec.CommittedResourceReservation.ResourceGroup != "test-group" { + t.Errorf("expected flavor group test-group, got %s", + touched[0].Spec.CommittedResourceReservation.ResourceGroup) + } +} + +func TestApplyCommitmentState_UnknownFlavorGroup(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + manager := NewReservationManager(client) + flavorGroups := map[string]compute.FlavorGroupFeature{} // Empty + + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "unknown-group", + TotalMemoryBytes: 8 * 1024 * 1024 * 1024, + } + + _, _, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err == nil { + t.Fatal("expected error for unknown flavor group, got nil") + } +} + +func TestNewReservation_SelectsAppropriateFlavor(t *testing.T) { + manager := &ReservationManager{} + flavorGroup := testFlavorGroup() + + tests := []struct { + name string + deltaMemory int64 + expectedName string + expectedCores int64 + }{ + { + name: "fits large flavor", + deltaMemory: 32768 * 1024 * 1024, // 32 GiB + expectedName: "large", + expectedCores: 16, + }, + { + name: "fits medium flavor", + deltaMemory: 16384 * 1024 * 1024, // 16 GiB + expectedName: "medium", + expectedCores: 8, + }, + { + name: "fits small flavor", + deltaMemory: 8192 * 1024 * 1024, // 8 GiB + expectedName: "small", + expectedCores: 4, + }, + { + name: "oversized uses largest available flavor", + deltaMemory: 100 * 1024 * 1024 * 1024, // 100 GiB (larger than any flavor) + expectedName: "large", // Will use largest available + expectedCores: 16, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + state := &CommitmentState{ + CommitmentUUID: "test-uuid", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: tt.deltaMemory, + } + + reservation := manager.newReservation( + state, + 0, + tt.deltaMemory, + flavorGroup, + "syncer", + ) + + // Verify flavor selection + if reservation.Spec.CommittedResourceReservation.ResourceName != tt.expectedName { + t.Errorf("expected flavor %s, got %s", + tt.expectedName, + reservation.Spec.CommittedResourceReservation.ResourceName) + } + + // Verify CPU allocation + cpuQuantity := reservation.Spec.Resources["cpu"] + if cpuQuantity.Value() != tt.expectedCores { + t.Errorf("expected %d cores, got %d", + tt.expectedCores, cpuQuantity.Value()) + } + }) + } +} diff --git a/internal/scheduling/reservations/commitments/state.go b/internal/scheduling/reservations/commitments/state.go new file mode 100644 index 000000000..996efff8e --- /dev/null +++ b/internal/scheduling/reservations/commitments/state.go @@ -0,0 +1,202 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "errors" + "fmt" + "strings" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/sapcc/go-api-declarations/liquid" + ctrl "sigs.k8s.io/controller-runtime" +) + +var stateLog = ctrl.Log.WithName("commitment_state") + +// Limes LIQUID resource naming convention: ram_ +const commitmentResourceNamePrefix = "ram_" + +func getFlavorGroupNameFromResource(resourceName string) (string, error) { + if !strings.HasPrefix(resourceName, commitmentResourceNamePrefix) { + return "", fmt.Errorf("invalid resource name: %s", resourceName) + } + return strings.TrimPrefix(resourceName, commitmentResourceNamePrefix), nil +} + +// CommitmentState represents desired or current commitment resource allocation. +type CommitmentState struct { + // CommitmentUUID uniquely identifies this commitment + CommitmentUUID string + // ProjectID is the OpenStack project this commitment belongs to + ProjectID string + // DomainID is the OpenStack domain this commitment belongs to + DomainID string + // FlavorGroupName identifies the flavor group (e.g., "hana_medium_v2") + FlavorGroupName string + // the total memory in bytes across all reservation slots + TotalMemoryBytes int64 + // AvailabilityZone specifies the availability zone for this commitment + AvailabilityZone string + // StartTime is when the commitment becomes active + StartTime *time.Time + // EndTime is when the commitment expires + EndTime *time.Time +} + +// FromCommitment converts Limes commitment to CommitmentState. +func FromCommitment( + commitment Commitment, + flavorGroup compute.FlavorGroupFeature, +) (*CommitmentState, error) { + + flavorGroupName, err := getFlavorGroupNameFromResource(commitment.ResourceName) + if err != nil { + return nil, err + } + + // Calculate total memory from commitment amount (amount = multiples of smallest flavor) + smallestFlavorMemoryBytes := int64(flavorGroup.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec // flavor memory from specs, realistically bounded + totalMemoryBytes := int64(commitment.Amount) * smallestFlavorMemoryBytes //nolint:gosec // commitment amount from Limes API, bounded by quota limits + + // Set start time: use ConfirmedAt if available, otherwise CreatedAt + var startTime *time.Time + if commitment.ConfirmedAt != nil { + t := time.Unix(int64(*commitment.ConfirmedAt), 0) //nolint:gosec // timestamp from Limes API, realistically bounded + startTime = &t + } else { + t := time.Unix(int64(commitment.CreatedAt), 0) //nolint:gosec // timestamp from Limes API, realistically bounded + startTime = &t + } + + // Set end time from ExpiresAt + var endTime *time.Time + if commitment.ExpiresAt > 0 { + t := time.Unix(int64(commitment.ExpiresAt), 0) //nolint:gosec // timestamp from Limes API, realistically bounded + endTime = &t + } + + return &CommitmentState{ + CommitmentUUID: commitment.UUID, + ProjectID: commitment.ProjectID, + DomainID: commitment.DomainID, + FlavorGroupName: flavorGroupName, + TotalMemoryBytes: totalMemoryBytes, + AvailabilityZone: commitment.AvailabilityZone, + StartTime: startTime, + EndTime: endTime, + }, nil +} + +// FromChangeCommitmentTargetState converts LIQUID API request to CommitmentState. +func FromChangeCommitmentTargetState( + commitment liquid.Commitment, + projectID string, + flavorGroupName string, + flavorGroup compute.FlavorGroupFeature, + az string, +) (*CommitmentState, error) { + + amountMultiple := uint64(0) + var startTime *time.Time + var endTime *time.Time + + switch commitment.NewStatus.UnwrapOr("none") { + // guaranteed and confirmed commitments are honored with start time now + case liquid.CommitmentStatusGuaranteed, liquid.CommitmentStatusConfirmed: + amountMultiple = commitment.Amount + // Set start time to now for active commitments + now := time.Now() + startTime = &now + } + + // ConfirmBy is ignored for now + // TODO do more sophisticated handling of guaranteed commitments + + // Set end time if not zero (commitments can have no expiry) + if !commitment.ExpiresAt.IsZero() { + endTime = &commitment.ExpiresAt + // check expiry time + if commitment.ExpiresAt.Before(time.Now()) || commitment.ExpiresAt.Equal(time.Now()) { + // commitment is already expired, ignore capacity + amountMultiple = 0 + } + } + + // Flavors are sorted by size descending, so the last one is the smallest + smallestFlavor := flavorGroup.SmallestFlavor + smallestFlavorMemoryBytes := int64(smallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec // flavor memory from specs, realistically bounded + + // Amount represents multiples of the smallest flavor in the group + totalMemoryBytes := int64(amountMultiple) * smallestFlavorMemoryBytes //nolint:gosec // commitment amount from Limes API, bounded by quota limits + + return &CommitmentState{ + CommitmentUUID: string(commitment.UUID), + ProjectID: projectID, + FlavorGroupName: flavorGroupName, + TotalMemoryBytes: totalMemoryBytes, + AvailabilityZone: az, + StartTime: startTime, + EndTime: endTime, + }, nil +} + +// FromReservations reconstructs CommitmentState from existing Reservation CRDs. +func FromReservations(reservations []v1alpha1.Reservation) (*CommitmentState, error) { + if len(reservations) == 0 { + return nil, errors.New("no reservations provided") + } + + // Extract commitment metadata from first reservation + first := reservations[0] + if first.Spec.CommittedResourceReservation == nil { + return nil, errors.New("not a committed resource reservation") + } + + state := &CommitmentState{ + CommitmentUUID: extractCommitmentUUID(first.Name), + ProjectID: first.Spec.CommittedResourceReservation.ProjectID, + DomainID: first.Spec.CommittedResourceReservation.DomainID, + FlavorGroupName: first.Spec.CommittedResourceReservation.ResourceGroup, + TotalMemoryBytes: 0, + AvailabilityZone: first.Spec.AvailabilityZone, + } + + if first.Spec.StartTime != nil { + state.StartTime = &first.Spec.StartTime.Time + } + if first.Spec.EndTime != nil { + state.EndTime = &first.Spec.EndTime.Time + } + + // Sum memory across all reservations + for _, res := range reservations { + if res.Spec.CommittedResourceReservation == nil { + return nil, errors.New("unexpected reservation type of reservation " + res.Name) + } + // check if it belongs to the same commitment + if extractCommitmentUUID(res.Name) != state.CommitmentUUID { + return nil, errors.New("reservation " + res.Name + " does not belong to commitment " + state.CommitmentUUID) + } + // check flavor group consistency, ignore if not matching to repair corrupted state in k8s + if res.Spec.CommittedResourceReservation.ResourceGroup != state.FlavorGroupName { + // log message + stateLog.Error(errors.New("inconsistent flavor group in reservation"), + "reservation belongs to same commitment but has different flavor group - ignoring reservation for capacity calculation", + "reservationName", res.Name, + "expectedFlavorGroup", state.FlavorGroupName, + "actualFlavorGroup", res.Spec.CommittedResourceReservation.ResourceGroup, + ) + continue + } + + memoryQuantity := res.Spec.Resources["memory"] + memoryBytes := memoryQuantity.Value() + state.TotalMemoryBytes += memoryBytes + } + + return state, nil +} diff --git a/internal/scheduling/reservations/commitments/state_test.go b/internal/scheduling/reservations/commitments/state_test.go new file mode 100644 index 000000000..d8581cec1 --- /dev/null +++ b/internal/scheduling/reservations/commitments/state_test.go @@ -0,0 +1,252 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// Test helper: creates a minimal flavor group for testing +func testFlavorGroup() compute.FlavorGroupFeature { + return compute.FlavorGroupFeature{ + Name: "test-group", + Flavors: []compute.FlavorInGroup{ + {Name: "large", VCPUs: 16, MemoryMB: 32768, DiskGB: 100}, + {Name: "medium", VCPUs: 8, MemoryMB: 16384, DiskGB: 50}, + {Name: "small", VCPUs: 4, MemoryMB: 8192, DiskGB: 25}, + }, + SmallestFlavor: compute.FlavorInGroup{ + Name: "small", VCPUs: 4, MemoryMB: 8192, DiskGB: 25, + }, + LargestFlavor: compute.FlavorInGroup{ + Name: "large", VCPUs: 16, MemoryMB: 32768, DiskGB: 100, + }, + } +} + +func TestFromCommitment_CalculatesMemoryCorrectly(t *testing.T) { + flavorGroup := testFlavorGroup() + commitment := Commitment{ + UUID: "test-uuid", + ProjectID: "project-1", + ResourceName: "ram_test-group", + Amount: 5, // 5 multiples of smallest flavor + } + + state, err := FromCommitment(commitment, flavorGroup) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Verify basic fields + if state.CommitmentUUID != "test-uuid" { + t.Errorf("expected UUID test-uuid, got %s", state.CommitmentUUID) + } + if state.ProjectID != "project-1" { + t.Errorf("expected ProjectID project-1, got %s", state.ProjectID) + } + if state.FlavorGroupName != "test-group" { + t.Errorf("expected FlavorGroupName test-group, got %s", state.FlavorGroupName) + } + + // Verify memory calculation: 5 * 8192 MB = 40960 MB = 42949672960 bytes + expectedMemory := int64(5 * 8192 * 1024 * 1024) + if state.TotalMemoryBytes != expectedMemory { + t.Errorf("expected memory %d, got %d", expectedMemory, state.TotalMemoryBytes) + } +} + +func TestFromCommitment_InvalidResourceName(t *testing.T) { + flavorGroup := testFlavorGroup() + commitment := Commitment{ + UUID: "test-uuid", + ProjectID: "project-1", + ResourceName: "invalid_resource_name", // missing "ram_" prefix + Amount: 1, + } + + _, err := FromCommitment(commitment, flavorGroup) + if err == nil { + t.Fatal("expected error for invalid resource name, got nil") + } +} + +func TestFromReservations_SumsMemoryCorrectly(t *testing.T) { + reservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), // 8 GiB + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-1", + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), // 16 GiB + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + }, + }, + }, + } + + state, err := FromReservations(reservations) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Verify fields extracted from first reservation + if state.CommitmentUUID != "abc123" { + t.Errorf("expected UUID abc123, got %s", state.CommitmentUUID) + } + if state.ProjectID != "project-1" { + t.Errorf("expected ProjectID project-1, got %s", state.ProjectID) + } + if state.FlavorGroupName != "test-group" { + t.Errorf("expected FlavorGroupName test-group, got %s", state.FlavorGroupName) + } + + // Verify memory is summed correctly: 8 GiB + 16 GiB = 24 GiB + expectedMemory := int64(24 * 1024 * 1024 * 1024) + if state.TotalMemoryBytes != expectedMemory { + t.Errorf("expected memory %d, got %d", expectedMemory, state.TotalMemoryBytes) + } +} + +func TestFromReservations_EmptyList(t *testing.T) { + _, err := FromReservations([]v1alpha1.Reservation{}) + if err == nil { + t.Fatal("expected error for empty reservation list, got nil") + } +} + +func TestFromReservations_SkipsInconsistentFlavorGroup(t *testing.T) { + reservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-1", + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "wrong-group", // Different flavor group + }, + }, + }, + } + + state, err := FromReservations(reservations) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Should only count first reservation with matching flavor group + expectedMemory := int64(8 * 1024 * 1024 * 1024) + if state.TotalMemoryBytes != expectedMemory { + t.Errorf("expected memory %d (ignoring inconsistent reservation), got %d", + expectedMemory, state.TotalMemoryBytes) + } +} + +func TestFromReservations_MixedCommitmentUUIDs(t *testing.T) { + reservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + }, + Spec: v1alpha1.ReservationSpec{ + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-xyz789-0", // Different commitment UUID + }, + Spec: v1alpha1.ReservationSpec{ + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + }, + }, + }, + } + + _, err := FromReservations(reservations) + if err == nil { + t.Fatal("expected error for mixed commitment UUIDs, got nil") + } +} + +func TestFromReservations_NonCommittedResourceType(t *testing.T) { + reservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, // Wrong type + }, + }, + } + + _, err := FromReservations(reservations) + if err == nil { + t.Fatal("expected error for non-CR reservation type, got nil") + } +} + +func TestGetFlavorGroupNameFromResource_Valid(t *testing.T) { + name, err := getFlavorGroupNameFromResource("ram_hana_medium_v2") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if name != "hana_medium_v2" { + t.Errorf("expected hana_medium_v2, got %s", name) + } +} + +func TestGetFlavorGroupNameFromResource_Invalid(t *testing.T) { + _, err := getFlavorGroupNameFromResource("invalid_resource") + if err == nil { + t.Fatal("expected error for invalid resource name, got nil") + } +} diff --git a/internal/scheduling/reservations/commitments/syncer.go b/internal/scheduling/reservations/commitments/syncer.go index 970a44b26..b9e6fe3b4 100644 --- a/internal/scheduling/reservations/commitments/syncer.go +++ b/internal/scheduling/reservations/commitments/syncer.go @@ -5,24 +5,20 @@ package commitments import ( "context" - "errors" "fmt" - "slices" - "sort" "strings" - - ctrl "sigs.k8s.io/controller-runtime" + "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" - k8serrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/api/resource" - "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) var ( - syncLog = ctrl.Log.WithName("sync") // CreatorValue identifies reservations created by this syncer. CreatorValue = "commitments-syncer" ) @@ -35,13 +31,12 @@ type SyncerConfig struct { } type Syncer struct { - // Client to fetch commitments. + // Client to fetch commitments from Limes CommitmentsClient - // Client for the kubernetes API. + // Kubernetes client for CRD operations client.Client } -// Create a new compute reservation syncer. func NewSyncer(k8sClient client.Client) *Syncer { return &Syncer{ CommitmentsClient: NewCommitmentsClient(), @@ -49,233 +44,175 @@ func NewSyncer(k8sClient client.Client) *Syncer { } } -// Initialize the syncer. func (s *Syncer) Init(ctx context.Context, config SyncerConfig) error { - // Initialize the syncer. if err := s.CommitmentsClient.Init(ctx, s.Client, config); err != nil { return err } return nil } -// Helper struct to unify the commitment with metadata needed for reservation creation. -type resolvedCommitment struct { - Commitment - Flavor Flavor -} - -// Get all compute commitments that should be converted to reservations. -func (s *Syncer) resolveUnusedCommitments(ctx context.Context) ([]resolvedCommitment, error) { - // Get all data we need from the openstack services. +func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavorGroups map[string]compute.FlavorGroupFeature) ([]*CommitmentState, error) { allProjects, err := s.ListProjects(ctx) if err != nil { return nil, err } - flavors, err := s.ListFlavorsByName(ctx) - if err != nil { - return nil, err - } commitments, err := s.ListCommitmentsByID(ctx, allProjects...) if err != nil { return nil, err } - // Remove non-compute/non-instance commitments or commitments we can't resolve. - var resolvedCommitments []resolvedCommitment + // Filter for compute commitments with RAM flavor group resources + var commitmentStates []*CommitmentState for id, commitment := range commitments { if commitment.ServiceType != "compute" { - delete(commitments, id) - syncLog.Info("skipping non-compute commitment", "id", id, "serviceType", commitment.ServiceType) - continue - } - if !strings.HasPrefix(commitment.ResourceName, "instances_") { - syncLog.Info("skipping non-instance commitment", "id", id, "resourceName", commitment.ResourceName) - delete(commitments, id) + log.Info("skipping non-compute commitment", "id", id, "serviceType", commitment.ServiceType) continue } - flavorName := strings.TrimPrefix(commitment.ResourceName, "instances_") - flavor, ok := flavors[flavorName] - if !ok { - syncLog.Info("skipping commitment without known flavor", "id", id, "flavorName", flavorName) - delete(commitments, id) + if !strings.HasPrefix(commitment.ResourceName, commitmentResourceNamePrefix) { + log.Info("skipping non-RAM-flavor-group commitment", "id", id, "resourceName", commitment.ResourceName) continue } - // We only support cloud-hypervisor and qemu hypervisors for commitments. - hvType, ok := flavor.ExtraSpecs["capabilities:hypervisor_type"] - if !ok || !slices.Contains([]string{"ch", "qemu"}, strings.ToLower(hvType)) { - syncLog.Info("skipping commitment with unsupported hv type", "commitmentID", commitment.UUID, "hypervisorType", hvType) - delete(commitments, id) + + // Extract flavor group name from resource name + flavorGroupName, err := getFlavorGroupNameFromResource(commitment.ResourceName) + if err != nil { + log.Info("skipping commitment with invalid resource name", + "id", id, + "resourceName", commitment.ResourceName, + "error", err) continue } - resolvedCommitments = append(resolvedCommitments, resolvedCommitment{ - Commitment: commitment, - Flavor: flavor, - }) - } - // Remove all commitments which are currently actively in use by a vm. - projectsWithCommitments := make([]Project, 0, len(resolvedCommitments)) - projectIDs := make(map[string]bool) - for _, commitment := range resolvedCommitments { - projectIDs[commitment.ProjectID] = true - } - for _, project := range allProjects { - if _, exists := projectIDs[project.ID]; exists { - projectsWithCommitments = append(projectsWithCommitments, project) + // Validate flavor group exists in Knowledge + flavorGroup, exists := flavorGroups[flavorGroupName] + if !exists { + log.Info("skipping commitment with unknown flavor group", + "id", id, + "flavorGroup", flavorGroupName) + continue } - } - // List all servers, not only the active ones, like limes when it calculates - // subresource usage: https://github.com/sapcc/limes/blob/c146c82/internal/liquids/nova/subresources.go#L94 - servers, err := s.ListServersByProjectID(ctx, projectsWithCommitments...) - if err != nil { - return nil, err - } - sort.Slice(resolvedCommitments, func(i, j int) bool { - return resolvedCommitments[i].ID < resolvedCommitments[j].ID - }) - mappedServers := map[string]struct{}{} // Servers subtracted from a commitment - var unusedCommitments []resolvedCommitment - for _, commitment := range resolvedCommitments { - matchingServerCount := uint64(0) - activeServers, ok := servers[commitment.ProjectID] - if !ok || len(activeServers) == 0 { - // No active servers in this project, keep the commitment. - unusedCommitments = append(unusedCommitments, commitment) + // Skip commitments with empty UUID + if commitment.UUID == "" { + log.Info("skipping commitment with empty UUID", + "id", id) continue } - // Some active servers, subtract them from the commitment amount. - sort.Slice(activeServers, func(i, j int) bool { - return activeServers[i].ID < activeServers[j].ID - }) - for _, server := range activeServers { - if _, exists := mappedServers[server.ID]; exists { - // This server is already subtracted from another commitment. - continue - } - if server.FlavorName != commitment.Flavor.Name { - // This server is of a different flavor, skip it. - continue - } - mappedServers[server.ID] = struct{}{} - matchingServerCount++ - syncLog.Info("subtracting server from commitment", "commitmentID", commitment.UUID, "serverID", server.ID, "remainingAmount", commitment.Amount) - } - if matchingServerCount >= commitment.Amount { - syncLog.Info("skipping commitment that is fully used by active servers", "id", commitment.UUID, "project", commitment.ProjectID) + + // Convert commitment to state using FromCommitment + state, err := FromCommitment(commitment, flavorGroup) + if err != nil { + log.Error(err, "failed to convert commitment to state", + "id", id, + "uuid", commitment.UUID) continue } - commitment.Amount -= matchingServerCount - unusedCommitments = append(unusedCommitments, commitment) + + log.Info("resolved commitment to state", + "commitmentID", commitment.UUID, + "flavorGroup", flavorGroupName, + "amount", commitment.Amount, + "totalMemoryBytes", state.TotalMemoryBytes) + + commitmentStates = append(commitmentStates, state) } - return unusedCommitments, nil + return commitmentStates, nil } -// Fetch commitments and update/create reservations for each of them. +// SyncReservations fetches commitments from Limes and synchronizes Reservation CRDs. func (s *Syncer) SyncReservations(ctx context.Context) error { - // Get all commitments that should be converted to reservations. - // TODO keep all commitments, not only the unused ones, propagate allocation correctly - commitments, err := s.resolveUnusedCommitments(ctx) + // TODO handle concurrency with change API: consider creation time of reservations and status ready + + // Create logger with run ID for this sync execution + runID := fmt.Sprintf("sync-%d", time.Now().Unix()) + log := ctrl.Log.WithName("CommitmentSyncer").WithValues("runID", runID) + + log.Info("starting commitment sync") + + // Check if flavor group knowledge is ready + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: s.Client} + knowledgeCRD, err := knowledge.Get(ctx) if err != nil { - syncLog.Error(err, "failed to get compute commitments") + log.Error(err, "failed to check flavor group knowledge readiness") return err } - // Map commitments to reservations. - var reservationsByName = make(map[string]v1alpha1.Reservation) - for _, commitment := range commitments { - // Get only the 5 first characters from the uuid. This should be safe enough. - if len(commitment.UUID) < 5 { - err := errors.New("commitment UUID is too short") - syncLog.Error(err, "uuid is less than 5 characters", "uuid", commitment.UUID) - continue - } - commitmentUUIDShort := commitment.UUID[:5] - spec := v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeCommittedResource, - Resources: map[string]resource.Quantity{ - "memory": *resource.NewQuantity(int64(commitment.Flavor.RAM)*1024*1024, resource.BinarySI), - "cpu": *resource.NewQuantity(int64(commitment.Flavor.VCPUs), resource.DecimalSI), - // Disk is currently not considered. - }, - CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - ProjectID: commitment.ProjectID, - DomainID: commitment.DomainID, - ResourceName: commitment.Flavor.Name, - ResourceGroup: commitment.Flavor.ExtraSpecs["hw_version"], - Allocations: make(map[string]v1alpha1.CommittedResourceAllocation), - Creator: CreatorValue, - }, - } - for n := range commitment.Amount { // N instances - meta := ctrl.ObjectMeta{ - Name: fmt.Sprintf("commitment-%s-%d", commitmentUUIDShort, n), - } - if _, exists := reservationsByName[meta.Name]; exists { - syncLog.Error(errors.New("duplicate reservation name"), - "reservation name already exists", - "name", meta.Name, - "commitmentUUID", commitment.UUID, - ) - continue - } - reservationsByName[meta.Name] = v1alpha1.Reservation{ - ObjectMeta: meta, - Spec: spec, - } - } + if knowledgeCRD == nil { + log.Info("skipping commitment sync - flavor group knowledge not ready yet") + return nil } - // Create new reservations or update existing ones. - for _, res := range reservationsByName { - // Check if the reservation already exists. - nn := types.NamespacedName{Name: res.Name, Namespace: res.Namespace} - var existing v1alpha1.Reservation - if err := s.Get(ctx, nn, &existing); err != nil { - if !k8serrors.IsNotFound(err) { - syncLog.Error(err, "failed to get reservation", "name", nn.Name) - return err - } - // Reservation does not exist, create it. - if err := s.Create(ctx, &res); err != nil { - return err - } - syncLog.Info("created reservation", "name", nn.Name) + // Get flavor groups using the CRD we already fetched + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, knowledgeCRD) + if err != nil { + log.Error(err, "failed to get flavor groups from knowledge") + return err + } + + // Get all commitments as states + commitmentStates, err := s.getCommitmentStates(ctx, log, flavorGroups) + if err != nil { + log.Error(err, "failed to get compute commitments") + return err + } + + // Create ReservationManager to handle state application + manager := NewReservationManager(s.Client) + + // Apply each commitment state using the manager + for _, state := range commitmentStates { + log.Info("applying commitment state", + "commitmentUUID", state.CommitmentUUID, + "projectID", state.ProjectID, + "flavorGroup", state.FlavorGroupName, + "totalMemoryBytes", state.TotalMemoryBytes) + + _, _, err := manager.ApplyCommitmentState(ctx, log, state, flavorGroups, CreatorValue) + if err != nil { + log.Error(err, "failed to apply commitment state", + "commitmentUUID", state.CommitmentUUID) + // Continue with other commitments even if one fails continue } - // Reservation exists, update it. - old := existing.DeepCopy() - existing.Spec = res.Spec - patch := client.MergeFrom(old) - if err := s.Patch(ctx, &existing, patch); err != nil { - syncLog.Error(err, "failed to patch reservation", "name", nn.Name) - return err - } - syncLog.Info("updated reservation", "name", nn.Name) } - // Delete reservations that are not in the commitments anymore. + // Delete reservations that are no longer in commitments + // Only query committed resource reservations using labels for efficiency var existingReservations v1alpha1.ReservationList - if err := s.List(ctx, &existingReservations); err != nil { - syncLog.Error(err, "failed to list existing reservations") + if err := s.List(ctx, &existingReservations, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + log.Error(err, "failed to list existing committed resource reservations") return err } + + // Build set of commitment UUIDs we should have + activeCommitments := make(map[string]bool) + for _, state := range commitmentStates { + activeCommitments[state.CommitmentUUID] = true + } + + // Delete reservations for commitments that no longer exist for _, existing := range existingReservations.Items { - // Only manage reservations created by this syncer (identified by Creator field). - if existing.Spec.CommittedResourceReservation == nil || - existing.Spec.CommittedResourceReservation.Creator != CreatorValue { + // Extract commitment UUID from reservation name + commitmentUUID := extractCommitmentUUID(existing.Name) + if commitmentUUID == "" { + log.Info("skipping reservation with unparseable name", "name", existing.Name) continue } - if _, found := reservationsByName[existing.Name]; !found { - // Reservation not found in commitments, delete it. + + if !activeCommitments[commitmentUUID] { + // This commitment no longer exists, delete the reservation if err := s.Delete(ctx, &existing); err != nil { - syncLog.Error(err, "failed to delete reservation", "name", existing.Name) + log.Error(err, "failed to delete reservation", "name", existing.Name) return err } - syncLog.Info("deleted reservation", "name", existing.Name) + log.Info("deleted reservation for expired commitment", + "name", existing.Name, + "commitmentUUID", commitmentUUID) } } - syncLog.Info("synced reservations", "count", len(reservationsByName)) + log.Info("synced reservations", "commitmentCount", len(commitmentStates)) return nil } diff --git a/internal/scheduling/reservations/commitments/syncer_test.go b/internal/scheduling/reservations/commitments/syncer_test.go index 4db74801b..0790545e8 100644 --- a/internal/scheduling/reservations/commitments/syncer_test.go +++ b/internal/scheduling/reservations/commitments/syncer_test.go @@ -7,15 +7,89 @@ import ( "context" "testing" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" - - "github.com/cobaltcore-dev/cortex/api/v1alpha1" ) +// FlavorGroupData holds test data for creating a flavor group +type FlavorGroupData struct { + LargestFlavorName string + LargestFlavorVCPUs uint64 + LargestFlavorMemoryMB uint64 + SmallestFlavorName string + SmallestFlavorVCPUs uint64 + SmallestFlavorMemoryMB uint64 +} + +// createFlavorGroupKnowledge creates a Knowledge CRD with flavor group data for testing +func createFlavorGroupKnowledge(t *testing.T, groups map[string]FlavorGroupData) *v1alpha1.Knowledge { + t.Helper() + + // Build flavor group features + features := make([]compute.FlavorGroupFeature, 0, len(groups)) + for groupName, data := range groups { + features = append(features, compute.FlavorGroupFeature{ + Name: groupName, + Flavors: []compute.FlavorInGroup{ + { + Name: data.LargestFlavorName, + VCPUs: data.LargestFlavorVCPUs, + MemoryMB: data.LargestFlavorMemoryMB, + }, + { + Name: data.SmallestFlavorName, + VCPUs: data.SmallestFlavorVCPUs, + MemoryMB: data.SmallestFlavorMemoryMB, + }, + }, + LargestFlavor: compute.FlavorInGroup{ + Name: data.LargestFlavorName, + VCPUs: data.LargestFlavorVCPUs, + MemoryMB: data.LargestFlavorMemoryMB, + }, + SmallestFlavor: compute.FlavorInGroup{ + Name: data.SmallestFlavorName, + VCPUs: data.SmallestFlavorVCPUs, + MemoryMB: data.SmallestFlavorMemoryMB, + }, + }) + } + + // Box the features + rawFeatures, err := v1alpha1.BoxFeatureList(features) + if err != nil { + t.Fatalf("Failed to box flavor group features: %v", err) + } + + return &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{ + Name: "flavor-groups", + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{ + Name: "flavor_groups", + }, + }, + Status: v1alpha1.KnowledgeStatus{ + Raw: rawFeatures, + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionTrue, + Reason: "ExtractorSucceeded", + }, + }, + }, + } +} + // Mock CommitmentsClient for testing type mockCommitmentsClient struct { initFunc func(ctx context.Context, client client.Client, conf SyncerConfig) error @@ -123,19 +197,32 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) { t.Fatalf("Failed to add scheme: %v", err) } + // Create flavor group knowledge CRD + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "test_group_v1": { + LargestFlavorName: "test-flavor", + LargestFlavorVCPUs: 2, + LargestFlavorMemoryMB: 1024, + SmallestFlavorName: "test-flavor", + SmallestFlavorVCPUs: 2, + SmallestFlavorMemoryMB: 1024, + }, + }) + k8sClient := fake.NewClientBuilder(). WithScheme(scheme). + WithObjects(flavorGroupsKnowledge). Build() - // Create mock commitments with instance flavors + // Create mock commitments with flavor group resources (using ram_ prefix) mockCommitments := []Commitment{ { ID: 1, UUID: "12345-67890-abcdef", ServiceType: "compute", - ResourceName: "instances_test-flavor", + ResourceName: "ram_test_group_v1", AvailabilityZone: "az1", - Amount: 2, // 2 instances + Amount: 2, // 2 multiples of smallest flavor (2 * 1024MB = 2048MB total) Unit: "", ProjectID: "test-project-1", DomainID: "test-domain-1", @@ -150,23 +237,6 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) { } return result, nil }, - listFlavorsByNameFunc: func(ctx context.Context) (map[string]Flavor, error) { - return map[string]Flavor{ - "test-flavor": { - ID: "flavor-1", - Name: "test-flavor", - RAM: 1024, // 1GB in MB - VCPUs: 2, - Disk: 20, // 20GB - ExtraSpecs: map[string]string{ - "hw:cpu_policy": "dedicated", - "hw:numa_nodes": "1", - "aggregate_instance_extra_specs:pinned": "true", - "capabilities:hypervisor_type": "qemu", - }, - }, - }, nil - }, listProjectsFunc: func(ctx context.Context) ([]Project, error) { return []Project{ {ID: "test-project-1", DomainID: "test-domain-1", Name: "Test Project 1"}, @@ -200,7 +270,7 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) { return } - // Should have 2 reservations (Amount = 2) + // Should have 2 reservations (Amount = 2, each for smallest flavor) if len(reservations.Items) != 2 { t.Errorf("Expected 2 reservations, got %d", len(reservations.Items)) return @@ -216,11 +286,12 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) { t.Errorf("Expected project ID test-project-1, got %v", res.Spec.CommittedResourceReservation.ProjectID) } - if res.Spec.CommittedResourceReservation.ResourceName != "test-flavor" { - t.Errorf("Expected flavor test-flavor, got %v", res.Spec.CommittedResourceReservation.ResourceName) + if res.Spec.CommittedResourceReservation.ResourceGroup != "test_group_v1" { + t.Errorf("Expected resource group test_group_v1, got %v", res.Spec.CommittedResourceReservation.ResourceGroup) } - // Check resource values + // Check resource values - should be sized for the flavor that fits + // With 2048MB total capacity, we can fit 2x 1024MB flavors expectedMemory := resource.MustParse("1073741824") // 1024MB in bytes if !res.Spec.Resources["memory"].Equal(expectedMemory) { t.Errorf("Expected memory %v, got %v", expectedMemory, res.Spec.Resources["memory"]) @@ -238,17 +309,34 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { t.Fatalf("Failed to add scheme: %v", err) } - // Create an existing reservation + // Create flavor group knowledge CRD + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "new_group_v1": { + LargestFlavorName: "new-flavor", + LargestFlavorVCPUs: 4, + LargestFlavorMemoryMB: 2048, + SmallestFlavorName: "new-flavor-small", + SmallestFlavorVCPUs: 2, + SmallestFlavorMemoryMB: 1024, + }, + }) + + // Create an existing reservation with mismatched project/flavor group + // The ReservationManager will delete this and create a new one existingReservation := &v1alpha1.Reservation{ ObjectMeta: ctrl.ObjectMeta{ - Name: "commitment-12345-0", // Instance commitments have -0 suffix + Name: "commitment-12345-67890-abcdef-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, }, Spec: v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeCommittedResource, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - ProjectID: "old-project", - ResourceName: "old-flavor", - Creator: CreatorValue, + ProjectID: "old-project", + ResourceName: "old-flavor", + ResourceGroup: "old_group", + Creator: CreatorValue, }, Resources: map[string]resource.Quantity{ "memory": resource.MustParse("512Mi"), @@ -259,16 +347,16 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { k8sClient := fake.NewClientBuilder(). WithScheme(scheme). - WithObjects(existingReservation). + WithObjects(existingReservation, flavorGroupsKnowledge). Build() - // Create mock commitment that should update the existing reservation + // Create mock commitment that will replace the existing reservation mockCommitments := []Commitment{ { ID: 1, UUID: "12345-67890-abcdef", ServiceType: "compute", - ResourceName: "instances_new-flavor", + ResourceName: "ram_new_group_v1", AvailabilityZone: "az1", Amount: 1, Unit: "", @@ -285,23 +373,6 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { } return result, nil }, - listFlavorsByNameFunc: func(ctx context.Context) (map[string]Flavor, error) { - return map[string]Flavor{ - "new-flavor": { - ID: "flavor-2", - Name: "new-flavor", - RAM: 2048, // 2GB in MB - VCPUs: 4, - Disk: 40, // 40GB - ExtraSpecs: map[string]string{ - "hw:cpu_policy": "shared", - "hw:numa_nodes": "2", - "aggregate_instance_extra_specs:pinned": "false", - "capabilities:hypervisor_type": "qemu", - }, - }, - }, nil - }, listProjectsFunc: func(ctx context.Context) ([]Project, error) { return []Project{ {ID: "new-project", DomainID: "new-domain", Name: "New Project"}, @@ -327,45 +398,66 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { return } - // Verify that the reservation was updated - var updatedReservation v1alpha1.Reservation - err = k8sClient.Get(context.Background(), client.ObjectKey{Name: "commitment-12345-0"}, &updatedReservation) + // Verify that reservations were updated (old one deleted, new one created) + // The new reservation will be at index 0 since the old one was deleted first + var reservations v1alpha1.ReservationList + err = k8sClient.List(context.Background(), &reservations) if err != nil { - t.Errorf("Failed to get updated reservation: %v", err) + t.Errorf("Failed to list reservations: %v", err) + return + } + + if len(reservations.Items) != 1 { + t.Errorf("Expected 1 reservation, got %d", len(reservations.Items)) return } - // Verify the reservation was updated with new values - if updatedReservation.Spec.CommittedResourceReservation == nil { + newReservation := reservations.Items[0] + + // Verify the new reservation has correct values + if newReservation.Spec.CommittedResourceReservation == nil { t.Errorf("Expected CommittedResourceReservation to be set") return } - if updatedReservation.Spec.CommittedResourceReservation.ProjectID != "new-project" { - t.Errorf("Expected project ID new-project, got %v", updatedReservation.Spec.CommittedResourceReservation.ProjectID) + if newReservation.Spec.CommittedResourceReservation.ProjectID != "new-project" { + t.Errorf("Expected project ID new-project, got %v", newReservation.Spec.CommittedResourceReservation.ProjectID) } - if updatedReservation.Spec.CommittedResourceReservation.ResourceName != "new-flavor" { - t.Errorf("Expected flavor new-flavor, got %v", updatedReservation.Spec.CommittedResourceReservation.ResourceName) + if newReservation.Spec.CommittedResourceReservation.ResourceGroup != "new_group_v1" { + t.Errorf("Expected resource group new_group_v1, got %v", newReservation.Spec.CommittedResourceReservation.ResourceGroup) } } -func TestSyncer_SyncReservations_ShortUUID(t *testing.T) { +func TestSyncer_SyncReservations_EmptyUUID(t *testing.T) { scheme := runtime.NewScheme() if err := v1alpha1.AddToScheme(scheme); err != nil { t.Fatalf("Failed to add scheme: %v", err) } + // Create flavor group knowledge CRD + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "test_group_v1": { + LargestFlavorName: "test-flavor", + LargestFlavorVCPUs: 2, + LargestFlavorMemoryMB: 1024, + SmallestFlavorName: "test-flavor", + SmallestFlavorVCPUs: 2, + SmallestFlavorMemoryMB: 1024, + }, + }) + k8sClient := fake.NewClientBuilder(). WithScheme(scheme). + WithObjects(flavorGroupsKnowledge). Build() - // Create mock commitment with short UUID (should be skipped) + // Create mock commitment with empty UUID (should be skipped) mockCommitments := []Commitment{ { ID: 1, - UUID: "123", // Too short + UUID: "", // Empty UUID ServiceType: "compute", - ResourceName: "instances_test-flavor", + ResourceName: "ram_test_group_v1", AvailabilityZone: "az1", Amount: 1, Unit: "", @@ -382,23 +474,6 @@ func TestSyncer_SyncReservations_ShortUUID(t *testing.T) { } return result, nil }, - listFlavorsByNameFunc: func(ctx context.Context) (map[string]Flavor, error) { - return map[string]Flavor{ - "test-flavor": { - ID: "flavor-1", - Name: "test-flavor", - RAM: 1024, // 1GB in MB - VCPUs: 2, - Disk: 20, // 20GB - ExtraSpecs: map[string]string{ - "hw:cpu_policy": "dedicated", - "hw:numa_nodes": "1", - "aggregate_instance_extra_specs:pinned": "true", - "capabilities:hypervisor_type": "qemu", - }, - }, - }, nil - }, listProjectsFunc: func(ctx context.Context) ([]Project, error) { return []Project{ {ID: "test-project", DomainID: "test-domain", Name: "Test Project"}, @@ -424,7 +499,7 @@ func TestSyncer_SyncReservations_ShortUUID(t *testing.T) { return } - // Verify that no reservations were created due to short UUID + // Verify that no reservations were created due to empty UUID var reservations v1alpha1.ReservationList err = k8sClient.List(context.Background(), &reservations) if err != nil { @@ -433,6 +508,6 @@ func TestSyncer_SyncReservations_ShortUUID(t *testing.T) { } if len(reservations.Items) != 0 { - t.Errorf("Expected 0 reservations due to short UUID, got %d", len(reservations.Items)) + t.Errorf("Expected 0 reservations due to empty UUID, got %d", len(reservations.Items)) } } diff --git a/internal/scheduling/reservations/commitments/utils.go b/internal/scheduling/reservations/commitments/utils.go new file mode 100644 index 000000000..0afb3ab67 --- /dev/null +++ b/internal/scheduling/reservations/commitments/utils.go @@ -0,0 +1,46 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "strconv" + "strings" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" +) + +func GetMaxSlotIndex(reservations []v1alpha1.Reservation) int { + maxIndex := -1 + for _, res := range reservations { + // Parse slot index from name: "commitment--" + parts := strings.Split(res.Name, "-") + if len(parts) >= 3 { + if index, err := strconv.Atoi(parts[len(parts)-1]); err == nil { + if index > maxIndex { + maxIndex = index + } + } + } + } + return maxIndex +} + +// Always continue counting slots from max, instead of filling gaps. +func GetNextSlotIndex(reservations []v1alpha1.Reservation) int { + maxIndex := GetMaxSlotIndex(reservations) + return maxIndex + 1 +} + +// extractCommitmentUUID parses UUID from reservation name (commitment--). +func extractCommitmentUUID(name string) string { + // Remove "commitment-" prefix + withoutPrefix := strings.TrimPrefix(name, "commitment-") + // Split by "-" and take all but the last part (which is the slot index) + parts := strings.Split(withoutPrefix, "-") + if len(parts) > 1 { + // Rejoin all parts except the last one (slot index) + return strings.Join(parts[:len(parts)-1], "-") + } + return withoutPrefix +} diff --git a/internal/scheduling/reservations/commitments/utils_test.go b/internal/scheduling/reservations/commitments/utils_test.go new file mode 100644 index 000000000..b16268b2f --- /dev/null +++ b/internal/scheduling/reservations/commitments/utils_test.go @@ -0,0 +1,84 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestGetMaxSlotIndex_FindsHighestIndex(t *testing.T) { + reservations := []v1alpha1.Reservation{ + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123-0"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123-5"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123-2"}}, + } + + maxIndex := GetMaxSlotIndex(reservations) + if maxIndex != 5 { + t.Errorf("expected max index 5, got %d", maxIndex) + } +} + +func TestGetMaxSlotIndex_EmptyList(t *testing.T) { + maxIndex := GetMaxSlotIndex([]v1alpha1.Reservation{}) + if maxIndex != -1 { + t.Errorf("expected -1 for empty list, got %d", maxIndex) + } +} + +func TestGetMaxSlotIndex_InvalidNames(t *testing.T) { + reservations := []v1alpha1.Reservation{ + {ObjectMeta: metav1.ObjectMeta{Name: "invalid-name"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123"}}, // Missing slot index + } + + maxIndex := GetMaxSlotIndex(reservations) + if maxIndex != -1 { + t.Errorf("expected -1 when no valid indices found, got %d", maxIndex) + } +} + +func TestGetNextSlotIndex_IncrementsByOne(t *testing.T) { + reservations := []v1alpha1.Reservation{ + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123-0"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123-3"}}, + } + + nextIndex := GetNextSlotIndex(reservations) + if nextIndex != 4 { + t.Errorf("expected next index 4, got %d", nextIndex) + } +} + +func TestGetNextSlotIndex_EmptyList(t *testing.T) { + nextIndex := GetNextSlotIndex([]v1alpha1.Reservation{}) + if nextIndex != 0 { + t.Errorf("expected 0 for empty list, got %d", nextIndex) + } +} + +func TestExtractCommitmentUUID_SimpleUUID(t *testing.T) { + uuid := extractCommitmentUUID("commitment-abc123-0") + if uuid != "abc123" { + t.Errorf("expected abc123, got %s", uuid) + } +} + +func TestExtractCommitmentUUID_ComplexUUID(t *testing.T) { + // UUID with dashes (like standard UUID format) + uuid := extractCommitmentUUID("commitment-550e8400-e29b-41d4-a716-446655440000-5") + if uuid != "550e8400-e29b-41d4-a716-446655440000" { + t.Errorf("expected full UUID, got %s", uuid) + } +} + +func TestExtractCommitmentUUID_NoSlotIndex(t *testing.T) { + uuid := extractCommitmentUUID("commitment-abc123") + if uuid != "abc123" { + t.Errorf("expected abc123, got %s", uuid) + } +} diff --git a/internal/scheduling/reservations/controller/client.go b/internal/scheduling/reservations/controller/client.go deleted file mode 100644 index a57428dc9..000000000 --- a/internal/scheduling/reservations/controller/client.go +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package controller - -import ( - "context" - "encoding/json" - "fmt" - "net/http" - - "github.com/cobaltcore-dev/cortex/pkg/keystone" - "github.com/cobaltcore-dev/cortex/pkg/sso" - "github.com/gophercloud/gophercloud/v2" - "github.com/sapcc/go-bits/must" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -var ( - syncLog = ctrl.Log.WithName("sync") -) - -// OpenStack hypervisor model as returned by the Nova API under /os-hypervisors/detail. -// See: https://docs.openstack.org/api-ref/compute/#list-hypervisors-details -type Hypervisor struct { - ID string `json:"id"` - Hostname string `json:"hypervisor_hostname"` - Service struct { - Host string `json:"host"` - } `json:"service"` - Type string `json:"hypervisor_type"` -} - -// Client to fetch hypervisor data. -type HypervisorClient interface { - // Init the client. - Init(ctx context.Context, client client.Client, conf Config) error - // List all hypervisors. - ListHypervisors(ctx context.Context) ([]Hypervisor, error) -} - -// Hypervisor client fetching commitments from openstack services. -type hypervisorClient struct { - // Providerclient authenticated against openstack. - provider *gophercloud.ProviderClient - // Nova service client for OpenStack. - nova *gophercloud.ServiceClient -} - -// Create a new hypervisor client. -// By default, this client will fetch hypervisors from the nova API. -func NewHypervisorClient() HypervisorClient { - return &hypervisorClient{} -} - -// Init the client. -func (c *hypervisorClient) Init(ctx context.Context, client client.Client, conf Config) error { - var authenticatedHTTP = http.DefaultClient - if conf.SSOSecretRef != nil { - var err error - authenticatedHTTP, err = sso.Connector{Client: client}. - FromSecretRef(ctx, *conf.SSOSecretRef) - if err != nil { - return err - } - } - authenticatedKeystone, err := keystone. - Connector{Client: client, HTTPClient: authenticatedHTTP}. - FromSecretRef(ctx, conf.KeystoneSecretRef) - if err != nil { - return err - } - // Automatically fetch the nova endpoint from the keystone service catalog. - c.provider = authenticatedKeystone.Client() - - // Get the nova endpoint. - url := must.Return(c.provider.EndpointLocator(gophercloud.EndpointOpts{ - Type: "compute", - Availability: "public", - })) - syncLog.Info("using nova endpoint", "url", url) - c.nova = &gophercloud.ServiceClient{ - ProviderClient: c.provider, - Endpoint: url, - Type: "compute", - Microversion: "2.61", - } - return nil -} - -func (c *hypervisorClient) ListHypervisors(ctx context.Context) ([]Hypervisor, error) { - // Note: currently we need to fetch this without gophercloud. - // Gophercloud will just assume the request is a single page even when - // the response is paginated, returning only the first page. - initialURL := c.nova.Endpoint + "os-hypervisors/detail" - var nextURL = &initialURL - var hypervisors []Hypervisor - for nextURL != nil { - req, err := http.NewRequestWithContext(ctx, http.MethodGet, *nextURL, http.NoBody) - if err != nil { - return nil, err - } - req.Header.Set("X-Auth-Token", c.provider.Token()) - req.Header.Set("X-OpenStack-Nova-API-Version", c.nova.Microversion) - resp, err := c.nova.HTTPClient.Do(req) - if err != nil { - return nil, err - } - defer resp.Body.Close() - if resp.StatusCode != http.StatusOK { - return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode) - } - var list struct { - Hypervisors []Hypervisor `json:"hypervisors"` - Links []struct { - Rel string `json:"rel"` - Href string `json:"href"` - } `json:"hypervisors_links"` - } - err = json.NewDecoder(resp.Body).Decode(&list) - if err != nil { - return nil, err - } - hypervisors = append(hypervisors, list.Hypervisors...) - nextURL = nil - for _, link := range list.Links { - if link.Rel == "next" { - nextURL = &link.Href - break - } - } - } - return hypervisors, nil -} diff --git a/internal/scheduling/reservations/controller/client_test.go b/internal/scheduling/reservations/controller/client_test.go deleted file mode 100644 index f2b5582bc..000000000 --- a/internal/scheduling/reservations/controller/client_test.go +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package controller - -import ( - "context" - - "sigs.k8s.io/controller-runtime/pkg/client" -) - -type mockHypervisorClient struct { - hypervisorsToReturn []Hypervisor - errToReturn error -} - -func (m *mockHypervisorClient) Init(ctx context.Context, client client.Client, conf Config) error { - return nil -} - -func (m *mockHypervisorClient) ListHypervisors(ctx context.Context) ([]Hypervisor, error) { - return m.hypervisorsToReturn, m.errToReturn -} diff --git a/internal/scheduling/reservations/controller/controller.go b/internal/scheduling/reservations/controller/controller.go index 4eae4cfc2..17177770c 100644 --- a/internal/scheduling/reservations/controller/controller.go +++ b/internal/scheduling/reservations/controller/controller.go @@ -10,6 +10,7 @@ import ( "fmt" "net/http" "strings" + "time" "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/resource" @@ -23,10 +24,21 @@ import ( schedulerdelegationapi "github.com/cobaltcore-dev/cortex/api/external/nova" "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" "github.com/cobaltcore-dev/cortex/pkg/multicluster" corev1 "k8s.io/api/core/v1" ) +const ( + // RequeueIntervalActive is the interval for requeueing active reservations for verification. + RequeueIntervalActive = 5 * time.Minute + // RequeueIntervalRetry is the interval for requeueing when retrying after knowledge is not ready. + RequeueIntervalRetry = 1 * time.Minute +) + // Endpoints for the reservations operator. type EndpointsConfig struct { // The nova external scheduler endpoint. @@ -42,18 +54,21 @@ type Config struct { // Secret ref to keystone credentials stored in a k8s secret. KeystoneSecretRef corev1.SecretReference `json:"keystoneSecretRef"` + + // Secret ref to the database credentials for querying VM state. + DatabaseSecretRef *corev1.SecretReference `json:"databaseSecretRef,omitempty"` } // ReservationReconciler reconciles a Reservation object type ReservationReconciler struct { - // Client to fetch hypervisors. - HypervisorClient // Client for the kubernetes API. client.Client // Kubernetes scheme to use for the reservations. Scheme *runtime.Scheme // Configuration for the controller. Conf Config + // Database connection for querying VM state from Knowledge cache. + DB *db.DB } // Reconcile is part of the main kubernetes reconciliation loop which aims to @@ -63,16 +78,60 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) // Fetch the reservation object. var res v1alpha1.Reservation if err := r.Get(ctx, req.NamespacedName, &res); err != nil { - // Can happen when the resource was just deleted. - return ctrl.Result{}, err + // Ignore not-found errors, since they can't be fixed by an immediate requeue + return ctrl.Result{}, client.IgnoreNotFound(err) } - // If the reservation is already active (Ready=True), skip it. + if meta.IsStatusConditionTrue(res.Status.Conditions, v1alpha1.ReservationConditionReady) { - log.Info("reservation is already active, skipping", "reservation", req.Name) - return ctrl.Result{}, nil // Don't need to requeue. + log.Info("reservation is active, verifying allocations", "reservation", req.Name) + + // Verify all allocations in Spec against actual VM state from database + if err := r.reconcileAllocations(ctx, &res); err != nil { + log.Error(err, "failed to reconcile allocations") + return ctrl.Result{}, err + } + + // Requeue periodically to keep verifying allocations + return ctrl.Result{RequeueAfter: RequeueIntervalActive}, nil } - // Sync Spec values to Status fields + // TODO trigger re-placement of unused reservations over time + + // Check if this is a pre-allocated reservation with allocations + if res.Spec.CommittedResourceReservation != nil && + len(res.Spec.CommittedResourceReservation.Allocations) > 0 && + res.Spec.TargetHost != "" { + // mark as ready without calling the placement API + log.Info("detected pre-allocated reservation", + "reservation", req.Name, + "targetHost", res.Spec.TargetHost, + "allocatedVMs", len(res.Spec.CommittedResourceReservation.Allocations)) + + old := res.DeepCopy() + res.Status.Host = res.Spec.TargetHost + meta.SetStatusCondition(&res.Status.Conditions, metav1.Condition{ + Type: v1alpha1.ReservationConditionReady, + Status: metav1.ConditionTrue, + Reason: "PreAllocated", + Message: "reservation pre-allocated with VM allocations", + }) + patch := client.MergeFrom(old) + if err := r.Status().Patch(ctx, &res, patch); err != nil { + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + log.Error(err, "failed to patch pre-allocated reservation status") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil + } + + log.Info("marked pre-allocated reservation as ready", "reservation", req.Name, "host", res.Status.Host) + // Requeue immediately to run verification in next reconcile loop + return ctrl.Result{Requeue: true}, nil + } + + // Sync Spec values to Status fields for non-pre-allocated reservations // This ensures the observed state reflects the desired state from Spec needsStatusUpdate := false if res.Spec.TargetHost != "" && res.Status.Host != res.Spec.TargetHost { @@ -83,13 +142,18 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) old := res.DeepCopy() patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { - log.Error(err, "failed to sync spec to status") - return ctrl.Result{}, err + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + log.Error(err, "failed to sync spec to status") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil } log.Info("synced spec to status", "reservation", req.Name, "host", res.Status.Host) } - // Currently we can only reconcile nova CommittedResourceReservations (those with ResourceName set). + // filter for CR reservations resourceName := "" if res.Spec.CommittedResourceReservation != nil { resourceName = res.Spec.CommittedResourceReservation.ResourceName @@ -105,8 +169,13 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) }) patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { - log.Error(err, "failed to patch reservation status") - return ctrl.Result{}, err + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + log.Error(err, "failed to patch reservation status") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil } return ctrl.Result{}, nil // Don't need to requeue. } @@ -130,49 +199,67 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) cpu = uint64(cpuValue) } - // Get all hosts and assign zero-weights to them. - hypervisors, err := r.ListHypervisors(ctx) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to list hypervisors: %w", err) + // Get project ID from CommittedResourceReservation spec if available. + projectID := "" + if res.Spec.CommittedResourceReservation != nil { + projectID = res.Spec.CommittedResourceReservation.ProjectID } - var eligibleHosts []schedulerdelegationapi.ExternalSchedulerHost - for _, hv := range hypervisors { - eligibleHosts = append(eligibleHosts, schedulerdelegationapi.ExternalSchedulerHost{ - ComputeHost: hv.Service.Host, - HypervisorHostname: hv.Hostname, - }) + + // Get AvailabilityZone from reservation if available + availabilityZone := "" + if res.Spec.AvailabilityZone != "" { + availabilityZone = res.Spec.AvailabilityZone } - if len(eligibleHosts) == 0 { - log.Info("no eligible hosts found for reservation", "reservation", req.Name) - return ctrl.Result{}, errors.New("no eligible hosts found for reservation") + + // Get flavor details from flavor group knowledge CRD + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: r.Client} + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) + if err != nil { + log.Info("flavor knowledge not ready, requeueing", + "resourceName", resourceName, + "error", err) + return ctrl.Result{RequeueAfter: RequeueIntervalRetry}, nil } - weights := make(map[string]float64, len(eligibleHosts)) - for _, host := range eligibleHosts { - weights[host.ComputeHost] = 0.0 + + // Search for the flavor across all flavor groups + var flavorDetails *compute.FlavorInGroup + for _, fg := range flavorGroups { + for _, flavor := range fg.Flavors { + if flavor.Name == resourceName { + flavorDetails = &flavor + break + } + } + if flavorDetails != nil { + break + } } - // Get project ID from CommittedResourceReservation spec if available. - projectID := "" - if res.Spec.CommittedResourceReservation != nil { - projectID = res.Spec.CommittedResourceReservation.ProjectID + // Check if flavor was found + if flavorDetails == nil { + log.Error(errors.New("flavor not found"), "flavor not found in any flavor group", + "resourceName", resourceName) + return ctrl.Result{RequeueAfter: 5 * time.Minute}, nil } // Call the external scheduler delegation API to get a host for the reservation. + // Cortex will fetch candidate hosts and weights itself from its knowledge state. externalSchedulerRequest := schedulerdelegationapi.ExternalSchedulerRequest{ Reservation: true, - Hosts: eligibleHosts, - Weights: weights, Spec: schedulerdelegationapi.NovaObject[schedulerdelegationapi.NovaSpec]{ Data: schedulerdelegationapi.NovaSpec{ - InstanceUUID: res.Name, - NumInstances: 1, // One for each reservation. - ProjectID: projectID, + InstanceUUID: res.Name, + NumInstances: 1, // One for each reservation. + ProjectID: projectID, + AvailabilityZone: availabilityZone, Flavor: schedulerdelegationapi.NovaObject[schedulerdelegationapi.NovaFlavor]{ Data: schedulerdelegationapi.NovaFlavor{ - Name: resourceName, - MemoryMB: memoryMB, - VCPUs: cpu, + Name: flavorDetails.Name, + MemoryMB: memoryMB, // take the memory from the reservation spec, not from the flavor - reservation might be bigger + VCPUs: cpu, // take the cpu from the reservation spec, not from the flavor - reservation might be bigger + ExtraSpecs: flavorDetails.ExtraSpecs, // Disk is currently not considered. + }, }, }, @@ -187,13 +274,26 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) } response, err := httpClient.Post(url, "application/json", strings.NewReader(string(reqBody))) if err != nil { - log.Error(err, "failed to send external scheduler request") + log.Error(err, "failed to send external scheduler request", "url", url) return ctrl.Result{}, err } defer response.Body.Close() + + // Check HTTP status code before attempting to decode JSON + if response.StatusCode != http.StatusOK { + err := fmt.Errorf("unexpected HTTP status code: %d", response.StatusCode) + log.Error(err, "external scheduler returned non-OK status", + "url", url, + "statusCode", response.StatusCode, + "status", response.Status) + return ctrl.Result{}, err + } + var externalSchedulerResponse schedulerdelegationapi.ExternalSchedulerResponse if err := json.NewDecoder(response.Body).Decode(&externalSchedulerResponse); err != nil { - log.Error(err, "failed to decode external scheduler response") + log.Error(err, "failed to decode external scheduler response", + "url", url, + "statusCode", response.StatusCode) return ctrl.Result{}, err } if len(externalSchedulerResponse.Hosts) == 0 { @@ -207,8 +307,13 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) }) patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { - log.Error(err, "failed to patch reservation status") - return ctrl.Result{}, err + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + log.Error(err, "failed to patch reservation status") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil } return ctrl.Result{}, nil // No need to requeue, we didn't find a host. } @@ -226,12 +331,141 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) res.Status.Host = host patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { - log.Error(err, "failed to patch reservation status") - return ctrl.Result{}, err + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + log.Error(err, "failed to patch reservation status") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil } return ctrl.Result{}, nil // No need to requeue, the reservation is now active. } +// reconcileAllocations verifies all allocations in Spec against actual Nova VM state. +// It updates Status.Allocations based on the actual host location of each VM. +func (r *ReservationReconciler) reconcileAllocations(ctx context.Context, res *v1alpha1.Reservation) error { + log := logf.FromContext(ctx) + + // Skip if no CommittedResourceReservation + if res.Spec.CommittedResourceReservation == nil { + return nil + } + + // TODO trigger migrations of unused reservations (to PAYG VMs) + + // Skip if no allocations to verify + if len(res.Spec.CommittedResourceReservation.Allocations) == 0 { + log.Info("no allocations to verify", "reservation", res.Name) + return nil + } + + // Query all VMs for this project from the database + projectID := res.Spec.CommittedResourceReservation.ProjectID + serverMap, err := r.listServersByProjectID(ctx, projectID) + if err != nil { + return fmt.Errorf("failed to list servers for project %s: %w", projectID, err) + } + + // initialize + if res.Status.CommittedResourceReservation == nil { + res.Status.CommittedResourceReservation = &v1alpha1.CommittedResourceReservationStatus{} + } + + // Build new Status.Allocations map based on actual VM locations + newStatusAllocations := make(map[string]string) + + for vmUUID := range res.Spec.CommittedResourceReservation.Allocations { + server, exists := serverMap[vmUUID] + if exists { + // VM found - record its actual host location + actualHost := server.OSEXTSRVATTRHost + newStatusAllocations[vmUUID] = actualHost + + log.Info("verified VM allocation", + "vm", vmUUID, + "reservation", res.Name, + "actualHost", actualHost, + "expectedHost", res.Status.Host) + } else { + // VM not found in database + log.Info("VM not found in database", + "vm", vmUUID, + "reservation", res.Name, + "projectID", projectID) + + // TODO handle entering and leave event + } + } + + // Patch the reservation status + old := res.DeepCopy() + + // Update Status.Allocations + res.Status.CommittedResourceReservation.Allocations = newStatusAllocations + + patch := client.MergeFrom(old) + if err := r.Status().Patch(ctx, res, patch); err != nil { + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) == nil { + // Object was deleted, no need to continue + return nil + } + return fmt.Errorf("failed to patch reservation status: %w", err) + } + + log.Info("reconciled allocations", + "reservation", res.Name, + "specAllocations", len(res.Spec.CommittedResourceReservation.Allocations), + "statusAllocations", len(newStatusAllocations)) + + return nil +} + +// Init initializes the reconciler with required clients and DB connection. +func (r *ReservationReconciler) Init(ctx context.Context, client client.Client, conf Config) error { + // Initialize database connection if DatabaseSecretRef is provided. + if conf.DatabaseSecretRef != nil { + var err error + r.DB, err = db.Connector{Client: client}.FromSecretRef(ctx, *conf.DatabaseSecretRef) + if err != nil { + return fmt.Errorf("failed to initialize database connection: %w", err) + } + logf.FromContext(ctx).Info("database connection initialized for reservation controller") + } + + return nil +} + +func (r *ReservationReconciler) listServersByProjectID(ctx context.Context, projectID string) (map[string]*nova.Server, error) { + if r.DB == nil { + return nil, errors.New("database connection not initialized") + } + + log := logf.FromContext(ctx) + + // Query servers from the database cache. + var servers []nova.Server + _, err := r.DB.Select(&servers, + "SELECT * FROM openstack_servers WHERE tenant_id = $1", + projectID) + if err != nil { + return nil, fmt.Errorf("failed to query servers from database: %w", err) + } + + log.V(1).Info("queried servers from database", + "projectID", projectID, + "serverCount", len(servers)) + + // Build lookup map for O(1) access by VM UUID. + serverMap := make(map[string]*nova.Server, len(servers)) + for i := range servers { + serverMap[servers[i].ID] = &servers[i] + } + + return serverMap, nil +} + // SetupWithManager sets up the controller with the Manager. func (r *ReservationReconciler) SetupWithManager(mgr ctrl.Manager, mcl *multicluster.Client) error { if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { diff --git a/internal/scheduling/reservations/controller/controller_test.go b/internal/scheduling/reservations/controller/controller_test.go index d716c0b63..548857d3a 100644 --- a/internal/scheduling/reservations/controller/controller_test.go +++ b/internal/scheduling/reservations/controller/controller_test.go @@ -36,7 +36,7 @@ func TestReservationReconciler_Reconcile(t *testing.T) { shouldRequeue bool }{ { - name: "skip already active reservation", + name: "expect already active reservation", reservation: &v1alpha1.Reservation{ ObjectMeta: ctrl.ObjectMeta{ Name: "test-reservation", @@ -59,7 +59,7 @@ func TestReservationReconciler_Reconcile(t *testing.T) { }, }, expectedReady: true, - shouldRequeue: false, + shouldRequeue: true, }, { name: "skip reservation without resource name", @@ -155,10 +155,71 @@ func TestReservationReconciler_reconcileInstanceReservation_Success(t *testing.T }, } + // Create flavor group knowledge CRD for the test + // Need to import compute package for FlavorGroupFeature + flavorGroups := []struct { + Name string `json:"name"` + Flavors []struct { + Name string `json:"name"` + MemoryMB uint64 `json:"memoryMB"` + VCPUs uint64 `json:"vcpus"` + ExtraSpecs map[string]string `json:"extraSpecs"` + } `json:"flavors"` + }{ + { + Name: "test-group", + Flavors: []struct { + Name string `json:"name"` + MemoryMB uint64 `json:"memoryMB"` + VCPUs uint64 `json:"vcpus"` + ExtraSpecs map[string]string `json:"extraSpecs"` + }{ + { + Name: "test-flavor", + MemoryMB: 1024, + VCPUs: 2, + ExtraSpecs: map[string]string{}, + }, + }, + }, + } + + // Marshal flavor groups into runtime.RawExtension + flavorGroupsJSON, err := json.Marshal(map[string]interface{}{ + "features": flavorGroups, + }) + if err != nil { + t.Fatalf("Failed to marshal flavor groups: %v", err) + } + + flavorGroupKnowledge := &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{ + Name: "flavor-groups", + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{ + Name: "flavor_groups", // Note: underscore not hyphen + }, + Recency: metav1.Duration{Duration: 0}, + }, + Status: v1alpha1.KnowledgeStatus{ + Raw: runtime.RawExtension{Raw: flavorGroupsJSON}, + RawLength: 1, + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionTrue, + Reason: "TestReady", + }, + }, + }, + } + client := fake.NewClientBuilder(). WithScheme(scheme). - WithObjects(reservation). - WithStatusSubresource(&v1alpha1.Reservation{}). + WithObjects(reservation, flavorGroupKnowledge). + WithStatusSubresource(&v1alpha1.Reservation{}, &v1alpha1.Knowledge{}). Build() // Create a mock server that returns a successful response @@ -196,28 +257,6 @@ func TestReservationReconciler_reconcileInstanceReservation_Success(t *testing.T Client: client, Scheme: scheme, Conf: config, - HypervisorClient: &mockHypervisorClient{ - hypervisorsToReturn: []Hypervisor{ - { - Hostname: "test-host-1", - Type: "qemu", - Service: struct { - Host string `json:"host"` - }{ - Host: "compute1", - }, - }, - { - Hostname: "test-host-2", - Type: "qemu", - Service: struct { - Host string `json:"host"` - }{ - Host: "compute2", - }, - }, - }, - }, } req := ctrl.Request{ diff --git a/internal/scheduling/reservations/flavor_groups.go b/internal/scheduling/reservations/flavor_groups.go new file mode 100644 index 000000000..197406eac --- /dev/null +++ b/internal/scheduling/reservations/flavor_groups.go @@ -0,0 +1,74 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package reservations + +import ( + "context" + "errors" + "fmt" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// FlavorGroupKnowledgeClient accesses flavor group data from Knowledge CRDs. +type FlavorGroupKnowledgeClient struct { + client.Client +} + +// Get retrieves the flavor groups Knowledge CRD and returns it if ready. +// Returns nil, nil if not ready yet. +func (c *FlavorGroupKnowledgeClient) Get(ctx context.Context) (*v1alpha1.Knowledge, error) { + knowledge := &v1alpha1.Knowledge{} + err := c.Client.Get(ctx, types.NamespacedName{ + Name: "flavor-groups", + // Namespace is empty as Knowledge is cluster-scoped + }, knowledge) + + if err != nil { + return nil, fmt.Errorf("failed to get flavor groups knowledge: %w", err) + } + + if meta.IsStatusConditionTrue(knowledge.Status.Conditions, v1alpha1.KnowledgeConditionReady) { + return knowledge, nil + } + + // Found but not ready yet + return nil, nil +} + +// GetAllFlavorGroups returns all flavor groups as a map. +// If knowledgeCRD is provided, uses it directly. Otherwise fetches the Knowledge CRD. +func (c *FlavorGroupKnowledgeClient) GetAllFlavorGroups(ctx context.Context, knowledgeCRD *v1alpha1.Knowledge) (map[string]compute.FlavorGroupFeature, error) { + // If no CRD provided, fetch it + if knowledgeCRD == nil { + var err error + knowledgeCRD, err = c.Get(ctx) + if err != nil { + return nil, err + } + if knowledgeCRD == nil { + return nil, errors.New("flavor groups knowledge is not ready") + } + } + + // Unbox the features from the raw extension + features, err := v1alpha1.UnboxFeatureList[compute.FlavorGroupFeature]( + knowledgeCRD.Status.Raw, + ) + if err != nil { + return nil, fmt.Errorf("failed to unbox flavor group features: %w", err) + } + + // Build map for efficient lookups + flavorGroupMap := make(map[string]compute.FlavorGroupFeature, len(features)) + for _, feature := range features { + flavorGroupMap[feature.Name] = feature + } + + return flavorGroupMap, nil +} From c509b50b96461ab68c85a26ba72322dd4297e6c6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 12 Mar 2026 11:02:15 +0000 Subject: [PATCH 04/55] Bump cortex chart appVersions to sha-3ee76b67 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index f4ef6e2bb..da36e530d 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-081184cd" +appVersion: "sha-3ee76b67" icon: "https://example.com/icon.png" dependencies: [] From 8f6dfef5edefdb785741b367f47f7877ac1abc4f Mon Sep 17 00:00:00 2001 From: Markus Wieland <44964229+SoWieMarkus@users.noreply.github.com> Date: Fri, 13 Mar 2026 07:17:42 +0100 Subject: [PATCH 05/55] Remove duplicate namePrefix from cortex values (#571) [skpi ci] Was defined twice in `values.yaml` --- helm/library/cortex/values.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/helm/library/cortex/values.yaml b/helm/library/cortex/values.yaml index e7a475184..ad3c7ba8b 100644 --- a/helm/library/cortex/values.yaml +++ b/helm/library/cortex/values.yaml @@ -44,9 +44,6 @@ controllerManager: terminationGracePeriodSeconds: 10 serviceAccountName: controller-manager -# Use this to unambiguate multiple cortex deployments in the same cluster. -namePrefix: cortex - # [RBAC]: To enable RBAC (Permissions) configurations rbac: enable: true From 546bf5ef4c93c0c28b8ae483e566f55184694238 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 13 Mar 2026 06:27:02 +0000 Subject: [PATCH 06/55] Bump cortex chart appVersions to sha-8f6dfef5 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index da36e530d..a27bd9a9e 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-3ee76b67" +appVersion: "sha-8f6dfef5" icon: "https://example.com/icon.png" dependencies: [] From 02cfa32d17c3b2630bd3b6d80a4a034369e2b784 Mon Sep 17 00:00:00 2001 From: Markus Wieland <44964229+SoWieMarkus@users.noreply.github.com> Date: Fri, 13 Mar 2026 07:32:06 +0100 Subject: [PATCH 07/55] Remove visualizer components and related files (#570) With the new use case of the decision crd (for now history crd) we can't use the visualizer anymore. Goal is to make the explanation already good enough :D --- Tiltfile | 8 - tools/visualizer/Dockerfile | 9 - tools/visualizer/app.yaml | 54 ---- tools/visualizer/favicon.ico | Bin 15406 -> 0 bytes tools/visualizer/nginx.conf | 18 -- tools/visualizer/nova.html | 503 ----------------------------------- tools/visualizer/role.yaml | 29 -- tools/visualizer/shared.css | 289 -------------------- 8 files changed, 910 deletions(-) delete mode 100644 tools/visualizer/Dockerfile delete mode 100644 tools/visualizer/app.yaml delete mode 100644 tools/visualizer/favicon.ico delete mode 100644 tools/visualizer/nginx.conf delete mode 100644 tools/visualizer/nova.html delete mode 100644 tools/visualizer/role.yaml delete mode 100644 tools/visualizer/shared.css diff --git a/Tiltfile b/Tiltfile index 84d39394b..2de697c32 100644 --- a/Tiltfile +++ b/Tiltfile @@ -268,14 +268,6 @@ k8s_resource( labels=['Monitoring'], ) -k8s_yaml('./tools/visualizer/role.yaml') -docker_build('cortex-visualizer', './tools/visualizer') -k8s_yaml('./tools/visualizer/app.yaml') -k8s_resource('cortex-visualizer', port_forwards=[ - port_forward(4000, 80), -], links=[ - link('localhost:4000', 'nova visualizer'), -], labels=['Monitoring']) docker_build('cortex-plutono', './tools/plutono') k8s_yaml('./tools/plutono/app.yaml') k8s_resource('cortex-plutono', port_forwards=[ diff --git a/tools/visualizer/Dockerfile b/tools/visualizer/Dockerfile deleted file mode 100644 index af7c859dd..000000000 --- a/tools/visualizer/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright SAP SE -# SPDX-License-Identifier: Apache-2.0 - -FROM nginx - -COPY nova.html /usr/share/nginx/html/nova.html -COPY shared.css /usr/share/nginx/html/shared.css -COPY favicon.ico /usr/share/nginx/html/favicon.ico -COPY nginx.conf /etc/nginx/conf.d/default.conf diff --git a/tools/visualizer/app.yaml b/tools/visualizer/app.yaml deleted file mode 100644 index 5697571e3..000000000 --- a/tools/visualizer/app.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright SAP SE -# SPDX-License-Identifier: Apache-2.0 - -apiVersion: apps/v1 -kind: Deployment -metadata: - name: cortex-visualizer - labels: - app: cortex-visualizer -spec: - replicas: 1 - selector: - matchLabels: - app: cortex-visualizer - template: - metadata: - labels: - app: cortex-visualizer - spec: - serviceAccountName: cortex-visualizer - containers: - - name: cortex-visualizer - image: cortex-visualizer - ports: - - containerPort: 80 - - name: kubectl-proxy - image: alpine:latest - command: ["/bin/sh"] - args: - - -c - - | - apk add --no-cache curl - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" - chmod +x kubectl - mv kubectl /usr/local/bin/ - kubectl proxy --port=8001 --address=0.0.0.0 --accept-hosts=.* - ports: - - containerPort: 8001 ---- -apiVersion: v1 -kind: Service -metadata: - name: cortex-visualizer -spec: - selector: - app: cortex-visualizer - ports: - - name: http - port: 80 - targetPort: 80 - - name: kubectl-proxy - port: 8001 - targetPort: 8001 - type: ClusterIP \ No newline at end of file diff --git a/tools/visualizer/favicon.ico b/tools/visualizer/favicon.ico deleted file mode 100644 index b4f9d5fb8c202c1528e8458073ffeffdb530c143..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15406 zcmeI21&~xn6UP^KcR%by3kgo}0)l&RmjVi>un^o`fkfyMDL@cXti${k>o3 zo2z$wyLar}?g2%n>g{{`-bhbRPxrsO$KeR)h~S7D*J0iL9Bm>x9Qhp%ho9d+|1-yP zIA&Y#a^?E}_y0H?|4rm@WVY&9O{^CGyti&Hi-i2M->UBS((t_#M&mf^#P%g~`i<)@#1(tb{!Jn3K0)dRd5 zJb17?d-hB})1K?psZ&DRUcGt=JaSiR*RCyN$Bqru9$J`3xpL*CUAuPfnoHZZZIwU8 zix-!L3m59xbLY;L>eZ{u*|TTm!Gj0Ewx{#kw{I(*)2C0D(W6I8-@bjNVZ(;ne{kbN zf77N-<^1{cI+szSMyU)OKYm=`Pf+a{W81cEa{c;sog=(M4*T`%r#!iL@187Lw8-uJ znPZCiS>a!P``eCY1y)+@*O$azI}U8O#*#; z^X84LUcFlP!@GCy0)4l=cw~F_^zorfrQpF!^o*Z%m9Q@Kle=OOw$)SOyjb78((&RT zA?=F&a`oy}ZT|T2V>xr?j6UDJdsmP>zQ20)O7FC-q>u%n>sj~X_-+#-a zN1ltq?%lhE{a(3pWtH241q(`(CQW4Ez=66C(V_YC=T|zaRjVdfu3V7;0|sdQ@4ov^ zX3m_cdbw=bvi@{?!Uyy^Ivt(+`|rQ&UPa#X<$ zI>Wj=dGbV%v7SA9O8@@-)gFX|$DY!S{9*T!Cr>W1W5rYW9?88fDoFJE3Ufv5eutAFjeLb|RxIUTu3Wi-#5OKFw_Y+nZPEM4)QJ-(0#tQ3{?@HqRZh~T zO{)!DCvoG(mFCTxD;;M1i4!ME=FFL8)v8rOea1&G;ZvYDLp#2i zG`N@b$d@l)fbsdm-bCKmBj{;+@z(h80b7Gz$8W;sp!>iD%a<<~bYiJer6f+AIEtsi znd}2^jnACG$>8jC>C&l8Llf(fEn7C})Txv5AD;+1vS-gO_#L4gpZTFnx^?R&DO09Y z`}+L(bHzO98e|>0Krfs1KX&Yx;_LnU_lJ6X>^yTpw}Np@LFX}^-R_w2O&#u`o&Wv& z_tkI2ceWSC$M#UrHTY7gQl$!{Cy|q&d>eb8_Gj=Hyxna1-L8rmAD@Q3$@=35+RGn* z{2};f_}-ot~Rv-*x#)k(}rc4RY{o%uhr9_DmDi_Xt3sCU`{(zlo z)~qSfqD7PMzyDr&O*8xWJX64Cty{NN`^TOC-YQ_q9654GiWDg{?t%Sh?p$E)apT76 ze3>`%_eH_3fiTiwM4zLfXGNvsg9qp5WT6DR3rt<#f_xK<$@`mxop>t{;4nKN_vA_|xe zJOS=9x{W*kP5hZUU@+_f-*|6-HtncGUtH87R>egz?Mz*7Fc%t#hY%+Q{}PiyM`J7b z4_4(n@iS}(J`Hg!unRZ~oPu33cm@1HUtGN3yLYdQ88b$CK`w!Kmow%G%{=g*iD^xm zG)a~%Tc$B*Vqa;}q)}fIybl%y3xK)sO?&j{A$981QT~%-03R6p4=yLh3`WB*0>73n zU0OD7+^86x__rOuhK6?9!cY896SG84;8E7BS#`ZxdvF7I4f=~)F+4CPem-mGihg{5 z>XWCTzge?psULtZX2;{9rJeTVQoj1?EA1D0kYBJxmMmG6PsnkC1PK%aK|k?6Xit(P ziN-ix(a(AkgJcb$fAQkQI`_PJ^L|jy$&-1J$0}O1sM3WDk@vtahUSqYM@swl?IlBo z44Mmp{_fqoD}KU%A|Hah^UR&NA@e3TL|=0(jLW|H=9_OczF4thMUA_`CvO$_gTGO- zWJ!TP_}uvNaN)wK zu4S(q8x|J4{LnFw74o>mk0L~fpn8+I3i(Xv$Jgf~evBOoxZjK74q! zS;R-kzcUv@KRT6|8~NRY2@~pi5EB9`7~B(<^MT*UHF^-c3r!t5bWnPb@rzcj(3yVT zfeBa-bS7~R;!4;G;%eyVuqfav;(o|@#E20!22GwHn{0|Z`ptK8pV$Rti9XV&Pp|T4 z@59vf)*YOTjR&(K$Jkr)c<3AF5)}P*K9h3+Yk=+8H-!rq)^#OD=I!#=Uw>)Nl6Vf- z?b4-7Zifhpey}XDaQI;gSQ`ui9w9%$`i3MoZTfS*6K@0WfP(LpU`|))D8ef3;Grli4lsGBt0mk(u z{rD{C4`=|tGbX%5mvfH6oOQGFpR)txj<8)|Ap9-bnD{xm20z;5*L+RC+1tc*p#j>- zL)!DHcKXpH_$lam>@fZTzKo&Y^k?p2ML*-A3puw!?w$*qOtI6?e{3@OcFy`?ufcQZ zNLR8Kmh>Y_U}Z25SQ%Yt&hjB!U?b`fLxFziHGU`hjCDhY;**ijW=+GAet2sNa!l@+ zvpY$XCe>Uu_5zu&RH>5W%$ZYtWv~YFPizI<$vFh_9_SzPS|5XcXu>yUpK|sRAJ$+Q z&O@Rf$-Sbph#NsaJjbpOXUUi`qv9#f&ia~u4#3%o7^RKf5eCp8cQ*>aEFiRO3tS^`@F8`x3~2*{n$)=4t$S7 zg$e~ae`2Q}U&RzK6aF!0#L@rAnybRu1YhAFd~*tZBeb9^T@}NB=trlrPl*+>CSZJG zop$_e?*skHxomWow+iFqU!c?R6Aeyq*T0>9`o^!Svx|ox5A;!u%c=Qu{!CCH{ z>wEGayMwQV9%RpQCx*Z{=p*N1r=K;zHZu?B!aU#wXN{ - - - - Cortex Nova Visualizer - - - - - - - - -
-
- -
Cortex Nova Visualizer
-
- - - -
-
-
-
Loading...
- -
-
-
- - - - - - \ No newline at end of file diff --git a/tools/visualizer/role.yaml b/tools/visualizer/role.yaml deleted file mode 100644 index e497f3b88..000000000 --- a/tools/visualizer/role.yaml +++ /dev/null @@ -1,29 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: cortex-visualizer - namespace: default - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: cortex-visualizer-decision-reader -rules: -- apiGroups: ["cortex.cloud"] - resources: ["decisions"] - verbs: ["get", "list", "watch"] - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: cortex-visualizer-decision-reader-binding -subjects: -- kind: ServiceAccount - name: cortex-visualizer - namespace: default -roleRef: - kind: ClusterRole - name: cortex-visualizer-decision-reader - apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/tools/visualizer/shared.css b/tools/visualizer/shared.css deleted file mode 100644 index b3a933375..000000000 --- a/tools/visualizer/shared.css +++ /dev/null @@ -1,289 +0,0 @@ -/* Copyright SAP SE */ -/* SPDX-License-Identifier: Apache-2.0 */ - -body { - font-family: Arial, Helvetica, sans-serif; - - --color-primary: rgb(255, 165, 2); - --color-on-primary: rgb(255, 255, 255); - --color-secondary: rgb(112, 161, 255); - --color-on-secondary: rgb(255, 255, 255); - --color-tertiary: rgb(255, 71, 87); - --color-on-tertiary: rgb(255, 255, 255); - --color-background: rgb(241, 242, 246); - --color-on-background: rgb(74, 74, 74); - --color-surface: rgb(255, 255, 255); - --color-on-surface: rgb(74, 74, 74); - - color: var(--color-text); - background: var(--color-background); - /* Remove the default margin and padding from the body. */ - margin: 0; -} - -/* Nice animated progress bar on top of the page. */ -.progress { - position: fixed; - top: 0; - left: 0; - right: 0; - height: 0.5em; - background: var(--color-primary); - z-index: 1000; -} - -.progress::before { - content: ''; - position: absolute; - top: 0; - left: 0; - right: 0; - height: 0.5em; - background: var(--color-secondary); - animation: progress 2s infinite; -} - -@keyframes progress { - 0% { - left: -100%; - right: 100%; - } - - 100% { - left: 100%; - right: -100%; - } -} - -.progress-text { - position: fixed; - top: 2em; - left: 0; - right: 0; - text-align: center; - font-weight: bold; -} - -/* Navbar that shows information. */ -nav { - padding-left: 0.25em; - background: var(--color-surface); - box-shadow: 0 0 1em rgba(0, 0, 0, 0.1); - z-index: 1; -} - -nav div.element { - display: inline-block; - padding-top: 1em; - padding-bottom: 2em; - padding-left: 1em; - padding-right: 1em; - margin: 0; - background: var(--color-surface); - color: var(--color-on-surface); - border-right: 2px solid var(--color-background); - font-size: 1em; -} - -nav div.element p.highlight { - font-size: 1.25em; - font-weight: bold; -} - -table { - /* Revert the default spacing used by the browser. */ - border-spacing: 0; -} - -/* Table cell showing the weight during scheduling. */ -td.weight { - text-align: center; - position: relative; - animation: weightAnimation 0.25s ease-in-out; -} - -td.weight div { - border-radius: 0.5em; - padding: 0.5em; - margin: 0.5em; - border: 2px solid var(--color-surface); -} - -/* Backdrop white for the weight cells */ -td.weight::after { - content: ''; - position: absolute; - --m: 0.6em; - top: var(--m); - bottom: var(--m); - left: var(--m); - right: var(--m); - border-radius: 0.5em; - background: var(--color-surface); - z-index: -1; -} - -/* Animation for weights when they first appear */ -@keyframes weightAnimation { - 0% { - opacity: 0; - transform: scale(0.5); - } - - 100% { - opacity: 1; - transform: scale(1); - } -} - -/* Table cell showing the hostname/name. */ -th.hostname { - text-align: center; - position: relative; -} - -th.hostname div { - position: relative; - padding: 0.1em; - padding-top: 0.5em; - padding-bottom: 0.5em; - margin: 0.1em; - width: 6em; - height: 6em; - overflow: hidden; -} - -/* Table cell showing additional information. */ -th.metainfo { - text-align: center; - position: relative; -} - -th.metainfo div p { - width: 6em; - overflow: hidden; -} - -th.metainfo div p.issue { - color: var(--color-tertiary); - border-radius: 0.5em; - font-size: 0.8em; -} - -/* Table row showing the name of a step in the pipeline. */ -th.stepkey { - text-align: left; - font-weight: bold; - padding-left: 0.75em; - padding-top: 0.5em; - padding-bottom: 0.25em; -} - -/* Highlighted rows in the table. */ -tr.highlight { - background: var(--color-surface); - /* tr doesn't support border-radius */ - clip-path: xywh(0 0 100% 100% round 0.75em); -} - -/* Chart showing usage statistics. */ -td.chart { - position: relative; - height: 24em; -} - -td.chart div.barsbefore, -td.chart div.barsafter, -td.chart div.backdrop, -td.chart div.stats { - position: absolute; - top: 0; - left: 0; - right: 0; - bottom: 0; - display: flex; - margin-top: 1.5em; - margin-bottom: 0.5em; - padding-left: 0.5em; - padding-right: 0.5em; - flex-direction: row; - justify-content: center; - align-items: flex-end; -} - -td.chart div.barsbefore p, -td.chart div.barsafter p, -td.chart div.backdrop p, -td.chart div.stats p { - margin-left: 0.1em; - margin-right: 0.1em; - display: flex; - border-radius: 0.2em; -} - -td.chart div.backdrop p { - height: 100%; - border-radius: 0.2em; - border: 1px solid rgba(0, 0, 0, 0.05); - background: white; -} - -td.chart div.stats { - text-align: center; - display: flex; - justify-content: center; - align-items: flex-start; -} - -td.chart div.stats p { - writing-mode: vertical-lr; - text-orientation: mixed; - display: flex; - font-size: 1em; - font-weight: bold; - margin-left: 0.1em; - margin-right: 0.1em; - justify-content: center; - align-items: center; -} - -/* Animation for chart bars */ -td.chart div.barsafter p, -td.chart div.barsbefore p { - animation: barAnim 0.25s ease-in-out; - overflow: hidden; -} - -@keyframes barAnim { - 0% { - transform: scaleY(0); - } - - 100% { - transform: scaleY(1); - } -} - -td.chart div.barsafter p.cpu { - background: var(--color-primary); -} - -td.chart div.barsafter p.mem { - background: var(--color-primary); -} - -td.chart div.barsafter p.disk { - background: var(--color-primary); -} - -td.chart div.barsbefore p.cpu { - background: var(--color-secondary); -} - -td.chart div.barsbefore p.mem { - background: var(--color-secondary); -} - -td.chart div.barsbefore p.disk { - background: var(--color-secondary); -} From 52e8d8550cf1d3c12665449cc91eaded0392d2b8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 13 Mar 2026 06:41:29 +0000 Subject: [PATCH 08/55] Bump cortex chart appVersions to sha-02cfa32d [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index a27bd9a9e..c4f1606dc 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-8f6dfef5" +appVersion: "sha-02cfa32d" icon: "https://example.com/icon.png" dependencies: [] From 3e9c27eff50ac90561c6f22609d2f0cdfb2a1d5c Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Sat, 14 Mar 2026 09:36:03 +0100 Subject: [PATCH 09/55] Renovate: Update github.com/cobaltcore-dev/openstack-hypervisor-operator digest to 05f22f6 (#573) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [github.com/cobaltcore-dev/openstack-hypervisor-operator](https://redirect.github.com/cobaltcore-dev/openstack-hypervisor-operator) | require | digest | `733c59b` → `05f22f6` | --- ### Configuration 📅 **Schedule**: Branch creation - "before 8am on Friday" (UTC), Automerge - At any time (no schedule defined). 🚦 **Automerge**: Disabled by config. Please merge this manually once you are satisfied. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR was generated by [Mend Renovate](https://mend.io/renovate/). View the [repository job log](https://developer.mend.io/github/cobaltcore-dev/cortex). Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index f6b914718..8a2f503d1 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/cobaltcore-dev/cortex go 1.26 require ( - github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260305105543-733c59b0b17c + github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260313132145-05f22f69d9fd github.com/go-gorp/gorp v2.2.0+incompatible github.com/gophercloud/gophercloud/v2 v2.10.0 github.com/ironcore-dev/ironcore v0.2.4 diff --git a/go.sum b/go.sum index 03dfd27f0..4d0e2fcd3 100644 --- a/go.sum +++ b/go.sum @@ -20,8 +20,8 @@ github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1x github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260305105543-733c59b0b17c h1:XBqsQQwdSep27eJN7sACjahkhmR2zRlJwv9PrYcEou8= -github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260305105543-733c59b0b17c/go.mod h1:b0KmJdxvRI8UXlGe8cRm5BD8Tm2WhF7zSKMSIRGyVL4= +github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260313132145-05f22f69d9fd h1:IzxramZZRC/9FtQQqpbgf8KIpH4soD9cliCFs2+zPd4= +github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260313132145-05f22f69d9fd/go.mod h1:b0KmJdxvRI8UXlGe8cRm5BD8Tm2WhF7zSKMSIRGyVL4= github.com/containerd/continuity v0.4.5 h1:ZRoN1sXq9u7V6QoHMcVWGhOwDFqZ4B9i5H6un1Wh0x4= github.com/containerd/continuity v0.4.5/go.mod h1:/lNJvtJKUQStBzpVQ1+rasXO1LAWtUQssk28EZvJ3nE= github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= From 339dc95e9d46e1534116eef13568720ff413a4c5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 14 Mar 2026 08:45:05 +0000 Subject: [PATCH 10/55] Bump cortex chart appVersions to sha-3e9c27ef [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index c4f1606dc..c1abf76a2 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-02cfa32d" +appVersion: "sha-3e9c27ef" icon: "https://example.com/icon.png" dependencies: [] From 7d1b8692a65fddaeefa82c757b2a682f248e24a0 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Sat, 14 Mar 2026 10:55:43 +0100 Subject: [PATCH 11/55] Renovate: Update docker/setup-qemu-action action to v4 (#580) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [docker/setup-qemu-action](https://redirect.github.com/docker/setup-qemu-action) | action | major | `v3` → `v4` | --- ### Release Notes
docker/setup-qemu-action (docker/setup-qemu-action) ### [`v4`](https://redirect.github.com/docker/setup-qemu-action/compare/v3...v4) [Compare Source](https://redirect.github.com/docker/setup-qemu-action/compare/v3...v4)
--- ### Configuration 📅 **Schedule**: Branch creation - "before 8am on Friday" (UTC), Automerge - At any time (no schedule defined). 🚦 **Automerge**: Disabled by config. Please merge this manually once you are satisfied. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR was generated by [Mend Renovate](https://mend.io/renovate/). View the [repository job log](https://developer.mend.io/github/cobaltcore-dev/cortex). Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- .github/workflows/push-images.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/push-images.yaml b/.github/workflows/push-images.yaml index 7addc3e7a..2f6f4ef5f 100644 --- a/.github/workflows/push-images.yaml +++ b/.github/workflows/push-images.yaml @@ -21,7 +21,7 @@ jobs: steps: - uses: actions/checkout@v6 - name: Set up QEMU - uses: docker/setup-qemu-action@v3 + uses: docker/setup-qemu-action@v4 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Login to Docker Registry From 8c1f61d5269798a1aadc0e1aafd87d63cae84a87 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 14 Mar 2026 10:04:37 +0000 Subject: [PATCH 12/55] Bump cortex chart appVersions to sha-7d1b8692 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index c1abf76a2..346d31c4a 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-3e9c27ef" +appVersion: "sha-7d1b8692" icon: "https://example.com/icon.png" dependencies: [] From 8278d9a7fa67107c3b8a27bbc432fca1b040e4be Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Sat, 14 Mar 2026 12:42:12 +0100 Subject: [PATCH 13/55] Renovate: Update docker/setup-buildx-action action to v4 (#579) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [docker/setup-buildx-action](https://redirect.github.com/docker/setup-buildx-action) | action | major | `v3` → `v4` | --- ### Release Notes
docker/setup-buildx-action (docker/setup-buildx-action) ### [`v4`](https://redirect.github.com/docker/setup-buildx-action/compare/v3...v4) [Compare Source](https://redirect.github.com/docker/setup-buildx-action/compare/v3...v4)
--- ### Configuration 📅 **Schedule**: Branch creation - "before 8am on Friday" (UTC), Automerge - At any time (no schedule defined). 🚦 **Automerge**: Disabled by config. Please merge this manually once you are satisfied. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR was generated by [Mend Renovate](https://mend.io/renovate/). View the [repository job log](https://developer.mend.io/github/cobaltcore-dev/cortex). Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- .github/workflows/push-images.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/push-images.yaml b/.github/workflows/push-images.yaml index 2f6f4ef5f..22fceaf53 100644 --- a/.github/workflows/push-images.yaml +++ b/.github/workflows/push-images.yaml @@ -23,7 +23,7 @@ jobs: - name: Set up QEMU uses: docker/setup-qemu-action@v4 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@v4 - name: Login to Docker Registry uses: docker/login-action@v3 with: From b33635eb12668607fb91e3c3a7f815983623d2c8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 14 Mar 2026 11:51:06 +0000 Subject: [PATCH 14/55] Bump cortex chart appVersions to sha-8278d9a7 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 346d31c4a..fa99f58ea 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-7d1b8692" +appVersion: "sha-8278d9a7" icon: "https://example.com/icon.png" dependencies: [] From 7151a847a52e43d3544f4e647b76550b01c86a3b Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Sat, 14 Mar 2026 14:09:15 +0100 Subject: [PATCH 15/55] Renovate: Update docker/login-action action to v4 (#577) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [docker/login-action](https://redirect.github.com/docker/login-action) | action | major | `v3` → `v4` | --- ### Release Notes
docker/login-action (docker/login-action) ### [`v4`](https://redirect.github.com/docker/login-action/compare/v3...v4) [Compare Source](https://redirect.github.com/docker/login-action/compare/v3...v4)
--- ### Configuration 📅 **Schedule**: Branch creation - "before 8am on Friday" (UTC), Automerge - At any time (no schedule defined). 🚦 **Automerge**: Disabled by config. Please merge this manually once you are satisfied. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR was generated by [Mend Renovate](https://mend.io/renovate/). View the [repository job log](https://developer.mend.io/github/cobaltcore-dev/cortex). Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- .github/workflows/push-charts.yaml | 2 +- .github/workflows/push-images.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/push-charts.yaml b/.github/workflows/push-charts.yaml index e75ab068d..88b0fe9e6 100644 --- a/.github/workflows/push-charts.yaml +++ b/.github/workflows/push-charts.yaml @@ -21,7 +21,7 @@ jobs: - name: Set up Helm uses: azure/setup-helm@v4.3.1 - name: Log into registry - uses: docker/login-action@v3 + uses: docker/login-action@v4 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} diff --git a/.github/workflows/push-images.yaml b/.github/workflows/push-images.yaml index 22fceaf53..9e0e0cc1e 100644 --- a/.github/workflows/push-images.yaml +++ b/.github/workflows/push-images.yaml @@ -25,7 +25,7 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v4 - name: Login to Docker Registry - uses: docker/login-action@v3 + uses: docker/login-action@v4 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} From 50bf73f94f2800978d2e60e16fb0717ff40ad2cc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 14 Mar 2026 13:18:09 +0000 Subject: [PATCH 16/55] Bump cortex chart appVersions to sha-7151a847 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index fa99f58ea..740fe4fa1 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-8278d9a7" +appVersion: "sha-7151a847" icon: "https://example.com/icon.png" dependencies: [] From 9de0594bf2648807a24bc8b66bac1833a7211abe Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Sat, 14 Mar 2026 15:35:57 +0100 Subject: [PATCH 17/55] Renovate: Update docker/build-push-action action to v7 (#576) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [docker/build-push-action](https://redirect.github.com/docker/build-push-action) | action | major | `v6` → `v7` | --- ### Release Notes
docker/build-push-action (docker/build-push-action) ### [`v7`](https://redirect.github.com/docker/build-push-action/compare/v6...v7) [Compare Source](https://redirect.github.com/docker/build-push-action/compare/v6...v7)
--- ### Configuration 📅 **Schedule**: Branch creation - "before 8am on Friday" (UTC), Automerge - At any time (no schedule defined). 🚦 **Automerge**: Disabled by config. Please merge this manually once you are satisfied. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR was generated by [Mend Renovate](https://mend.io/renovate/). View the [repository job log](https://developer.mend.io/github/cobaltcore-dev/cortex). Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- .github/workflows/push-images.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/push-images.yaml b/.github/workflows/push-images.yaml index 9e0e0cc1e..c8c98d407 100644 --- a/.github/workflows/push-images.yaml +++ b/.github/workflows/push-images.yaml @@ -54,7 +54,7 @@ jobs: - name: Build and Push Cortex Postgres if: steps.changed_postgres_files.outputs.all_changed_files != '' id: push_cortex_postgres - uses: docker/build-push-action@v6 + uses: docker/build-push-action@v7 with: context: postgres platforms: linux/amd64,linux/arm64 @@ -87,7 +87,7 @@ jobs: DOCKER_METADATA_SHORT_SHA_LENGTH: 8 - name: Build and Push Cortex id: push_cortex - uses: docker/build-push-action@v6 + uses: docker/build-push-action@v7 with: context: . file: Dockerfile From b209b55fbf611e88bb326ce682304bd922bf1282 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 14 Mar 2026 14:44:51 +0000 Subject: [PATCH 18/55] Bump cortex chart appVersions to sha-9de0594b [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 740fe4fa1..cbdc6f8ac 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-7151a847" +appVersion: "sha-9de0594b" icon: "https://example.com/icon.png" dependencies: [] From 70974a438fd8237ea377195ac3897fdc672cd02f Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Sat, 14 Mar 2026 16:04:50 +0100 Subject: [PATCH 19/55] Renovate: Update External dependencies (#575) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Change | [Age](https://docs.renovatebot.com/merge-confidence/) | [Adoption](https://docs.renovatebot.com/merge-confidence/) | [Passing](https://docs.renovatebot.com/merge-confidence/) | [Confidence](https://docs.renovatebot.com/merge-confidence/) | Type | Update | |---|---|---|---|---|---|---|---| | [github.com/gophercloud/gophercloud/v2](https://redirect.github.com/gophercloud/gophercloud) | `v2.10.0` → `v2.11.1` | ![age](https://developer.mend.io/api/mc/badges/age/go/github.com%2fgophercloud%2fgophercloud%2fv2/v2.11.1?slim=true) | ![adoption](https://developer.mend.io/api/mc/badges/adoption/go/github.com%2fgophercloud%2fgophercloud%2fv2/v2.11.1?slim=true) | ![passing](https://developer.mend.io/api/mc/badges/compatibility/go/github.com%2fgophercloud%2fgophercloud%2fv2/v2.10.0/v2.11.1?slim=true) | ![confidence](https://developer.mend.io/api/mc/badges/confidence/go/github.com%2fgophercloud%2fgophercloud%2fv2/v2.10.0/v2.11.1?slim=true) | require | minor | | [k8s.io/api](https://redirect.github.com/kubernetes/api) | `v0.35.1` → `v0.35.2` | ![age](https://developer.mend.io/api/mc/badges/age/go/k8s.io%2fapi/v0.35.2?slim=true) | ![adoption](https://developer.mend.io/api/mc/badges/adoption/go/k8s.io%2fapi/v0.35.2?slim=true) | ![passing](https://developer.mend.io/api/mc/badges/compatibility/go/k8s.io%2fapi/v0.35.1/v0.35.2?slim=true) | ![confidence](https://developer.mend.io/api/mc/badges/confidence/go/k8s.io%2fapi/v0.35.1/v0.35.2?slim=true) | require | patch | | [k8s.io/apimachinery](https://redirect.github.com/kubernetes/apimachinery) | `v0.35.1` → `v0.35.2` | ![age](https://developer.mend.io/api/mc/badges/age/go/k8s.io%2fapimachinery/v0.35.2?slim=true) | ![adoption](https://developer.mend.io/api/mc/badges/adoption/go/k8s.io%2fapimachinery/v0.35.2?slim=true) | ![passing](https://developer.mend.io/api/mc/badges/compatibility/go/k8s.io%2fapimachinery/v0.35.1/v0.35.2?slim=true) | ![confidence](https://developer.mend.io/api/mc/badges/confidence/go/k8s.io%2fapimachinery/v0.35.1/v0.35.2?slim=true) | require | patch | | [k8s.io/client-go](https://redirect.github.com/kubernetes/client-go) | `v0.35.1` → `v0.35.2` | ![age](https://developer.mend.io/api/mc/badges/age/go/k8s.io%2fclient-go/v0.35.2?slim=true) | ![adoption](https://developer.mend.io/api/mc/badges/adoption/go/k8s.io%2fclient-go/v0.35.2?slim=true) | ![passing](https://developer.mend.io/api/mc/badges/compatibility/go/k8s.io%2fclient-go/v0.35.1/v0.35.2?slim=true) | ![confidence](https://developer.mend.io/api/mc/badges/confidence/go/k8s.io%2fclient-go/v0.35.1/v0.35.2?slim=true) | require | patch | | [kube-prometheus-stack](https://redirect.github.com/prometheus-operator/kube-prometheus) ([source](https://redirect.github.com/prometheus-community/helm-charts)) | `82.4.1` → `82.10.3` | ![age](https://developer.mend.io/api/mc/badges/age/docker/ghcr.io%2fprometheus-community%2fcharts%2fkube-prometheus-stack/82.10.3?slim=true) | ![adoption](https://developer.mend.io/api/mc/badges/adoption/docker/ghcr.io%2fprometheus-community%2fcharts%2fkube-prometheus-stack/82.10.3?slim=true) | ![passing](https://developer.mend.io/api/mc/badges/compatibility/docker/ghcr.io%2fprometheus-community%2fcharts%2fkube-prometheus-stack/82.4.1/82.10.3?slim=true) | ![confidence](https://developer.mend.io/api/mc/badges/confidence/docker/ghcr.io%2fprometheus-community%2fcharts%2fkube-prometheus-stack/82.4.1/82.10.3?slim=true) | | minor | | [sigs.k8s.io/controller-runtime](https://redirect.github.com/kubernetes-sigs/controller-runtime) | `v0.23.1` → `v0.23.3` | ![age](https://developer.mend.io/api/mc/badges/age/go/sigs.k8s.io%2fcontroller-runtime/v0.23.3?slim=true) | ![adoption](https://developer.mend.io/api/mc/badges/adoption/go/sigs.k8s.io%2fcontroller-runtime/v0.23.3?slim=true) | ![passing](https://developer.mend.io/api/mc/badges/compatibility/go/sigs.k8s.io%2fcontroller-runtime/v0.23.1/v0.23.3?slim=true) | ![confidence](https://developer.mend.io/api/mc/badges/confidence/go/sigs.k8s.io%2fcontroller-runtime/v0.23.1/v0.23.3?slim=true) | require | patch | --- ### Release Notes
gophercloud/gophercloud (github.com/gophercloud/gophercloud/v2) ### [`v2.11.1`](https://redirect.github.com/gophercloud/gophercloud/releases/tag/v2.11.1) [Compare Source](https://redirect.github.com/gophercloud/gophercloud/compare/v2.11.0...v2.11.1) #### What's Changed - \[v2] Do not specify go patch version by [@​mandre](https://redirect.github.com/mandre) in [#​3648](https://redirect.github.com/gophercloud/gophercloud/pull/3648) **Full Changelog**: ### [`v2.11.0`](https://redirect.github.com/gophercloud/gophercloud/releases/tag/v2.11.0) [Compare Source](https://redirect.github.com/gophercloud/gophercloud/compare/v2.10.0...v2.11.0) #### What's Changed - \[v2] Add PCIAddress field to baremetal InterfaceType by [@​MahnoorAsghar](https://redirect.github.com/MahnoorAsghar) in [#​3602](https://redirect.github.com/gophercloud/gophercloud/pull/3602) - \[v2] Networking V2: Added support for ML2 extension port\_trusted\_vif by [@​dlaw4608](https://redirect.github.com/dlaw4608) in [#​3610](https://redirect.github.com/gophercloud/gophercloud/pull/3610) - \[v2] networking/v2/layer3/routers: Add external gateways management by [@​aldokimi](https://redirect.github.com/aldokimi) in [#​3611](https://redirect.github.com/gophercloud/gophercloud/pull/3611) - \[v2] Use jimmy amphora in octavia job by [@​eshulman2](https://redirect.github.com/eshulman2) in [#​3625](https://redirect.github.com/gophercloud/gophercloud/pull/3625) - \[v2] CI: Fix fwaas jobs by [@​mandre](https://redirect.github.com/mandre) in [#​3632](https://redirect.github.com/gophercloud/gophercloud/pull/3632) - \[v2] Bump go by [@​mandre](https://redirect.github.com/mandre) in [#​3630](https://redirect.github.com/gophercloud/gophercloud/pull/3630) - \[v2] Add a new Ironic field representing node health to Gophercloud by [@​jacob-anders](https://redirect.github.com/jacob-anders) in [#​3629](https://redirect.github.com/gophercloud/gophercloud/pull/3629) - \[v2] Add TSIG key support for OpenStack DNS v2 API by [@​omersch381](https://redirect.github.com/omersch381) in [#​3633](https://redirect.github.com/gophercloud/gophercloud/pull/3633) - \[v2] fix: networkipavailabilities: handle scientific notation in IP counts by [@​ednxzu](https://redirect.github.com/ednxzu) in [#​3640](https://redirect.github.com/gophercloud/gophercloud/pull/3640) - Prepare v2.11.0 by [@​mandre](https://redirect.github.com/mandre) in [#​3641](https://redirect.github.com/gophercloud/gophercloud/pull/3641) **Full Changelog**:
kubernetes/api (k8s.io/api) ### [`v0.35.2`](https://redirect.github.com/kubernetes/api/compare/v0.35.1...v0.35.2) [Compare Source](https://redirect.github.com/kubernetes/api/compare/v0.35.1...v0.35.2)
kubernetes/apimachinery (k8s.io/apimachinery) ### [`v0.35.2`](https://redirect.github.com/kubernetes/apimachinery/compare/v0.35.1...v0.35.2) [Compare Source](https://redirect.github.com/kubernetes/apimachinery/compare/v0.35.1...v0.35.2)
kubernetes/client-go (k8s.io/client-go) ### [`v0.35.2`](https://redirect.github.com/kubernetes/client-go/compare/v0.35.1...v0.35.2) [Compare Source](https://redirect.github.com/kubernetes/client-go/compare/v0.35.1...v0.35.2)
prometheus-community/helm-charts (kube-prometheus-stack) ### [`v82.10.3`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-82.10.3) kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus Operator. #### What's Changed - \[kube-prometheus-stack] Update kube-prometheus-stack dependency non-major updates by [@​renovate](https://redirect.github.com/renovate)\[bot] in [#​6726](https://redirect.github.com/prometheus-community/helm-charts/pull/6726) **Full Changelog**: ### [`v82.10.2`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-82.10.2) kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus Operator. #### What's Changed - \[kube-prometheus-stack] Update kube-prometheus-stack dependency non-major updates by [@​renovate](https://redirect.github.com/renovate)\[bot] in [#​6725](https://redirect.github.com/prometheus-community/helm-charts/pull/6725) **Full Changelog**: ### [`v82.10.1`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-82.10.1) [Compare Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-82.10.0...kube-prometheus-stack-82.10.1) kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus Operator. #### What's Changed - \[kube-prometheus-stack] add configurable reloaderWebNodePort by [@​firasmosbehi](https://redirect.github.com/firasmosbehi) in [#​6717](https://redirect.github.com/prometheus-community/helm-charts/pull/6717) **Full Changelog**: ### [`v82.10.0`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-82.10.0) [Compare Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-82.9.0...kube-prometheus-stack-82.10.0) kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus Operator. #### What's Changed - \[kube-prometheus-stack] Update kube-prometheus-stack dependency non-major updates by [@​renovate](https://redirect.github.com/renovate)\[bot] in [#​6722](https://redirect.github.com/prometheus-community/helm-charts/pull/6722) **Full Changelog**: ### [`v82.9.0`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-82.9.0) [Compare Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-82.8.0...kube-prometheus-stack-82.9.0) kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus Operator. #### What's Changed - \[kube-prometheus-stack] Update kube-prometheus-stack dependency non-major updates by [@​renovate](https://redirect.github.com/renovate)\[bot] in [#​6706](https://redirect.github.com/prometheus-community/helm-charts/pull/6706) **Full Changelog**: ### [`v82.8.0`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-82.8.0) [Compare Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-82.7.0...kube-prometheus-stack-82.8.0) kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus Operator. #### What's Changed - \[kube-prometheus-stack] Add VPA support for Prometheus by [@​QuentinBisson](https://redirect.github.com/QuentinBisson) in [#​6700](https://redirect.github.com/prometheus-community/helm-charts/pull/6700) **Full Changelog**: ### [`v82.7.0`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-82.7.0) [Compare Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-82.6.1...kube-prometheus-stack-82.7.0) kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus Operator. #### What's Changed - \[kube-prometheus-stack] Add VPA support for Alertmanager by [@​QuentinBisson](https://redirect.github.com/QuentinBisson) in [#​6699](https://redirect.github.com/prometheus-community/helm-charts/pull/6699) **Full Changelog**: ### [`v82.6.1`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-82.6.1) [Compare Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-82.6.0...kube-prometheus-stack-82.6.1) kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus Operator. #### What's Changed - \[kube-prometheus-stack] Update Helm release grafana to v11.2.3 by [@​renovate](https://redirect.github.com/renovate)\[bot] in [#​6701](https://redirect.github.com/prometheus-community/helm-charts/pull/6701) **Full Changelog**: ### [`v82.6.0`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-82.6.0) [Compare Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-82.5.0...kube-prometheus-stack-82.6.0) kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus Operator. #### What's Changed - \[kube-prometheus-stack] Update digest to [`b9b15e1`](https://redirect.github.com/prometheus-community/helm-charts/commit/b9b15e1) by [@​renovate](https://redirect.github.com/renovate)\[bot] in [#​6697](https://redirect.github.com/prometheus-community/helm-charts/pull/6697) **Full Changelog**: ### [`v82.5.0`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-82.5.0) [Compare Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-82.4.3...kube-prometheus-stack-82.5.0) kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus Operator. #### What's Changed - \[kube-prometheus-stack] Update kube-prometheus-stack dependency non-major updates by [@​renovate](https://redirect.github.com/renovate)\[bot] in [#​6696](https://redirect.github.com/prometheus-community/helm-charts/pull/6696) **Full Changelog**: ### [`v82.4.3`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-82.4.3) [Compare Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-82.4.2...kube-prometheus-stack-82.4.3) kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus Operator. #### What's Changed - \[kube-prometheus-stack] Update Helm release grafana to v11.2.2 by [@​renovate](https://redirect.github.com/renovate)\[bot] in [#​6689](https://redirect.github.com/prometheus-community/helm-charts/pull/6689) **Full Changelog**: ### [`v82.4.2`](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-82.4.1...kube-prometheus-stack-82.4.2) [Compare Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-82.4.1...kube-prometheus-stack-82.4.2)
kubernetes-sigs/controller-runtime (sigs.k8s.io/controller-runtime) ### [`v0.23.3`](https://redirect.github.com/kubernetes-sigs/controller-runtime/releases/tag/v0.23.3) [Compare Source](https://redirect.github.com/kubernetes-sigs/controller-runtime/compare/v0.23.2...v0.23.3) #### What's Changed - 🐛 Ensure DefaulterRemoveUnknownOrOmitableFields is still working even if objects are equal by [@​k8s-infra-cherrypick-robot](https://redirect.github.com/k8s-infra-cherrypick-robot) in [#​3469](https://redirect.github.com/kubernetes-sigs/controller-runtime/pull/3469) **Full Changelog**: ### [`v0.23.2`](https://redirect.github.com/kubernetes-sigs/controller-runtime/releases/tag/v0.23.2) [Compare Source](https://redirect.github.com/kubernetes-sigs/controller-runtime/compare/v0.23.1...v0.23.2) #### What's Changed - 🐛 Fix fake client's SSA status patch resource version check by [@​k8s-infra-cherrypick-robot](https://redirect.github.com/k8s-infra-cherrypick-robot) in [#​3446](https://redirect.github.com/kubernetes-sigs/controller-runtime/pull/3446) - ✨ Reduce memory usage of default webhooks by [@​k8s-infra-cherrypick-robot](https://redirect.github.com/k8s-infra-cherrypick-robot) in [#​3467](https://redirect.github.com/kubernetes-sigs/controller-runtime/pull/3467) **Full Changelog**:
--- ### Configuration 📅 **Schedule**: Branch creation - "before 8am on Friday" (UTC), Automerge - At any time (no schedule defined). 🚦 **Automerge**: Enabled. ♻ **Rebasing**: Whenever PR is behind base branch, or you tick the rebase/retry checkbox. 👻 **Immortal**: This PR will be recreated if closed unmerged. Get [config help](https://redirect.github.com/renovatebot/renovate/discussions) if that's undesired. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR was generated by [Mend Renovate](https://mend.io/renovate/). View the [repository job log](https://developer.mend.io/github/cobaltcore-dev/cortex). Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- go.mod | 10 +++++----- go.sum | 20 +++++++++---------- .../dev/cortex-prometheus-operator/Chart.yaml | 2 +- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/go.mod b/go.mod index 8a2f503d1..65f60cdcf 100644 --- a/go.mod +++ b/go.mod @@ -5,16 +5,16 @@ go 1.26 require ( github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260313132145-05f22f69d9fd github.com/go-gorp/gorp v2.2.0+incompatible - github.com/gophercloud/gophercloud/v2 v2.10.0 + github.com/gophercloud/gophercloud/v2 v2.11.1 github.com/ironcore-dev/ironcore v0.2.4 github.com/majewsky/gg v1.5.0 github.com/prometheus/client_golang v1.23.2 github.com/prometheus/client_model v0.6.2 github.com/sapcc/go-bits v0.0.0-20260226170120-c20f89b66c3c - k8s.io/api v0.35.1 - k8s.io/apimachinery v0.35.1 - k8s.io/client-go v0.35.1 - sigs.k8s.io/controller-runtime v0.23.1 + k8s.io/api v0.35.2 + k8s.io/apimachinery v0.35.2 + k8s.io/client-go v0.35.2 + sigs.k8s.io/controller-runtime v0.23.3 ) require ( diff --git a/go.sum b/go.sum index 4d0e2fcd3..bc95e0d6a 100644 --- a/go.sum +++ b/go.sum @@ -97,8 +97,8 @@ github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83 h1:z2ogiKUYzX5Is6zr/v github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gophercloud/gophercloud/v2 v2.10.0 h1:NRadC0aHNvy4iMoFXj5AFiPmut/Sj3hAPAo9B59VMGc= -github.com/gophercloud/gophercloud/v2 v2.10.0/go.mod h1:Ki/ILhYZr/5EPebrPL9Ej+tUg4lqx71/YH2JWVeU+Qk= +github.com/gophercloud/gophercloud/v2 v2.11.1 h1:jCs4vLH8sJgRqrPzqVfWgl7uI6JnIIlsgeIRM0uHjxY= +github.com/gophercloud/gophercloud/v2 v2.11.1/go.mod h1:Rm0YvKQ4QYX2rY9XaDKnjRzSGwlG5ge4h6ABYnmkKQM= github.com/gotestyourself/gotestyourself v2.2.0+incompatible h1:AQwinXlbQR2HvPjQZOmDhRqsv5mZf+Jb1RnSLxcqZcI= github.com/gotestyourself/gotestyourself v2.2.0+incompatible/go.mod h1:zZKM6oeNM8k+FRljX1mnzVYeS8wiGgQyvST1/GafPbY= github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1 h1:X5VWvz21y3gzm9Nw/kaUeku/1+uBhcekkmy4IkffJww= @@ -283,16 +283,16 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools v2.2.0+incompatible h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo= gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw= -k8s.io/api v0.35.1 h1:0PO/1FhlK/EQNVK5+txc4FuhQibV25VLSdLMmGpDE/Q= -k8s.io/api v0.35.1/go.mod h1:28uR9xlXWml9eT0uaGo6y71xK86JBELShLy4wR1XtxM= +k8s.io/api v0.35.2 h1:tW7mWc2RpxW7HS4CoRXhtYHSzme1PN1UjGHJ1bdrtdw= +k8s.io/api v0.35.2/go.mod h1:7AJfqGoAZcwSFhOjcGM7WV05QxMMgUaChNfLTXDRE60= k8s.io/apiextensions-apiserver v0.35.0 h1:3xHk2rTOdWXXJM+RDQZJvdx0yEOgC0FgQ1PlJatA5T4= k8s.io/apiextensions-apiserver v0.35.0/go.mod h1:E1Ahk9SADaLQ4qtzYFkwUqusXTcaV2uw3l14aqpL2LU= -k8s.io/apimachinery v0.35.1 h1:yxO6gV555P1YV0SANtnTjXYfiivaTPvCTKX6w6qdDsU= -k8s.io/apimachinery v0.35.1/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= +k8s.io/apimachinery v0.35.2 h1:NqsM/mmZA7sHW02JZ9RTtk3wInRgbVxL8MPfzSANAK8= +k8s.io/apimachinery v0.35.2/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= k8s.io/apiserver v0.35.0 h1:CUGo5o+7hW9GcAEF3x3usT3fX4f9r8xmgQeCBDaOgX4= k8s.io/apiserver v0.35.0/go.mod h1:QUy1U4+PrzbJaM3XGu2tQ7U9A4udRRo5cyxkFX0GEds= -k8s.io/client-go v0.35.1 h1:+eSfZHwuo/I19PaSxqumjqZ9l5XiTEKbIaJ+j1wLcLM= -k8s.io/client-go v0.35.1/go.mod h1:1p1KxDt3a0ruRfc/pG4qT/3oHmUj1AhSHEcxNSGg+OA= +k8s.io/client-go v0.35.2 h1:YUfPefdGJA4aljDdayAXkc98DnPkIetMl4PrKX97W9o= +k8s.io/client-go v0.35.2/go.mod h1:4QqEwh4oQpeK8AaefZ0jwTFJw/9kIjdQi0jpKeYvz7g= k8s.io/component-base v0.35.0 h1:+yBrOhzri2S1BVqyVSvcM3PtPyx5GUxCK2tinZz1G94= k8s.io/component-base v0.35.0/go.mod h1:85SCX4UCa6SCFt6p3IKAPej7jSnF3L8EbfSyMZayJR0= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= @@ -303,8 +303,8 @@ k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzk k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.33.0 h1:qPrZsv1cwQiFeieFlRqT627fVZ+tyfou/+S5S0H5ua0= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.33.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= -sigs.k8s.io/controller-runtime v0.23.1 h1:TjJSM80Nf43Mg21+RCy3J70aj/W6KyvDtOlpKf+PupE= -sigs.k8s.io/controller-runtime v0.23.1/go.mod h1:B6COOxKptp+YaUT5q4l6LqUJTRpizbgf9KSRNdQGns0= +sigs.k8s.io/controller-runtime v0.23.3 h1:VjB/vhoPoA9l1kEKZHBMnQF33tdCLQKJtydy4iqwZ80= +sigs.k8s.io/controller-runtime v0.23.3/go.mod h1:B6COOxKptp+YaUT5q4l6LqUJTRpizbgf9KSRNdQGns0= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= diff --git a/helm/dev/cortex-prometheus-operator/Chart.yaml b/helm/dev/cortex-prometheus-operator/Chart.yaml index 2c28dfe7c..fbe10bb3e 100644 --- a/helm/dev/cortex-prometheus-operator/Chart.yaml +++ b/helm/dev/cortex-prometheus-operator/Chart.yaml @@ -10,4 +10,4 @@ dependencies: # CRDs of the prometheus operator, such as PrometheusRule, ServiceMonitor, etc. - name: kube-prometheus-stack repository: oci://ghcr.io/prometheus-community/charts - version: 82.4.1 + version: 82.10.3 From 94c919c3c3816b010432fb6b20bca69f5fad2286 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 14 Mar 2026 15:13:46 +0000 Subject: [PATCH 20/55] Bump cortex chart appVersions to sha-70974a43 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index cbdc6f8ac..2f0e5f623 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-9de0594b" +appVersion: "sha-70974a43" icon: "https://example.com/icon.png" dependencies: [] From 7086080f9c795e06109a6a174085ee941007a222 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Sat, 14 Mar 2026 16:35:14 +0100 Subject: [PATCH 21/55] Renovate: Update github.com/sapcc/go-bits digest to 034b497 (#574) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [github.com/sapcc/go-bits](https://redirect.github.com/sapcc/go-bits) | require | digest | `c20f89b` → `034b497` | --- ### Configuration 📅 **Schedule**: Branch creation - "before 8am on Friday" (UTC), Automerge - At any time (no schedule defined). 🚦 **Automerge**: Enabled. ♻ **Rebasing**: Whenever PR is behind base branch, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR was generated by [Mend Renovate](https://mend.io/renovate/). View the [repository job log](https://developer.mend.io/github/cobaltcore-dev/cortex). Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 65f60cdcf..2b2219b3d 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,7 @@ require ( github.com/majewsky/gg v1.5.0 github.com/prometheus/client_golang v1.23.2 github.com/prometheus/client_model v0.6.2 - github.com/sapcc/go-bits v0.0.0-20260226170120-c20f89b66c3c + github.com/sapcc/go-bits v0.0.0-20260312170110-034b497ebb7e k8s.io/api v0.35.2 k8s.io/apimachinery v0.35.2 k8s.io/client-go v0.35.2 diff --git a/go.sum b/go.sum index bc95e0d6a..01060515c 100644 --- a/go.sum +++ b/go.sum @@ -176,8 +176,8 @@ github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sapcc/go-api-declarations v1.20.2 h1:GWqv8VgsF4k9id6N051AVTaEpcjT02APsOuz2yCvTPQ= github.com/sapcc/go-api-declarations v1.20.2/go.mod h1:eiRrXXUeQS5C/1kKn8/KMjk0Y0goUzgDQswj30rH0Zc= -github.com/sapcc/go-bits v0.0.0-20260226170120-c20f89b66c3c h1:GX6ADtKR6Bs2bBRBqeJf376MsxLNppC1SOHLIBuQwIA= -github.com/sapcc/go-bits v0.0.0-20260226170120-c20f89b66c3c/go.mod h1:AYC4f8FYO9DllSt4TyMwm1e5iPn32/DGIrlgXtuEsJ0= +github.com/sapcc/go-bits v0.0.0-20260312170110-034b497ebb7e h1:4wgkrfAlnL6ffM7HTNoHn1HrBBurCRR71WNOszdiDNQ= +github.com/sapcc/go-bits v0.0.0-20260312170110-034b497ebb7e/go.mod h1:NZjMiGVm04U25vwR6ZWvMw0XOOnvS1jkmXpjiepOeUw= github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw= github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= From a95bd2f627eb49b90d7db330b3c4365ff1179ea4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 14 Mar 2026 15:44:32 +0000 Subject: [PATCH 22/55] Bump cortex chart appVersions to sha-7086080f [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 2f0e5f623..4458a555b 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-70974a43" +appVersion: "sha-7086080f" icon: "https://example.com/icon.png" dependencies: [] From 26ef0b1d0b59683dd2210c0eb67ef7e1641bd2bc Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Sun, 15 Mar 2026 00:07:29 +0100 Subject: [PATCH 23/55] Renovate: Update docker/metadata-action action to v6 (#578) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [docker/metadata-action](https://redirect.github.com/docker/metadata-action) | action | major | `v5` → `v6` | --- ### Release Notes
docker/metadata-action (docker/metadata-action) ### [`v6`](https://redirect.github.com/docker/metadata-action/compare/v5...v6) [Compare Source](https://redirect.github.com/docker/metadata-action/compare/v5...v6)
--- ### Configuration 📅 **Schedule**: Branch creation - "before 8am on Friday" (UTC), Automerge - At any time (no schedule defined). 🚦 **Automerge**: Disabled by config. Please merge this manually once you are satisfied. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR was generated by [Mend Renovate](https://mend.io/renovate/). View the [repository job log](https://developer.mend.io/github/cobaltcore-dev/cortex). Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- .github/workflows/push-images.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/push-images.yaml b/.github/workflows/push-images.yaml index c8c98d407..997595976 100644 --- a/.github/workflows/push-images.yaml +++ b/.github/workflows/push-images.yaml @@ -41,7 +41,7 @@ jobs: - name: Docker Meta (Cortex Postgres) if: steps.changed_postgres_files.outputs.all_changed_files != '' id: meta_cortex_postgres - uses: docker/metadata-action@v5 + uses: docker/metadata-action@v6 with: images: ${{ env.REGISTRY }}/${{ github.repository }}-postgres tags: | @@ -75,7 +75,7 @@ jobs: # Build & push new cortex image - name: Docker Meta (Cortex) id: meta_cortex - uses: docker/metadata-action@v5 + uses: docker/metadata-action@v6 with: images: ${{ env.REGISTRY }}/${{ github.repository }} tags: | From 4e9d48f1df64b718b05976953d820a3ad7271739 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 14 Mar 2026 23:16:39 +0000 Subject: [PATCH 24/55] Bump cortex chart appVersions to sha-26ef0b1d [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 4458a555b..3c36db4ee 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-7086080f" +appVersion: "sha-26ef0b1d" icon: "https://example.com/icon.png" dependencies: [] From 781dec128083ae0a5bdfd6f0ad9f128bba1149b3 Mon Sep 17 00:00:00 2001 From: Markus Wieland <44964229+SoWieMarkus@users.noreply.github.com> Date: Mon, 16 Mar 2026 08:58:45 +0100 Subject: [PATCH 25/55] Configure code rabbit (#582) See [CodeRabbit Config docs](https://docs.coderabbit.ai/reference/configuration#reviews) --- .coderabbit.yaml | 2 ++ .gitignore | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 .coderabbit.yaml diff --git a/.coderabbit.yaml b/.coderabbit.yaml new file mode 100644 index 000000000..d158def60 --- /dev/null +++ b/.coderabbit.yaml @@ -0,0 +1,2 @@ +reviews: + high_level_summary: false diff --git a/.gitignore b/.gitignore index f0ab035e7..907b4f7ca 100644 --- a/.gitignore +++ b/.gitignore @@ -34,4 +34,5 @@ cortex.secrets.yaml !.github !.golangci.yaml !.license-scan-overrides.jsonl -!.license-scan-rules.json \ No newline at end of file +!.license-scan-rules.json +!.coderabbit.yaml \ No newline at end of file From eceedcee3c121c1b14e78217fbdfae6dadbaf6ff Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 16 Mar 2026 08:08:09 +0000 Subject: [PATCH 26/55] Bump cortex chart appVersions to sha-781dec12 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 3c36db4ee..30a83613a 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-26ef0b1d" +appVersion: "sha-781dec12" icon: "https://example.com/icon.png" dependencies: [] From 1aec52b75c4676761fd652a8d0778126e20979e9 Mon Sep 17 00:00:00 2001 From: Philipp Matthes <27271818+PhilippMatthes@users.noreply.github.com> Date: Mon, 16 Mar 2026 09:30:08 +0100 Subject: [PATCH 27/55] Upgrade to latest hypervisor crd (#581) This upgrade is necessary, because we changed the allocation and capacity types from a generic `map[string]resource.Quantity` to `map[hv1.ResourceName]resource.Quantity`. In addition, this upgrade also makes use of the `hv1.ResourceName` api inside our Reservation CRD. See: https://github.com/cobaltcore-dev/openstack-hypervisor-operator/pull/257 ## Summary by CodeRabbit * **Refactor** * Standardized resource naming to a strongly-typed format across scheduling and reservation logic, improving type safety and consistency for capacity, allocation and reservation handling. * **Chores** * Bumped the OpenStack Hypervisor Operator dependency to the newer release. --- api/v1alpha1/reservation_types.go | 5 +- api/v1alpha1/zz_generated.deepcopy.go | 5 +- go.mod | 2 +- go.sum | 2 + .../compute/resource_capacity_kvm_test.go | 66 ++++---- internal/scheduling/nova/integration_test.go | 24 +-- .../filters/filter_has_enough_capacity.go | 6 +- .../filter_has_enough_capacity_test.go | 30 ++-- .../nova/plugins/weighers/kvm_binpack.go | 21 ++- .../nova/plugins/weighers/kvm_binpack_test.go | 147 +++++++---------- .../weighers/kvm_failover_evacuation_test.go | 12 +- .../weighers/kvm_prefer_smaller_hosts.go | 17 +- .../weighers/kvm_prefer_smaller_hosts_test.go | 155 ++++++++---------- .../commitments/reservation_manager.go | 15 +- .../commitments/reservation_manager_test.go | 31 ++-- .../reservations/commitments/state_test.go | 17 +- .../reservations/commitments/syncer_test.go | 15 +- .../controller/controller_test.go | 7 +- .../reservations/controller/monitor.go | 2 +- .../reservations/controller/monitor_test.go | 31 ++-- 20 files changed, 288 insertions(+), 322 deletions(-) diff --git a/api/v1alpha1/reservation_types.go b/api/v1alpha1/reservation_types.go index df3ad473e..913a93a8f 100644 --- a/api/v1alpha1/reservation_types.go +++ b/api/v1alpha1/reservation_types.go @@ -4,6 +4,7 @@ package v1alpha1 import ( + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -44,7 +45,7 @@ type CommittedResourceAllocation struct { // Resources consumed by this instance. // +kubebuilder:validation:Required - Resources map[string]resource.Quantity `json:"resources"` + Resources map[hv1.ResourceName]resource.Quantity `json:"resources"` } // CommittedResourceReservationSpec defines the spec fields specific to committed resource reservations. @@ -99,7 +100,7 @@ type ReservationSpec struct { // Resources to reserve for this instance. // +kubebuilder:validation:Optional - Resources map[string]resource.Quantity `json:"resources,omitempty"` + Resources map[hv1.ResourceName]resource.Quantity `json:"resources,omitempty"` // StartTime is the time when the reservation becomes active. // +kubebuilder:validation:Optional diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 564f30cac..96043cc1f 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -8,6 +8,7 @@ package v1alpha1 import ( + apiv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -35,7 +36,7 @@ func (in *CommittedResourceAllocation) DeepCopyInto(out *CommittedResourceAlloca in.CreationTimestamp.DeepCopyInto(&out.CreationTimestamp) if in.Resources != nil { in, out := &in.Resources, &out.Resources - *out = make(map[string]resource.Quantity, len(*in)) + *out = make(map[apiv1.ResourceName]resource.Quantity, len(*in)) for key, val := range *in { (*out)[key] = val.DeepCopy() } @@ -1218,7 +1219,7 @@ func (in *ReservationSpec) DeepCopyInto(out *ReservationSpec) { *out = *in if in.Resources != nil { in, out := &in.Resources, &out.Resources - *out = make(map[string]resource.Quantity, len(*in)) + *out = make(map[apiv1.ResourceName]resource.Quantity, len(*in)) for key, val := range *in { (*out)[key] = val.DeepCopy() } diff --git a/go.mod b/go.mod index 2b2219b3d..5bda482d6 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/cobaltcore-dev/cortex go 1.26 require ( - github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260313132145-05f22f69d9fd + github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260316070528-80f53bbce409 github.com/go-gorp/gorp v2.2.0+incompatible github.com/gophercloud/gophercloud/v2 v2.11.1 github.com/ironcore-dev/ironcore v0.2.4 diff --git a/go.sum b/go.sum index 01060515c..338b73d74 100644 --- a/go.sum +++ b/go.sum @@ -22,6 +22,8 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260313132145-05f22f69d9fd h1:IzxramZZRC/9FtQQqpbgf8KIpH4soD9cliCFs2+zPd4= github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260313132145-05f22f69d9fd/go.mod h1:b0KmJdxvRI8UXlGe8cRm5BD8Tm2WhF7zSKMSIRGyVL4= +github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260316070528-80f53bbce409 h1:hiTMLk6JZsmFF+ECBJnOVcDAw2d+iCXhk4eDvVpYHYM= +github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260316070528-80f53bbce409/go.mod h1:b0KmJdxvRI8UXlGe8cRm5BD8Tm2WhF7zSKMSIRGyVL4= github.com/containerd/continuity v0.4.5 h1:ZRoN1sXq9u7V6QoHMcVWGhOwDFqZ4B9i5H6un1Wh0x4= github.com/containerd/continuity v0.4.5/go.mod h1:/lNJvtJKUQStBzpVQ1+rasXO1LAWtUQssk28EZvJ3nE= github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go index 015217e15..d0f3b1780 100644 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go +++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go @@ -58,13 +58,13 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("128"), - "memory": resource.MustParse("512Gi"), + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse("64"), - "memory": resource.MustParse("256Gi"), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), }, Traits: []string{}, }, @@ -148,13 +148,13 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("256"), - "memory": resource.MustParse("1Ti"), + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("256"), + hv1.ResourceMemory: resource.MustParse("1Ti"), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse("128"), - "memory": resource.MustParse("512Gi"), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), }, Traits: []string{ "CUSTOM_HW_SAPPHIRE_RAPIDS", @@ -209,13 +209,13 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("64"), - "memory": resource.MustParse("256Gi"), + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse("32"), - "memory": resource.MustParse("128Gi"), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("32"), + hv1.ResourceMemory: resource.MustParse("128Gi"), }, Traits: []string{ "CUSTOM_DECOMMISSIONING", @@ -255,13 +255,13 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("100"), - "memory": resource.MustParse("200Gi"), + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("100"), + hv1.ResourceMemory: resource.MustParse("200Gi"), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse("50"), - "memory": resource.MustParse("100Gi"), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("50"), + hv1.ResourceMemory: resource.MustParse("100Gi"), }, Traits: []string{}, }, @@ -274,13 +274,13 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("200"), - "memory": resource.MustParse("400Gi"), + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("200"), + hv1.ResourceMemory: resource.MustParse("400Gi"), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse("150"), - "memory": resource.MustParse("300Gi"), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("150"), + hv1.ResourceMemory: resource.MustParse("300Gi"), }, Traits: []string{"CUSTOM_HW_SAPPHIRE_RAPIDS"}, }, @@ -332,9 +332,9 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("96"), - "memory": resource.MustParse("384Gi"), + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("96"), + hv1.ResourceMemory: resource.MustParse("384Gi"), }, // No Allocation field - simulating missing data Allocation: nil, diff --git a/internal/scheduling/nova/integration_test.go b/internal/scheduling/nova/integration_test.go index 137ac5a10..596d9f2ed 100644 --- a/internal/scheduling/nova/integration_test.go +++ b/internal/scheduling/nova/integration_test.go @@ -48,13 +48,13 @@ func newHypervisor(name, cpuCap, cpuAlloc, memCap, memAlloc string) *hv1.Hypervi Name: name, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse(cpuCap), - "memory": resource.MustParse(memCap), + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpuCap), + hv1.ResourceMemory: resource.MustParse(memCap), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse(cpuAlloc), - "memory": resource.MustParse(memAlloc), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpuAlloc), + hv1.ResourceMemory: resource.MustParse(memAlloc), }, }, } @@ -68,9 +68,9 @@ func newCommittedReservation(name, targetHost, observedHost, projectID, flavorNa Spec: v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeCommittedResource, TargetHost: targetHost, - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse(cpu), - "memory": resource.MustParse(memory), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpu), + hv1.ResourceMemory: resource.MustParse(memory), }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ProjectID: projectID, @@ -100,9 +100,9 @@ func newFailoverReservation(name, targetHost, resourceGroup, cpu, memory string, Spec: v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeFailover, TargetHost: targetHost, - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse(cpu), - "memory": resource.MustParse(memory), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpu), + hv1.ResourceMemory: resource.MustParse(memory), }, FailoverReservation: &v1alpha1.FailoverReservationSpec{ ResourceGroup: resourceGroup, diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go index a90638eac..8852f6151 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go @@ -45,7 +45,7 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa result := s.IncludeAllHostsFromRequest(request) // This map holds the free resources per host. - freeResourcesByHost := make(map[string]map[string]resource.Quantity) + freeResourcesByHost := make(map[string]map[hv1.ResourceName]resource.Quantity) // The hypervisor resource auto-discovers its current utilization. // We can use the hypervisor status to calculate the total capacity @@ -145,7 +145,7 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa // For CR reservations with allocations, calculate remaining (unallocated) resources to block. // This prevents double-blocking of resources already consumed by running instances. - var resourcesToBlock map[string]resource.Quantity + var resourcesToBlock map[hv1.ResourceName]resource.Quantity if reservation.Spec.Type == v1alpha1.ReservationTypeCommittedResource && // if the reservation is not being migrated, block only unused resources reservation.Spec.TargetHost == reservation.Status.Host && @@ -154,7 +154,7 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa len(reservation.Spec.CommittedResourceReservation.Allocations) > 0 && len(reservation.Status.CommittedResourceReservation.Allocations) > 0 { // Start with full reservation resources - resourcesToBlock = make(map[string]resource.Quantity) + resourcesToBlock = make(map[hv1.ResourceName]resource.Quantity) for k, v := range reservation.Spec.Resources { resourcesToBlock[k] = v.DeepCopy() } diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go index cb998a286..504bbb523 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go @@ -39,13 +39,13 @@ func newHypervisor(name, cpuCap, cpuAlloc, memCap, memAlloc string) *hv1.Hypervi Name: name, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse(cpuCap), - "memory": resource.MustParse(memCap), + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpuCap), + hv1.ResourceMemory: resource.MustParse(memCap), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse(cpuAlloc), - "memory": resource.MustParse(memAlloc), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpuAlloc), + hv1.ResourceMemory: resource.MustParse(memAlloc), }, }, } @@ -64,9 +64,9 @@ func newCommittedReservation( Spec: v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeCommittedResource, TargetHost: targetHost, - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse(cpu), - "memory": resource.MustParse(memory), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpu), + hv1.ResourceMemory: resource.MustParse(memory), }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ProjectID: projectID, @@ -104,9 +104,9 @@ func newFailoverReservation(name, targetHost, cpu, memory string, allocations ma Spec: v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeFailover, TargetHost: targetHost, - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse(cpu), - "memory": resource.MustParse(memory), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpu), + hv1.ResourceMemory: resource.MustParse(memory), }, FailoverReservation: &v1alpha1.FailoverReservationSpec{ ResourceGroup: "m1.large", @@ -150,9 +150,9 @@ func crSpecAllocs(vms ...crVmAlloc) map[string]v1alpha1.CommittedResourceAllocat for _, v := range vms { allocs[v.uuid] = v1alpha1.CommittedResourceAllocation{ CreationTimestamp: metav1.Now(), - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse(v.cpu), - "memory": resource.MustParse(v.mem), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(v.cpu), + hv1.ResourceMemory: resource.MustParse(v.mem), }, } } diff --git a/internal/scheduling/nova/plugins/weighers/kvm_binpack.go b/internal/scheduling/nova/plugins/weighers/kvm_binpack.go index 1a3bd7573..3bed165f4 100644 --- a/internal/scheduling/nova/plugins/weighers/kvm_binpack.go +++ b/internal/scheduling/nova/plugins/weighers/kvm_binpack.go @@ -13,7 +13,6 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" "github.com/cobaltcore-dev/cortex/internal/scheduling/lib" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" ) @@ -23,7 +22,7 @@ type KVMBinpackStepOpts struct { // node's resource utilizations after placing the VM. // If a resource is not specified, is ignored in the score calculation // (equivalent to a weight of 0). - ResourceWeights map[corev1.ResourceName]float64 `json:"resourceWeights"` + ResourceWeights map[hv1.ResourceName]float64 `json:"resourceWeights"` } // Validate the options to ensure they are correct before running the weigher. @@ -31,9 +30,9 @@ func (o KVMBinpackStepOpts) Validate() error { if len(o.ResourceWeights) == 0 { return errors.New("at least one resource weight must be specified") } - supportedResources := []corev1.ResourceName{ - corev1.ResourceMemory, - corev1.ResourceCPU, + supportedResources := []hv1.ResourceName{ + hv1.ResourceMemory, + hv1.ResourceCPU, } for resourceName, value := range o.ResourceWeights { if !slices.Contains(supportedResources, resourceName) { @@ -94,7 +93,7 @@ func (s *KVMBinpackStep) Run(traceLog *slog.Logger, request api.ExternalSchedule var totalWeightedUtilization, totalWeight float64 for resourceName, weight := range s.Options.ResourceWeights { - capacity, ok := hv.Status.Capacity[resourceName.String()] + capacity, ok := hv.Status.Capacity[resourceName] if !ok { traceLog.Warn("no capacity in status, skipping", "host", host, "resource", resourceName) @@ -105,7 +104,7 @@ func (s *KVMBinpackStep) Run(traceLog *slog.Logger, request api.ExternalSchedule "host", host, "resource", resourceName) continue } - allocation, ok := hv.Status.Allocation[resourceName.String()] + allocation, ok := hv.Status.Allocation[resourceName] if !ok { traceLog.Warn("no allocation in status, skipping", "host", host, "resource", resourceName) @@ -138,15 +137,15 @@ func (s *KVMBinpackStep) Run(traceLog *slog.Logger, request api.ExternalSchedule } // calcVMResources calculates the total resource requests for the VM to be scheduled. -func (s *KVMBinpackStep) calcVMResources(req api.ExternalSchedulerRequest) map[corev1.ResourceName]resource.Quantity { - resources := make(map[corev1.ResourceName]resource.Quantity) +func (s *KVMBinpackStep) calcVMResources(req api.ExternalSchedulerRequest) map[hv1.ResourceName]resource.Quantity { + resources := make(map[hv1.ResourceName]resource.Quantity) resourcesMemBytes := int64(req.Spec.Data.Flavor.Data.MemoryMB * 1_000_000) //nolint:gosec // memory values are bounded by Nova resourcesMemBytes *= int64(req.Spec.Data.NumInstances) //nolint:gosec // instance count is bounded by Nova - resources[corev1.ResourceMemory] = *resource. + resources[hv1.ResourceMemory] = *resource. NewQuantity(resourcesMemBytes, resource.DecimalSI) resourcesCPU := int64(req.Spec.Data.Flavor.Data.VCPUs) //nolint:gosec // vCPU values are bounded by Nova resourcesCPU *= int64(req.Spec.Data.NumInstances) //nolint:gosec // instance count is bounded by Nova - resources[corev1.ResourceCPU] = *resource. + resources[hv1.ResourceCPU] = *resource. NewQuantity(resourcesCPU, resource.DecimalSI) return resources } diff --git a/internal/scheduling/nova/plugins/weighers/kvm_binpack_test.go b/internal/scheduling/nova/plugins/weighers/kvm_binpack_test.go index dde381e71..e867c5bf7 100644 --- a/internal/scheduling/nova/plugins/weighers/kvm_binpack_test.go +++ b/internal/scheduling/nova/plugins/weighers/kvm_binpack_test.go @@ -10,7 +10,6 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" @@ -23,13 +22,13 @@ func newHypervisor(name, capacityCPU, capacityMem, allocationCPU, allocationMem Name: name, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse(capacityCPU), - "memory": resource.MustParse(capacityMem), + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(capacityCPU), + hv1.ResourceMemory: resource.MustParse(capacityMem), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse(allocationCPU), - "memory": resource.MustParse(allocationMem), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(allocationCPU), + hv1.ResourceMemory: resource.MustParse(allocationMem), }, }, } @@ -81,9 +80,9 @@ func TestKVMBinpackStepOpts_Validate(t *testing.T) { { name: "valid opts with memory and cpu weights", opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, + hv1.ResourceCPU: 1.0, }, }, wantErr: false, @@ -91,9 +90,9 @@ func TestKVMBinpackStepOpts_Validate(t *testing.T) { { name: "inverted weights should raise error", opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: -1.0, - corev1.ResourceCPU: -1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: -1.0, + hv1.ResourceCPU: -1.0, }, }, wantErr: true, @@ -101,9 +100,9 @@ func TestKVMBinpackStepOpts_Validate(t *testing.T) { { name: "zero weights should raise error", opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 0.0, - corev1.ResourceCPU: 0.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 0.0, + hv1.ResourceCPU: 0.0, }, }, wantErr: true, @@ -111,8 +110,8 @@ func TestKVMBinpackStepOpts_Validate(t *testing.T) { { name: "valid opts with only memory weight", opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 2.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 2.0, }, }, wantErr: false, @@ -120,8 +119,8 @@ func TestKVMBinpackStepOpts_Validate(t *testing.T) { { name: "valid opts with only cpu weight", opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 0.5, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 0.5, }, }, wantErr: false, @@ -129,9 +128,9 @@ func TestKVMBinpackStepOpts_Validate(t *testing.T) { { name: "zero weights should raise error", opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 0.0, - corev1.ResourceCPU: 0.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 0.0, + hv1.ResourceCPU: 0.0, }, }, wantErr: true, @@ -139,7 +138,7 @@ func TestKVMBinpackStepOpts_Validate(t *testing.T) { { name: "valid opts with empty resource weights", opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{}, + ResourceWeights: map[hv1.ResourceName]float64{}, }, wantErr: true, }, @@ -148,30 +147,10 @@ func TestKVMBinpackStepOpts_Validate(t *testing.T) { opts: KVMBinpackStepOpts{}, wantErr: true, }, - { - name: "invalid opts with unsupported resource", - opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceStorage: 1.0, - }, - }, - wantErr: true, - errMsg: "unsupported resource", - }, - { - name: "invalid opts with unsupported ephemeral-storage resource", - opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceEphemeralStorage: 1.0, - }, - }, - wantErr: true, - errMsg: "unsupported resource", - }, { name: "invalid opts with custom unsupported resource", opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ + ResourceWeights: map[hv1.ResourceName]float64{ "nvidia.com/gpu": 1.0, }, }, @@ -221,8 +200,8 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(8192, 4, 1, []string{"host1", "host2"}), // 8Gi memory opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ // with 0.1 tolerance @@ -243,8 +222,8 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(8192, 4, 1, []string{"host1", "host2"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ // with 0.1 tolerance @@ -261,9 +240,9 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(8192, 4, 1, []string{"host1", "host2"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ // with 0.1 tolerance @@ -281,9 +260,9 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(8192, 4, 1, []string{"host1"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 2.0, - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ // with 0.1 tolerance @@ -299,8 +278,8 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(8192, 4, 2, []string{"host1"}), // 2 instances opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ // with 0.1 tolerance @@ -314,8 +293,8 @@ func TestKVMBinpackStep_Run(t *testing.T) { hypervisors: []*hv1.Hypervisor{}, request: newBinpackRequest(8192, 4, 1, []string{"host1", "host2"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -333,8 +312,8 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(8192, 4, 1, []string{"host1", "host2"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -351,7 +330,7 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(8192, 4, 1, []string{"host1"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{}, + ResourceWeights: map[hv1.ResourceName]float64{}, }, expectedWeights: map[string]float64{ "host1": 0, // No weights configured, score is 0 @@ -364,21 +343,21 @@ func TestKVMBinpackStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("0"), - "memory": resource.MustParse("100Gi"), + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("100Gi"), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse("0"), - "memory": resource.MustParse("80Gi"), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("80Gi"), }, }, }, }, request: newBinpackRequest(8192, 4, 1, []string{"host1"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -392,10 +371,10 @@ func TestKVMBinpackStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("100"), + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("100"), }, - Allocation: map[string]resource.Quantity{ + Allocation: map[hv1.ResourceName]resource.Quantity{ // No CPU allocation }, }, @@ -403,8 +382,8 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(8192, 4, 1, []string{"host1"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -418,19 +397,19 @@ func TestKVMBinpackStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ + Capacity: map[hv1.ResourceName]resource.Quantity{ // No CPU capacity }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse("80"), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("80"), }, }, }, }, request: newBinpackRequest(8192, 4, 1, []string{"host1"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -446,8 +425,8 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(20480, 20, 1, []string{"host1"}), // 20Gi, 20 CPUs - more than available opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -547,7 +526,7 @@ func TestKVMBinpackStep_calcVMResources(t *testing.T) { step := &KVMBinpackStep{} resources := step.calcVMResources(tt.request) - memResource, ok := resources[corev1.ResourceMemory] + memResource, ok := resources[hv1.ResourceMemory] if !ok { t.Error("expected memory resource to be present") } else { @@ -557,7 +536,7 @@ func TestKVMBinpackStep_calcVMResources(t *testing.T) { } } - cpuResource, ok := resources[corev1.ResourceCPU] + cpuResource, ok := resources[hv1.ResourceCPU] if !ok { t.Error("expected CPU resource to be present") } else { diff --git a/internal/scheduling/nova/plugins/weighers/kvm_failover_evacuation_test.go b/internal/scheduling/nova/plugins/weighers/kvm_failover_evacuation_test.go index 9c3ace3ec..0664e55d4 100644 --- a/internal/scheduling/nova/plugins/weighers/kvm_failover_evacuation_test.go +++ b/internal/scheduling/nova/plugins/weighers/kvm_failover_evacuation_test.go @@ -49,9 +49,9 @@ func newFailoverReservation(name, targetHost string, failed bool, allocations ma Spec: v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeFailover, TargetHost: targetHost, - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse("4"), - "memory": resource.MustParse("8Gi"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("4"), + hv1.ResourceMemory: resource.MustParse("8Gi"), }, FailoverReservation: &v1alpha1.FailoverReservationSpec{ ResourceGroup: "m1.large", @@ -84,9 +84,9 @@ func newCommittedReservation(name, targetHost string) *v1alpha1.Reservation { Spec: v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeCommittedResource, TargetHost: targetHost, - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse("4"), - "memory": resource.MustParse("8Gi"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("4"), + hv1.ResourceMemory: resource.MustParse("8Gi"), }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ProjectID: "project-A", diff --git a/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts.go b/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts.go index 1bb070592..8bb5928ee 100644 --- a/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts.go +++ b/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts.go @@ -13,7 +13,6 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" "github.com/cobaltcore-dev/cortex/internal/scheduling/lib" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" ) @@ -23,7 +22,7 @@ type KVMPreferSmallerHostsStepOpts struct { // of the normalized distances from the smallest capacity for each resource. // If a resource is not specified, it is ignored in the score calculation // (equivalent to a weight of 0). - ResourceWeights map[corev1.ResourceName]float64 `json:"resourceWeights"` + ResourceWeights map[hv1.ResourceName]float64 `json:"resourceWeights"` } // Validate the options to ensure they are correct before running the weigher. @@ -31,9 +30,9 @@ func (o KVMPreferSmallerHostsStepOpts) Validate() error { if len(o.ResourceWeights) == 0 { return errors.New("at least one resource weight must be specified") } - supportedResources := []corev1.ResourceName{ - corev1.ResourceMemory, - corev1.ResourceCPU, + supportedResources := []hv1.ResourceName{ + hv1.ResourceMemory, + hv1.ResourceCPU, } for resourceName, val := range o.ResourceWeights { if val < 0 { @@ -73,8 +72,8 @@ func (s *KVMPreferSmallerHostsStep) Run(traceLog *slog.Logger, request api.Exter } // Calculate smallest and largest capacity for each resource across active hosts - smallest := make(map[corev1.ResourceName]*resource.Quantity) - largest := make(map[corev1.ResourceName]*resource.Quantity) + smallest := make(map[hv1.ResourceName]*resource.Quantity) + largest := make(map[hv1.ResourceName]*resource.Quantity) for resourceName := range s.Options.ResourceWeights { for _, hv := range hvs.Items { @@ -82,7 +81,7 @@ func (s *KVMPreferSmallerHostsStep) Run(traceLog *slog.Logger, request api.Exter if _, ok := result.Activations[hv.Name]; !ok { continue } - capacity, ok := hv.Status.Capacity[resourceName.String()] + capacity, ok := hv.Status.Capacity[resourceName] if !ok { traceLog.Warn("hypervisor has no capacity for resource, skipping", "host", hv.Name, "resource", resourceName) @@ -107,7 +106,7 @@ func (s *KVMPreferSmallerHostsStep) Run(traceLog *slog.Logger, request api.Exter var totalWeightedScore, totalWeight float64 for resourceName, weight := range s.Options.ResourceWeights { - capacity, ok := hv.Status.Capacity[resourceName.String()] + capacity, ok := hv.Status.Capacity[resourceName] if !ok { traceLog.Warn("hypervisor has no capacity for resource, skipping", "host", hv.Name, "resource", resourceName) diff --git a/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts_test.go b/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts_test.go index 545c124ab..4a1b70e20 100644 --- a/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts_test.go +++ b/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts_test.go @@ -10,7 +10,6 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" @@ -23,9 +22,9 @@ func newHypervisorWithCapacity(name, capacityCPU, capacityMem string) *hv1.Hyper Name: name, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse(capacityCPU), - "memory": resource.MustParse(capacityMem), + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(capacityCPU), + hv1.ResourceMemory: resource.MustParse(capacityMem), }, }, } @@ -77,9 +76,9 @@ func TestKVMPreferSmallerHostsStepOpts_Validate(t *testing.T) { { name: "valid opts with memory and cpu weights", opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, + hv1.ResourceCPU: 1.0, }, }, wantErr: false, @@ -87,8 +86,8 @@ func TestKVMPreferSmallerHostsStepOpts_Validate(t *testing.T) { { name: "valid opts with only memory weight", opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 2.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 2.0, }, }, wantErr: false, @@ -96,8 +95,8 @@ func TestKVMPreferSmallerHostsStepOpts_Validate(t *testing.T) { { name: "valid opts with only cpu weight", opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 0.5, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 0.5, }, }, wantErr: false, @@ -105,9 +104,9 @@ func TestKVMPreferSmallerHostsStepOpts_Validate(t *testing.T) { { name: "valid opts with zero weights", opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 0.0, - corev1.ResourceCPU: 0.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 0.0, + hv1.ResourceCPU: 0.0, }, }, wantErr: false, @@ -115,7 +114,7 @@ func TestKVMPreferSmallerHostsStepOpts_Validate(t *testing.T) { { name: "invalid opts with empty resource weights", opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{}, + ResourceWeights: map[hv1.ResourceName]float64{}, }, wantErr: true, errMsg: "at least one resource weight must be specified", @@ -129,8 +128,8 @@ func TestKVMPreferSmallerHostsStepOpts_Validate(t *testing.T) { { name: "invalid opts with negative weight", opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: -1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: -1.0, }, }, wantErr: true, @@ -139,37 +138,17 @@ func TestKVMPreferSmallerHostsStepOpts_Validate(t *testing.T) { { name: "invalid opts with negative cpu weight", opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: -0.5, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: -0.5, }, }, wantErr: true, errMsg: "resource weights must be greater than or equal to zero", }, - { - name: "invalid opts with unsupported resource", - opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceStorage: 1.0, - }, - }, - wantErr: true, - errMsg: "unsupported resource", - }, - { - name: "invalid opts with unsupported ephemeral-storage resource", - opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceEphemeralStorage: 1.0, - }, - }, - wantErr: true, - errMsg: "unsupported resource", - }, { name: "invalid opts with custom unsupported resource", opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ + ResourceWeights: map[hv1.ResourceName]float64{ "nvidia.com/gpu": 1.0, }, }, @@ -216,8 +195,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -236,8 +215,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -259,9 +238,9 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -284,9 +263,9 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 2.0, // memory is weighted 2x - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 2.0, // memory is weighted 2x + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -305,8 +284,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -324,8 +303,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -343,8 +322,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -358,8 +337,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { hypervisors: []*hv1.Hypervisor{}, request: newPreferSmallerHostsRequest([]string{"host1", "host2"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -378,8 +357,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -397,8 +376,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host3"}, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("100"), + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("100"), // No memory capacity }, }, @@ -406,8 +385,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -427,8 +406,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { // Only host1 and host2 in the request (host3 was filtered out) request: newPreferSmallerHostsRequest([]string{"host1", "host2"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -446,8 +425,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -468,8 +447,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3", "host4"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -487,20 +466,20 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{}, + Capacity: map[hv1.ResourceName]resource.Quantity{}, }, }, { ObjectMeta: metav1.ObjectMeta{Name: "host2"}, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{}, + Capacity: map[hv1.ResourceName]resource.Quantity{}, }, }, }, request: newPreferSmallerHostsRequest([]string{"host1", "host2"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -518,8 +497,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -538,8 +517,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -555,8 +534,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "memory": resource.MustParse("64Gi"), + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("64Gi"), // No CPU }, }, @@ -564,8 +543,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host2"}, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "memory": resource.MustParse("128Gi"), + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("128Gi"), // No CPU }, }, @@ -573,9 +552,9 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, - corev1.ResourceCPU: 1.0, // CPU requested but not available + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, + hv1.ResourceCPU: 1.0, // CPU requested but not available }, }, expectedWeights: map[string]float64{ @@ -594,9 +573,9 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 0.0, // zero weight - ignored - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 0.0, // zero weight - ignored + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go index 350de7e8c..13856d992 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager.go +++ b/internal/scheduling/reservations/commitments/reservation_manager.go @@ -9,6 +9,7 @@ import ( "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "github.com/go-logr/logr" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" @@ -77,7 +78,7 @@ func (m *ReservationManager) ApplyCommitmentState( } deltaMemoryBytes := desiredState.TotalMemoryBytes for _, res := range existing { - memoryQuantity := res.Spec.Resources["memory"] + memoryQuantity := res.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes -= memoryQuantity.Value() } @@ -104,7 +105,7 @@ func (m *ReservationManager) ApplyCommitmentState( "expectedProjectID", desiredState.ProjectID, "actualProjectID", res.Spec.CommittedResourceReservation.ProjectID) removedReservations = append(removedReservations, res) - memValue := res.Spec.Resources["memory"] + memValue := res.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes += memValue.Value() if err := m.Delete(ctx, &res); err != nil { @@ -132,7 +133,7 @@ func (m *ReservationManager) ApplyCommitmentState( existing = existing[:len(existing)-1] // remove from existing list } removedReservations = append(removedReservations, *reservationToDelete) - memValue := reservationToDelete.Spec.Resources["memory"] + memValue := reservationToDelete.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes += memValue.Value() log.Info("deleting reservation", @@ -153,7 +154,7 @@ func (m *ReservationManager) ApplyCommitmentState( // TODO more sophisticated flavor selection, especially with flavors of different cpu/memory ratio reservation := m.newReservation(desiredState, nextSlotIndex, deltaMemoryBytes, flavorGroup, creator) touchedReservations = append(touchedReservations, *reservation) - memValue := reservation.Spec.Resources["memory"] + memValue := reservation.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes -= memValue.Value() log.Info("creating reservation", @@ -265,12 +266,12 @@ func (m *ReservationManager) newReservation( spec := v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeCommittedResource, - Resources: map[string]resource.Quantity{ - "memory": *resource.NewQuantity( + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity( memoryBytes, resource.BinarySI, ), - "cpu": *resource.NewQuantity( + hv1.ResourceCPU: *resource.NewQuantity( cpus, resource.DecimalSI, ), diff --git a/internal/scheduling/reservations/commitments/reservation_manager_test.go b/internal/scheduling/reservations/commitments/reservation_manager_test.go index d8cf9c267..8022999fb 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager_test.go +++ b/internal/scheduling/reservations/commitments/reservation_manager_test.go @@ -9,6 +9,7 @@ import ( "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "github.com/go-logr/logr" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -64,7 +65,7 @@ func TestApplyCommitmentState_CreatesNewReservations(t *testing.T) { // Verify created reservations sum to desired state totalMemory := int64(0) for _, res := range touched { - memQuantity := res.Spec.Resources["memory"] + memQuantity := res.Spec.Resources[hv1.ResourceMemory] totalMemory += memQuantity.Value() } @@ -89,8 +90,8 @@ func TestApplyCommitmentState_DeletesExcessReservations(t *testing.T) { }, }, Spec: v1alpha1.ReservationSpec{ - Resources: map[string]resource.Quantity{ - "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ProjectID: "project-1", @@ -108,8 +109,8 @@ func TestApplyCommitmentState_DeletesExcessReservations(t *testing.T) { }, }, Spec: v1alpha1.ReservationSpec{ - Resources: map[string]resource.Quantity{ - "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ProjectID: "project-1", @@ -168,7 +169,7 @@ func TestApplyCommitmentState_DeletesExcessReservations(t *testing.T) { totalMemory := int64(0) for _, res := range remainingList.Items { - memQuantity := res.Spec.Resources["memory"] + memQuantity := res.Spec.Resources[hv1.ResourceMemory] totalMemory += memQuantity.Value() } @@ -193,8 +194,8 @@ func TestApplyCommitmentState_PreservesAllocatedReservations(t *testing.T) { }, }, Spec: v1alpha1.ReservationSpec{ - Resources: map[string]resource.Quantity{ - "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ProjectID: "project-1", @@ -214,8 +215,8 @@ func TestApplyCommitmentState_PreservesAllocatedReservations(t *testing.T) { }, }, Spec: v1alpha1.ReservationSpec{ - Resources: map[string]resource.Quantity{ - "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ProjectID: "project-1", @@ -299,8 +300,8 @@ func TestApplyCommitmentState_HandlesZeroCapacity(t *testing.T) { }, }, Spec: v1alpha1.ReservationSpec{ - Resources: map[string]resource.Quantity{ - "memory": *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ProjectID: "project-1", @@ -377,8 +378,8 @@ func TestApplyCommitmentState_FixesWrongFlavorGroup(t *testing.T) { }, }, Spec: v1alpha1.ReservationSpec{ - Resources: map[string]resource.Quantity{ - "memory": *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ProjectID: "project-1", @@ -530,7 +531,7 @@ func TestNewReservation_SelectsAppropriateFlavor(t *testing.T) { } // Verify CPU allocation - cpuQuantity := reservation.Spec.Resources["cpu"] + cpuQuantity := reservation.Spec.Resources[hv1.ResourceCPU] if cpuQuantity.Value() != tt.expectedCores { t.Errorf("expected %d cores, got %d", tt.expectedCores, cpuQuantity.Value()) diff --git a/internal/scheduling/reservations/commitments/state_test.go b/internal/scheduling/reservations/commitments/state_test.go index d8581cec1..7060300db 100644 --- a/internal/scheduling/reservations/commitments/state_test.go +++ b/internal/scheduling/reservations/commitments/state_test.go @@ -8,6 +8,7 @@ import ( "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -84,8 +85,8 @@ func TestFromReservations_SumsMemoryCorrectly(t *testing.T) { Name: "commitment-abc123-0", }, Spec: v1alpha1.ReservationSpec{ - Resources: map[string]resource.Quantity{ - "memory": *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), // 8 GiB + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), // 8 GiB }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ProjectID: "project-1", @@ -98,8 +99,8 @@ func TestFromReservations_SumsMemoryCorrectly(t *testing.T) { Name: "commitment-abc123-1", }, Spec: v1alpha1.ReservationSpec{ - Resources: map[string]resource.Quantity{ - "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), // 16 GiB + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), // 16 GiB }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ProjectID: "project-1", @@ -146,8 +147,8 @@ func TestFromReservations_SkipsInconsistentFlavorGroup(t *testing.T) { Name: "commitment-abc123-0", }, Spec: v1alpha1.ReservationSpec{ - Resources: map[string]resource.Quantity{ - "memory": *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ProjectID: "project-1", @@ -160,8 +161,8 @@ func TestFromReservations_SkipsInconsistentFlavorGroup(t *testing.T) { Name: "commitment-abc123-1", }, Spec: v1alpha1.ReservationSpec{ - Resources: map[string]resource.Quantity{ - "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ProjectID: "project-1", diff --git a/internal/scheduling/reservations/commitments/syncer_test.go b/internal/scheduling/reservations/commitments/syncer_test.go index 0790545e8..75512299a 100644 --- a/internal/scheduling/reservations/commitments/syncer_test.go +++ b/internal/scheduling/reservations/commitments/syncer_test.go @@ -9,6 +9,7 @@ import ( "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -293,13 +294,13 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) { // Check resource values - should be sized for the flavor that fits // With 2048MB total capacity, we can fit 2x 1024MB flavors expectedMemory := resource.MustParse("1073741824") // 1024MB in bytes - if !res.Spec.Resources["memory"].Equal(expectedMemory) { - t.Errorf("Expected memory %v, got %v", expectedMemory, res.Spec.Resources["memory"]) + if !res.Spec.Resources[hv1.ResourceMemory].Equal(expectedMemory) { + t.Errorf("Expected memory %v, got %v", expectedMemory, res.Spec.Resources[hv1.ResourceMemory]) } expectedVCPUs := resource.MustParse("2") - if !res.Spec.Resources["cpu"].Equal(expectedVCPUs) { - t.Errorf("Expected vCPUs %v, got %v", expectedVCPUs, res.Spec.Resources["cpu"]) + if !res.Spec.Resources[hv1.ResourceCPU].Equal(expectedVCPUs) { + t.Errorf("Expected vCPUs %v, got %v", expectedVCPUs, res.Spec.Resources[hv1.ResourceCPU]) } } @@ -338,9 +339,9 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { ResourceGroup: "old_group", Creator: CreatorValue, }, - Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("512Mi"), - "cpu": resource.MustParse("1"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("512Mi"), + hv1.ResourceCPU: resource.MustParse("1"), }, }, } diff --git a/internal/scheduling/reservations/controller/controller_test.go b/internal/scheduling/reservations/controller/controller_test.go index 548857d3a..0ef3e253c 100644 --- a/internal/scheduling/reservations/controller/controller_test.go +++ b/internal/scheduling/reservations/controller/controller_test.go @@ -10,6 +10,7 @@ import ( "net/http/httptest" "testing" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -148,9 +149,9 @@ func TestReservationReconciler_reconcileInstanceReservation_Success(t *testing.T ProjectID: "test-project", ResourceName: "test-flavor", }, - Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("1Gi"), - "cpu": resource.MustParse("2"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("1Gi"), + hv1.ResourceCPU: resource.MustParse("2"), }, }, } diff --git a/internal/scheduling/reservations/controller/monitor.go b/internal/scheduling/reservations/controller/monitor.go index 3e6c6dae6..0c0ad2875 100644 --- a/internal/scheduling/reservations/controller/monitor.go +++ b/internal/scheduling/reservations/controller/monitor.go @@ -101,7 +101,7 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) { resourcesByLabels[key] = map[string]uint64{} } for resourceName, resourceQuantity := range reservation.Spec.Resources { - resourcesByLabels[key][resourceName] += resourceQuantity.AsDec().UnscaledBig().Uint64() + resourcesByLabels[key][string(resourceName)] += resourceQuantity.AsDec().UnscaledBig().Uint64() } } for key, resources := range resourcesByLabels { diff --git a/internal/scheduling/reservations/controller/monitor_test.go b/internal/scheduling/reservations/controller/monitor_test.go index fef88e35e..eef11892e 100644 --- a/internal/scheduling/reservations/controller/monitor_test.go +++ b/internal/scheduling/reservations/controller/monitor_test.go @@ -14,6 +14,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" "github.com/cobaltcore-dev/cortex/api/v1alpha1" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" ) @@ -98,9 +99,9 @@ func TestMonitor_Collect_WithReservations(t *testing.T) { CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ResourceName: "test-flavor", }, - Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("1Gi"), - "cpu": resource.MustParse("2"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("1Gi"), + hv1.ResourceCPU: resource.MustParse("2"), }, }, Status: v1alpha1.ReservationStatus{ @@ -123,9 +124,9 @@ func TestMonitor_Collect_WithReservations(t *testing.T) { CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ResourceName: "test-flavor", }, - Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("2Gi"), - "cpu": resource.MustParse("4"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("2Gi"), + hv1.ResourceCPU: resource.MustParse("4"), }, }, Status: v1alpha1.ReservationStatus{ @@ -148,9 +149,9 @@ func TestMonitor_Collect_WithReservations(t *testing.T) { CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ResourceName: "test-flavor", }, - Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("4Gi"), - "cpu": resource.MustParse("4"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("4Gi"), + hv1.ResourceCPU: resource.MustParse("4"), }, }, Status: v1alpha1.ReservationStatus{ @@ -244,9 +245,9 @@ func TestMonitor_Collect_ResourceMetrics(t *testing.T) { CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ResourceName: "test-flavor", }, - Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("1000Mi"), - "cpu": resource.MustParse("2"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("1000Mi"), + hv1.ResourceCPU: resource.MustParse("2"), }, }, Status: v1alpha1.ReservationStatus{ @@ -367,9 +368,9 @@ func TestMonitor_Collect_LabelSanitization(t *testing.T) { CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ResourceName: "test-flavor", }, - Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("1Gi"), - "cpu": resource.MustParse("2"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("1Gi"), + hv1.ResourceCPU: resource.MustParse("2"), }, }, Status: v1alpha1.ReservationStatus{ From ed610ead62515385a61949b739ec8aff5985d356 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 16 Mar 2026 08:39:37 +0000 Subject: [PATCH 28/55] Bump cortex chart appVersions to sha-1aec52b7 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 30a83613a..ded1cd916 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-781dec12" +appVersion: "sha-1aec52b7" icon: "https://example.com/icon.png" dependencies: [] From 60fef16c4b644ec1b3afd747d1767776abcce1bc Mon Sep 17 00:00:00 2001 From: Philipp Matthes <27271818+PhilippMatthes@users.noreply.github.com> Date: Mon, 16 Mar 2026 15:23:25 +0100 Subject: [PATCH 29/55] Add AGENTS.md (#561) See https://agents.md/#examples -- this file helps agents understand our code, architecture and intentions. --- AGENTS.md | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..6f2e12a17 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,79 @@ + + +# Instructions for Agents + +## Context + +You are developing code in the open-source project github.com/cobaltcore-dev/cortex. + +Cortex is a modular and extensible service for initial placement and scheduling in cloud-native environments covering workloads such as compute, storage, network, and other scheduling domains. + +It improves resource utilization and operational performance by making smart placement decisions based on the current state of the environment and defined constraints and objectives. + +Cortex is written in Golang and is designed for production-scale deployments using algorithmic approaches to balance decision quality, execution efficiency, and maintaining a low resource footprint. + +## Best Practices + +All code files must contain this license header: +```go +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 +``` + +General: +- Keep it concise and always focus on good code quality. We go to production +- We are on modern Golang, so you no longer need `interface{}` and use `any` instead +- Similarly, you no longer have to capture loop variables in closures, as this is now the default behavior in Go +- Don’t document trivial steps you do and avoid unnecessary empty lines between code segments +- When adding imports, keep in mind that the autoformatter will remove them if you don't use them +- `fmt.Errorf` should not be used when there are no parameters. Use `errors.New` +- Errors should always be lowercase like `errors.New("this is an error")` to conform to linting rules +- You can use `maps.Copy` instead of iteratively copying a map +- You can use `strings.Contains` to check if some string is in another +- You can use `slices.Contains` to check if an element is part of a slice +- And definitely use `testlib.Ptr` for test cases that require pointer values + +Testing: +- Ideally test files should be short and contain only the necessary cases +- Avoid creating testing libraries, keep helper functions in the same file as the tests that use them +- Use golang native testing whenever possible, avoid using Ginkgo or testify +- Don't test for the existence of interface methods +- If applicable, use struct based test cases, but limit yourself to the most relevant cases + +Helm charts: +- Note the `# from: file://../../library/cortex-postgres` comment in `Chart.yaml` files, this is required and should point to the local chart path + +## Repository Structure + +Code: +- `cmd/main.go` is the entry point for the manager, which starts the controllers and webhooks +- `api/v1alpha1` is where the CRD specs of cortex lives +- `api/external` contains messages sent to cortex via http from external openstack services +- `internal/scheduling` contains the logic for scheduling in different cloud domains +- `internal/knowledge` has all the logic for feature extraction and raw data downloads from sources like prometheus and openstack +- `pkg` is the code that is very non-cortex-specific and can be used across other projects as well + +Deployment: +- `helm/library` contains a generic cortex setup, i.e. the manager and its dependencies +- `helm/dev` contains charts that can deploy cortex dependencies that a typical production cluster already has, such as a fine-tuned kube-prometheus-stack for monitoring +- `helm/bundles` here are the charts that stylize the library chart into a deployment for a specific domain, for example a bundle for deploying cortex with openstack nova +- In the `helm` folders there are also helpers for syncing helm dependencies which are used by the tiltfile for local development and our ci pipelines to replace oci dependencies with local paths + +Tooling: +- `tools` contains miscallaneous tools for development, which should typically **not** be used by agents + +Documentation: +- `docs` contains documentation for cortex, which should be written in markdown + +## Tooling + +Before finishing your task, you should always ensure local tests and lints are passing: +- `make` regenerates CRDs and deepcopy methods, runs tests, and performs lints +- Avoid running `make` when you don't want to apply your crd changes just yet +- `make lint` runs golangci-lint, `make lint-fix` runs golangci-lint with `--fix` +- `make test` runs all the unit tests with `go test ./...` +- If you are struggling with the Makefile, you can use `make help` to get a list of all available commands and their descriptions From cc15c604f4f6c4c1c41c5b6659795b34c26a5b4e Mon Sep 17 00:00:00 2001 From: Philipp Matthes <27271818+PhilippMatthes@users.noreply.github.com> Date: Mon, 16 Mar 2026 15:30:49 +0100 Subject: [PATCH 30/55] Support server groups in workload spawner (#560) Support server groups in the workload spawner, so we can spawn vms with anti-affinity or affinity rules. --- tools/spawner/cli/cli.go | 17 ++++++ tools/spawner/main.go | 94 +++++++++++++++++++++++++++-- tools/spawner/types/server_group.go | 11 ++++ 3 files changed, 117 insertions(+), 5 deletions(-) create mode 100644 tools/spawner/types/server_group.go diff --git a/tools/spawner/cli/cli.go b/tools/spawner/cli/cli.go index 0caeaf8d0..57ebfb059 100644 --- a/tools/spawner/cli/cli.go +++ b/tools/spawner/cli/cli.go @@ -12,6 +12,7 @@ import ( "strings" "github.com/cobaltcore-dev/cortex/tools/spawner/defaults" + "github.com/cobaltcore-dev/cortex/tools/spawner/types" "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/flavors" "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/hypervisors" "github.com/gophercloud/gophercloud/v2/openstack/identity/v3/domains" @@ -28,6 +29,8 @@ type CLI interface { ChooseImage([]images.Image) images.Image ChooseHypervisorType([]string) string ChooseHypervisor([]hypervisors.Hypervisor) hypervisors.Hypervisor + ChooseServerGroupPolicy([]string) string + ChooseServerGroup([]types.ServerGroup) types.ServerGroup } type cli struct { @@ -92,6 +95,20 @@ func (c *cli) ChooseHypervisor(hs []hypervisors.Hypervisor) hypervisors.Hypervis return choose(c.defaults, "WS_HYPERVISOR", "📂 Hypervisors", hs, f) } +func (c *cli) ChooseServerGroupPolicy(ps []string) string { + f := func(p string) string { + return p + } + return choose(c.defaults, "WS_SERVER_GROUP_POLICY", "📂 Server Group Policies", ps, f) +} + +func (c *cli) ChooseServerGroup(sgs []types.ServerGroup) types.ServerGroup { + f := func(sg types.ServerGroup) string { + return fmt.Sprintf("%s (%s) id:%s", sg.Name, sg.Policy, sg.ID[:5]) + } + return choose(c.defaults, "WS_SERVER_GROUP", "📂 Server Groups", sgs, f) +} + // Choose asks the user to choose one of the given options. // The user can choose by index or by name. The user can also choose the default value. // If the user chooses to input a name, the mapping is done by the displayname function. diff --git a/tools/spawner/main.go b/tools/spawner/main.go index c16b66a1b..2a9bcf12b 100644 --- a/tools/spawner/main.go +++ b/tools/spawner/main.go @@ -19,6 +19,7 @@ import ( "github.com/cobaltcore-dev/cortex/tools/spawner/cli" "github.com/cobaltcore-dev/cortex/tools/spawner/defaults" + "github.com/cobaltcore-dev/cortex/tools/spawner/types" "github.com/gophercloud/gophercloud/v2" "github.com/gophercloud/gophercloud/v2/openstack" "github.com/gophercloud/gophercloud/v2/openstack/blockstorage/v3/volumes" @@ -326,8 +327,8 @@ func main() { var network *networks.Network if len(networksAll) == 1 { fmt.Printf("❓ Delete existing network %s [y/N, default: \033[1;34mN\033[0m]: ", networkName) - reader := bufio.NewReader(os.Stdin) - input := must.Return(reader.ReadString('\n')) + reader = bufio.NewReader(os.Stdin) + input = must.Return(reader.ReadString('\n')) input = strings.TrimSpace(input) if input == "y" { // Delete the subnets. @@ -384,8 +385,8 @@ func main() { // Delete all existing keypairs with the same name. if len(keypairsFiltered) > 0 { fmt.Printf("❓ Delete existing keypairs %v? [y/N, default: \033[1;34my\033[0m]: ", keyName) - reader := bufio.NewReader(os.Stdin) - input := must.Return(reader.ReadString('\n')) + reader = bufio.NewReader(os.Stdin) + input = must.Return(reader.ReadString('\n')) input = strings.TrimSpace(input) if input == "" { input = "y" @@ -412,6 +413,89 @@ func main() { keypair := must.Return(keypairs.Create(ctx, projectCompute, kpo).Extract()) fmt.Printf("🛜 Using keypair %s\n", keyName) + // Check if there are existing server groups and check if the user wants to delete them. + fmt.Println("🔄 Looking up existing server groups") + // Gophercloud doesn't support server groups, so we have to do a raw API call here. + var getServerGroupsResponse struct { + ServerGroups []types.ServerGroup `json:"server_groups"` + } + _ = must.Return(projectCompute.Get(ctx, projectCompute.Endpoint+"/os-server-groups", &getServerGroupsResponse, nil)) + if len(getServerGroupsResponse.ServerGroups) > 0 { + fmt.Printf("❓ Delete existing server groups with name prefix %s [y/N, default: \033[1;34my\033[0m]: ", prefix) + reader = bufio.NewReader(os.Stdin) + input = must.Return(reader.ReadString('\n')) + input = strings.TrimSpace(input) + if input == "" { + input = "y" + } + if input == "y" { + var wg sync.WaitGroup + for _, sg := range getServerGroupsResponse.ServerGroups { + if strings.HasPrefix(sg.Name, prefix) { + wg.Go(func() { + fmt.Printf("🧨 Deleting server group %s\n", sg.Name) + _ = must.Return(projectCompute.Delete(ctx, projectCompute.Endpoint+"/os-server-groups/"+sg.ID, nil)) + fmt.Printf("💥 Deleted server group %s\n", sg.Name) + }) + } + } + wg.Wait() + fmt.Println("🧨 Deleted all existing server groups") + } + } + + var selectedServerGroupID string + + // Get the server groups again and check if the user wants to use an existing one or create a new one. + fmt.Println("🔄 Checking existing server groups again") + _ = must.Return(projectCompute.Get(ctx, projectCompute.Endpoint+"/os-server-groups", &getServerGroupsResponse, nil)) + if len(getServerGroupsResponse.ServerGroups) > 0 { + // Ask the user if they want to use an existing server group. + fmt.Printf("❓ Use existing server group for affinity rules? [y/N, default: \033[1;34mN\033[0m]: ") + reader = bufio.NewReader(os.Stdin) + input = must.Return(reader.ReadString('\n')) + input = strings.TrimSpace(input) + if input == "y" { + selectedServerGroupID = cli.ChooseServerGroup(getServerGroupsResponse.ServerGroups).ID + } + } + // If the user doesn't want to use an existing server group, ask if they want to create a new one. + if selectedServerGroupID == "" { + fmt.Printf("❓ Create a server group for affinity rules? [y/N, default: \033[1;34mN\033[0m]: ") + reader = bufio.NewReader(os.Stdin) + input = must.Return(reader.ReadString('\n')) + input = strings.TrimSpace(input) + if input == "y" { + policies := []string{"anti-affinity", "affinity", "soft-anti-affinity", "soft-affinity"} + policy := cli.ChooseServerGroupPolicy(policies) + serverGroupName := prefix + "-server-group" + fmt.Printf("🆕 Creating server group %s with policy %s\n", serverGroupName, policy) + createServerGroupRequest := struct { + ServerGroup struct { + Name string `json:"name"` + Policy string `json:"policy"` + // For simplicity, we don't include rules for now. + } `json:"server_group"` + }{} + createServerGroupRequest.ServerGroup.Name = serverGroupName + createServerGroupRequest.ServerGroup.Policy = policy + var createServerGroupResponse struct { + ServerGroup struct { + ID string `json:"id"` + } `json:"server_group"` + } + _ = must.Return(projectCompute.Post(ctx, projectCompute.Endpoint+"/os-server-groups", &createServerGroupRequest, &createServerGroupResponse, &gophercloud.RequestOpts{ + OkCodes: []int{200, 201, 202}, + })) + selectedServerGroupID = createServerGroupResponse.ServerGroup.ID + } + } + if selectedServerGroupID != "" { + fmt.Printf("🛜 Using server group with id %s\n", selectedServerGroupID) + } else { + fmt.Printf("🚫 Not using a server group for affinity rules\n") + } + // Load the script template tmpl, err := template.ParseFiles("tools/spawner/script.sh.tpl") must.Succeed(err) @@ -473,7 +557,7 @@ func main() { KeyName: keyName, CreateOptsBuilder: sco, } - ho := servers.SchedulerHintOpts{} + ho := servers.SchedulerHintOpts{Group: selectedServerGroupID} serverCreateResult, err := servers.Create(ctx, projectCompute, so, ho).Extract() baseMsg := fmt.Sprintf( "... (%d/%d) Spawning VM %s on %s with flavor %s, image %s ", diff --git a/tools/spawner/types/server_group.go b/tools/spawner/types/server_group.go new file mode 100644 index 000000000..da7716db9 --- /dev/null +++ b/tools/spawner/types/server_group.go @@ -0,0 +1,11 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package types + +// Not supported by gophercloud. +type ServerGroup struct { + ID string `json:"id"` + Policy string `json:"policy"` + Name string `json:"name"` +} From bcb664f111b5e1365c6cc81327eaa873b95d193e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 16 Mar 2026 14:40:13 +0000 Subject: [PATCH 31/55] Bump cortex chart appVersions to sha-cc15c604 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index ded1cd916..bf38cc1ce 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-1aec52b7" +appVersion: "sha-cc15c604" icon: "https://example.com/icon.png" dependencies: [] From 6db36b819af6230b9ef0eac09362da2788ded12f Mon Sep 17 00:00:00 2001 From: Malte <140147670+umswmayj@users.noreply.github.com> Date: Tue, 17 Mar 2026 10:14:42 +0100 Subject: [PATCH 32/55] fix: Bump and fix postgresql base image (#586) --- postgres/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/postgres/Dockerfile b/postgres/Dockerfile index 796069d8b..09c049295 100644 --- a/postgres/Dockerfile +++ b/postgres/Dockerfile @@ -1,5 +1,5 @@ -# Last updated: 19 Feb 2026 -FROM debian:trixie-slim +# Last updated: 17 Mar 2026 +FROM debian:trixie-slim@sha256:26f98ccd92fd0a44d6928ce8ff8f4921b4d2f535bfa07555ee5d18f61429cf0c # explicitly set user/group IDs RUN set -eux; \ @@ -194,4 +194,4 @@ STOPSIGNAL SIGINT # that even 90 seconds may not be long enough in many instances. EXPOSE 5432 -CMD ["postgres"] \ No newline at end of file +CMD ["postgres"] From f98b13202ac32e593d73ac29305ab37fd4483253 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 17 Mar 2026 09:30:40 +0000 Subject: [PATCH 33/55] Bump cortex-postgres chart appVersions to sha-6db36b81 [skip ci] --- helm/library/cortex-postgres/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex-postgres/Chart.yaml b/helm/library/cortex-postgres/Chart.yaml index ef1fc6398..f849d8bfc 100644 --- a/helm/library/cortex-postgres/Chart.yaml +++ b/helm/library/cortex-postgres/Chart.yaml @@ -6,4 +6,4 @@ name: cortex-postgres description: Postgres setup for Cortex. type: application version: 0.5.12 -appVersion: "sha-73adf5e6" +appVersion: "sha-6db36b81" From 9b9b83573dac33bd9de3d2dc974e8773100eed64 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 17 Mar 2026 09:30:41 +0000 Subject: [PATCH 34/55] Bump cortex chart appVersions to sha-6db36b81 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index bf38cc1ce..323c747d1 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-cc15c604" +appVersion: "sha-6db36b81" icon: "https://example.com/icon.png" dependencies: [] From 6f843fe7983979ecdc4915ab69eeb1e065353afb Mon Sep 17 00:00:00 2001 From: Malte <140147670+umswmayj@users.noreply.github.com> Date: Tue, 17 Mar 2026 11:27:24 +0100 Subject: [PATCH 35/55] fix: Bump postgres chart (#588) --- helm/bundles/cortex-cinder/Chart.yaml | 2 +- helm/bundles/cortex-manila/Chart.yaml | 2 +- helm/bundles/cortex-nova/Chart.yaml | 2 +- helm/library/cortex-postgres/Chart.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/helm/bundles/cortex-cinder/Chart.yaml b/helm/bundles/cortex-cinder/Chart.yaml index b81e033df..2c6bba933 100644 --- a/helm/bundles/cortex-cinder/Chart.yaml +++ b/helm/bundles/cortex-cinder/Chart.yaml @@ -11,7 +11,7 @@ dependencies: # from: file://../../library/cortex-postgres - name: cortex-postgres repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.5.12 + version: 0.5.13 # from: file://../../library/cortex - name: cortex diff --git a/helm/bundles/cortex-manila/Chart.yaml b/helm/bundles/cortex-manila/Chart.yaml index 98e7f587c..0fd2bf0a6 100644 --- a/helm/bundles/cortex-manila/Chart.yaml +++ b/helm/bundles/cortex-manila/Chart.yaml @@ -11,7 +11,7 @@ dependencies: # from: file://../../library/cortex-postgres - name: cortex-postgres repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.5.12 + version: 0.5.13 # from: file://../../library/cortex - name: cortex diff --git a/helm/bundles/cortex-nova/Chart.yaml b/helm/bundles/cortex-nova/Chart.yaml index ec1c16b2f..5c349ae49 100644 --- a/helm/bundles/cortex-nova/Chart.yaml +++ b/helm/bundles/cortex-nova/Chart.yaml @@ -11,7 +11,7 @@ dependencies: # from: file://../../library/cortex-postgres - name: cortex-postgres repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.5.12 + version: 0.5.13 # from: file://../../library/cortex - name: cortex diff --git a/helm/library/cortex-postgres/Chart.yaml b/helm/library/cortex-postgres/Chart.yaml index f849d8bfc..39710ebad 100644 --- a/helm/library/cortex-postgres/Chart.yaml +++ b/helm/library/cortex-postgres/Chart.yaml @@ -5,5 +5,5 @@ apiVersion: v2 name: cortex-postgres description: Postgres setup for Cortex. type: application -version: 0.5.12 +version: 0.5.13 appVersion: "sha-6db36b81" From 354b3d8ba29e39a1d27acc32c4561f6218ecb70d Mon Sep 17 00:00:00 2001 From: Philipp Matthes <27271818+PhilippMatthes@users.noreply.github.com> Date: Tue, 17 Mar 2026 11:30:51 +0100 Subject: [PATCH 36/55] Restore descheduler alerting (#587) During the pipeline refactoring we stripped away the metric `cortex_descheduler_pipeline_vm_descheduling_duration_seconds_count`. We now only have the metric `cortex_detector_pipeline_run_duration_seconds_count` which however doesn't support the `error` label. This change adds the `error` label back in and adjusts the corresponding alert. Also, we add a unit test to check if the label is propagated correctly. --- .../cortex-nova/alerts/nova.alerts.yaml | 2 +- internal/scheduling/lib/detector_monitor.go | 6 +- internal/scheduling/lib/detector_pipeline.go | 14 ++- .../scheduling/lib/detector_pipeline_test.go | 105 ++++++++++++++++++ .../dashboards/cortex-status.json | 8 +- 5 files changed, 122 insertions(+), 13 deletions(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 1c580a9ba..c0190487f 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -40,7 +40,7 @@ groups: but the quality of the responses may be affected. - alert: CortexNovaDeschedulerPipelineErroring - expr: delta(cortex_descheduler_pipeline_vm_descheduling_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0 + expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0 for: 5m labels: context: descheduler diff --git a/internal/scheduling/lib/detector_monitor.go b/internal/scheduling/lib/detector_monitor.go index 4df9c9950..c72ab2804 100644 --- a/internal/scheduling/lib/detector_monitor.go +++ b/internal/scheduling/lib/detector_monitor.go @@ -17,7 +17,7 @@ type DetectorPipelineMonitor struct { // A counter to measure how many vm ids are selected for descheduling by each step. stepDeschedulingCounter *prometheus.GaugeVec // A histogram to measure how long the pipeline takes to run in total. - pipelineRunTimer prometheus.Histogram + pipelineRunTimer *prometheus.HistogramVec // The name of the pipeline being monitored. PipelineName string @@ -34,11 +34,11 @@ func NewDetectorPipelineMonitor() DetectorPipelineMonitor { Name: "cortex_detector_pipeline_step_detections", Help: "Number of resources detected by a detector pipeline step", }, []string{"step"}), - pipelineRunTimer: prometheus.NewHistogram(prometheus.HistogramOpts{ + pipelineRunTimer: prometheus.NewHistogramVec(prometheus.HistogramOpts{ Name: "cortex_detector_pipeline_run_duration_seconds", Help: "Duration of descheduler pipeline run", Buckets: prometheus.DefBuckets, - }), + }, []string{"error"}), } } diff --git a/internal/scheduling/lib/detector_pipeline.go b/internal/scheduling/lib/detector_pipeline.go index b0db5f235..bed650562 100644 --- a/internal/scheduling/lib/detector_pipeline.go +++ b/internal/scheduling/lib/detector_pipeline.go @@ -61,12 +61,15 @@ func (p *DetectorPipeline[DetectionType]) Init( // Execute the descheduler steps in parallel and collect the decisions made by // each step. func (p *DetectorPipeline[DetectionType]) Run() map[string][]DetectionType { + lock := sync.Mutex{} + decisionsByStep := map[string][]DetectionType{} + metricErrLabel := "false" if p.Monitor.pipelineRunTimer != nil { - timer := prometheus.NewTimer(p.Monitor.pipelineRunTimer) + timer := prometheus.NewTimer(prometheus.ObserverFunc(func(v float64) { + p.Monitor.pipelineRunTimer.WithLabelValues(metricErrLabel).Observe(v) + })) defer timer.ObserveDuration() } - var lock sync.Mutex - decisionsByStep := map[string][]DetectionType{} var wg sync.WaitGroup for stepName, step := range p.steps { wg.Go(func() { @@ -76,13 +79,14 @@ func (p *DetectorPipeline[DetectionType]) Run() map[string][]DetectionType { slog.Info("descheduler: step skipped") return } + lock.Lock() + defer lock.Unlock() if err != nil { slog.Error("descheduler: failed to run step", "error", err) + metricErrLabel = "true" return } slog.Info("descheduler: finished step") - lock.Lock() - defer lock.Unlock() decisionsByStep[stepName] = decisions }) } diff --git a/internal/scheduling/lib/detector_pipeline_test.go b/internal/scheduling/lib/detector_pipeline_test.go index 99ad1834b..9d14cc661 100644 --- a/internal/scheduling/lib/detector_pipeline_test.go +++ b/internal/scheduling/lib/detector_pipeline_test.go @@ -10,6 +10,8 @@ import ( "testing" "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) @@ -379,3 +381,106 @@ func TestDetectorPipeline_RunWithMonitor(t *testing.T) { t.Errorf("expected 1 step result, got %d", len(result)) } } + +func TestDetectorPipeline_Run_MetricErrorLabel(t *testing.T) { + tests := []struct { + name string + steps map[string]Detector[mockDetection] + expectedErrLabel string + }{ + { + name: "successful run has error=false label", + steps: map[string]Detector[mockDetection]{ + "step1": &mockDetectorStep{ + decisions: []mockDetection{ + {resource: "vm1", host: "host1", reason: "reason1"}, + }, + }, + }, + expectedErrLabel: "false", + }, + { + name: "failed step sets error=true label", + steps: map[string]Detector[mockDetection]{ + "failing_step": &mockDetectorStep{ + runErr: errors.New("run failed"), + }, + }, + expectedErrLabel: "true", + }, + { + name: "one failing step among multiple sets error=true label", + steps: map[string]Detector[mockDetection]{ + "failing_step": &mockDetectorStep{ + runErr: errors.New("run failed"), + }, + "working_step": &mockDetectorStep{ + decisions: []mockDetection{ + {resource: "vm1", host: "host1", reason: "reason1"}, + }, + }, + }, + expectedErrLabel: "true", + }, + { + name: "skipped step does not set error=true label", + steps: map[string]Detector[mockDetection]{ + "skipped_step": &mockDetectorStep{ + runErr: ErrStepSkipped, + }, + }, + expectedErrLabel: "false", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + monitor := NewDetectorPipelineMonitor() + pipeline := &DetectorPipeline[mockDetection]{ + steps: tt.steps, + Monitor: monitor, + } + + pipeline.Run() + + // Verify the histogram has observations + count := testutil.CollectAndCount(monitor.pipelineRunTimer, "cortex_detector_pipeline_run_duration_seconds") + if count == 0 { + t.Errorf("expected histogram to have observations") + } + + // Gather metrics from the histogram and check the labels + reg := prometheus.NewRegistry() + reg.MustRegister(monitor.pipelineRunTimer) + families, err := reg.Gather() + if err != nil { + t.Fatalf("failed to gather metrics: %v", err) + } + + found := false + for _, family := range families { + if family.GetName() != "cortex_detector_pipeline_run_duration_seconds" { + continue + } + for _, metric := range family.GetMetric() { + for _, label := range metric.GetLabel() { + if label.GetName() == "error" && label.GetValue() == tt.expectedErrLabel { + found = true + } + // Verify opposite label is not present + oppositeLabel := "true" + if tt.expectedErrLabel == "true" { + oppositeLabel = "false" + } + if label.GetName() == "error" && label.GetValue() == oppositeLabel { + t.Errorf("expected metric to NOT have error=%s label", oppositeLabel) + } + } + } + } + if !found { + t.Errorf("expected metric to have error=%s label", tt.expectedErrLabel) + } + }) + } +} diff --git a/tools/plutono/provisioning/dashboards/cortex-status.json b/tools/plutono/provisioning/dashboards/cortex-status.json index 043d07983..2481aded0 100644 --- a/tools/plutono/provisioning/dashboards/cortex-status.json +++ b/tools/plutono/provisioning/dashboards/cortex-status.json @@ -2124,9 +2124,9 @@ "targets": [ { "exemplar": true, - "expr": "sum by(error, skipped) (rate(cortex_descheduler_pipeline_vm_descheduling_duration_seconds_count{}[2m]))", + "expr": "sum by(error) (rate(cortex_detector_pipeline_run_duration_seconds_count{}[2m]))", "interval": "", - "legendFormat": "Error: {{error}}, Skipped: {{skipped}}", + "legendFormat": "Error: {{error}}", "refId": "A" } ], @@ -2321,11 +2321,11 @@ "targets": [ { "exemplar": true, - "expr": "sum by (error, skipped, source_host, target_host, vm_id) (delta(cortex_descheduler_pipeline_vm_descheduling_duration_seconds_count{}[2m]))", + "expr": "sum by (error) (delta(cortex_detector_pipeline_run_duration_seconds_count{}[2m]))", "format": "time_series", "instant": false, "interval": "", - "legendFormat": "{{vm_id}}: ({{source_host}}) -> ({{target_host}}), Error: {{error}}, Skipped: {{skipped}}", + "legendFormat": "Error: {{error}}", "refId": "A" } ], From 19ec6e6194ad16d96a44440c86a8f975a156f3dd Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 17 Mar 2026 10:40:18 +0000 Subject: [PATCH 37/55] Bump cortex chart appVersions to sha-354b3d8b [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 323c747d1..3b1609bd6 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.25 -appVersion: "sha-6db36b81" +appVersion: "sha-354b3d8b" icon: "https://example.com/icon.png" dependencies: [] From 47117be277e93f292847d9b57dd89e6b3466a718 Mon Sep 17 00:00:00 2001 From: Malte <140147670+umswmayj@users.noreply.github.com> Date: Tue, 17 Mar 2026 12:47:47 +0100 Subject: [PATCH 38/55] fix: Bump cortex-x (new postgres version) (#591) --- helm/bundles/cortex-cinder/Chart.yaml | 2 +- helm/bundles/cortex-crds/Chart.yaml | 2 +- helm/bundles/cortex-ironcore/Chart.yaml | 2 +- helm/bundles/cortex-manila/Chart.yaml | 2 +- helm/bundles/cortex-nova/Chart.yaml | 2 +- helm/bundles/cortex-pods/Chart.yaml | 2 +- helm/library/cortex/Chart.yaml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/helm/bundles/cortex-cinder/Chart.yaml b/helm/bundles/cortex-cinder/Chart.yaml index 2c6bba933..d9ee0a674 100644 --- a/helm/bundles/cortex-cinder/Chart.yaml +++ b/helm/bundles/cortex-cinder/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-cinder description: A Helm chart deploying Cortex for Cinder. type: application -version: 0.0.38 +version: 0.0.39 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres diff --git a/helm/bundles/cortex-crds/Chart.yaml b/helm/bundles/cortex-crds/Chart.yaml index 5427ac308..840298499 100644 --- a/helm/bundles/cortex-crds/Chart.yaml +++ b/helm/bundles/cortex-crds/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-crds description: A Helm chart deploying Cortex CRDs. type: application -version: 0.0.38 +version: 0.0.39 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex diff --git a/helm/bundles/cortex-ironcore/Chart.yaml b/helm/bundles/cortex-ironcore/Chart.yaml index a555ba200..56f696d1e 100644 --- a/helm/bundles/cortex-ironcore/Chart.yaml +++ b/helm/bundles/cortex-ironcore/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-ironcore description: A Helm chart deploying Cortex for IronCore. type: application -version: 0.0.38 +version: 0.0.39 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex diff --git a/helm/bundles/cortex-manila/Chart.yaml b/helm/bundles/cortex-manila/Chart.yaml index 0fd2bf0a6..955676a3f 100644 --- a/helm/bundles/cortex-manila/Chart.yaml +++ b/helm/bundles/cortex-manila/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-manila description: A Helm chart deploying Cortex for Manila. type: application -version: 0.0.38 +version: 0.0.39 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres diff --git a/helm/bundles/cortex-nova/Chart.yaml b/helm/bundles/cortex-nova/Chart.yaml index 5c349ae49..6e8c90172 100644 --- a/helm/bundles/cortex-nova/Chart.yaml +++ b/helm/bundles/cortex-nova/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-nova description: A Helm chart deploying Cortex for Nova. type: application -version: 0.0.38 +version: 0.0.39 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres diff --git a/helm/bundles/cortex-pods/Chart.yaml b/helm/bundles/cortex-pods/Chart.yaml index 949a2493a..d0c040d91 100644 --- a/helm/bundles/cortex-pods/Chart.yaml +++ b/helm/bundles/cortex-pods/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-pods description: A Helm chart deploying Cortex for Pods. type: application -version: 0.0.38 +version: 0.0.39 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 3b1609bd6..57097a54a 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: cortex description: A Helm chart to distribute cortex. type: application -version: 0.0.25 +version: 0.0.26 appVersion: "sha-354b3d8b" icon: "https://example.com/icon.png" dependencies: [] From 6b9fa2902d48451e59904e60981236efcd2a330e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 17 Mar 2026 11:56:53 +0000 Subject: [PATCH 39/55] Bump cortex chart appVersions to sha-47117be2 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 57097a54a..acd59c960 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.26 -appVersion: "sha-354b3d8b" +appVersion: "sha-47117be2" icon: "https://example.com/icon.png" dependencies: [] From c02bd8af38be251f5b9a72d72c32fa127b5786a0 Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Tue, 17 Mar 2026 12:59:51 +0100 Subject: [PATCH 40/55] fix: Bump cortex --- helm/bundles/cortex-cinder/Chart.yaml | 4 ++-- helm/bundles/cortex-crds/Chart.yaml | 2 +- helm/bundles/cortex-ironcore/Chart.yaml | 2 +- helm/bundles/cortex-manila/Chart.yaml | 4 ++-- helm/bundles/cortex-nova/Chart.yaml | 4 ++-- helm/bundles/cortex-pods/Chart.yaml | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/helm/bundles/cortex-cinder/Chart.yaml b/helm/bundles/cortex-cinder/Chart.yaml index d9ee0a674..a6e369e27 100644 --- a/helm/bundles/cortex-cinder/Chart.yaml +++ b/helm/bundles/cortex-cinder/Chart.yaml @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-crds/Chart.yaml b/helm/bundles/cortex-crds/Chart.yaml index 840298499..c4b2d7095 100644 --- a/helm/bundles/cortex-crds/Chart.yaml +++ b/helm/bundles/cortex-crds/Chart.yaml @@ -11,7 +11,7 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-ironcore/Chart.yaml b/helm/bundles/cortex-ironcore/Chart.yaml index 56f696d1e..0a4a278c5 100644 --- a/helm/bundles/cortex-ironcore/Chart.yaml +++ b/helm/bundles/cortex-ironcore/Chart.yaml @@ -11,7 +11,7 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-manila/Chart.yaml b/helm/bundles/cortex-manila/Chart.yaml index 955676a3f..ac7420728 100644 --- a/helm/bundles/cortex-manila/Chart.yaml +++ b/helm/bundles/cortex-manila/Chart.yaml @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-nova/Chart.yaml b/helm/bundles/cortex-nova/Chart.yaml index 6e8c90172..0a02660e9 100644 --- a/helm/bundles/cortex-nova/Chart.yaml +++ b/helm/bundles/cortex-nova/Chart.yaml @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-pods/Chart.yaml b/helm/bundles/cortex-pods/Chart.yaml index d0c040d91..dafbc2205 100644 --- a/helm/bundles/cortex-pods/Chart.yaml +++ b/helm/bundles/cortex-pods/Chart.yaml @@ -11,7 +11,7 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case From 63434bcd3645a66457184dd161909636c53cbbdd Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 17 Mar 2026 12:10:20 +0000 Subject: [PATCH 41/55] Bump cortex chart appVersions to sha-c02bd8af [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index acd59c960..62542d328 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.26 -appVersion: "sha-47117be2" +appVersion: "sha-c02bd8af" icon: "https://example.com/icon.png" dependencies: [] From 13aca98b55d84e1535f84edcf15c4d1662ab92ab Mon Sep 17 00:00:00 2001 From: Philipp Matthes <27271818+PhilippMatthes@users.noreply.github.com> Date: Tue, 17 Mar 2026 13:30:40 +0100 Subject: [PATCH 42/55] Bump CortexNovaSchedulingDown alert to critical (#589) This alert has been around for some time, and so far never reported false positives or flapped. So we can consider it stable. Since we're going onto the critical path with cortex for nova kvm, it's crucial that we escalate this alert. There's also an actionable playbook for this alert already. --- helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index c0190487f..65de5c626 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -10,15 +10,16 @@ groups: context: liveness dashboard: cortex/cortex service: cortex - severity: warning + severity: critical support_group: workload-management playbook: docs/support/playbook/cortex/down annotations: summary: "Cortex Scheduling for Nova is down" description: > The Cortex scheduling service is down. Scheduling requests from Nova will - not be served. This is no immediate problem, since Nova will continue - placing new VMs. However, the placement will be less desirable. + not be served. This is non-critical for vmware virtual machines, but + blocks kvm virtual machines from being scheduled. Thus, it is + recommended to immediately investigate and resolve the issue. - alert: CortexNovaKnowledgeDown expr: | From ca7b0eae663bcb8ece14eb560e92d15a3f5e9651 Mon Sep 17 00:00:00 2001 From: Philipp Matthes <27271818+PhilippMatthes@users.noreply.github.com> Date: Tue, 17 Mar 2026 13:31:06 +0100 Subject: [PATCH 43/55] Use hv1 effective capacity for weighing + filtering (#583) In https://github.com/cobaltcore-dev/openstack-hypervisor-operator/pull/257 we introduced a new `effectiveCapacity` field of the hypervisor crd. In https://github.com/cobaltcore-dev/kvm-node-agent/pull/70 we populate this field. Now we can upgrade cortex so it performs scheduling on the overcommitted capacity. --- internal/scheduling/nova/integration_test.go | 2 +- .../plugins/filters/filter_has_enough_capacity.go | 10 ++++++++-- .../filters/filter_has_enough_capacity_test.go | 2 +- .../nova/plugins/weighers/kvm_binpack.go | 7 ++++--- .../nova/plugins/weighers/kvm_binpack_test.go | 8 ++++---- .../plugins/weighers/kvm_prefer_smaller_hosts.go | 15 ++++++++------- .../weighers/kvm_prefer_smaller_hosts_test.go | 12 ++++++------ 7 files changed, 32 insertions(+), 24 deletions(-) diff --git a/internal/scheduling/nova/integration_test.go b/internal/scheduling/nova/integration_test.go index 596d9f2ed..a1267c9c0 100644 --- a/internal/scheduling/nova/integration_test.go +++ b/internal/scheduling/nova/integration_test.go @@ -48,7 +48,7 @@ func newHypervisor(name, cpuCap, cpuAlloc, memCap, memAlloc string) *hv1.Hypervi Name: name, }, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceCPU: resource.MustParse(cpuCap), hv1.ResourceMemory: resource.MustParse(memCap), }, diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go index 8852f6151..198f1a28f 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go @@ -56,8 +56,14 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa return nil, err } for _, hv := range hvs.Items { - // Start with the total capacity. - freeResourcesByHost[hv.Name] = hv.Status.Capacity + // This case would be caught below, but we want to log this explicitly. + if hv.Status.EffectiveCapacity == nil { + traceLog.Warn("hypervisor with nil effective capacity, skipping", "host", hv.Name) + continue + } + + // Start with the total effective capacity which is capacity * overcommit ratio. + freeResourcesByHost[hv.Name] = hv.Status.EffectiveCapacity // Subtract allocated resources. for resourceName, allocated := range hv.Status.Allocation { diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go index 504bbb523..4068cf900 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go @@ -39,7 +39,7 @@ func newHypervisor(name, cpuCap, cpuAlloc, memCap, memAlloc string) *hv1.Hypervi Name: name, }, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceCPU: resource.MustParse(cpuCap), hv1.ResourceMemory: resource.MustParse(memCap), }, diff --git a/internal/scheduling/nova/plugins/weighers/kvm_binpack.go b/internal/scheduling/nova/plugins/weighers/kvm_binpack.go index 3bed165f4..e1509a4cc 100644 --- a/internal/scheduling/nova/plugins/weighers/kvm_binpack.go +++ b/internal/scheduling/nova/plugins/weighers/kvm_binpack.go @@ -93,14 +93,15 @@ func (s *KVMBinpackStep) Run(traceLog *slog.Logger, request api.ExternalSchedule var totalWeightedUtilization, totalWeight float64 for resourceName, weight := range s.Options.ResourceWeights { - capacity, ok := hv.Status.Capacity[resourceName] + // Effective capacity = capacity * overcommit ratio. + capacity, ok := hv.Status.EffectiveCapacity[resourceName] if !ok { - traceLog.Warn("no capacity in status, skipping", + traceLog.Warn("no effective capacity in status, skipping", "host", host, "resource", resourceName) continue } if capacity.IsZero() { - traceLog.Warn("capacity is zero, skipping", + traceLog.Warn("effective capacity is zero, skipping", "host", host, "resource", resourceName) continue } diff --git a/internal/scheduling/nova/plugins/weighers/kvm_binpack_test.go b/internal/scheduling/nova/plugins/weighers/kvm_binpack_test.go index e867c5bf7..69e1aa9f6 100644 --- a/internal/scheduling/nova/plugins/weighers/kvm_binpack_test.go +++ b/internal/scheduling/nova/plugins/weighers/kvm_binpack_test.go @@ -22,7 +22,7 @@ func newHypervisor(name, capacityCPU, capacityMem, allocationCPU, allocationMem Name: name, }, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceCPU: resource.MustParse(capacityCPU), hv1.ResourceMemory: resource.MustParse(capacityMem), }, @@ -343,7 +343,7 @@ func TestKVMBinpackStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceCPU: resource.MustParse("0"), hv1.ResourceMemory: resource.MustParse("100Gi"), }, @@ -371,7 +371,7 @@ func TestKVMBinpackStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceCPU: resource.MustParse("100"), }, Allocation: map[hv1.ResourceName]resource.Quantity{ @@ -397,7 +397,7 @@ func TestKVMBinpackStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ // No CPU capacity }, Allocation: map[hv1.ResourceName]resource.Quantity{ diff --git a/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts.go b/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts.go index 8bb5928ee..b65a5f75f 100644 --- a/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts.go +++ b/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts.go @@ -81,9 +81,10 @@ func (s *KVMPreferSmallerHostsStep) Run(traceLog *slog.Logger, request api.Exter if _, ok := result.Activations[hv.Name]; !ok { continue } - capacity, ok := hv.Status.Capacity[resourceName] + // Effective capacity = capacity * overcommit ratio. + capacity, ok := hv.Status.EffectiveCapacity[resourceName] if !ok { - traceLog.Warn("hypervisor has no capacity for resource, skipping", + traceLog.Warn("hypervisor has no effective capacity for resource, skipping", "host", hv.Name, "resource", resourceName) continue } @@ -106,9 +107,9 @@ func (s *KVMPreferSmallerHostsStep) Run(traceLog *slog.Logger, request api.Exter var totalWeightedScore, totalWeight float64 for resourceName, weight := range s.Options.ResourceWeights { - capacity, ok := hv.Status.Capacity[resourceName] + capacity, ok := hv.Status.EffectiveCapacity[resourceName] if !ok { - traceLog.Warn("hypervisor has no capacity for resource, skipping", + traceLog.Warn("hypervisor has no effective capacity for resource, skipping", "host", hv.Name, "resource", resourceName) continue } @@ -117,14 +118,14 @@ func (s *KVMPreferSmallerHostsStep) Run(traceLog *slog.Logger, request api.Exter largestCap := largest[resourceName] if smallestCap == nil || largestCap == nil { - traceLog.Warn("no capacity range found for resource, skipping", + traceLog.Warn("no effective capacity range found for resource, skipping", "resource", resourceName) continue } - // If all hosts have the same capacity for this resource, skip it + // If all hosts have the same effective capacity for this resource, skip it if smallestCap.Cmp(*largestCap) == 0 { - traceLog.Info("all hypervisors have the same capacity for resource, skipping", + traceLog.Info("all hypervisors have the same effective capacity for resource, skipping", "resource", resourceName) continue } diff --git a/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts_test.go b/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts_test.go index 4a1b70e20..2ab2deb89 100644 --- a/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts_test.go +++ b/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts_test.go @@ -22,7 +22,7 @@ func newHypervisorWithCapacity(name, capacityCPU, capacityMem string) *hv1.Hyper Name: name, }, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceCPU: resource.MustParse(capacityCPU), hv1.ResourceMemory: resource.MustParse(capacityMem), }, @@ -376,7 +376,7 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host3"}, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceCPU: resource.MustParse("100"), // No memory capacity }, @@ -466,13 +466,13 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{}, + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{}, }, }, { ObjectMeta: metav1.ObjectMeta{Name: "host2"}, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{}, + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{}, }, }, }, @@ -534,7 +534,7 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceMemory: resource.MustParse("64Gi"), // No CPU }, @@ -543,7 +543,7 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host2"}, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceMemory: resource.MustParse("128Gi"), // No CPU }, From c71d9d4649a22617af58c9e12aa11aa5d4bd1be9 Mon Sep 17 00:00:00 2001 From: Philipp Matthes <27271818+PhilippMatthes@users.noreply.github.com> Date: Tue, 17 Mar 2026 13:36:08 +0100 Subject: [PATCH 44/55] Provide hypervisor overcommit controller (#584) This change adds a controller to the nova scheduling controller manager, which finds hypervisors with specific traits and assigns desired overcommit ratios to them. --- cmd/main.go | 9 + helm/bundles/cortex-nova/values.yaml | 6 + .../templates/rbac/hypervisor_role.yaml | 3 + .../nova/hypervisor_overcommit_controller.go | 242 +++++ .../hypervisor_overcommit_controller_test.go | 936 ++++++++++++++++++ pkg/conf/conf.go | 28 +- 6 files changed, 1218 insertions(+), 6 deletions(-) create mode 100644 internal/scheduling/nova/hypervisor_overcommit_controller.go create mode 100644 internal/scheduling/nova/hypervisor_overcommit_controller_test.go diff --git a/cmd/main.go b/cmd/main.go index 43ae63f9c..46a244de1 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -369,6 +369,15 @@ func main() { os.Exit(1) } } + if slices.Contains(mainConfig.EnabledControllers, "hypervisor-overcommit-controller") { + hypervisorOvercommitController := &nova.HypervisorOvercommitController{} + hypervisorOvercommitController.Client = multiclusterClient + if err := hypervisorOvercommitController.SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", + "controller", "HypervisorOvercommitController") + os.Exit(1) + } + } if slices.Contains(mainConfig.EnabledControllers, "manila-decisions-pipeline-controller") { controller := &manila.FilterWeigherPipelineController{ Monitor: filterWeigherPipelineMonitor, diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index 200ba3ff3..c38b8bfc4 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -113,6 +113,7 @@ cortex-scheduling-controllers: enabledControllers: - nova-pipeline-controllers - nova-deschedulings-executor + - hypervisor-overcommit-controller - explanation-controller - reservations-controller enabledTasks: @@ -120,6 +121,11 @@ cortex-scheduling-controllers: # Endpoints configuration for reservations controller endpoints: novaExternalScheduler: "http://localhost:8080/scheduler/nova/external" + # OvercommitMappings is a list of mappings that map hypervisor traits to + # overcommit ratios. Note that this list is applied in order, so if there + # are multiple mappings applying to the same hypervisors, the last mapping + # in this list will override the previous ones. + overcommitMappings: [] cortex-knowledge-controllers: <<: *cortex diff --git a/helm/library/cortex/templates/rbac/hypervisor_role.yaml b/helm/library/cortex/templates/rbac/hypervisor_role.yaml index 14b61e5de..0a2fefa00 100644 --- a/helm/library/cortex/templates/rbac/hypervisor_role.yaml +++ b/helm/library/cortex/templates/rbac/hypervisor_role.yaml @@ -1,5 +1,6 @@ {{- if .Values.rbac.hypervisor.enable }} --- +# TODO: Check if this role can be part of the nova bundle, not the core library apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: @@ -14,6 +15,8 @@ rules: verbs: - get - list + - patch + - update - watch - apiGroups: - kvm.cloud.sap diff --git a/internal/scheduling/nova/hypervisor_overcommit_controller.go b/internal/scheduling/nova/hypervisor_overcommit_controller.go new file mode 100644 index 000000000..8d54475f0 --- /dev/null +++ b/internal/scheduling/nova/hypervisor_overcommit_controller.go @@ -0,0 +1,242 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package nova + +import ( + "context" + "errors" + "fmt" + "maps" + "slices" + + "github.com/cobaltcore-dev/cortex/pkg/conf" + "github.com/cobaltcore-dev/cortex/pkg/multicluster" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/client-go/util/workqueue" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +// HypervisorOvercommitMapping maps hypervisor types to their desired +// overcommit ratios. This mapping will be loaded from a configmap +// that is mounted into the controller pod. +type HypervisorOvercommitMapping struct { + // Overcommit is the overcommit ratio to set for hypervisors by resource name. + // Values must be set to something >= 1.0, otherwise the controller will + // ignore them. + Overcommit map[hv1.ResourceName]float64 `json:"overcommit"` + + // HasTrait specifies a trait that a hypervisor may have, and that, if present, + // triggers the controller to set the overcommit ratio specified in the + // overcommit field for that hypervisor. + HasTrait *string `json:"hasTrait,omitempty"` + + // HasntTrait specifies a trait that a hypervisor may have, and that, if + // NOT present, triggers the controller to set the overcommit ratio + // specified in the overcommit field for that hypervisor. + HasntTrait *string `json:"hasntTrait,omitempty"` +} + +// Validate the provided HypervisorOvercommitMapping, returning an error if the +// mapping is invalid. +func (m *HypervisorOvercommitMapping) Validate() error { + for resource, overcommit := range m.Overcommit { + if overcommit < 1.0 { + return errors.New("invalid overcommit ratio in config, must be >= 1.0. " + + "Invalid value for resource " + string(resource) + ": " + + fmt.Sprintf("%f", overcommit)) + } + // Has trait and hasn't trait are mutually exclusive, so if both are set + // we return an error. + if m.HasTrait != nil && m.HasntTrait != nil { + return errors.New("invalid overcommit mapping, hasTrait and hasntTrait are mutually exclusive") + } + // At least one of has trait and hasn't trait must be set, + // otherwise we don't know when to apply this mapping. + if m.HasTrait == nil && m.HasntTrait == nil { + return errors.New("invalid overcommit mapping, at least one of hasTrait and hasntTrait must be set") + } + } + return nil +} + +// HypervisorOvercommitConfig holds the configuration for the +// HypervisorOvercommitController and is loaded from a configmap that is mounted +// into the controller pod. +type HypervisorOvercommitConfig struct { + // OvercommitMappings is a list of mappings that map hypervisor traits to + // overcommit ratios. Note that this list is applied in order, so if there + // are multiple mappings applying to the same hypervisors, the last mapping + // in this list will override the previous ones. + OvercommitMappings []HypervisorOvercommitMapping `json:"overcommitMappings"` +} + +// Validate the provided HypervisorOvercommitConfig, returning an error if the +// config is invalid. +func (c *HypervisorOvercommitConfig) Validate() error { + // Check that all the individual mappings are valid. + for _, mapping := range c.OvercommitMappings { + if err := mapping.Validate(); err != nil { + return err + } + } + return nil +} + +// HypervisorOvercommitController is a controller that reconciles on the +// hypervisor crd and sets desired overcommit ratios based on the hypervisor +// type. +type HypervisorOvercommitController struct { + client.Client + + // config holds the configuration for the controller, which is loaded from a + // configmap that is mounted into the controller pod. + config HypervisorOvercommitConfig +} + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +// +// For more details, check Reconcile and its Result here: +// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.20.2/pkg/reconcile +// +// For more details about the method shape, read up here: +// - https://ahmet.im/blog/controller-pitfalls/#reconcile-method-shape +func (c *HypervisorOvercommitController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx) + log.Info("Reconciling resource") + + obj := new(hv1.Hypervisor) + if err := c.Get(ctx, req.NamespacedName, obj); err != nil { + if apierrors.IsNotFound(err) { + // If the custom resource is not found then it usually means + // that it was deleted or not created. + log.Info("Resource not found. Ignoring since object must be deleted") + return ctrl.Result{}, nil + } + // Error reading the object - requeue the request. + log.Error(err, "Failed to get resource") + return ctrl.Result{}, err + } + + // Build desired overcommit ratios by iterating mappings in order. + // Later mappings override earlier ones for the same resource, preserving + // non-overlapping resources from previous mappings. + desiredOvercommit := make(map[hv1.ResourceName]float64) + for _, mapping := range c.config.OvercommitMappings { + var applyMapping bool + switch { + // These are mutually exclusive. + case mapping.HasTrait != nil: + applyMapping = slices.Contains(obj.Status.Traits, *mapping.HasTrait) + case mapping.HasntTrait != nil: + applyMapping = !slices.Contains(obj.Status.Traits, *mapping.HasntTrait) + default: + // This should never happen due to validation, but we check it just in case. + log.Info("Skipping overcommit mapping with no trait specified", + "overcommit", mapping.Overcommit) + continue + } + if !applyMapping { + continue + } + log.Info("Applying overcommit mapping on hypervisor", + "overcommit", mapping.Overcommit) + maps.Copy(desiredOvercommit, mapping.Overcommit) + } + log.Info("Desired overcommit ratios based on traits", + "desiredOvercommit", desiredOvercommit) + if maps.Equal(desiredOvercommit, obj.Spec.Overcommit) { + log.Info("Overcommit ratios are up to date, no update needed") + return ctrl.Result{}, nil + } + + // Update the desired overcommit ratios on the hypervisor spec. + orig := obj.DeepCopy() + obj.Spec.Overcommit = desiredOvercommit + if err := c.Patch(ctx, obj, client.MergeFrom(orig)); err != nil { + log.Error(err, "Failed to update hypervisor overcommit ratios") + return ctrl.Result{}, err + } + log.Info("Updated hypervisor with new overcommit ratios", + "overcommit", desiredOvercommit) + + return ctrl.Result{}, nil +} + +// handleRemoteHypervisor is called by watches in remote clusters and triggers +// a reconcile on the hypervisor resource that was changed in the remote cluster. +func (c *HypervisorOvercommitController) handleRemoteHypervisor() handler.EventHandler { + handler := handler.Funcs{} + handler.CreateFunc = func(ctx context.Context, evt event.CreateEvent, + queue workqueue.TypedRateLimitingInterface[reconcile.Request]) { + + queue.Add(ctrl.Request{NamespacedName: client.ObjectKey{ + Name: evt.Object.(*hv1.Hypervisor).Name, // cluster-scoped crd + }}) + } + handler.UpdateFunc = func(ctx context.Context, evt event.UpdateEvent, + queue workqueue.TypedRateLimitingInterface[reconcile.Request]) { + + queue.Add(ctrl.Request{NamespacedName: client.ObjectKey{ + Name: evt.ObjectOld.(*hv1.Hypervisor).Name, // cluster-scoped crd + }}) + } + handler.DeleteFunc = func(ctx context.Context, evt event.DeleteEvent, + queue workqueue.TypedRateLimitingInterface[reconcile.Request]) { + + queue.Add(ctrl.Request{NamespacedName: client.ObjectKey{ + Name: evt.Object.(*hv1.Hypervisor).Name, // cluster-scoped crd + }}) + } + return handler +} + +// predicateRemoteHypervisor is used to filter events from remote clusters, +// so that only events for hypervisors that should be processed by this +// controller will trigger reconciliations. +func (c *HypervisorOvercommitController) predicateRemoteHypervisor() predicate.Predicate { + // Currently we're watching all hypervisors. In this way, if a trait + // gets removed from the hypervisor, we'll still reconcile this + // hypervisor and update the overcommit ratios accordingly. + return predicate.NewPredicateFuncs(func(object client.Object) bool { + _, ok := object.(*hv1.Hypervisor) + return ok + }) +} + +// SetupWithManager sets up the controller with the Manager and a multicluster +// client. The multicluster client is used to watch for changes in the +// Hypervisor CRD across all clusters and trigger reconciliations accordingly. +func (c *HypervisorOvercommitController) SetupWithManager(mgr ctrl.Manager) (err error) { + // This will load the config in a safe way and gracefully handle errors. + c.config, err = conf.GetConfig[HypervisorOvercommitConfig]() + if err != nil { + return err + } + // Validate we don't have any weird values in the config. + if err := c.config.Validate(); err != nil { + return err + } + // Check that the provided client is a multicluster client, since we need + // that to watch for hypervisors across clusters. + mcl, ok := c.Client.(*multicluster.Client) + if !ok { + return errors.New("provided client must be a multicluster client") + } + return multicluster. + BuildController(mcl, mgr). + // The hypervisor crd may be distributed across multiple remote clusters. + WatchesMulticluster(&hv1.Hypervisor{}, + c.handleRemoteHypervisor(), + c.predicateRemoteHypervisor(), + ). + Named("hypervisor-overcommit-controller"). + Complete(c) +} diff --git a/internal/scheduling/nova/hypervisor_overcommit_controller_test.go b/internal/scheduling/nova/hypervisor_overcommit_controller_test.go new file mode 100644 index 000000000..391fe356c --- /dev/null +++ b/internal/scheduling/nova/hypervisor_overcommit_controller_test.go @@ -0,0 +1,936 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package nova + +import ( + "context" + "errors" + "strings" + "testing" + + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/workqueue" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +func TestHypervisorOvercommitMapping_Validate(t *testing.T) { + gpuTrait := "CUSTOM_GPU" + standardTrait := "CUSTOM_STANDARD" + + tests := []struct { + name string + mapping HypervisorOvercommitMapping + expectError bool + }{ + { + name: "valid overcommit ratios with HasTrait", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + hv1.ResourceMemory: 1.5, + }, + HasTrait: &gpuTrait, + }, + expectError: false, + }, + { + name: "valid minimum overcommit ratio of 1.0 with HasntTrait", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, + }, + HasntTrait: &gpuTrait, + }, + expectError: false, + }, + { + name: "invalid overcommit ratio less than 1.0", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 0.5, + }, + HasTrait: &gpuTrait, + }, + expectError: true, + }, + { + name: "invalid overcommit ratio of zero", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 0.0, + }, + HasTrait: &gpuTrait, + }, + expectError: true, + }, + { + name: "invalid negative overcommit ratio", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: -1.0, + }, + HasTrait: &gpuTrait, + }, + expectError: true, + }, + { + name: "empty overcommit map is valid", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + expectError: false, + }, + { + name: "nil overcommit map is valid", + mapping: HypervisorOvercommitMapping{ + Overcommit: nil, + }, + expectError: false, + }, + { + name: "mixed valid and invalid overcommit ratios", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + hv1.ResourceMemory: 0.5, // invalid + }, + HasTrait: &gpuTrait, + }, + expectError: true, + }, + { + name: "invalid: both HasTrait and HasntTrait set", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasTrait: &gpuTrait, + HasntTrait: &standardTrait, + }, + expectError: true, + }, + { + name: "invalid: neither HasTrait nor HasntTrait set with non-empty overcommit", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + }, + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.mapping.Validate() + if tt.expectError && err == nil { + t.Error("expected error but got nil") + } + if !tt.expectError && err != nil { + t.Errorf("expected no error but got: %v", err) + } + }) + } +} + +func TestHypervisorOvercommitConfig_Validate(t *testing.T) { + gpuTrait := "CUSTOM_GPU" + standardTrait := "CUSTOM_STANDARD" + tests := []struct { + name string + config HypervisorOvercommitConfig + expectError bool + }{ + { + name: "valid config with single mapping", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectError: false, + }, + { + name: "valid config with multiple mappings", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasTrait: &gpuTrait, + }, + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.5, + }, + HasntTrait: &standardTrait, + }, + }, + }, + expectError: false, + }, + { + name: "invalid config with bad overcommit ratio", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 0.5, // invalid ratio + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectError: true, + }, + { + name: "invalid config with both HasTrait and HasntTrait", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasTrait: &gpuTrait, + HasntTrait: &standardTrait, + }, + }, + }, + expectError: true, + }, + { + name: "invalid config with neither HasTrait nor HasntTrait", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + }, + }, + }, + expectError: true, + }, + { + name: "empty config is valid", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{}, + }, + expectError: false, + }, + { + name: "nil mappings is valid", + config: HypervisorOvercommitConfig{ + OvercommitMappings: nil, + }, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.config.Validate() + if tt.expectError && err == nil { + t.Error("expected error but got nil") + } + if !tt.expectError && err != nil { + t.Errorf("expected no error but got: %v", err) + } + }) + } +} + +func newTestHypervisorScheme(t *testing.T) *runtime.Scheme { + t.Helper() + scheme := runtime.NewScheme() + if err := hv1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add hv1 to scheme: %v", err) + } + return scheme +} + +func TestHypervisorOvercommitController_Reconcile(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + gpuTrait := "CUSTOM_GPU" + standardTrait := "CUSTOM_STANDARD" + missingTrait := "CUSTOM_MISSING" + + tests := []struct { + name string + hypervisor *hv1.Hypervisor + config HypervisorOvercommitConfig + expectedOvercommit map[hv1.ResourceName]float64 + expectNoUpdate bool + expectNotFoundError bool + }{ + { + name: "apply overcommit for matching HasTrait", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + }, + { + name: "apply overcommit for matching HasntTrait", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{}, // missing trait + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasntTrait: &missingTrait, + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + }, + { + name: "skip mapping when HasTrait not present", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_OTHER"}, + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{}, + }, + { + name: "skip mapping when HasntTrait is present", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, // trait is present + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasntTrait: &gpuTrait, // should skip because GPU trait IS present + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{}, + }, + { + name: "later mappings override earlier ones", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU", "CUSTOM_STANDARD"}, + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasTrait: &standardTrait, + }, + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, // should override the first + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + }, + { + name: "no update when overcommit already matches", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + expectNoUpdate: true, + }, + { + name: "skip mapping without trait specified", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + // No HasTrait or HasntTrait specified + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{}, + }, + { + name: "combine HasTrait and HasntTrait mappings", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, // has GPU, doesn't have STANDARD + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + HasTrait: &gpuTrait, + }, + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.5, + }, + HasntTrait: &standardTrait, // STANDARD not present + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + hv1.ResourceMemory: 1.5, + }, + }, + { + name: "hypervisor not found", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nonexistent", + }, + }, + config: HypervisorOvercommitConfig{}, + expectNotFoundError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var fakeClient client.Client + if tt.expectNotFoundError { + // Don't add the hypervisor to the fake client + fakeClient = fake.NewClientBuilder(). + WithScheme(scheme). + Build() + } else { + fakeClient = fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(tt.hypervisor). + Build() + } + + controller := &HypervisorOvercommitController{ + Client: fakeClient, + config: tt.config, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: tt.hypervisor.Name, + }, + } + + ctx := context.Background() + result, err := controller.Reconcile(ctx, req) + + if err != nil { + t.Errorf("unexpected error: %v", err) + return + } + + if result.RequeueAfter > 0 { + t.Error("expected no requeue") + } + + if tt.expectNotFoundError { + // For not found case, we expect no error and no requeue + return + } + + // Get the updated hypervisor + updated := &hv1.Hypervisor{} + if err := fakeClient.Get(ctx, req.NamespacedName, updated); err != nil { + t.Fatalf("failed to get updated hypervisor: %v", err) + } + + // Check overcommit ratios + if len(updated.Spec.Overcommit) != len(tt.expectedOvercommit) { + t.Errorf("expected %d overcommit entries, got %d", + len(tt.expectedOvercommit), len(updated.Spec.Overcommit)) + } + + for resource, expected := range tt.expectedOvercommit { + actual, ok := updated.Spec.Overcommit[resource] + if !ok { + t.Errorf("expected overcommit for resource %s, but not found", resource) + continue + } + if actual != expected { + t.Errorf("expected overcommit %f for resource %s, got %f", + expected, resource, actual) + } + } + }) + } +} + +func TestHypervisorOvercommitController_ReconcileNotFound(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + controller := &HypervisorOvercommitController{ + Client: fakeClient, + config: HypervisorOvercommitConfig{}, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "nonexistent-hypervisor", + }, + } + + ctx := context.Background() + result, err := controller.Reconcile(ctx, req) + + if err != nil { + t.Errorf("expected no error for not found resource, got: %v", err) + } + + if result.RequeueAfter > 0 { + t.Error("expected no requeue for not found resource") + } +} + +// mockWorkQueue implements workqueue.TypedRateLimitingInterface for testing +type mockWorkQueue struct { + workqueue.TypedRateLimitingInterface[reconcile.Request] + items []reconcile.Request +} + +func (m *mockWorkQueue) Add(item reconcile.Request) { + m.items = append(m.items, item) +} + +func TestHypervisorOvercommitController_HandleRemoteHypervisor(t *testing.T) { + controller := &HypervisorOvercommitController{} + handler := controller.handleRemoteHypervisor() + + hypervisor := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + } + + ctx := context.Background() + + t.Run("CreateFunc", func(t *testing.T) { + queue := &mockWorkQueue{} + handler.Create(ctx, event.CreateEvent{Object: hypervisor}, queue) + + if len(queue.items) != 1 { + t.Errorf("expected 1 item in queue, got %d", len(queue.items)) + } + if queue.items[0].Name != "test-hypervisor" { + t.Errorf("expected hypervisor name 'test-hypervisor', got %s", queue.items[0].Name) + } + }) + + t.Run("UpdateFunc", func(t *testing.T) { + queue := &mockWorkQueue{} + handler.Update(ctx, event.UpdateEvent{ + ObjectOld: hypervisor, + ObjectNew: hypervisor, + }, queue) + + if len(queue.items) != 1 { + t.Errorf("expected 1 item in queue, got %d", len(queue.items)) + } + if queue.items[0].Name != "test-hypervisor" { + t.Errorf("expected hypervisor name 'test-hypervisor', got %s", queue.items[0].Name) + } + }) + + t.Run("DeleteFunc", func(t *testing.T) { + queue := &mockWorkQueue{} + handler.Delete(ctx, event.DeleteEvent{Object: hypervisor}, queue) + + if len(queue.items) != 1 { + t.Errorf("expected 1 item in queue, got %d", len(queue.items)) + } + if queue.items[0].Name != "test-hypervisor" { + t.Errorf("expected hypervisor name 'test-hypervisor', got %s", queue.items[0].Name) + } + }) +} + +func TestHypervisorOvercommitController_PredicateRemoteHypervisor(t *testing.T) { + controller := &HypervisorOvercommitController{} + predicate := controller.predicateRemoteHypervisor() + + t.Run("accepts Hypervisor objects", func(t *testing.T) { + hypervisor := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + } + + if !predicate.Generic(event.GenericEvent{Object: hypervisor}) { + t.Error("expected predicate to accept Hypervisor object") + } + }) + + t.Run("rejects non-Hypervisor objects", func(t *testing.T) { + // Create a non-Hypervisor object by using a different type + // We'll test with a nil object which should return false + type nonHypervisor struct { + client.Object + } + + if predicate.Generic(event.GenericEvent{Object: &nonHypervisor{}}) { + t.Error("expected predicate to reject non-Hypervisor object") + } + }) +} + +func TestHypervisorOvercommitController_SetupWithManager_InvalidClient(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + // Create a regular fake client (not a multicluster client) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + controller := &HypervisorOvercommitController{ + Client: fakeClient, + } + + // Create a minimal mock manager for testing + mgr := &mockManager{scheme: scheme} + + // SetupWithManager should fail - either because config loading fails + // (in test environment without config files) or because the client + // is not a multicluster client. + err := controller.SetupWithManager(mgr) + if err == nil { + t.Error("expected error when calling SetupWithManager, got nil") + } + // The error could be either about missing config or about multicluster client + // depending on the test environment. We just verify an error is returned. +} + +// mockManager implements ctrl.Manager for testing SetupWithManager +type mockManager struct { + ctrl.Manager + scheme *runtime.Scheme +} + +func (m *mockManager) GetScheme() *runtime.Scheme { + return m.scheme +} + +// patchFailingClient wraps a client.Client and returns an error on Patch calls +type patchFailingClient struct { + client.Client + patchErr error +} + +func (c *patchFailingClient) Patch(ctx context.Context, obj client.Object, patch client.Patch, opts ...client.PatchOption) error { + return c.patchErr +} + +func TestHypervisorOvercommitController_Reconcile_PatchError(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + gpuTrait := "CUSTOM_GPU" + hypervisor := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + } + + // Create a fake client with the hypervisor, then wrap it to fail on Patch + baseClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(hypervisor). + Build() + + patchErr := errors.New("patch failed") + failingClient := &patchFailingClient{ + Client: baseClient, + patchErr: patchErr, + } + + controller := &HypervisorOvercommitController{ + Client: failingClient, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + HasTrait: &gpuTrait, + }, + }, + }, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: hypervisor.Name, + }, + } + + ctx := context.Background() + _, err := controller.Reconcile(ctx, req) + + // Reconcile should return an error when Patch fails + if err == nil { + t.Error("expected error when Patch fails, got nil") + } + if !strings.Contains(err.Error(), "patch failed") { + t.Errorf("expected error message to contain 'patch failed', got: %v", err) + } +} + +func TestHypervisorOvercommitController_Reconcile_EmptyConfig(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + hypervisor := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(hypervisor). + Build() + + controller := &HypervisorOvercommitController{ + Client: fakeClient, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{}, + }, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: hypervisor.Name, + }, + } + + ctx := context.Background() + result, err := controller.Reconcile(ctx, req) + + if err != nil { + t.Errorf("unexpected error: %v", err) + } + + if result.RequeueAfter > 0 { + t.Error("expected no requeue") + } + + // Verify no changes were made + updated := &hv1.Hypervisor{} + if err := fakeClient.Get(ctx, req.NamespacedName, updated); err != nil { + t.Fatalf("failed to get updated hypervisor: %v", err) + } + + if len(updated.Spec.Overcommit) != 0 { + t.Errorf("expected empty overcommit, got %v", updated.Spec.Overcommit) + } +} + +func TestHypervisorOvercommitController_Reconcile_MultipleResources(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + gpuTrait := "CUSTOM_GPU" + hypervisor := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(hypervisor). + Build() + + controller := &HypervisorOvercommitController{ + Client: fakeClient, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + hv1.ResourceMemory: 1.5, + }, + HasTrait: &gpuTrait, + }, + }, + }, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: hypervisor.Name, + }, + } + + ctx := context.Background() + _, err := controller.Reconcile(ctx, req) + + if err != nil { + t.Errorf("unexpected error: %v", err) + } + + updated := &hv1.Hypervisor{} + if err := fakeClient.Get(ctx, req.NamespacedName, updated); err != nil { + t.Fatalf("failed to get updated hypervisor: %v", err) + } + + if len(updated.Spec.Overcommit) != 2 { + t.Errorf("expected 2 overcommit entries, got %d", len(updated.Spec.Overcommit)) + } + + if updated.Spec.Overcommit[hv1.ResourceCPU] != 4.0 { + t.Errorf("expected CPU overcommit 4.0, got %f", updated.Spec.Overcommit[hv1.ResourceCPU]) + } + + if updated.Spec.Overcommit[hv1.ResourceMemory] != 1.5 { + t.Errorf("expected Memory overcommit 1.5, got %f", updated.Spec.Overcommit[hv1.ResourceMemory]) + } +} diff --git a/pkg/conf/conf.go b/pkg/conf/conf.go index 595b33bf2..b0feb02c2 100644 --- a/pkg/conf/conf.go +++ b/pkg/conf/conf.go @@ -17,35 +17,51 @@ import ( // // The values read from secrets.json will override the values in conf.json func GetConfigOrDie[C any]() C { + c, err := GetConfig[C]() + if err != nil { + panic(err) + } + return c +} + +// Create a new configuration from the default config json file. +// Return an error if the config cannot be read or parsed. +// +// This will read two files: +// - /etc/config/conf.json +// - /etc/secrets/secrets.json +// +// The values read from secrets.json will override the values in conf.json +func GetConfig[C any]() (C, error) { // Note: We need to read the config as a raw map first, to avoid golang // unmarshalling default values for the fields. // Read the base config from the configmap (not including secrets). cmConf, err := readRawConfig("/etc/config/conf.json") if err != nil { - panic(err) + return *new(C), err } // Read the secrets config from the kubernetes secret. secretConf, err := readRawConfig("/etc/secrets/secrets.json") if err != nil { - panic(err) + return *new(C), err } return newConfigFromMaps[C](cmConf, secretConf) } -func newConfigFromMaps[C any](base, override map[string]any) C { +func newConfigFromMaps[C any](base, override map[string]any) (C, error) { // Merge the base config with the override config. mergedConf := mergeMaps(base, override) // Marshal again, and then unmarshal into the config struct. mergedBytes, err := json.Marshal(mergedConf) if err != nil { - panic(err) + return *new(C), err } var c C if err := json.Unmarshal(mergedBytes, &c); err != nil { - panic(err) + return *new(C), err } - return c + return c, nil } // Read the json as a map from the given file path. From 3c025dfa6ea1f794267596a1333beca6ca6172a8 Mon Sep 17 00:00:00 2001 From: Markus Wieland <44964229+SoWieMarkus@users.noreply.github.com> Date: Tue, 17 Mar 2026 13:42:54 +0100 Subject: [PATCH 45/55] Refactor kvm resource capacity kpi to use effective capacity (#585) ## Changes - Adjusted the kvm resource capacity kpi to use the new `EffectiveCapacity` from the hypervisor CRD, see: https://github.com/cobaltcore-dev/openstack-hypervisor-operator/pull/257 --- .../plugins/compute/resource_capacity_kvm.go | 18 ++++-- .../compute/resource_capacity_kvm_test.go | 61 +++++++++++++++++-- 2 files changed, 69 insertions(+), 10 deletions(-) diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go index 38d3b68d1..638df91da 100644 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go +++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go @@ -154,20 +154,30 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { } for _, hypervisor := range hvs.Items { - cpuTotal, hasCPUTotal := hypervisor.Status.Capacity["cpu"] - ramTotal, hasRAMTotal := hypervisor.Status.Capacity["memory"] + if hypervisor.Status.EffectiveCapacity == nil { + slog.Warn("hypervisor with nil effective capacity, skipping", "host", hypervisor.Name) + continue + } + + cpuTotal, hasCPUTotal := hypervisor.Status.EffectiveCapacity[hv1.ResourceCPU] + ramTotal, hasRAMTotal := hypervisor.Status.EffectiveCapacity[hv1.ResourceMemory] if !hasCPUTotal || !hasRAMTotal { slog.Error("hypervisor missing cpu or ram total capacity", "hypervisor", hypervisor.Name) continue } - cpuUsed, hasCPUUtilized := hypervisor.Status.Allocation["cpu"] + if cpuTotal.IsZero() || ramTotal.IsZero() { + slog.Warn("hypervisor with zero cpu or ram total capacity, skipping", "host", hypervisor.Name) + continue + } + + cpuUsed, hasCPUUtilized := hypervisor.Status.Allocation[hv1.ResourceCPU] if !hasCPUUtilized { cpuUsed = resource.MustParse("0") } - ramUsed, hasRAMUtilized := hypervisor.Status.Allocation["memory"] + ramUsed, hasRAMUtilized := hypervisor.Status.Allocation[hv1.ResourceMemory] if !hasRAMUtilized { ramUsed = resource.MustParse("0") } diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go index d0f3b1780..bb2e5f91a 100644 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go +++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go @@ -47,6 +47,55 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { hypervisors []hv1.Hypervisor expectedMetrics map[string][]expectedMetric // metric_name -> []expectedMetric }{ + { + name: "single hypervisor with nil effective capacity", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: nil, // Simulate nil effective capacity + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + // No metrics should be emitted for this hypervisor since effective capacity is nil + expectedMetrics: map[string][]expectedMetric{}, + }, + { + name: "single hypervisor with zero total capacity", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), // Simulate zero CPU capacity + hv1.ResourceMemory: resource.MustParse("0"), // Simulate zero RAM capacity + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + Traits: []string{}, + }, + }, + }, + // No metrics should be emitted for this hypervisor since total capacity is zero + expectedMetrics: map[string][]expectedMetric{}, + }, { name: "single hypervisor with default traits", hypervisors: []hv1.Hypervisor{ @@ -58,7 +107,7 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceCPU: resource.MustParse("128"), hv1.ResourceMemory: resource.MustParse("512Gi"), }, @@ -148,7 +197,7 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceCPU: resource.MustParse("256"), hv1.ResourceMemory: resource.MustParse("1Ti"), }, @@ -209,7 +258,7 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceCPU: resource.MustParse("64"), hv1.ResourceMemory: resource.MustParse("256Gi"), }, @@ -255,7 +304,7 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceCPU: resource.MustParse("100"), hv1.ResourceMemory: resource.MustParse("200Gi"), }, @@ -274,7 +323,7 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceCPU: resource.MustParse("200"), hv1.ResourceMemory: resource.MustParse("400Gi"), }, @@ -332,7 +381,7 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceCPU: resource.MustParse("96"), hv1.ResourceMemory: resource.MustParse("384Gi"), }, From 5bd67771cfb3ab9fcb4d37058e8f2eaa11a9c936 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 17 Mar 2026 12:52:10 +0000 Subject: [PATCH 46/55] Bump cortex chart appVersions to sha-3c025dfa [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 62542d328..cf616c345 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.26 -appVersion: "sha-c02bd8af" +appVersion: "sha-3c025dfa" icon: "https://example.com/icon.png" dependencies: [] From cd9a8be3be2c307dfeccc57e7d1838f652564fab Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Tue, 17 Mar 2026 14:28:52 +0100 Subject: [PATCH 47/55] Fix hypervisor crd url --- Tiltfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tiltfile b/Tiltfile index 2de697c32..a42fe43f4 100644 --- a/Tiltfile +++ b/Tiltfile @@ -75,7 +75,7 @@ local('kubectl wait --namespace cert-manager --for=condition=available deploymen ########### Dependency CRDs # Make sure the local cluster is running if you are running into startup issues here. -url = 'https://raw.githubusercontent.com/cobaltcore-dev/openstack-hypervisor-operator/refs/heads/main/charts/openstack-hypervisor-operator/crds/hypervisor-crd.yaml' +url = 'https://raw.githubusercontent.com/cobaltcore-dev/openstack-hypervisor-operator/refs/heads/main/charts/openstack-hypervisor-operator/crds/kvm.cloud.sap_hypervisors.yaml' local('curl -L ' + url + ' | kubectl apply -f -') ########### Cortex Operator & CRDs From 9b5fa57406e6fc32b55e5a96c17bbdb2ad480b14 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Tue, 17 Mar 2026 14:29:04 +0100 Subject: [PATCH 48/55] Fix incorrect scoping in hypervisor overcommit controller --- .../nova/hypervisor_overcommit_controller.go | 23 +++++++++++-------- .../hypervisor_overcommit_controller_test.go | 8 +++---- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/internal/scheduling/nova/hypervisor_overcommit_controller.go b/internal/scheduling/nova/hypervisor_overcommit_controller.go index 8d54475f0..946fabec4 100644 --- a/internal/scheduling/nova/hypervisor_overcommit_controller.go +++ b/internal/scheduling/nova/hypervisor_overcommit_controller.go @@ -52,16 +52,16 @@ func (m *HypervisorOvercommitMapping) Validate() error { "Invalid value for resource " + string(resource) + ": " + fmt.Sprintf("%f", overcommit)) } - // Has trait and hasn't trait are mutually exclusive, so if both are set - // we return an error. - if m.HasTrait != nil && m.HasntTrait != nil { - return errors.New("invalid overcommit mapping, hasTrait and hasntTrait are mutually exclusive") - } - // At least one of has trait and hasn't trait must be set, - // otherwise we don't know when to apply this mapping. - if m.HasTrait == nil && m.HasntTrait == nil { - return errors.New("invalid overcommit mapping, at least one of hasTrait and hasntTrait must be set") - } + } + // Has trait and hasn't trait are mutually exclusive, so if both are set + // we return an error. + if m.HasTrait != nil && m.HasntTrait != nil { + return errors.New("invalid overcommit mapping, hasTrait and hasntTrait are mutually exclusive") + } + // At least one of has trait and hasn't trait must be set, + // otherwise we don't know when to apply this mapping. + if m.HasTrait == nil && m.HasntTrait == nil { + return errors.New("invalid overcommit mapping, at least one of hasTrait and hasntTrait must be set") } return nil } @@ -130,6 +130,9 @@ func (c *HypervisorOvercommitController) Reconcile(ctx context.Context, req ctrl // non-overlapping resources from previous mappings. desiredOvercommit := make(map[hv1.ResourceName]float64) for _, mapping := range c.config.OvercommitMappings { + log.Info("Processing overcommit mapping", + "mapping", mapping, + "hypervisorTraits", obj.Status.Traits) var applyMapping bool switch { // These are mutually exclusive. diff --git a/internal/scheduling/nova/hypervisor_overcommit_controller_test.go b/internal/scheduling/nova/hypervisor_overcommit_controller_test.go index 391fe356c..e52669c3a 100644 --- a/internal/scheduling/nova/hypervisor_overcommit_controller_test.go +++ b/internal/scheduling/nova/hypervisor_overcommit_controller_test.go @@ -82,18 +82,18 @@ func TestHypervisorOvercommitMapping_Validate(t *testing.T) { expectError: true, }, { - name: "empty overcommit map is valid", + name: "empty overcommit map is invalid", mapping: HypervisorOvercommitMapping{ Overcommit: map[hv1.ResourceName]float64{}, }, - expectError: false, + expectError: true, }, { - name: "nil overcommit map is valid", + name: "nil overcommit map is invalid", mapping: HypervisorOvercommitMapping{ Overcommit: nil, }, - expectError: false, + expectError: true, }, { name: "mixed valid and invalid overcommit ratios", From 03638cfc8998c8a91f0e79e2396e57ee9ba0ccff Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 17 Mar 2026 13:38:43 +0000 Subject: [PATCH 49/55] Bump cortex chart appVersions to sha-9b5fa574 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index cf616c345..dd0bad528 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.26 -appVersion: "sha-3c025dfa" +appVersion: "sha-9b5fa574" icon: "https://example.com/icon.png" dependencies: [] From 1669faac960bcaee0c09e386b15c948a91b6170d Mon Sep 17 00:00:00 2001 From: mblos <156897072+mblos@users.noreply.github.com> Date: Wed, 18 Mar 2026 08:36:33 +0100 Subject: [PATCH 50/55] fix: commitment change API integration test (#592) - fix hypervisor crd Tilt reference - make test with proper summary based on gotestsum - Adding integration tests for commitment change API - Commitment change API handles more corner cases - commitment config added - moving endpoint to "/v1/commitments/..." --- Makefile | 18 + api/v1alpha1/reservation_types.go | 4 + docs/develop.md | 15 + .../files/crds/cortex.cloud_reservations.yaml | 4 + .../reservations/commitments/api.go | 10 +- .../commitments/api_change_commitments.go | 130 +- .../api_change_commitments_test.go | 1770 +++++++++++++++-- .../reservations/commitments/config.go | 22 + .../commitments/reservation_manager.go | 33 +- .../reservations/commitments/state.go | 2 +- 10 files changed, 1791 insertions(+), 217 deletions(-) create mode 100644 internal/scheduling/reservations/commitments/config.go diff --git a/Makefile b/Makefile index b63e2e267..3d90f6161 100644 --- a/Makefile +++ b/Makefile @@ -28,6 +28,17 @@ lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes test: ## Run all tests. go test ./... +.PHONY: testsum +testsum: gotestsum ## Run all tests (clean output for passing, verbose for failing). Options: WATCH=1, RUN=, PACKAGE=, FORMAT= (e.g., standard-verbose for all output) + $(GOTESTSUM) \ + $(if $(WATCH),--watch) \ + --format $(if $(FORMAT),$(FORMAT),testname) \ + --hide-summary=all \ + -- \ + $(if $(VERBOSE),-v) \ + $(if $(RUN),-run $(RUN)) \ + $(if $(PACKAGE),$(PACKAGE),./...) + .PHONY: generate generate: deepcopy crds ## Regenerate CRDs and DeepCopy after API type changes. @@ -45,9 +56,11 @@ $(LOCALBIN): CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen GOLANGCI_LINT = $(LOCALBIN)/golangci-lint +GOTESTSUM = $(LOCALBIN)/gotestsum CONTROLLER_TOOLS_VERSION ?= v0.20.0 GOLANGCI_LINT_VERSION ?= v2.9.0 +GOTESTSUM_VERSION ?= v1.13.0 .PHONY: controller-gen controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessary. @@ -59,6 +72,11 @@ golangci-lint: $(GOLANGCI_LINT) ## Download golangci-lint locally if necessary. $(GOLANGCI_LINT): $(LOCALBIN) $(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/v2/cmd/golangci-lint,$(GOLANGCI_LINT_VERSION)) +.PHONY: gotestsum +gotestsum: $(GOTESTSUM) ## Download gotestsum locally if necessary. +$(GOTESTSUM): $(LOCALBIN) + $(call go-install-tool,$(GOTESTSUM),gotest.tools/gotestsum,$(GOTESTSUM_VERSION)) + # go-install-tool will 'go install' any package with custom target and name of binary, if it doesn't exist # $1 - target path with name of binary # $2 - package url which can be installed diff --git a/api/v1alpha1/reservation_types.go b/api/v1alpha1/reservation_types.go index 913a93a8f..5e6a30b01 100644 --- a/api/v1alpha1/reservation_types.go +++ b/api/v1alpha1/reservation_types.go @@ -54,6 +54,10 @@ type CommittedResourceReservationSpec struct { // +kubebuilder:validation:Optional ResourceName string `json:"resourceName,omitempty"` + // CommitmentUUID is the UUID of the commitment that this reservation corresponds to. + // +kubebuilder:validation:Optional + CommitmentUUID string `json:"commitmentUUID,omitempty"` + // ResourceGroup is the group/category of the resource (e.g., flavor group for Nova) // +kubebuilder:validation:Optional ResourceGroup string `json:"resourceGroup,omitempty"` diff --git a/docs/develop.md b/docs/develop.md index 5b090c889..c39cbd61a 100644 --- a/docs/develop.md +++ b/docs/develop.md @@ -34,6 +34,21 @@ Cortex is developed using the Go programming language. To get started with the d Run `make` in your terminal from the cortex root directory to perform linting and testing tasks. +### Working on Tests + +```bash +# Watch mode for continuous testing; print logs for failed tests only +make testsum WATCH=1 +``` + +The `testsum` target provides cleaner output by showing only full verbose output for failing tests. + +**Available options:** +- `WATCH=1` - Automatically re-run tests when files change +- `RUN=` - Run specific tests matching the pattern +- `PACKAGE=` - Test specific package(s) +- `FORMAT=` - Change output format (e.g., `standard-verbose` for verbose output on all tests) + ## Helm Charts Helm charts bundle the application into a package, containing all the [Kubernetes](https://kubernetes.io/docs/tutorials/hello-minikube/) resources needed to run the application. The configuration for the application is specified in the [Helm `values.yaml`](cortex.secrets.example.yaml). diff --git a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml index 915e5677e..d9256e5db 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml @@ -87,6 +87,10 @@ spec: Key: Workload UUID (VM UUID for Nova, Pod UID for Pods, Machine UID for IronCore, etc.) Value: allocation state and metadata type: object + commitmentUUID: + description: CommitmentUUID is the UUID of the commitment that + this reservation corresponds to. + type: string creator: description: |- Creator identifies the system or component that created this reservation. diff --git a/internal/scheduling/reservations/commitments/api.go b/internal/scheduling/reservations/commitments/api.go index ba83e2ab8..9d8fd5944 100644 --- a/internal/scheduling/reservations/commitments/api.go +++ b/internal/scheduling/reservations/commitments/api.go @@ -14,20 +14,26 @@ import ( // HTTPAPI implements Limes LIQUID commitment validation endpoints. type HTTPAPI struct { client client.Client + config Config // Mutex to serialize change-commitments requests changeMutex sync.Mutex } func NewAPI(client client.Client) *HTTPAPI { + return NewAPIWithConfig(client, DefaultConfig()) +} + +func NewAPIWithConfig(client client.Client, config Config) *HTTPAPI { return &HTTPAPI{ client: client, + config: config, } } func (api *HTTPAPI) Init(mux *http.ServeMux) { - mux.HandleFunc("/v1/change-commitments", api.HandleChangeCommitments) + mux.HandleFunc("/v1/commitments/change-commitments", api.HandleChangeCommitments) // mux.HandleFunc("/v1/report-capacity", api.HandleReportCapacity) - mux.HandleFunc("/v1/info", api.HandleInfo) + mux.HandleFunc("/v1/commitments/info", api.HandleInfo) } var commitmentApiLog = ctrl.Log.WithName("commitment_api") diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go index 3134b3b9d..1c6276ade 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments.go @@ -9,6 +9,8 @@ import ( "errors" "fmt" "net/http" + "sort" + "strings" "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" @@ -16,20 +18,23 @@ import ( "github.com/go-logr/logr" . "github.com/majewsky/gg/option" "github.com/sapcc/go-api-declarations/liquid" - apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" ) -const ( - // watchTimeout is how long to wait for all reservations to become ready - watchTimeout = 20 * time.Second - - // pollInterval is how frequently to poll reservation status - pollInterval = 1 * time.Second -) +// sortedKeys returns map keys sorted alphabetically for deterministic iteration. +func sortedKeys[K ~string, V any](m map[K]V) []K { + keys := make([]K, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + sort.Slice(keys, func(i, j int) bool { + return string(keys[i]) < string(keys[j]) + }) + return keys +} // implements POST /v1/change-commitments from Limes LIQUID API: // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go @@ -99,6 +104,7 @@ func (api *HTTPAPI) processCommitmentChanges(w http.ResponseWriter, log logr.Log ctx := context.Background() manager := NewReservationManager(api.client) requireRollback := false + failedCommitments := make(map[string]string) // commitmentUUID to reason for failure, for better response messages in case of rollback log.Info("processing commitment change request", "availabilityZone", req.AZ, "dryRun", req.DryRun, "affectedProjects", len(req.ByProject)) knowledge := &reservations.FlavorGroupKnowledgeClient{Client: api.client} @@ -135,8 +141,10 @@ func (api *HTTPAPI) processCommitmentChanges(w http.ResponseWriter, log logr.Log } ProcessLoop: - for projectID, projectChanges := range req.ByProject { - for resourceName, resourceChanges := range projectChanges.ByResource { + for _, projectID := range sortedKeys(req.ByProject) { + projectChanges := req.ByProject[projectID] + for _, resourceName := range sortedKeys(projectChanges.ByResource) { + resourceChanges := projectChanges.ByResource[resourceName] // Validate resource name pattern (instances_group_*) flavorGroupName, err := getFlavorGroupNameFromResource(string(resourceName)) if err != nil { @@ -157,6 +165,7 @@ ProcessLoop: // Additional per-commitment validation if needed log.Info("processing commitment change", "commitmentUUID", commitment.UUID, "projectID", projectID, "resourceName", resourceName, "oldStatus", commitment.OldStatus.UnwrapOr("none"), "newStatus", commitment.NewStatus.UnwrapOr("none")) + // TODO add configurable upper limit validation for commitment size (number of instances) to prevent excessive reservation creation // TODO add domain // List all committed resource reservations, then filter by name prefix @@ -164,7 +173,8 @@ ProcessLoop: if err := api.client.List(ctx, &all_reservations, client.MatchingLabels{ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, }); err != nil { - resp.RejectionReason = fmt.Sprintf("failed to list reservations for commitment %s: %v", commitment.UUID, err) + failedCommitments[string(commitment.UUID)] = "failed to list reservations" + log.Info(fmt.Sprintf("failed to list reservations for commitment %s: %v", commitment.UUID, err)) requireRollback = true break ProcessLoop } @@ -189,7 +199,8 @@ ProcessLoop: } else { stateBefore, err = FromReservations(existing_reservations.Items) if err != nil { - resp.RejectionReason = fmt.Sprintf("failed to get existing state for commitment %s: %v", commitment.UUID, err) + failedCommitments[string(commitment.UUID)] = "failed to parse existing commitment reservations" + log.Info(fmt.Sprintf("failed to get existing state for commitment %s: %v", commitment.UUID, err)) requireRollback = true break ProcessLoop } @@ -199,7 +210,8 @@ ProcessLoop: // get desired state stateDesired, err := FromChangeCommitmentTargetState(commitment, string(projectID), flavorGroupName, flavorGroup, string(req.AZ)) if err != nil { - resp.RejectionReason = fmt.Sprintf("failed to get desired state for commitment %s: %v", commitment.UUID, err) + failedCommitments[string(commitment.UUID)] = "failed to determine desired commitment state" + log.Info(fmt.Sprintf("failed to get desired state for commitment %s: %v", commitment.UUID, err)) requireRollback = true break ProcessLoop } @@ -208,7 +220,8 @@ ProcessLoop: touchedReservations, deletedReservations, err := manager.ApplyCommitmentState(ctx, log, stateDesired, flavorGroups, "changeCommitmentsApi") if err != nil { - resp.RejectionReason = fmt.Sprintf("failed to apply commitment state for commitment %s: %v", commitment.UUID, err) + failedCommitments[string(commitment.UUID)] = "failed to apply commitment state" + log.Info(fmt.Sprintf("failed to apply commitment state for commitment %s: %v", commitment.UUID, err)) requireRollback = true break ProcessLoop } @@ -224,10 +237,17 @@ ProcessLoop: time_start := time.Now() - if err := watchReservationsUntilReady(ctx, log, api.client, reservationsToWatch, watchTimeout); err != nil { + if failedReservations, errors := watchReservationsUntilReady(ctx, log, api.client, reservationsToWatch, api.config.ChangeAPIWatchReservationsTimeout, api.config.ChangeAPIWatchReservationsPollInterval); len(failedReservations) > 0 || len(errors) > 0 { log.Info("reservations failed to become ready, initiating rollback", - "reason", err.Error()) - resp.RejectionReason = fmt.Sprintf("Not all reservations can be fulfilled: %v", err) + "failedReservations", len(failedReservations), + "errors", errors) + + for _, res := range failedReservations { + failedCommitments[res.Spec.CommittedResourceReservation.CommitmentUUID] = "not sufficient capacity" + } + if len(failedReservations) == 0 { + resp.RejectionReason += "timeout reached while processing commitment changes" + } requireRollback = true } @@ -235,6 +255,16 @@ ProcessLoop: } if requireRollback { + // Build rejection reason from failed commitments + if len(failedCommitments) > 0 { + var reasonBuilder strings.Builder + reasonBuilder.WriteString(fmt.Sprintf("%d commitment(s) failed to apply: ", len(failedCommitments))) + for commitmentUUID, reason := range failedCommitments { + reasonBuilder.WriteString(fmt.Sprintf("\n- commitment %s: %s", commitmentUUID, reason)) + } + resp.RejectionReason = reasonBuilder.String() + } + log.Info("rollback of commitment changes") for commitmentUUID, state := range statesBefore { // Rollback to statesBefore for this commitment @@ -247,16 +277,10 @@ ProcessLoop: } log.Info("finished applying rollbacks for commitment changes", "reasonOfRollback", resp.RejectionReason) - - // TODO improve human-readable reasoning based on actual failure, i.e. polish resp.RejectionReason return nil } log.Info("commitment changes accepted") - if resp.RejectionReason != "" { - log.Info("unexpected non-empty rejection reason without rollback", "reason", resp.RejectionReason) - resp.RejectionReason = "" - } return nil } @@ -267,23 +291,28 @@ func watchReservationsUntilReady( k8sClient client.Client, reservations []v1alpha1.Reservation, timeout time.Duration, -) error { + pollInterval time.Duration, +) (failedReservations []v1alpha1.Reservation, errors []error) { if len(reservations) == 0 { - return nil + return failedReservations, nil } deadline := time.Now().Add(timeout) + reservationsToWatch := make([]v1alpha1.Reservation, len(reservations)) + copy(reservationsToWatch, reservations) + for { + var stillWaiting []v1alpha1.Reservation if time.Now().After(deadline) { - return fmt.Errorf("timeout after %v waiting for reservations to become ready", timeout) + errors = append(errors, fmt.Errorf("timeout after %v waiting for reservations to become ready", timeout)) + return failedReservations, errors } - allReady := true - var notReadyReasons []string + allChecked := true - for _, res := range reservations { + for _, res := range reservationsToWatch { // Fetch current state var current v1alpha1.Reservation nn := types.NamespacedName{ @@ -292,12 +321,11 @@ func watchReservationsUntilReady( } if err := k8sClient.Get(ctx, nn, ¤t); err != nil { - if apierrors.IsNotFound(err) { - // Reservation is still in process of being created - allReady = false - continue - } - return fmt.Errorf("failed to get reservation %s: %w", res.Name, err) + allChecked = false + // Reservation is still in process of being created, or there is a transient error, continue waiting for it + log.V(1).Info("transient error getting reservation, will retry", "reservation", res.Name, "error", err) + stillWaiting = append(stillWaiting, res) + continue } // Check Ready condition @@ -308,37 +336,33 @@ func watchReservationsUntilReady( if readyCond == nil { // Condition not set yet, keep waiting - allReady = false - notReadyReasons = append(notReadyReasons, - res.Name+": condition not set") + allChecked = false + stillWaiting = append(stillWaiting, res) continue } switch readyCond.Status { case metav1.ConditionTrue: - // This reservation is ready - continue + // TODO use more than readyCondition case metav1.ConditionFalse: - // Explicit failure - stop immediately - return fmt.Errorf("reservation %s failed: %s (reason: %s)", - res.Name, readyCond.Message, readyCond.Reason) + allChecked = false + failedReservations = append(failedReservations, res) case metav1.ConditionUnknown: - // Still processing - allReady = false - notReadyReasons = append(notReadyReasons, - fmt.Sprintf("%s: %s", res.Name, readyCond.Message)) + allChecked = false + stillWaiting = append(stillWaiting, res) } } - if allReady { - log.Info("all reservations are ready", - "count", len(reservations)) - return nil + if allChecked || len(stillWaiting) == 0 { + log.Info("all reservations checked", + "failed", len(failedReservations)) + return failedReservations, errors } + reservationsToWatch = stillWaiting // Log progress log.Info("waiting for reservations to become ready", - "notReady", len(notReadyReasons), + "notReady", len(reservationsToWatch), "total", len(reservations), "timeRemaining", time.Until(deadline).Round(time.Second)) @@ -347,7 +371,7 @@ func watchReservationsUntilReady( case <-time.After(pollInterval): // Continue polling case <-ctx.Done(): - return fmt.Errorf("context cancelled while waiting for reservations: %w", ctx.Err()) + return failedReservations, append(errors, fmt.Errorf("context cancelled while waiting for reservations: %w", ctx.Err())) } } } diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_test.go index c4703c4a1..871e72b54 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments_test.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments_test.go @@ -1,246 +1,1720 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 +//nolint:unparam,unused // test helper functions have fixed parameters for simplicity package commitments import ( "bytes" + "context" "encoding/json" + "fmt" + "io" "net/http" "net/http/httptest" + "os" + "sort" + "strconv" + "strings" + "sync" "testing" "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "github.com/sapcc/go-api-declarations/liquid" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" ) -// TODO refactor with proper integration tests +// ============================================================================ +// Integration Tests +// ============================================================================ -func TestHandleChangeCommitments_VersionMismatch(t *testing.T) { - // Create a fake Kubernetes client with a Knowledge CRD - scheme := runtime.NewScheme() - if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("failed to add scheme: %v", err) +func TestCommitmentChangeIntegration(t *testing.T) { + m1Tiny := &TestFlavor{Name: "m1.tiny", Group: "gp_1", MemoryMB: 256, VCPUs: 1} + m1Small := &TestFlavor{Name: "m1.small", Group: "hana_1", MemoryMB: 1024, VCPUs: 4} + m1Large := &TestFlavor{Name: "m1.large", Group: "hana_1", MemoryMB: 4096, VCPUs: 16} + m1XL := &TestFlavor{Name: "m1.xl", Group: "hana_1", MemoryMB: 8192, VCPUs: 32} + + testCases := []CommitmentChangeTestCase{ + { + Name: "Shrinking CR - unused reservations removed, used reservations untouched", + VMs: []*TestVM{{UUID: "vm-a1", Flavor: m1Large, ProjectID: "project-A", Host: "host-1", AZ: "az-a"}}, + Flavors: []*TestFlavor{m1Small, m1Large}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-a1"}}, + {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-123", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("ram_hana_1", "project-A", "uuid-123", "confirmed", 2)), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-a1"}}, + {CommitmentID: "uuid-123", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Insufficient capacity when increasing CR", + VMs: []*TestVM{}, + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{{CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("ram_hana_1", "project-A", "uuid-456", "confirmed", 3)), + AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 1024, "host-2": 0}}, + ExpectedReservations: []*TestReservation{{CommitmentID: "uuid-456", Host: "", Flavor: m1Small, ProjectID: "project-A"}}, + ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment uuid-456: not sufficient capacity"), + }, + { + Name: "Swap capacity between CRs - order dependent - delete-first succeeds", + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-456", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-456", "confirmed", 0), + createCommitment("ram_hana_1", "project-B", "uuid-123", "confirmed", 2), + ), + AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0}}, + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-B"}, + {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-B"}}, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Swap capacity between CRs - order dependent - create-first fails", + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-B"}, + {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-B"}}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-456", "confirmed", 2), + createCommitment("ram_hana_1", "project-B", "uuid-123", "confirmed", 0), + ), + AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0}}, + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-B"}, + {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-B"}}, + ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment uuid-456: not sufficient capacity"), + }, + { + Name: "Flavor bin-packing - mixed sizes when largest doesn't fit", + // Greedy selection: 10GB request with 8/4/1GB flavors → picks 1×8GB + 2×1GB + Flavors: []*TestFlavor{m1XL, m1Large, m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-binpack", "confirmed", 10), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-binpack", Flavor: m1XL, ProjectID: "project-A"}, + {CommitmentID: "uuid-binpack", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-binpack", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Version mismatch - request rejected with 409 Conflict", + // InfoVersion validation prevents stale requests (1233 vs 1234) + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1233, + createCommitment("ram_hana_1", "project-A", "uuid-version", "confirmed", 2), + ), + EnvInfoVersion: 1234, + ExpectedReservations: []*TestReservation{}, + ExpectedAPIResponse: APIResponseExpectation{StatusCode: 409}, + }, + { + Name: "Multi-project rollback - one failure rolls back all", + // Transactional: project-B fails (insufficient capacity) → both projects rollback + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-project-a", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-project-a", "confirmed", 2), + createCommitment("ram_hana_1", "project-B", "uuid-project-b", "confirmed", 2), + ), + AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 1024, "host-2": 0}}, + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-project-a", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse("uuid-project-b", "not sufficient capacity"), + }, + { + Name: "Rollback with VMs allocated - limitation: VM allocations not rolled back", + // Controller will eventually clean up and repair inconsistent state + VMs: []*TestVM{{UUID: "vm-rollback", Flavor: m1Small, ProjectID: "project-A", Host: "host-1", AZ: "az-a"}}, + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "commitment-A", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-rollback"}}, + {CommitmentID: "commitment-A", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "commitment-A", "confirmed", 0), + createCommitment("ram_hana_1", "project-B", "commitment-B", "confirmed", 6), + ), + AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0}}, + ExpectedReservations: []*TestReservation{ + // Rollback creates unscheduled reservations (empty Host accepts any in matching) + {CommitmentID: "commitment-A", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "commitment-A", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse("commitment-B", "not sufficient capacity"), + }, + { + Name: "New commitment creation - from zero to N reservations", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-new", "confirmed", 3), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-new", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-new", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-new", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "New commitment creation - large batch", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-new", "confirmed", 200), + ), + ExpectedReservations: func() []*TestReservation { + var reservations []*TestReservation + for range 200 { + reservations = append(reservations, &TestReservation{ + CommitmentID: "uuid-new", + Flavor: m1Small, + ProjectID: "project-A", + }) + } + return reservations + }(), + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "With reservations of custom size - total unchanged", + // Preserves custom-sized reservations when total matches (2×2GB = 4GB) + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-custom", "confirmed", 4), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "With reservations of custom size - increase total", + // 4GB (2×2GB custom) → 6GB: preserves custom sizes, adds standard-sized reservations + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-custom", "confirmed", 6), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "With reservations of custom size - decrease total", + // 4GB (2×2GB custom) → 3GB: removes 1×2GB custom, adds 1×1GB standard + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-custom", "confirmed", 3), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Complete commitment deletion - N to zero reservations", + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-delete", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-delete", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-delete", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-b-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-B"}, + {CommitmentID: "uuid-a-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-delete", "confirmed", 0), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-b-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-B"}, + {CommitmentID: "uuid-a-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "VM allocation preservation - keep VMs during growth", + VMs: []*TestVM{{UUID: "vm-existing", Flavor: m1Small, ProjectID: "project-A", Host: "host-1", AZ: "az-a"}}, + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-growth", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-existing"}}, + {CommitmentID: "uuid-growth", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-growth", "confirmed", 3), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-growth", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-existing"}}, + {CommitmentID: "uuid-growth", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-growth", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Multi-project success - both projects succeed", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-a", "confirmed", 2), + createCommitment("ram_hana_1", "project-B", "uuid-b", "confirmed", 2), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-a", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-a", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-b", Flavor: m1Small, ProjectID: "project-B"}, + {CommitmentID: "uuid-b", Flavor: m1Small, ProjectID: "project-B"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Multiple flavor groups - ram_hana_1 and ram_hana_2", + // Amount in multiples of smallest flavor: hana_1 (2×1GB), hana_2 (2×2GB) + Flavors: []*TestFlavor{ + m1Small, + {Name: "m2.small", Group: "hana_2", MemoryMB: 2048, VCPUs: 8}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-hana1", "confirmed", 2), + createCommitment("ram_hana_2", "project-A", "uuid-hana2", "confirmed", 2), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-hana1", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-hana1", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-hana2", Flavor: &TestFlavor{Name: "m2.small", Group: "hana_2", MemoryMB: 2048, VCPUs: 8}, ProjectID: "project-A"}, + {CommitmentID: "uuid-hana2", Flavor: &TestFlavor{Name: "m2.small", Group: "hana_2", MemoryMB: 2048, VCPUs: 8}, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Unknown flavor group - clear rejection message", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_nonexistent", "project-A", "uuid-unknown", "confirmed", 2), + ), + ExpectedReservations: []*TestReservation{}, + ExpectedAPIResponse: newAPIResponse("flavor group not found"), + }, + { + Name: "Three-way capacity swap - complex reallocation", + // A:2→0, B:1→0, C:0→3 in single transaction + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-a", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-a", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-b", Host: "host-3", Flavor: m1Small, ProjectID: "project-B"}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-a", "confirmed", 0), + createCommitment("ram_hana_1", "project-B", "uuid-b", "confirmed", 0), + createCommitment("ram_hana_1", "project-C", "uuid-c", "confirmed", 3), + ), + AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0, "host-3": 0}}, + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-c", Host: "host-1", Flavor: m1Small, ProjectID: "project-C"}, + {CommitmentID: "uuid-c", Host: "host-2", Flavor: m1Small, ProjectID: "project-C"}, + {CommitmentID: "uuid-c", Host: "host-3", Flavor: m1Small, ProjectID: "project-C"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Reservation repair - existing reservations with wrong metadata", + Flavors: []*TestFlavor{m1Small, m1Large}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-repair", Host: "host-preserved", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, + {CommitmentID: "uuid-repair", Host: "host-1", Flavor: m1Small, ProjectID: "wrong-project", AZ: "az-a"}, + {CommitmentID: "uuid-repair", Host: "host-2", Flavor: &TestFlavor{Name: "m1.small", Group: "hana_13", MemoryMB: 1024, VCPUs: 4}, ProjectID: "project-A", AZ: "az-a"}, + {CommitmentID: "uuid-repair", Host: "host-4", Flavor: m1Small, ProjectID: "project-A", AZ: "wrong-az"}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-repair", "confirmed", 8, "az-a"), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-repair", Host: "host-preserved", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, + {CommitmentID: "uuid-repair", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, + {CommitmentID: "uuid-repair", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, + {CommitmentID: "uuid-repair", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, + {CommitmentID: "uuid-repair", Flavor: m1Large, ProjectID: "project-A", AZ: "az-a"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Empty request - no commitment changes", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234), + ExpectedReservations: []*TestReservation{}, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Dry run request - feature not yet implemented", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", true, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-dryrun", "confirmed", 2), + ), + ExpectedReservations: []*TestReservation{}, + ExpectedAPIResponse: newAPIResponse("Dry run not supported"), + }, + { + Name: "Knowledge not ready - clear rejection with RetryAt", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-knowledge", "confirmed", 2), + ), + ExpectedReservations: []*TestReservation{}, + ExpectedAPIResponse: APIResponseExpectation{ + StatusCode: 200, + RejectReasonSubstrings: []string{"caches not ready"}, + RetryAtPresent: true, + }, + EnvInfoVersion: -1, // Skip Knowledge CRD creation + }, + { + Name: "Multiple commitments insufficient capacity - all listed in error", + // Tests that multiple failed commitments are all mentioned in the rejection reason + Flavors: []*TestFlavor{m1Small, m1Tiny}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-multi-fail-1", "confirmed", 3), + createCommitment("ram_hana_1", "project-B", "uuid-multi-fail-2", "confirmed", 3), + createCommitment("ram_gp_1", "project-C", "uuid-would-not-fail", "confirmed", 1), // would be rolled back, but not part of the reject reason + ), + AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 256}}, + ExpectedReservations: []*TestReservation{}, + ExpectedAPIResponse: newAPIResponse("2 commitment(s) failed", "commitment uuid-multi-fail-1: not sufficient capacity", "commitment uuid-multi-fail-2: not sufficient capacity"), + }, + { + Name: "Watch timeout with custom config - triggers rollback with timeout error", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-timeout", "confirmed", 2), + ), + // With 0ms timeout, the watch will timeout immediately before reservations become ready + CustomConfig: &Config{ + ChangeAPIWatchReservationsTimeout: 0 * time.Millisecond, + ChangeAPIWatchReservationsPollInterval: 100 * time.Millisecond, + }, + ExpectedReservations: []*TestReservation{}, // Rollback removes all reservations + ExpectedAPIResponse: newAPIResponse("timeout reached while processing commitment changes"), + }, } - // Create a Knowledge CRD with a specific version timestamp and flavor groups - knowledgeTimestamp := time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC) - flavorGroup := createTestFlavorGroup() + for _, tc := range testCases { + t.Run(tc.Name, func(t *testing.T) { + runCommitmentChangeTest(t, tc) + }) + } +} - // Box the features using the Knowledge API - rawExt, err := v1alpha1.BoxFeatureList([]compute.FlavorGroupFeature{flavorGroup}) - if err != nil { - t.Fatalf("failed to box feature list: %v", err) +// runCommitmentChangeTest executes a single commitment change integration test case. +func runCommitmentChangeTest(t *testing.T, tc CommitmentChangeTestCase) { + t.Helper() + + // Convert test types to actual types + var vms []VM + for _, testVM := range tc.VMs { + vms = append(vms, testVM.ToVM()) } - knowledge := &v1alpha1.Knowledge{ - ObjectMeta: metav1.ObjectMeta{ - Name: "flavor-groups", + var flavorInGroups []compute.FlavorInGroup + for _, testFlavor := range tc.Flavors { + flavorInGroups = append(flavorInGroups, testFlavor.ToFlavorInGroup()) + } + + // Use EnvInfoVersion if specified (non-zero), otherwise default to CommitmentRequest.InfoVersion + envInfoVersion := tc.CommitmentRequest.InfoVersion + if tc.EnvInfoVersion != 0 { + envInfoVersion = tc.EnvInfoVersion + } + + flavorGroups := TestFlavorGroup{ + infoVersion: envInfoVersion, + flavors: flavorInGroups, + }.ToFlavorGroupsKnowledge() + + // Convert existing reservations with auto-numbering per commitment + var existingReservations []*v1alpha1.Reservation + numberCounters := make(map[string]int) + for _, testRes := range tc.ExistingReservations { + number := numberCounters[testRes.CommitmentID] + numberCounters[testRes.CommitmentID]++ + existingReservations = append(existingReservations, testRes.toReservation(number)) + } + + // Create test environment with available resources and custom config if provided + env := newCommitmentTestEnv(t, vms, nil, existingReservations, flavorGroups, tc.AvailableResources, tc.CustomConfig) + defer env.Close() + + t.Log("Initial state:") + env.LogStateSummary() + + // Call commitment change API + reqJSON := buildRequestJSON(tc.CommitmentRequest) + resp, respJSON, statusCode := env.CallChangeCommitmentsAPI(reqJSON) + + t.Log("After API call:") + env.LogStateSummary() + + // Verify API response + env.VerifyAPIResponse(tc.ExpectedAPIResponse, resp, respJSON, statusCode) + + // Verify reservations using content-based matching + env.VerifyReservationsMatch(tc.ExpectedReservations) + + // Log final test result + if t.Failed() { + t.Log("❌ Test FAILED") + } else { + t.Log("✅ Test PASSED") + } +} + +// ============================================================================ +// Test Types & Constants +// ============================================================================ + +const ( + defaultFlavorDiskGB = 40 + flavorGroupsKnowledgeName = "flavor-groups" + knowledgeRecencyDuration = 60 * time.Second + defaultCommitmentExpiryYears = 1 +) + +type CommitmentChangeTestCase struct { + Name string + VMs []*TestVM + Flavors []*TestFlavor + ExistingReservations []*TestReservation + CommitmentRequest CommitmentChangeRequest + ExpectedReservations []*TestReservation + ExpectedAPIResponse APIResponseExpectation + AvailableResources *AvailableResources // If nil, all reservations accepted without checks + EnvInfoVersion int64 // Override InfoVersion for version mismatch tests + CustomConfig *Config // Override default config for testing timeout behavior +} + +// AvailableResources defines available memory per host (MB). +// Scheduler uses first-come-first-serve. CPU is ignored. +type AvailableResources struct { + PerHost map[string]int64 // host -> available memory MB +} + +type TestFlavorGroup struct { + infoVersion int64 + flavors []compute.FlavorInGroup +} + +func (tfg TestFlavorGroup) ToFlavorGroupsKnowledge() FlavorGroupsKnowledge { + groupMap := make(map[string][]compute.FlavorInGroup) + + for _, flavor := range tfg.flavors { + groupName := flavor.ExtraSpecs["quota:hw_version"] + if groupName == "" { + panic("Flavor " + flavor.Name + " is missing quota:hw_version in extra specs") + } + groupMap[groupName] = append(groupMap[groupName], flavor) + } + + var groups []compute.FlavorGroupFeature + for groupName, groupFlavors := range groupMap { + if len(groupFlavors) == 0 { + continue + } + + // Sort descending: required by reservation manager's flavor selection + sort.Slice(groupFlavors, func(i, j int) bool { + return groupFlavors[i].MemoryMB > groupFlavors[j].MemoryMB + }) + + smallest := groupFlavors[len(groupFlavors)-1] + largest := groupFlavors[0] + + groups = append(groups, compute.FlavorGroupFeature{ + Name: groupName, + Flavors: groupFlavors, + SmallestFlavor: smallest, + LargestFlavor: largest, + }) + } + + return FlavorGroupsKnowledge{ + InfoVersion: tfg.infoVersion, + Groups: groups, + } +} + +type FlavorGroupsKnowledge struct { + InfoVersion int64 + Groups []compute.FlavorGroupFeature +} + +type CommitmentChangeRequest struct { + AZ string + DryRun bool + InfoVersion int64 + Commitments []TestCommitment +} + +type TestCommitment struct { + ResourceName liquid.ResourceName + ProjectID string + ConfirmationID string + State string + Amount uint64 +} + +type APIResponseExpectation struct { + StatusCode int + RejectReasonSubstrings []string + RetryAtPresent bool +} + +type ReservationVerification struct { + Host string + Allocations map[string]string +} + +type VM struct { + UUID string + FlavorName string + ProjectID string + CurrentHypervisor string + AvailabilityZone string + Resources map[string]int64 + FlavorExtraSpecs map[string]string +} + +type TestFlavor struct { + Name string + Group string + MemoryMB int64 + VCPUs int64 + DiskGB uint64 +} + +func (f *TestFlavor) ToFlavorInGroup() compute.FlavorInGroup { + diskGB := f.DiskGB + if diskGB == 0 { + diskGB = defaultFlavorDiskGB + } + return compute.FlavorInGroup{ + Name: f.Name, + MemoryMB: uint64(f.MemoryMB), //nolint:gosec // test values are always positive + VCPUs: uint64(f.VCPUs), //nolint:gosec // test values are always positive + DiskGB: diskGB, + ExtraSpecs: map[string]string{ + "quota:hw_version": f.Group, }, - Spec: v1alpha1.KnowledgeSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - Extractor: v1alpha1.KnowledgeExtractorSpec{ - Name: "flavor-groups", + } +} + +type TestVM struct { + UUID string + Flavor *TestFlavor + ProjectID string + Host string + AZ string +} + +func (vm *TestVM) ToVM() VM { + return VM{ + UUID: vm.UUID, + FlavorName: vm.Flavor.Name, + ProjectID: vm.ProjectID, + CurrentHypervisor: vm.Host, + AvailabilityZone: vm.AZ, + Resources: map[string]int64{ + "memory": vm.Flavor.MemoryMB, + "vcpus": vm.Flavor.VCPUs, + }, + FlavorExtraSpecs: map[string]string{ + "quota:hw_version": vm.Flavor.Group, + }, + } +} + +type TestReservation struct { + CommitmentID string + Host string // Empty = any host accepted in matching + Flavor *TestFlavor + ProjectID string + VMs []string // VM UUIDs + MemoryMB int64 // If 0, uses Flavor.MemoryMB; else custom size + AZ string +} + +func (tr *TestReservation) toReservation(number int) *v1alpha1.Reservation { + name := fmt.Sprintf("commitment-%s-%d", tr.CommitmentID, number) + + memoryMB := tr.MemoryMB + if memoryMB == 0 { + memoryMB = tr.Flavor.MemoryMB + } + + specAllocations := make(map[string]v1alpha1.CommittedResourceAllocation) + statusAllocations := make(map[string]string) + for _, vmUUID := range tr.VMs { + specAllocations[vmUUID] = v1alpha1.CommittedResourceAllocation{ + CreationTimestamp: metav1.Now(), + Resources: map[hv1.ResourceName]resource.Quantity{ + "memory": resource.MustParse(strconv.FormatInt(memoryMB, 10) + "Mi"), + "cpu": resource.MustParse(strconv.FormatInt(tr.Flavor.VCPUs, 10)), }, + } + statusAllocations[vmUUID] = tr.Host + } + + spec := v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + TargetHost: tr.Host, + Resources: map[hv1.ResourceName]resource.Quantity{ + "memory": resource.MustParse(strconv.FormatInt(memoryMB, 10) + "Mi"), + "cpu": resource.MustParse(strconv.FormatInt(tr.Flavor.VCPUs, 10)), }, - Status: v1alpha1.KnowledgeStatus{ - LastContentChange: metav1.Time{Time: knowledgeTimestamp}, - Raw: rawExt, - RawLength: 1, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + CommitmentUUID: tr.CommitmentID, + ProjectID: tr.ProjectID, + ResourceName: tr.Flavor.Name, + ResourceGroup: tr.Flavor.Group, + Allocations: specAllocations, + }, + } + + if tr.AZ != "" { + spec.AvailabilityZone = tr.AZ + } + + return &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: spec, + Status: v1alpha1.ReservationStatus{ Conditions: []metav1.Condition{ { - Type: v1alpha1.KnowledgeConditionReady, + Type: v1alpha1.ReservationConditionReady, Status: metav1.ConditionTrue, - Reason: "Ready", + Reason: "ReservationActive", }, }, + Host: tr.Host, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationStatus{ + Allocations: statusAllocations, + }, }, } +} - k8sClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(knowledge). - WithStatusSubresource(knowledge). - Build() +// ============================================================================ +// Test Environment +// ============================================================================ - api := &HTTPAPI{ - client: k8sClient, - } - - // Create request JSON with mismatched version - requestJSON := `{ - "az": "az-a", - "dryRun": false, - "infoVersion": 12345, - "byProject": {} - }` +type CommitmentTestEnv struct { + T *testing.T + Scheme *runtime.Scheme + K8sClient client.Client + VMSource *MockVMSource + FlavorGroups FlavorGroupsKnowledge + HTTPServer *httptest.Server + API *HTTPAPI + availableResources map[string]int64 // host -> available memory MB + processedReserv map[string]bool // track processed reservations + mu sync.Mutex // protects availableResources and processedReserv +} - req := httptest.NewRequest(http.MethodPost, "/v1/change-commitments", bytes.NewReader([]byte(requestJSON))) - req.Header.Set("Content-Type", "application/json") +// FakeReservationController simulates synchronous reservation controller. +type FakeReservationController struct { + env *CommitmentTestEnv +} - w := httptest.NewRecorder() +func (c *FakeReservationController) OnReservationCreated(res *v1alpha1.Reservation) { + c.env.processNewReservation(res) +} - // Call the handler - api.HandleChangeCommitments(w, req) +func (c *FakeReservationController) OnReservationDeleted(res *v1alpha1.Reservation) { + c.env.mu.Lock() + defer c.env.mu.Unlock() - // Check response - resp := w.Result() - defer resp.Body.Close() + // Return memory when Delete() is called directly (before deletion timestamp is set) + if c.env.availableResources != nil && res.Status.Host != "" { + memoryQuantity := res.Spec.Resources["memory"] + memoryBytes := memoryQuantity.Value() + memoryMB := memoryBytes / (1024 * 1024) - // Verify HTTP 409 Conflict status - if resp.StatusCode != http.StatusConflict { - t.Errorf("expected status code %d (Conflict), got %d", http.StatusConflict, resp.StatusCode) + if _, exists := c.env.availableResources[res.Status.Host]; exists { + c.env.availableResources[res.Status.Host] += memoryMB + c.env.T.Logf("↩ Returned %d MB to %s (now %d MB available) via OnReservationDeleted for %s", + memoryMB, res.Status.Host, c.env.availableResources[res.Status.Host], res.Name) + } } - // Verify Content-Type is text/plain (set by http.Error) - contentType := resp.Header.Get("Content-Type") - if contentType != "text/plain; charset=utf-8" { - t.Errorf("expected Content-Type 'text/plain; charset=utf-8', got %q", contentType) + // Clear tracking so recreated reservations with same name are processed + delete(c.env.processedReserv, res.Name) +} + +// operationInterceptorClient routes reservation events to FakeReservationController. +type operationInterceptorClient struct { + client.Client + controller *FakeReservationController +} + +func (d *operationInterceptorClient) Create(ctx context.Context, obj client.Object, opts ...client.CreateOption) error { + err := d.Client.Create(ctx, obj, opts...) + if err != nil { + return err } - // Verify error message contains version information - var responseBody bytes.Buffer - if _, err = responseBody.ReadFrom(resp.Body); err != nil { - t.Fatalf("failed to read response body: %v", err) + if res, ok := obj.(*v1alpha1.Reservation); ok { + d.controller.OnReservationCreated(res) } - bodyStr := responseBody.String() - if !bytes.Contains([]byte(bodyStr), []byte("Version mismatch")) { - t.Errorf("expected response to contain 'Version mismatch', got: %s", bodyStr) + return nil +} + +func (d *operationInterceptorClient) Delete(ctx context.Context, obj client.Object, opts ...client.DeleteOption) error { + if res, ok := obj.(*v1alpha1.Reservation); ok { + d.controller.OnReservationDeleted(res) } - if !bytes.Contains([]byte(bodyStr), []byte("12345")) { - t.Errorf("expected response to contain request version '12345', got: %s", bodyStr) + + return d.Client.Delete(ctx, obj, opts...) +} + +func (env *CommitmentTestEnv) Close() { + if env.HTTPServer != nil { + env.HTTPServer.Close() } } -func TestHandleChangeCommitments_DryRun(t *testing.T) { + +func newCommitmentTestEnv( + t *testing.T, + vms []VM, + hypervisors []*hv1.Hypervisor, + reservations []*v1alpha1.Reservation, + flavorGroups FlavorGroupsKnowledge, + resources *AvailableResources, + customConfig *Config, +) *CommitmentTestEnv { + + t.Helper() + + log.SetLogger(zap.New(zap.WriteTo(os.Stderr), zap.UseDevMode(true))) + + objects := make([]client.Object, 0, len(hypervisors)+len(reservations)) + for _, hv := range hypervisors { + objects = append(objects, hv) + } + for _, res := range reservations { + objects = append(objects, res) + } + scheme := runtime.NewScheme() if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("failed to add scheme: %v", err) + t.Fatalf("Failed to add v1alpha1 scheme: %v", err) + } + if err := hv1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add hv1 scheme: %v", err) + } + + // InfoVersion of -1 skips Knowledge CRD creation (tests "not ready" scenario) + if flavorGroups.InfoVersion != -1 { + knowledgeCRD := createKnowledgeCRD(flavorGroups) + objects = append(objects, knowledgeCRD) } - k8sClient := fake.NewClientBuilder(). + baseK8sClient := fake.NewClientBuilder(). WithScheme(scheme). + WithObjects(objects...). + WithStatusSubresource(&v1alpha1.Reservation{}). + WithStatusSubresource(&v1alpha1.Knowledge{}). + WithIndex(&v1alpha1.Reservation{}, "spec.type", func(obj client.Object) []string { + res := obj.(*v1alpha1.Reservation) + return []string{string(res.Spec.Type)} + }). Build() - api := &HTTPAPI{ - client: k8sClient, + var availableResources map[string]int64 + if resources != nil && resources.PerHost != nil { + availableResources = make(map[string]int64) + for host, memMB := range resources.PerHost { + availableResources[host] = memMB + } } - // Create dry run request JSON - requestJSON := `{ - "az": "az-a", - "dryRun": true, - "infoVersion": 12345, - "byProject": {} - }` + env := &CommitmentTestEnv{ + T: t, + Scheme: scheme, + K8sClient: nil, // Will be set below + VMSource: NewMockVMSource(vms), + FlavorGroups: flavorGroups, + HTTPServer: nil, // Will be set below + API: nil, // Will be set below + availableResources: availableResources, + processedReserv: make(map[string]bool), + } + + controller := &FakeReservationController{env: env} + wrappedClient := &operationInterceptorClient{ + Client: baseK8sClient, + controller: controller, + } + env.K8sClient = wrappedClient - req := httptest.NewRequest(http.MethodPost, "/v1/change-commitments", bytes.NewReader([]byte(requestJSON))) - req.Header.Set("Content-Type", "application/json") - w := httptest.NewRecorder() + // Use custom config if provided, otherwise use default + var api *HTTPAPI + if customConfig != nil { + api = NewAPIWithConfig(wrappedClient, *customConfig) + } else { + api = NewAPI(wrappedClient) + } + mux := http.NewServeMux() + api.Init(mux) + httpServer := httptest.NewServer(mux) + + env.HTTPServer = httpServer + env.API = api - api.HandleChangeCommitments(w, req) + return env +} - resp := w.Result() - defer resp.Body.Close() +// ============================================================================ +// Environment Helper Methods +// ============================================================================ - // Dry run should return 200 OK with rejection reason - if resp.StatusCode != http.StatusOK { - t.Errorf("expected status code %d (OK), got %d", http.StatusOK, resp.StatusCode) +// ListVMs returns all VMs from the VMSource. +func (env *CommitmentTestEnv) ListVMs() []VM { + vms, err := env.VMSource.ListVMs(context.Background()) + if err != nil { + env.T.Fatalf("Failed to list VMs: %v", err) } + return vms +} - // Verify response is JSON - contentType := resp.Header.Get("Content-Type") - if contentType != "application/json" { - t.Errorf("expected Content-Type 'application/json', got %q", contentType) +// ListReservations returns all reservations. +func (env *CommitmentTestEnv) ListReservations() []v1alpha1.Reservation { + var list v1alpha1.ReservationList + if err := env.K8sClient.List(context.Background(), &list); err != nil { + env.T.Fatalf("Failed to list reservations: %v", err) } + return list.Items +} - // Parse response - var response liquid.CommitmentChangeResponse - if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { - t.Fatalf("failed to decode response: %v", err) +// ListHypervisors returns all hypervisors. +func (env *CommitmentTestEnv) ListHypervisors() []hv1.Hypervisor { + var list hv1.HypervisorList + if err := env.K8sClient.List(context.Background(), &list); err != nil { + env.T.Fatalf("Failed to list hypervisors: %v", err) } + return list.Items +} + +// LogStateSummary logs a summary of the current state. +func (env *CommitmentTestEnv) LogStateSummary() { + env.T.Helper() + + hypervisors := env.ListHypervisors() + vms := env.ListVMs() + reservations := env.ListReservations() - if response.RejectionReason != "Dry run not supported yet" { - t.Errorf("expected rejection reason 'Dry run not supported yet', got %q", response.RejectionReason) + env.T.Log("=== State Summary ===") + env.T.Logf("Hypervisors: %d", len(hypervisors)) + env.T.Logf("VMs: %d", len(vms)) + env.T.Logf("Reservations: %d", len(reservations)) + + for _, res := range reservations { + allocCount := 0 + if res.Status.CommittedResourceReservation != nil { + allocCount = len(res.Status.CommittedResourceReservation.Allocations) + } + env.T.Logf(" - %s (host: %s, allocations: %d)", res.Name, res.Status.Host, allocCount) } + env.T.Log("=====================") } -func TestProcessCommitmentChanges_KnowledgeNotReady(t *testing.T) { - // Test when flavor groups knowledge is not available - scheme := runtime.NewScheme() - if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("failed to add scheme: %v", err) +// CallChangeCommitmentsAPI calls the change commitments API endpoint with JSON. +// It uses a hybrid approach: fast polling during API execution + synchronous final pass. +func (env *CommitmentTestEnv) CallChangeCommitmentsAPI(reqJSON string) (resp liquid.CommitmentChangeResponse, respJSON string, statusCode int) { + env.T.Helper() + + // Start fast polling in background to handle reservations during API execution + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + + go func() { + ticker := time.NewTicker(5 * time.Millisecond) // Very fast - 5ms + defer ticker.Stop() + defer close(done) + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + env.processReservations() + } + } + }() + + // Make HTTP request + url := env.HTTPServer.URL + "/v1/commitments/change-commitments" + httpResp, err := http.Post(url, "application/json", bytes.NewReader([]byte(reqJSON))) //nolint:gosec,noctx // test server URL, not user input + if err != nil { + cancel() + <-done + env.T.Fatalf("Failed to make HTTP request: %v", err) } + defer httpResp.Body.Close() - // No Knowledge CRD created - simulates knowledge not ready - k8sClient := fake.NewClientBuilder(). - WithScheme(scheme). - Build() + // Read response body + respBytes, err := io.ReadAll(httpResp.Body) + if err != nil { + cancel() + <-done + env.T.Fatalf("Failed to read response body: %v", err) + } - api := &HTTPAPI{ - client: k8sClient, + respJSON = string(respBytes) + + // Parse response - only for 200 OK responses + // Non-200 responses (like 409 Conflict for version mismatch) use plain text via http.Error() + if httpResp.StatusCode == http.StatusOK { + if err := json.Unmarshal(respBytes, &resp); err != nil { + cancel() + <-done + env.T.Fatalf("Failed to unmarshal response: %v", err) + } } - requestJSON := `{ - "az": "az-a", - "dryRun": false, - "infoVersion": 12345, - "byProject": {} - }` + // Stop background polling + cancel() + <-done + + // Final synchronous pass to ensure all reservations are processed + // This eliminates any race conditions + env.processReservations() + + statusCode = httpResp.StatusCode + return resp, respJSON, statusCode +} + +// processReservations handles all reservation lifecycle events synchronously. +// This includes marking reservations as Ready/Failed and removing finalizers from deleted reservations. +func (env *CommitmentTestEnv) processReservations() { + ctx := context.Background() + reservations := env.ListReservations() + + for _, res := range reservations { + // Handle deletion - return memory to host and remove finalizers + if !res.DeletionTimestamp.IsZero() { + env.T.Logf("Processing deletion for reservation %s (host: %s)", res.Name, res.Status.Host) + + env.mu.Lock() + // Return memory to host if resource tracking is enabled + if env.availableResources != nil { + env.T.Logf("Resource tracking enabled, returning memory for %s", res.Name) + memoryQuantity := res.Spec.Resources["memory"] + memoryBytes := memoryQuantity.Value() + memoryMB := memoryBytes / (1024 * 1024) - req := httptest.NewRequest(http.MethodPost, "/v1/change-commitments", bytes.NewReader([]byte(requestJSON))) - req.Header.Set("Content-Type", "application/json") - w := httptest.NewRecorder() + env.T.Logf("Reservation %s has host=%s, memory=%d MB", res.Name, res.Status.Host, memoryMB) - api.HandleChangeCommitments(w, req) + // Check if host exists in our tracking + if _, exists := env.availableResources[res.Status.Host]; !exists { + env.mu.Unlock() + env.T.Fatalf("Host %s not found in available resources for reservation %s - this indicates an inconsistency", + res.Status.Host, res.Name) + } - resp := w.Result() - defer resp.Body.Close() + // Return memory to host + env.availableResources[res.Status.Host] += memoryMB + env.T.Logf("↩ Returned %d MB to %s (now %d MB available) from deleted reservation %s", + memoryMB, res.Status.Host, env.availableResources[res.Status.Host], res.Name) + } else { + env.T.Logf("Resource tracking NOT enabled for %s", res.Name) + } - // Should return 200 OK with rejection reason - if resp.StatusCode != http.StatusOK { - t.Errorf("expected status code %d (OK), got %d", http.StatusOK, resp.StatusCode) + // Clear tracking so recreated reservations with same name are processed + delete(env.processedReserv, res.Name) + env.mu.Unlock() + + // Remove finalizers to allow deletion + if len(res.Finalizers) > 0 { + res.Finalizers = []string{} + if err := env.K8sClient.Update(ctx, &res); err != nil { + // Ignore errors - might be already deleted + continue + } + } + continue + } + + // Skip if already processed (has a condition set) + if env.hasCondition(&res) { + continue + } + + env.mu.Lock() + alreadyProcessed := env.processedReserv[res.Name] + env.mu.Unlock() + + // Skip if already tracked as processed + if alreadyProcessed { + continue + } + + // Process new reservation with resource-based scheduling + env.processNewReservation(&res) } +} - var response liquid.CommitmentChangeResponse - if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { - t.Fatalf("failed to decode response: %v", err) +// hasCondition checks if a reservation has any Ready condition set. +func (env *CommitmentTestEnv) hasCondition(res *v1alpha1.Reservation) bool { + for _, cond := range res.Status.Conditions { + if cond.Type == v1alpha1.ReservationConditionReady { + return true + } } + return false +} + +// processNewReservation implements first-come-first-serve scheduling based on available resources. +// It tries to find a host with enough memory capacity and assigns the reservation to that host. +func (env *CommitmentTestEnv) processNewReservation(res *v1alpha1.Reservation) { + env.mu.Lock() + defer env.mu.Unlock() + + env.processedReserv[res.Name] = true + + // If no available resources configured, accept all reservations without host assignment + if env.availableResources == nil { + env.markReservationReady(res) + return + } + + // Get required memory from reservation spec + memoryQuantity := res.Spec.Resources["memory"] + memoryBytes := memoryQuantity.Value() + memoryMB := memoryBytes / (1024 * 1024) - if response.RejectionReason != "caches not ready" { - t.Errorf("expected rejection reason 'caches not ready', got %q", response.RejectionReason) + // First-come-first-serve: find first host with enough capacity + // Sort hosts to ensure deterministic behavior (Go map iteration is random) + hosts := make([]string, 0, len(env.availableResources)) + for host := range env.availableResources { + hosts = append(hosts, host) } + sort.Strings(hosts) + + var selectedHost string + for _, host := range hosts { + if env.availableResources[host] >= memoryMB { + selectedHost = host + break + } + } + + if selectedHost != "" { + // SUCCESS: Schedule on this host + env.availableResources[selectedHost] -= memoryMB + + // Update reservation with selected host + ctx := context.Background() + + // Update spec (TargetHost) + res.Spec.TargetHost = selectedHost + if err := env.K8sClient.Update(ctx, res); err != nil { + env.T.Logf("Warning: Failed to update reservation spec: %v", err) + } + + // Update status (Host) - requires Status().Update + res.Status.Host = selectedHost + if err := env.K8sClient.Status().Update(ctx, res); err != nil { + env.T.Logf("Warning: Failed to update reservation status host: %v", err) + } - if response.RetryAt.IsNone() { - t.Error("expected RetryAt to be set") + env.markReservationReady(res) + env.T.Logf("✓ Scheduled reservation %s on %s (%d MB used, %d MB remaining)", + res.Name, selectedHost, memoryMB, env.availableResources[selectedHost]) + } else { + // FAILURE: No host has enough capacity + env.markReservationFailed(res, "Insufficient capacity on all hosts") + env.T.Logf("✗ Failed to schedule reservation %s (needs %d MB, no host has capacity)", + res.Name, memoryMB) } } -// Helper function to create a minimal flavor group for testing -func createTestFlavorGroup() compute.FlavorGroupFeature { - return compute.FlavorGroupFeature{ - Name: "test_group", - Flavors: []compute.FlavorInGroup{ - { - Name: "test.small", - MemoryMB: 8192, - VCPUs: 2, - DiskGB: 40, - ExtraSpecs: map[string]string{ - "quota:separate": "true", - }, +// markReservationReady updates a reservation to have Ready=True status. +func (env *CommitmentTestEnv) markReservationReady(res *v1alpha1.Reservation) { + res.Status.Conditions = []metav1.Condition{ + { + Type: v1alpha1.ReservationConditionReady, + Status: metav1.ConditionTrue, + Reason: "ReservationActive", + Message: "Reservation is ready (set by test controller)", + LastTransitionTime: metav1.Now(), + }, + } + + if err := env.K8sClient.Status().Update(context.Background(), res); err != nil { + // Ignore errors - might be deleted during update + return + } +} + +// markReservationFailed updates a reservation to have Ready=False status (scheduling failed). +func (env *CommitmentTestEnv) markReservationFailed(res *v1alpha1.Reservation, reason string) { + res.Status.Conditions = []metav1.Condition{ + { + Type: v1alpha1.ReservationConditionReady, + Status: metav1.ConditionFalse, + Reason: "SchedulingFailed", + Message: reason, + LastTransitionTime: metav1.Now(), + }, + } + + if err := env.K8sClient.Status().Update(context.Background(), res); err != nil { + // Ignore errors - might be deleted during update + return + } +} + +// VerifyAPIResponse verifies the API response matches expectations. +// For rejection reasons, it checks if ALL expected substrings are present in the actual rejection reason. +func (env *CommitmentTestEnv) VerifyAPIResponse(expected APIResponseExpectation, actual liquid.CommitmentChangeResponse, respJSON string, statusCode int) { + env.T.Helper() + + if statusCode != expected.StatusCode { + env.T.Errorf("Expected status code %d, got %d", expected.StatusCode, statusCode) + } + + if len(expected.RejectReasonSubstrings) > 0 { + if actual.RejectionReason == "" { + env.T.Errorf("Expected rejection reason containing substrings %v, got none", expected.RejectReasonSubstrings) + } else { + // Check that ALL expected substrings are present + for _, substring := range expected.RejectReasonSubstrings { + if !strings.Contains(actual.RejectionReason, substring) { + env.T.Errorf("Expected rejection reason to contain %q, but got %q", substring, actual.RejectionReason) + } + } + } + } else { + if actual.RejectionReason != "" { + env.T.Errorf("Expected no rejection reason, got %q", actual.RejectionReason) + } + } + + // Check RetryAt field presence in JSON (avoids dealing with option.Option type) + retryAtPresent := strings.Contains(respJSON, `"retryAt"`) + if expected.RetryAtPresent { + if !retryAtPresent { + env.T.Error("Expected retryAt field to be present in JSON response, but it was not found") + } + } else { + if retryAtPresent { + env.T.Error("Expected retryAt field to be absent from JSON response, but it was found") + } + } +} + +// VerifyReservationsMatch verifies that actual reservations match expected reservations by content. +func (env *CommitmentTestEnv) VerifyReservationsMatch(expected []*TestReservation) { + env.T.Helper() + + actualReservations := env.ListReservations() + + // Make copies of both lists so we can remove matched items + expectedCopy := make([]*TestReservation, len(expected)) + copy(expectedCopy, expected) + + actualCopy := make([]v1alpha1.Reservation, len(actualReservations)) + copy(actualCopy, actualReservations) + + // Track unmatched items for detailed reporting + var unmatchedExpected []*TestReservation + var unmatchedActual []v1alpha1.Reservation + + // Greedy matching: while there are expected items, find matches and remove + for len(expectedCopy) > 0 { + exp := expectedCopy[0] + found := false + + // Find first actual that matches this expected + for i, actual := range actualCopy { + if env.reservationMatches(exp, &actual) { + expectedCopy = expectedCopy[1:] + actualCopy = append(actualCopy[:i], actualCopy[i+1:]...) + found = true + break + } + } + + if !found { + unmatchedExpected = append(unmatchedExpected, exp) + expectedCopy = expectedCopy[1:] + } + } + + unmatchedActual = actualCopy + + // If there are any mismatches, print detailed comparison + if len(unmatchedExpected) > 0 || len(unmatchedActual) > 0 { + env.T.Error("❌ Reservation mismatch detected!") + env.T.Log("") + env.T.Log("═══════════════════════════════════════════════════════════════") + env.T.Log("EXPECTED RESERVATIONS:") + env.T.Log("═══════════════════════════════════════════════════════════════") + env.printExpectedReservations(expected, unmatchedExpected) + + env.T.Log("") + env.T.Log("═══════════════════════════════════════════════════════════════") + env.T.Log("ACTUAL RESERVATIONS:") + env.T.Log("═══════════════════════════════════════════════════════════════") + env.printActualReservations(actualReservations, unmatchedActual) + + env.T.Log("") + env.T.Log("═══════════════════════════════════════════════════════════════") + env.T.Log("DIFF SUMMARY:") + env.T.Log("═══════════════════════════════════════════════════════════════") + env.printDiffSummary(unmatchedExpected, unmatchedActual) + env.T.Log("═══════════════════════════════════════════════════════════════") + } +} + +// String returns a compact string representation of a TestReservation. +func (tr *TestReservation) String() string { + flavorName := "" + flavorGroup := "" + if tr.Flavor != nil { + flavorName = tr.Flavor.Name + flavorGroup = tr.Flavor.Group + } + + host := tr.Host + if host == "" { + host = "" + } + + az := tr.AZ + if az == "" { + az = "" + } + + vmInfo := "" + if len(tr.VMs) > 0 { + vmInfo = fmt.Sprintf(" VMs=%v", tr.VMs) + } + + return fmt.Sprintf("%s/%s/%s(%s)/%s/az=%s%s", tr.CommitmentID, tr.ProjectID, flavorName, flavorGroup, host, az, vmInfo) +} + +// compactReservationString returns a compact string representation of an actual Reservation. +func compactReservationString(res *v1alpha1.Reservation) string { + commitmentID := "" + projectID := "" + flavorName := "" + flavorGroup := "" + vmCount := 0 + + if res.Spec.CommittedResourceReservation != nil { + commitmentID = res.Spec.CommittedResourceReservation.CommitmentUUID + projectID = res.Spec.CommittedResourceReservation.ProjectID + flavorName = res.Spec.CommittedResourceReservation.ResourceName + flavorGroup = res.Spec.CommittedResourceReservation.ResourceGroup + if res.Status.CommittedResourceReservation != nil { + vmCount = len(res.Status.CommittedResourceReservation.Allocations) + } + } + + host := res.Status.Host + if host == "" { + host = "" + } + + az := res.Spec.AvailabilityZone + if az == "" { + az = "" + } + + vmInfo := "" + if vmCount > 0 { + vmInfo = fmt.Sprintf(" VMs=%d", vmCount) + } + + return fmt.Sprintf("%s/%s/%s(%s)/%s/az=%s%s", commitmentID, projectID, flavorName, flavorGroup, host, az, vmInfo) +} + +// printExpectedReservations prints all expected reservations with markers for unmatched ones. +func (env *CommitmentTestEnv) printExpectedReservations(all, unmatched []*TestReservation) { + env.T.Helper() + + unmatchedMap := make(map[*TestReservation]bool) + for _, res := range unmatched { + unmatchedMap[res] = true + } + + if len(all) == 0 { + env.T.Log(" (none)") + return + } + + for i, res := range all { + marker := "✓" + if unmatchedMap[res] { + marker = "✗" + } + env.T.Logf(" %s [%d] %s", marker, i+1, res.String()) + } + + env.T.Logf(" Total: %d (%d matched, %d missing)", + len(all), len(all)-len(unmatched), len(unmatched)) +} + +// printActualReservations prints all actual reservations with markers for unmatched ones. +func (env *CommitmentTestEnv) printActualReservations(all, unmatched []v1alpha1.Reservation) { + env.T.Helper() + + unmatchedMap := make(map[string]bool) + for _, res := range unmatched { + unmatchedMap[res.Name] = true + } + + if len(all) == 0 { + env.T.Log(" (none)") + return + } + + for i, res := range all { + marker := "✓" + if unmatchedMap[res.Name] { + marker = "⊕" + } + env.T.Logf(" %s [%d] %s", marker, i+1, compactReservationString(&res)) + } + + env.T.Logf(" Total: %d (%d matched, %d unexpected)", + len(all), len(all)-len(unmatched), len(unmatched)) +} + +// printDiffSummary prints a summary of differences between expected and actual. +func (env *CommitmentTestEnv) printDiffSummary(unmatchedExpected []*TestReservation, unmatchedActual []v1alpha1.Reservation) { + env.T.Helper() + + if len(unmatchedExpected) > 0 { + env.T.Logf(" MISSING (%d expected, not found):", len(unmatchedExpected)) + for _, res := range unmatchedExpected { + env.T.Logf(" • %s", res.String()) + } + } + + if len(unmatchedActual) > 0 { + env.T.Logf(" UNEXPECTED (%d found, not expected):", len(unmatchedActual)) + for _, res := range unmatchedActual { + env.T.Logf(" • %s", compactReservationString(&res)) + } + } + + if len(unmatchedExpected) == 0 && len(unmatchedActual) == 0 { + env.T.Log(" ✓ All match!") + } +} + +// reservationMatches checks if an actual reservation matches an expected one. +// All fields are checked comprehensively for complete validation. +func (env *CommitmentTestEnv) reservationMatches(expected *TestReservation, actual *v1alpha1.Reservation) bool { + // Check CommitmentID (from reservation name prefix) + if !strings.HasPrefix(actual.Name, "commitment-"+expected.CommitmentID+"-") { + return false + } + + // Check that CommittedResourceReservation spec exists + if actual.Spec.CommittedResourceReservation == nil { + return false + } + + // Check CommitmentUUID in spec matches + if actual.Spec.CommittedResourceReservation.CommitmentUUID != expected.CommitmentID { + return false + } + + // Check ProjectID + if actual.Spec.CommittedResourceReservation.ProjectID != expected.ProjectID { + return false + } + + // Check ResourceName (flavor name) + if expected.Flavor != nil { + if actual.Spec.CommittedResourceReservation.ResourceName != expected.Flavor.Name { + return false + } + } + + // Check ResourceGroup (flavor group) + if expected.Flavor != nil { + if actual.Spec.CommittedResourceReservation.ResourceGroup != expected.Flavor.Group { + return false + } + } + + // Check Host (if specified in expected) + if expected.Host != "" && actual.Status.Host != expected.Host { + return false + } + + // Check AZ (if specified in expected) + if expected.AZ != "" && actual.Spec.AvailabilityZone != expected.AZ { + return false + } + + // Check Memory (use custom MemoryMB if non-zero, otherwise use flavor size) + expectedMemoryMB := expected.MemoryMB + if expectedMemoryMB == 0 && expected.Flavor != nil { + expectedMemoryMB = expected.Flavor.MemoryMB + } + memoryQuantity := actual.Spec.Resources["memory"] + actualMemoryBytes := memoryQuantity.Value() + actualMemoryMB := actualMemoryBytes / (1024 * 1024) + if actualMemoryMB != expectedMemoryMB { + return false + } + + // Check CPU (from flavor if available) + if expected.Flavor != nil { + cpuQuantity := actual.Spec.Resources["cpu"] + actualCPU := cpuQuantity.Value() + if actualCPU != expected.Flavor.VCPUs { + return false + } + } + + // Check VM allocations (set comparison - order doesn't matter) + if !env.vmAllocationsMatch(expected.VMs, actual) { + return false + } + + // Check reservation type + if actual.Spec.Type != v1alpha1.ReservationTypeCommittedResource { + return false + } + + return true +} + +// vmAllocationsMatch checks if VM allocations match (set comparison). +func (env *CommitmentTestEnv) vmAllocationsMatch(expectedVMs []string, actual *v1alpha1.Reservation) bool { + if actual.Status.CommittedResourceReservation == nil { + return len(expectedVMs) == 0 + } + + actualVMs := make(map[string]bool) + for vmUUID := range actual.Status.CommittedResourceReservation.Allocations { + actualVMs[vmUUID] = true + } + + // Check counts match + if len(expectedVMs) != len(actualVMs) { + return false + } + + // Check all expected VMs are in actual + for _, vmUUID := range expectedVMs { + if !actualVMs[vmUUID] { + return false + } + } + + return true +} + +// ============================================================================ +// Mock VM Source +// ============================================================================ + +// MockVMSource implements VMSource for testing. +type MockVMSource struct { + VMs []VM +} + +// NewMockVMSource creates a new MockVMSource with the given VMs. +func NewMockVMSource(vms []VM) *MockVMSource { + return &MockVMSource{VMs: vms} +} + +// ListVMs returns the configured VMs. +func (s *MockVMSource) ListVMs(_ context.Context) ([]VM, error) { + return s.VMs, nil +} + +// ============================================================================ +// Helper Functions +// ============================================================================ + +// newHypervisorWithAZ creates a Hypervisor CRD with the given parameters including availability zone. +func newHypervisorWithAZ(name string, cpuCap, memoryGi, cpuAlloc, memoryGiAlloc int, instances []hv1.Instance, traits []string, az string) *hv1.Hypervisor { + labels := make(map[string]string) + if az != "" { + labels[corev1.LabelTopologyZone] = az + } + return &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: labels, + }, + Status: hv1.HypervisorStatus{ + Capacity: map[hv1.ResourceName]resource.Quantity{ + "cpu": resource.MustParse(strconv.Itoa(cpuCap)), + "memory": resource.MustParse(strconv.Itoa(memoryGi) + "Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + "cpu": resource.MustParse(strconv.Itoa(cpuAlloc)), + "memory": resource.MustParse(strconv.Itoa(memoryGiAlloc) + "Gi"), + }, + NumInstances: len(instances), + Instances: instances, + Traits: traits, + }, + } +} + +// createCommitment creates a TestCommitment for use in test cases. +// The az parameter is optional - if empty string, no AZ constraint is set. +func createCommitment(resourceName, projectID, confirmationID, state string, amount uint64, az ...string) TestCommitment { + return TestCommitment{ + ResourceName: liquid.ResourceName(resourceName), + ProjectID: projectID, + ConfirmationID: confirmationID, + State: state, + Amount: amount, + } +} + +// newCommitmentRequest creates a CommitmentChangeRequest with the given commitments. +func newCommitmentRequest(az string, dryRun bool, infoVersion int64, commitments ...TestCommitment) CommitmentChangeRequest { + return CommitmentChangeRequest{ + AZ: az, + DryRun: dryRun, + InfoVersion: infoVersion, + Commitments: commitments, + } +} + +// newAPIResponse creates an APIResponseExpectation with 200 OK status. +func newAPIResponse(rejectReasonSubstrings ...string) APIResponseExpectation { + return APIResponseExpectation{ + StatusCode: 200, + RejectReasonSubstrings: rejectReasonSubstrings, + } +} + +// buildRequestJSON converts a test CommitmentChangeRequest to JSON string. +// Builds the nested JSON structure directly for simplicity. +func buildRequestJSON(req CommitmentChangeRequest) string { + // Group commitments by project and resource for nested structure + type projectResources map[liquid.ResourceName][]TestCommitment + byProject := make(map[string]projectResources) + + for _, commit := range req.Commitments { + if byProject[commit.ProjectID] == nil { + byProject[commit.ProjectID] = make(projectResources) + } + byProject[commit.ProjectID][commit.ResourceName] = append( + byProject[commit.ProjectID][commit.ResourceName], + commit, + ) + } + + // Build nested JSON structure + var projectParts []string + for projectID, resources := range byProject { + var resourceParts []string + for resourceName, commits := range resources { + var commitParts []string + for _, c := range commits { + expiryTime := time.Now().Add(time.Duration(defaultCommitmentExpiryYears) * 365 * 24 * time.Hour) + commitParts = append(commitParts, fmt.Sprintf(`{"uuid":"%s","newStatus":"%s","amount":%d,"expiresAt":"%s"}`, + c.ConfirmationID, c.State, c.Amount, expiryTime.Format(time.RFC3339))) + } + resourceParts = append(resourceParts, fmt.Sprintf(`"%s":{"commitments":[%s]}`, + resourceName, strings.Join(commitParts, ","))) + } + projectParts = append(projectParts, fmt.Sprintf(`"%s":{"byResource":{%s}}`, + projectID, strings.Join(resourceParts, ","))) + } + + return fmt.Sprintf(`{"az":"%s","dryRun":%t,"infoVersion":%d,"byProject":{%s}}`, + req.AZ, req.DryRun, req.InfoVersion, strings.Join(projectParts, ",")) +} + +// createKnowledgeCRD creates a Knowledge CRD populated with flavor groups. +func createKnowledgeCRD(flavorGroups FlavorGroupsKnowledge) *v1alpha1.Knowledge { + rawExt, err := v1alpha1.BoxFeatureList(flavorGroups.Groups) + if err != nil { + panic("Failed to box flavor groups: " + err.Error()) + } + + lastContentChange := time.Unix(flavorGroups.InfoVersion, 0) + + return &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{ + Name: flavorGroupsKnowledgeName, + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{ + Name: flavorGroupsKnowledgeName, }, + Recency: metav1.Duration{Duration: knowledgeRecencyDuration}, }, - SmallestFlavor: compute.FlavorInGroup{ - Name: "test.small", - MemoryMB: 8192, - VCPUs: 2, - DiskGB: 40, + Status: v1alpha1.KnowledgeStatus{ + LastExtracted: metav1.Time{Time: lastContentChange}, + LastContentChange: metav1.Time{Time: lastContentChange}, + Raw: rawExt, + RawLength: len(flavorGroups.Groups), + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionTrue, + Reason: "KnowledgeReady", + Message: "Flavor groups knowledge is ready", + LastTransitionTime: metav1.Time{Time: lastContentChange}, + }, + }, }, } } diff --git a/internal/scheduling/reservations/commitments/config.go b/internal/scheduling/reservations/commitments/config.go new file mode 100644 index 000000000..95dc904d8 --- /dev/null +++ b/internal/scheduling/reservations/commitments/config.go @@ -0,0 +1,22 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import "time" + +// Config defines the configuration for the commitments HTTP API. +type Config struct { + // how long to wait for reservations to become ready before timing out and rolling back. + ChangeAPIWatchReservationsTimeout time.Duration `json:"changeAPIWatchReservationsTimeout"` + + // how frequently to poll reservation status during watch. + ChangeAPIWatchReservationsPollInterval time.Duration `json:"changeAPIWatchReservationsPollInterval"` +} + +func DefaultConfig() Config { + return Config{ + ChangeAPIWatchReservationsTimeout: 2 * time.Second, + ChangeAPIWatchReservationsPollInterval: 100 * time.Millisecond, + } +} diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go index 13856d992..21ee1fee1 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager.go +++ b/internal/scheduling/reservations/commitments/reservation_manager.go @@ -136,7 +136,7 @@ func (m *ReservationManager) ApplyCommitmentState( memValue := reservationToDelete.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes += memValue.Value() - log.Info("deleting reservation", + log.Info("deleting reservation (capacity decrease)", "commitmentUUID", desiredState.CommitmentUUID, "deltaMemoryBytes", deltaMemoryBytes, "name", reservationToDelete.Name, @@ -205,19 +205,25 @@ func (m *ReservationManager) syncReservationMetadata( state *CommitmentState, ) (*v1alpha1.Reservation, error) { - // if any of AZ, StarTime, EndTime differ from desired state, need to patch - if (state.AvailabilityZone != "" && reservation.Spec.AvailabilityZone != state.AvailabilityZone) || + // if any of CommitmentUUID, AZ, StarTime, EndTime differ from desired state, need to patch + if (state.CommitmentUUID != "" && reservation.Spec.CommittedResourceReservation.CommitmentUUID != state.CommitmentUUID) || + (state.AvailabilityZone != "" && reservation.Spec.AvailabilityZone != state.AvailabilityZone) || (state.StartTime != nil && (reservation.Spec.StartTime == nil || !reservation.Spec.StartTime.Time.Equal(*state.StartTime))) || (state.EndTime != nil && (reservation.Spec.EndTime == nil || !reservation.Spec.EndTime.Time.Equal(*state.EndTime))) { // Apply patch log.Info("syncing reservation metadata", - "reservation", reservation.Name, - "availabilityZone", state.AvailabilityZone, - "startTime", state.StartTime, - "endTime", state.EndTime) + "reservation", reservation, + "desired commitmentUUID", state.CommitmentUUID, + "desired availabilityZone", state.AvailabilityZone, + "desired startTime", state.StartTime, + "desired endTime", state.EndTime) patch := client.MergeFrom(reservation.DeepCopy()) + if state.CommitmentUUID != "" { + reservation.Spec.CommittedResourceReservation.CommitmentUUID = state.CommitmentUUID + } + if state.AvailabilityZone != "" { reservation.Spec.AvailabilityZone = state.AvailabilityZone } @@ -277,12 +283,13 @@ func (m *ReservationManager) newReservation( ), }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - ProjectID: state.ProjectID, - DomainID: state.DomainID, - ResourceGroup: state.FlavorGroupName, - ResourceName: flavorInGroup.Name, - Creator: creator, - Allocations: nil, + ProjectID: state.ProjectID, + CommitmentUUID: state.CommitmentUUID, + DomainID: state.DomainID, + ResourceGroup: state.FlavorGroupName, + ResourceName: flavorInGroup.Name, + Creator: creator, + Allocations: nil, }, } diff --git a/internal/scheduling/reservations/commitments/state.go b/internal/scheduling/reservations/commitments/state.go index 996efff8e..50108beef 100644 --- a/internal/scheduling/reservations/commitments/state.go +++ b/internal/scheduling/reservations/commitments/state.go @@ -29,7 +29,7 @@ func getFlavorGroupNameFromResource(resourceName string) (string, error) { // CommitmentState represents desired or current commitment resource allocation. type CommitmentState struct { - // CommitmentUUID uniquely identifies this commitment + // CommitmentUUID is the UUID of the commitment this state corresponds to. CommitmentUUID string // ProjectID is the OpenStack project this commitment belongs to ProjectID string From c2bc6830fff789f2a97a57bb9d8558253587b0fd Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 18 Mar 2026 07:44:46 +0000 Subject: [PATCH 51/55] Bump cortex chart appVersions to sha-1669faac [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index dd0bad528..258a23db6 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.26 -appVersion: "sha-9b5fa574" +appVersion: "sha-1669faac" icon: "https://example.com/icon.png" dependencies: [] From 9daeb2443ad3a149647e0eee30dc149d2b00e782 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Wed, 18 Mar 2026 09:35:10 +0100 Subject: [PATCH 52/55] Add filter_correct_az to "hot" pipelines This change adds the filter_correct_az to the pipelines we're actively using for nova vms outside an experimental stage. In this way we stay consistent to what we have deployed, slowly phasing in the nova filters. --- .../bundles/cortex-nova/templates/pipelines_kvm.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml index a17f75f5b..68ec01352 100644 --- a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml @@ -15,7 +15,11 @@ spec: Specifically, this pipeline is used for general purpose workloads. type: filter-weigher createDecisions: true - filters: [] + filters: + - name: filter_correct_az + description: | + This step will filter out hosts whose aggregate information indicates they + are not placed in the requested availability zone. weighers: - name: kvm_prefer_smaller_hosts params: @@ -61,7 +65,11 @@ spec: Specifically, this pipeline is used for hana virtual machines. type: filter-weigher createDecisions: true - filters: [] + filters: + - name: filter_correct_az + description: | + This step will filter out hosts whose aggregate information indicates they + are not placed in the requested availability zone. weighers: - name: kvm_prefer_smaller_hosts params: From b3401a27abcad5fdb9c148a9fa51a962c1bc43c2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 18 Mar 2026 08:45:47 +0000 Subject: [PATCH 53/55] Bump cortex chart appVersions to sha-9daeb244 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 258a23db6..11bcbc532 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.26 -appVersion: "sha-1669faac" +appVersion: "sha-9daeb244" icon: "https://example.com/icon.png" dependencies: [] From 1401e19ee646d0f7f6b417cd6390cf5a0edb3052 Mon Sep 17 00:00:00 2001 From: Malte <140147670+umswmayj@users.noreply.github.com> Date: Wed, 18 Mar 2026 13:51:20 +0100 Subject: [PATCH 54/55] feat: Add fallback to capacity filter and external customer filter (#593) --- .../filters/filter_external_customer.go | 4 +- .../filters/filter_external_customer_test.go | 12 +- .../filters/filter_has_enough_capacity.go | 11 +- .../filter_has_enough_capacity_test.go | 129 +++++++++++++++++- 4 files changed, 143 insertions(+), 13 deletions(-) diff --git a/internal/scheduling/nova/plugins/filters/filter_external_customer.go b/internal/scheduling/nova/plugins/filters/filter_external_customer.go index 62c059b10..56f73c8ac 100644 --- a/internal/scheduling/nova/plugins/filters/filter_external_customer.go +++ b/internal/scheduling/nova/plugins/filters/filter_external_customer.go @@ -37,8 +37,8 @@ func (s *FilterExternalCustomerStep) Run(traceLog *slog.Logger, request api.Exte result := s.IncludeAllHostsFromRequest(request) domainName, err := request.Spec.Data.GetSchedulerHintStr("domain_name") if err != nil { - traceLog.Error("failed to get domain_name scheduler hint", "error", err) - return nil, err + traceLog.Error("failed to get domain_name scheduler hint, skipping filter", "error", err) + return result, nil } if slices.Contains(s.Options.CustomerIgnoredDomainNames, domainName) { traceLog.Info("domain is no external customer domain, skipping filter", "domain", domainName) diff --git a/internal/scheduling/nova/plugins/filters/filter_external_customer_test.go b/internal/scheduling/nova/plugins/filters/filter_external_customer_test.go index fb9971a83..7ca313dc3 100644 --- a/internal/scheduling/nova/plugins/filters/filter_external_customer_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_external_customer_test.go @@ -245,7 +245,7 @@ func TestFilterExternalCustomerStep_Run(t *testing.T) { filteredHosts: []string{"host3"}, }, { - name: "Missing domain_name in scheduler hints - error", + name: "Missing domain_name in scheduler hints - skips filter, all hosts pass", opts: FilterExternalCustomerStepOpts{ CustomerDomainNamePrefixes: []string{"ext-"}, }, @@ -257,12 +257,14 @@ func TestFilterExternalCustomerStep_Run(t *testing.T) { }, Hosts: []api.ExternalSchedulerHost{ {ComputeHost: "host1"}, + {ComputeHost: "host3"}, }, }, - expectError: true, + expectedHosts: []string{"host1", "host3"}, + filteredHosts: []string{}, }, { - name: "Nil scheduler hints - error", + name: "Nil scheduler hints - skips filter, all hosts pass", opts: FilterExternalCustomerStepOpts{ CustomerDomainNamePrefixes: []string{"ext-"}, }, @@ -274,9 +276,11 @@ func TestFilterExternalCustomerStep_Run(t *testing.T) { }, Hosts: []api.ExternalSchedulerHost{ {ComputeHost: "host1"}, + {ComputeHost: "host2"}, }, }, - expectError: true, + expectedHosts: []string{"host1", "host2"}, + filteredHosts: []string{}, }, { name: "Case sensitive prefix matching", diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go index 198f1a28f..5e1f1dc3c 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go @@ -56,15 +56,14 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa return nil, err } for _, hv := range hvs.Items { - // This case would be caught below, but we want to log this explicitly. if hv.Status.EffectiveCapacity == nil { - traceLog.Warn("hypervisor with nil effective capacity, skipping", "host", hv.Name) - continue + traceLog.Warn("hypervisor with nil effective capacity, use capacity instead (overprovisioning not considered)", "host", hv.Name) + freeResourcesByHost[hv.Name] = hv.Status.Capacity + } else { + // Start with the total effective capacity which is capacity * overcommit ratio. + freeResourcesByHost[hv.Name] = hv.Status.EffectiveCapacity } - // Start with the total effective capacity which is capacity * overcommit ratio. - freeResourcesByHost[hv.Name] = hv.Status.EffectiveCapacity - // Subtract allocated resources. for resourceName, allocated := range hv.Status.Allocation { free, ok := freeResourcesByHost[hv.Name][resourceName] diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go index 4068cf900..452782484 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go @@ -51,6 +51,49 @@ func newHypervisor(name, cpuCap, cpuAlloc, memCap, memAlloc string) *hv1.Hypervi } } +// newHypervisorWithCapacityOnly creates a hypervisor with only Capacity set (no EffectiveCapacity). +func newHypervisorWithCapacityOnly(name, cpuCap, memCap string) *hv1.Hypervisor { + return &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Status: hv1.HypervisorStatus{ + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpuCap), + hv1.ResourceMemory: resource.MustParse(memCap), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + }, + } +} + +// newHypervisorWithBothCapacities creates a hypervisor with both Capacity and EffectiveCapacity set. +// EffectiveCapacity is typically >= Capacity due to overcommit ratio. +func newHypervisorWithBothCapacities(name, cpuCap, cpuEffCap, memCap, memEffCap string) *hv1.Hypervisor { + return &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Status: hv1.HypervisorStatus{ + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpuCap), + hv1.ResourceMemory: resource.MustParse(memCap), + }, + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpuEffCap), + hv1.ResourceMemory: resource.MustParse(memEffCap), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + }, + } +} + func newCommittedReservation( name, targetHost, observedHost, projectID, flavorName, flavorGroup, cpu, memory string, specAllocations map[string]v1alpha1.CommittedResourceAllocation, // Spec allocations for CR @@ -440,7 +483,7 @@ func TestFilterHasEnoughCapacity_ReservationTypes(t *testing.T) { t.Run(tt.name, func(t *testing.T) { objects := make([]client.Object, 0, len(hypervisors)+len(tt.reservations)) for _, h := range hypervisors { - objects = append(objects, h) + objects = append(objects, h.DeepCopy()) } for _, r := range tt.reservations { objects = append(objects, r) @@ -469,3 +512,87 @@ func TestFilterHasEnoughCapacity_ReservationTypes(t *testing.T) { }) } } + +func TestFilterHasEnoughCapacity_NilEffectiveCapacityFallback(t *testing.T) { + scheme := buildTestScheme(t) + + tests := []struct { + name string + hypervisors []*hv1.Hypervisor + request api.ExternalSchedulerRequest + expectedHosts []string + filteredHosts []string + }{ + { + name: "Hypervisor with nil EffectiveCapacity uses Capacity fallback", + hypervisors: []*hv1.Hypervisor{ + newHypervisor("host1", "16", "8", "32Gi", "16Gi"), // has EffectiveCapacity: 8 CPU free, 16Gi free + newHypervisorWithCapacityOnly("host2", "8", "16Gi"), // nil EffectiveCapacity, uses Capacity: 8 CPU free, 16Gi free + newHypervisorWithCapacityOnly("host3", "2", "4Gi"), // nil EffectiveCapacity, uses Capacity: 2 CPU free (not enough) + newHypervisorWithCapacityOnly("host4", "16", "32Gi"), // nil EffectiveCapacity, uses Capacity: 16 CPU free, 32Gi free + }, + request: newNovaRequest("instance-123", "project-A", "m1.small", "gp-1", 4, "8Gi", false, []string{"host1", "host2", "host3", "host4"}), + expectedHosts: []string{"host1", "host2", "host4"}, + filteredHosts: []string{"host3"}, + }, + { + name: "All hypervisors with nil EffectiveCapacity use Capacity fallback", + hypervisors: []*hv1.Hypervisor{ + newHypervisorWithCapacityOnly("host1", "8", "16Gi"), + newHypervisorWithCapacityOnly("host2", "4", "8Gi"), + }, + request: newNovaRequest("instance-123", "project-A", "m1.small", "gp-1", 4, "8Gi", false, []string{"host1", "host2"}), + expectedHosts: []string{"host1", "host2"}, + filteredHosts: []string{}, + }, + { + name: "EffectiveCapacity used when both are set (overcommit scenario)", + hypervisors: []*hv1.Hypervisor{ + // host1: Capacity=8 CPU, EffectiveCapacity=16 CPU (2x overcommit) + // With Capacity only: 8 free -> passes + // With EffectiveCapacity: 16 free -> passes (more capacity available) + newHypervisorWithBothCapacities("host1", "8", "16", "16Gi", "32Gi"), + // host2: Capacity=4 CPU, EffectiveCapacity=8 CPU (2x overcommit) + // With Capacity only: 4 free -> would be filtered (need 5) + // With EffectiveCapacity: 8 free -> passes + newHypervisorWithBothCapacities("host2", "4", "8", "8Gi", "16Gi"), + // host3: Capacity=4 CPU, EffectiveCapacity=4 CPU (no overcommit) + // Both: 4 free -> filtered (need 5) + newHypervisorWithBothCapacities("host3", "4", "4", "8Gi", "8Gi"), + }, + request: newNovaRequest("instance-123", "project-A", "m1.small", "gp-1", 5, "8Gi", false, []string{"host1", "host2", "host3"}), + expectedHosts: []string{"host1", "host2"}, + filteredHosts: []string{"host3"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + objects := make([]client.Object, 0, len(tt.hypervisors)) + for _, h := range tt.hypervisors { + objects = append(objects, h.DeepCopy()) + } + + step := &FilterHasEnoughCapacity{} + step.Client = fake.NewClientBuilder().WithScheme(scheme).WithObjects(objects...).Build() + step.Options = FilterHasEnoughCapacityOpts{LockReserved: false} + + result, err := step.Run(slog.Default(), tt.request) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + + for _, host := range tt.expectedHosts { + if _, ok := result.Activations[host]; !ok { + t.Errorf("expected host %s to be present in activations, but got %+v", host, result.Activations) + } + } + + for _, host := range tt.filteredHosts { + if _, ok := result.Activations[host]; ok { + t.Errorf("expected host %s to be filtered out", host) + } + } + }) + } +} From a43b058b779ee871cb7f033cbc58a71da5b1c2f9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 18 Mar 2026 13:01:33 +0000 Subject: [PATCH 55/55] Bump cortex chart appVersions to sha-1401e19e [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 11bcbc532..aa0316575 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.26 -appVersion: "sha-9daeb244" +appVersion: "sha-1401e19e" icon: "https://example.com/icon.png" dependencies: []