diff --git a/.coderabbit.yaml b/.coderabbit.yaml new file mode 100644 index 000000000..d158def60 --- /dev/null +++ b/.coderabbit.yaml @@ -0,0 +1,2 @@ +reviews: + high_level_summary: false diff --git a/.github/workflows/push-charts.yaml b/.github/workflows/push-charts.yaml index e75ab068d..88b0fe9e6 100644 --- a/.github/workflows/push-charts.yaml +++ b/.github/workflows/push-charts.yaml @@ -21,7 +21,7 @@ jobs: - name: Set up Helm uses: azure/setup-helm@v4.3.1 - name: Log into registry - uses: docker/login-action@v3 + uses: docker/login-action@v4 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} diff --git a/.github/workflows/push-images.yaml b/.github/workflows/push-images.yaml index 7addc3e7a..997595976 100644 --- a/.github/workflows/push-images.yaml +++ b/.github/workflows/push-images.yaml @@ -21,11 +21,11 @@ jobs: steps: - uses: actions/checkout@v6 - name: Set up QEMU - uses: docker/setup-qemu-action@v3 + uses: docker/setup-qemu-action@v4 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@v4 - name: Login to Docker Registry - uses: docker/login-action@v3 + uses: docker/login-action@v4 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} @@ -41,7 +41,7 @@ jobs: - name: Docker Meta (Cortex Postgres) if: steps.changed_postgres_files.outputs.all_changed_files != '' id: meta_cortex_postgres - uses: docker/metadata-action@v5 + uses: docker/metadata-action@v6 with: images: ${{ env.REGISTRY }}/${{ github.repository }}-postgres tags: | @@ -54,7 +54,7 @@ jobs: - name: Build and Push Cortex Postgres if: steps.changed_postgres_files.outputs.all_changed_files != '' id: push_cortex_postgres - uses: docker/build-push-action@v6 + uses: docker/build-push-action@v7 with: context: postgres platforms: linux/amd64,linux/arm64 @@ -75,7 +75,7 @@ jobs: # Build & push new cortex image - name: Docker Meta (Cortex) id: meta_cortex - uses: docker/metadata-action@v5 + uses: docker/metadata-action@v6 with: images: ${{ env.REGISTRY }}/${{ github.repository }} tags: | @@ -87,7 +87,7 @@ jobs: DOCKER_METADATA_SHORT_SHA_LENGTH: 8 - name: Build and Push Cortex id: push_cortex - uses: docker/build-push-action@v6 + uses: docker/build-push-action@v7 with: context: . file: Dockerfile diff --git a/.gitignore b/.gitignore index f0ab035e7..907b4f7ca 100644 --- a/.gitignore +++ b/.gitignore @@ -34,4 +34,5 @@ cortex.secrets.yaml !.github !.golangci.yaml !.license-scan-overrides.jsonl -!.license-scan-rules.json \ No newline at end of file +!.license-scan-rules.json +!.coderabbit.yaml \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..6f2e12a17 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,79 @@ + + +# Instructions for Agents + +## Context + +You are developing code in the open-source project github.com/cobaltcore-dev/cortex. + +Cortex is a modular and extensible service for initial placement and scheduling in cloud-native environments covering workloads such as compute, storage, network, and other scheduling domains. + +It improves resource utilization and operational performance by making smart placement decisions based on the current state of the environment and defined constraints and objectives. + +Cortex is written in Golang and is designed for production-scale deployments using algorithmic approaches to balance decision quality, execution efficiency, and maintaining a low resource footprint. + +## Best Practices + +All code files must contain this license header: +```go +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 +``` + +General: +- Keep it concise and always focus on good code quality. We go to production +- We are on modern Golang, so you no longer need `interface{}` and use `any` instead +- Similarly, you no longer have to capture loop variables in closures, as this is now the default behavior in Go +- Don’t document trivial steps you do and avoid unnecessary empty lines between code segments +- When adding imports, keep in mind that the autoformatter will remove them if you don't use them +- `fmt.Errorf` should not be used when there are no parameters. Use `errors.New` +- Errors should always be lowercase like `errors.New("this is an error")` to conform to linting rules +- You can use `maps.Copy` instead of iteratively copying a map +- You can use `strings.Contains` to check if some string is in another +- You can use `slices.Contains` to check if an element is part of a slice +- And definitely use `testlib.Ptr` for test cases that require pointer values + +Testing: +- Ideally test files should be short and contain only the necessary cases +- Avoid creating testing libraries, keep helper functions in the same file as the tests that use them +- Use golang native testing whenever possible, avoid using Ginkgo or testify +- Don't test for the existence of interface methods +- If applicable, use struct based test cases, but limit yourself to the most relevant cases + +Helm charts: +- Note the `# from: file://../../library/cortex-postgres` comment in `Chart.yaml` files, this is required and should point to the local chart path + +## Repository Structure + +Code: +- `cmd/main.go` is the entry point for the manager, which starts the controllers and webhooks +- `api/v1alpha1` is where the CRD specs of cortex lives +- `api/external` contains messages sent to cortex via http from external openstack services +- `internal/scheduling` contains the logic for scheduling in different cloud domains +- `internal/knowledge` has all the logic for feature extraction and raw data downloads from sources like prometheus and openstack +- `pkg` is the code that is very non-cortex-specific and can be used across other projects as well + +Deployment: +- `helm/library` contains a generic cortex setup, i.e. the manager and its dependencies +- `helm/dev` contains charts that can deploy cortex dependencies that a typical production cluster already has, such as a fine-tuned kube-prometheus-stack for monitoring +- `helm/bundles` here are the charts that stylize the library chart into a deployment for a specific domain, for example a bundle for deploying cortex with openstack nova +- In the `helm` folders there are also helpers for syncing helm dependencies which are used by the tiltfile for local development and our ci pipelines to replace oci dependencies with local paths + +Tooling: +- `tools` contains miscallaneous tools for development, which should typically **not** be used by agents + +Documentation: +- `docs` contains documentation for cortex, which should be written in markdown + +## Tooling + +Before finishing your task, you should always ensure local tests and lints are passing: +- `make` regenerates CRDs and deepcopy methods, runs tests, and performs lints +- Avoid running `make` when you don't want to apply your crd changes just yet +- `make lint` runs golangci-lint, `make lint-fix` runs golangci-lint with `--fix` +- `make test` runs all the unit tests with `go test ./...` +- If you are struggling with the Makefile, you can use `make help` to get a list of all available commands and their descriptions diff --git a/CODEOWNERS b/CODEOWNERS index 23f370f22..f6e423ecb 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1 +1 @@ -* arno.uhlig@sap.com julius.clausnitzer@sap.com malte.viering@sap.com marcel.bloecher@sap.com markus.wieland@sap.com p.matthes@sap.com \ No newline at end of file +* arno.uhlig@sap.com julius.clausnitzer@sap.com malte.viering@sap.com marcel.gute@sap.com markus.wieland@sap.com p.matthes@sap.com \ No newline at end of file diff --git a/Makefile b/Makefile index b63e2e267..3d90f6161 100644 --- a/Makefile +++ b/Makefile @@ -28,6 +28,17 @@ lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes test: ## Run all tests. go test ./... +.PHONY: testsum +testsum: gotestsum ## Run all tests (clean output for passing, verbose for failing). Options: WATCH=1, RUN=, PACKAGE=, FORMAT= (e.g., standard-verbose for all output) + $(GOTESTSUM) \ + $(if $(WATCH),--watch) \ + --format $(if $(FORMAT),$(FORMAT),testname) \ + --hide-summary=all \ + -- \ + $(if $(VERBOSE),-v) \ + $(if $(RUN),-run $(RUN)) \ + $(if $(PACKAGE),$(PACKAGE),./...) + .PHONY: generate generate: deepcopy crds ## Regenerate CRDs and DeepCopy after API type changes. @@ -45,9 +56,11 @@ $(LOCALBIN): CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen GOLANGCI_LINT = $(LOCALBIN)/golangci-lint +GOTESTSUM = $(LOCALBIN)/gotestsum CONTROLLER_TOOLS_VERSION ?= v0.20.0 GOLANGCI_LINT_VERSION ?= v2.9.0 +GOTESTSUM_VERSION ?= v1.13.0 .PHONY: controller-gen controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessary. @@ -59,6 +72,11 @@ golangci-lint: $(GOLANGCI_LINT) ## Download golangci-lint locally if necessary. $(GOLANGCI_LINT): $(LOCALBIN) $(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/v2/cmd/golangci-lint,$(GOLANGCI_LINT_VERSION)) +.PHONY: gotestsum +gotestsum: $(GOTESTSUM) ## Download gotestsum locally if necessary. +$(GOTESTSUM): $(LOCALBIN) + $(call go-install-tool,$(GOTESTSUM),gotest.tools/gotestsum,$(GOTESTSUM_VERSION)) + # go-install-tool will 'go install' any package with custom target and name of binary, if it doesn't exist # $1 - target path with name of binary # $2 - package url which can be installed diff --git a/Tiltfile b/Tiltfile index 84d39394b..a42fe43f4 100644 --- a/Tiltfile +++ b/Tiltfile @@ -75,7 +75,7 @@ local('kubectl wait --namespace cert-manager --for=condition=available deploymen ########### Dependency CRDs # Make sure the local cluster is running if you are running into startup issues here. -url = 'https://raw.githubusercontent.com/cobaltcore-dev/openstack-hypervisor-operator/refs/heads/main/charts/openstack-hypervisor-operator/crds/hypervisor-crd.yaml' +url = 'https://raw.githubusercontent.com/cobaltcore-dev/openstack-hypervisor-operator/refs/heads/main/charts/openstack-hypervisor-operator/crds/kvm.cloud.sap_hypervisors.yaml' local('curl -L ' + url + ' | kubectl apply -f -') ########### Cortex Operator & CRDs @@ -268,14 +268,6 @@ k8s_resource( labels=['Monitoring'], ) -k8s_yaml('./tools/visualizer/role.yaml') -docker_build('cortex-visualizer', './tools/visualizer') -k8s_yaml('./tools/visualizer/app.yaml') -k8s_resource('cortex-visualizer', port_forwards=[ - port_forward(4000, 80), -], links=[ - link('localhost:4000', 'nova visualizer'), -], labels=['Monitoring']) docker_build('cortex-plutono', './tools/plutono') k8s_yaml('./tools/plutono/app.yaml') k8s_resource('cortex-plutono', port_forwards=[ diff --git a/api/v1alpha1/knowledge_types.go b/api/v1alpha1/knowledge_types.go index d90f76565..504b30449 100644 --- a/api/v1alpha1/knowledge_types.go +++ b/api/v1alpha1/knowledge_types.go @@ -93,6 +93,11 @@ type KnowledgeStatus struct { // +kubebuilder:validation:Optional LastExtracted metav1.Time `json:"lastExtracted"` + // When the extracted knowledge content last changed. + // Updated only when the Raw data actually changes, not on every reconcile. + // +kubebuilder:validation:Optional + LastContentChange metav1.Time `json:"lastContentChange,omitempty"` + // The raw data behind the extracted knowledge, e.g. a list of features. // +kubebuilder:validation:Optional Raw runtime.RawExtension `json:"raw"` @@ -111,6 +116,7 @@ type KnowledgeStatus struct { // +kubebuilder:printcolumn:name="Domain",type="string",JSONPath=".spec.schedulingDomain" // +kubebuilder:printcolumn:name="Created",type="date",JSONPath=".metadata.creationTimestamp" // +kubebuilder:printcolumn:name="Extracted",type="date",JSONPath=".status.lastExtracted" +// +kubebuilder:printcolumn:name="Changed",type="date",JSONPath=".status.lastContentChange" // +kubebuilder:printcolumn:name="Recency",type="string",JSONPath=".spec.recency" // +kubebuilder:printcolumn:name="Features",type="integer",JSONPath=".status.rawLength" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" diff --git a/api/v1alpha1/reservation_types.go b/api/v1alpha1/reservation_types.go index ed8e42f43..5e6a30b01 100644 --- a/api/v1alpha1/reservation_types.go +++ b/api/v1alpha1/reservation_types.go @@ -4,6 +4,7 @@ package v1alpha1 import ( + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -21,6 +22,20 @@ const ( ReservationTypeFailover ReservationType = "FailoverReservation" ) +// Label keys for Reservation metadata. +// Labels follow Kubernetes naming conventions using reverse-DNS notation +const ( + // ===== Common Reservation Labels ===== + + // LabelReservationType identifies the type of reservation. + // This label is present on all reservations to enable type-based filtering. + LabelReservationType = "reservations.cortex.sap.com/type" + + // Reservation type label values + ReservationTypeLabelCommittedResource = "committed-resource" + ReservationTypeLabelFailover = "failover" +) + // CommittedResourceAllocation represents a workload's assignment to a committed resource reservation slot. // The workload could be a VM (Nova/IronCore), Pod (Kubernetes), or other resource. type CommittedResourceAllocation struct { @@ -30,7 +45,7 @@ type CommittedResourceAllocation struct { // Resources consumed by this instance. // +kubebuilder:validation:Required - Resources map[string]resource.Quantity `json:"resources"` + Resources map[hv1.ResourceName]resource.Quantity `json:"resources"` } // CommittedResourceReservationSpec defines the spec fields specific to committed resource reservations. @@ -39,6 +54,10 @@ type CommittedResourceReservationSpec struct { // +kubebuilder:validation:Optional ResourceName string `json:"resourceName,omitempty"` + // CommitmentUUID is the UUID of the commitment that this reservation corresponds to. + // +kubebuilder:validation:Optional + CommitmentUUID string `json:"commitmentUUID,omitempty"` + // ResourceGroup is the group/category of the resource (e.g., flavor group for Nova) // +kubebuilder:validation:Optional ResourceGroup string `json:"resourceGroup,omitempty"` @@ -79,9 +98,13 @@ type ReservationSpec struct { // +kubebuilder:validation:Optional SchedulingDomain string `json:"schedulingDomain,omitempty"` + // AvailabilityZone specifies the availability zone for this reservation, if restricted to a specific AZ. + // +kubebuilder:validation:Optional + AvailabilityZone string `json:"availabilityZone,omitempty"` + // Resources to reserve for this instance. // +kubebuilder:validation:Optional - Resources map[string]resource.Quantity `json:"resources,omitempty"` + Resources map[hv1.ResourceName]resource.Quantity `json:"resources,omitempty"` // StartTime is the time when the reservation becomes active. // +kubebuilder:validation:Optional @@ -166,7 +189,7 @@ type ReservationStatus struct { // +kubebuilder:object:root=true // +kubebuilder:subresource:status // +kubebuilder:resource:scope=Cluster -// +kubebuilder:printcolumn:name="Type",type="string",JSONPath=".spec.type" +// +kubebuilder:printcolumn:name="Type",type="string",JSONPath=".metadata.labels['reservations\\.cortex\\.sap\\.com/type']" // +kubebuilder:printcolumn:name="Host",type="string",JSONPath=".status.host" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 5a756e045..96043cc1f 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -8,6 +8,7 @@ package v1alpha1 import ( + apiv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -35,7 +36,7 @@ func (in *CommittedResourceAllocation) DeepCopyInto(out *CommittedResourceAlloca in.CreationTimestamp.DeepCopyInto(&out.CreationTimestamp) if in.Resources != nil { in, out := &in.Resources, &out.Resources - *out = make(map[string]resource.Quantity, len(*in)) + *out = make(map[apiv1.ResourceName]resource.Quantity, len(*in)) for key, val := range *in { (*out)[key] = val.DeepCopy() } @@ -833,6 +834,7 @@ func (in *KnowledgeSpec) DeepCopy() *KnowledgeSpec { func (in *KnowledgeStatus) DeepCopyInto(out *KnowledgeStatus) { *out = *in in.LastExtracted.DeepCopyInto(&out.LastExtracted) + in.LastContentChange.DeepCopyInto(&out.LastContentChange) in.Raw.DeepCopyInto(&out.Raw) if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions @@ -1217,7 +1219,7 @@ func (in *ReservationSpec) DeepCopyInto(out *ReservationSpec) { *out = *in if in.Resources != nil { in, out := &in.Resources, &out.Resources - *out = make(map[string]resource.Quantity, len(*in)) + *out = make(map[apiv1.ResourceName]resource.Quantity, len(*in)) for key, val := range *in { (*out)[key] = val.DeepCopy() } diff --git a/cmd/main.go b/cmd/main.go index 4e4865567..46a244de1 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -309,6 +309,10 @@ func main() { httpAPIConf := conf.GetConfigOrDie[nova.HTTPAPIConfig]() nova.NewAPI(httpAPIConf, filterWeigherController).Init(mux) + // Initialize commitments API for LIQUID interface + commitmentsAPI := commitments.NewAPI(multiclusterClient) + commitmentsAPI.Init(mux) + // Detector pipeline controller setup. novaClient := nova.NewNovaClient() novaClientConfig := conf.GetConfigOrDie[nova.NovaClientConfig]() @@ -365,6 +369,15 @@ func main() { os.Exit(1) } } + if slices.Contains(mainConfig.EnabledControllers, "hypervisor-overcommit-controller") { + hypervisorOvercommitController := &nova.HypervisorOvercommitController{} + hypervisorOvercommitController.Client = multiclusterClient + if err := hypervisorOvercommitController.SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", + "controller", "HypervisorOvercommitController") + os.Exit(1) + } + } if slices.Contains(mainConfig.EnabledControllers, "manila-decisions-pipeline-controller") { controller := &manila.FilterWeigherPipelineController{ Monitor: filterWeigherPipelineMonitor, @@ -456,11 +469,11 @@ func main() { monitor := reservationscontroller.NewControllerMonitor(multiclusterClient) metrics.Registry.MustRegister(&monitor) reservationsControllerConfig := conf.GetConfigOrDie[reservationscontroller.Config]() + if err := (&reservationscontroller.ReservationReconciler{ - Client: multiclusterClient, - Scheme: mgr.GetScheme(), - Conf: reservationsControllerConfig, - HypervisorClient: reservationscontroller.NewHypervisorClient(), + Client: multiclusterClient, + Scheme: mgr.GetScheme(), + Conf: reservationsControllerConfig, }).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Reservation") os.Exit(1) diff --git a/docs/develop.md b/docs/develop.md index 5b090c889..c39cbd61a 100644 --- a/docs/develop.md +++ b/docs/develop.md @@ -34,6 +34,21 @@ Cortex is developed using the Go programming language. To get started with the d Run `make` in your terminal from the cortex root directory to perform linting and testing tasks. +### Working on Tests + +```bash +# Watch mode for continuous testing; print logs for failed tests only +make testsum WATCH=1 +``` + +The `testsum` target provides cleaner output by showing only full verbose output for failing tests. + +**Available options:** +- `WATCH=1` - Automatically re-run tests when files change +- `RUN=` - Run specific tests matching the pattern +- `PACKAGE=` - Test specific package(s) +- `FORMAT=` - Change output format (e.g., `standard-verbose` for verbose output on all tests) + ## Helm Charts Helm charts bundle the application into a package, containing all the [Kubernetes](https://kubernetes.io/docs/tutorials/hello-minikube/) resources needed to run the application. The configuration for the application is specified in the [Helm `values.yaml`](cortex.secrets.example.yaml). diff --git a/go.mod b/go.mod index 245513cb3..5bda482d6 100644 --- a/go.mod +++ b/go.mod @@ -3,17 +3,18 @@ module github.com/cobaltcore-dev/cortex go 1.26 require ( - github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260305105543-733c59b0b17c + github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260316070528-80f53bbce409 github.com/go-gorp/gorp v2.2.0+incompatible - github.com/gophercloud/gophercloud/v2 v2.10.0 + github.com/gophercloud/gophercloud/v2 v2.11.1 github.com/ironcore-dev/ironcore v0.2.4 + github.com/majewsky/gg v1.5.0 github.com/prometheus/client_golang v1.23.2 github.com/prometheus/client_model v0.6.2 - github.com/sapcc/go-bits v0.0.0-20260226170120-c20f89b66c3c - k8s.io/api v0.35.1 - k8s.io/apimachinery v0.35.1 - k8s.io/client-go v0.35.1 - sigs.k8s.io/controller-runtime v0.23.1 + github.com/sapcc/go-bits v0.0.0-20260312170110-034b497ebb7e + k8s.io/api v0.35.2 + k8s.io/apimachinery v0.35.2 + k8s.io/client-go v0.35.2 + sigs.k8s.io/controller-runtime v0.23.3 ) require ( @@ -36,7 +37,7 @@ require ( github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect - github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/logr v1.4.3 github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.22.1 // indirect @@ -71,7 +72,7 @@ require ( github.com/poy/onpar v0.3.5 // indirect github.com/prometheus/common v0.67.5 // indirect github.com/prometheus/procfs v0.17.0 // indirect - github.com/sapcc/go-api-declarations v1.20.2 // indirect + github.com/sapcc/go-api-declarations v1.20.2 github.com/sirupsen/logrus v1.9.3 // indirect github.com/spf13/cobra v1.10.1 // indirect github.com/spf13/pflag v1.0.10 // indirect diff --git a/go.sum b/go.sum index 03dfd27f0..338b73d74 100644 --- a/go.sum +++ b/go.sum @@ -20,8 +20,10 @@ github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1x github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260305105543-733c59b0b17c h1:XBqsQQwdSep27eJN7sACjahkhmR2zRlJwv9PrYcEou8= -github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260305105543-733c59b0b17c/go.mod h1:b0KmJdxvRI8UXlGe8cRm5BD8Tm2WhF7zSKMSIRGyVL4= +github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260313132145-05f22f69d9fd h1:IzxramZZRC/9FtQQqpbgf8KIpH4soD9cliCFs2+zPd4= +github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260313132145-05f22f69d9fd/go.mod h1:b0KmJdxvRI8UXlGe8cRm5BD8Tm2WhF7zSKMSIRGyVL4= +github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260316070528-80f53bbce409 h1:hiTMLk6JZsmFF+ECBJnOVcDAw2d+iCXhk4eDvVpYHYM= +github.com/cobaltcore-dev/openstack-hypervisor-operator v0.0.0-20260316070528-80f53bbce409/go.mod h1:b0KmJdxvRI8UXlGe8cRm5BD8Tm2WhF7zSKMSIRGyVL4= github.com/containerd/continuity v0.4.5 h1:ZRoN1sXq9u7V6QoHMcVWGhOwDFqZ4B9i5H6un1Wh0x4= github.com/containerd/continuity v0.4.5/go.mod h1:/lNJvtJKUQStBzpVQ1+rasXO1LAWtUQssk28EZvJ3nE= github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= @@ -97,8 +99,8 @@ github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83 h1:z2ogiKUYzX5Is6zr/v github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gophercloud/gophercloud/v2 v2.10.0 h1:NRadC0aHNvy4iMoFXj5AFiPmut/Sj3hAPAo9B59VMGc= -github.com/gophercloud/gophercloud/v2 v2.10.0/go.mod h1:Ki/ILhYZr/5EPebrPL9Ej+tUg4lqx71/YH2JWVeU+Qk= +github.com/gophercloud/gophercloud/v2 v2.11.1 h1:jCs4vLH8sJgRqrPzqVfWgl7uI6JnIIlsgeIRM0uHjxY= +github.com/gophercloud/gophercloud/v2 v2.11.1/go.mod h1:Rm0YvKQ4QYX2rY9XaDKnjRzSGwlG5ge4h6ABYnmkKQM= github.com/gotestyourself/gotestyourself v2.2.0+incompatible h1:AQwinXlbQR2HvPjQZOmDhRqsv5mZf+Jb1RnSLxcqZcI= github.com/gotestyourself/gotestyourself v2.2.0+incompatible/go.mod h1:zZKM6oeNM8k+FRljX1mnzVYeS8wiGgQyvST1/GafPbY= github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1 h1:X5VWvz21y3gzm9Nw/kaUeku/1+uBhcekkmy4IkffJww= @@ -176,8 +178,8 @@ github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sapcc/go-api-declarations v1.20.2 h1:GWqv8VgsF4k9id6N051AVTaEpcjT02APsOuz2yCvTPQ= github.com/sapcc/go-api-declarations v1.20.2/go.mod h1:eiRrXXUeQS5C/1kKn8/KMjk0Y0goUzgDQswj30rH0Zc= -github.com/sapcc/go-bits v0.0.0-20260226170120-c20f89b66c3c h1:GX6ADtKR6Bs2bBRBqeJf376MsxLNppC1SOHLIBuQwIA= -github.com/sapcc/go-bits v0.0.0-20260226170120-c20f89b66c3c/go.mod h1:AYC4f8FYO9DllSt4TyMwm1e5iPn32/DGIrlgXtuEsJ0= +github.com/sapcc/go-bits v0.0.0-20260312170110-034b497ebb7e h1:4wgkrfAlnL6ffM7HTNoHn1HrBBurCRR71WNOszdiDNQ= +github.com/sapcc/go-bits v0.0.0-20260312170110-034b497ebb7e/go.mod h1:NZjMiGVm04U25vwR6ZWvMw0XOOnvS1jkmXpjiepOeUw= github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw= github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= @@ -283,16 +285,16 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools v2.2.0+incompatible h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo= gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw= -k8s.io/api v0.35.1 h1:0PO/1FhlK/EQNVK5+txc4FuhQibV25VLSdLMmGpDE/Q= -k8s.io/api v0.35.1/go.mod h1:28uR9xlXWml9eT0uaGo6y71xK86JBELShLy4wR1XtxM= +k8s.io/api v0.35.2 h1:tW7mWc2RpxW7HS4CoRXhtYHSzme1PN1UjGHJ1bdrtdw= +k8s.io/api v0.35.2/go.mod h1:7AJfqGoAZcwSFhOjcGM7WV05QxMMgUaChNfLTXDRE60= k8s.io/apiextensions-apiserver v0.35.0 h1:3xHk2rTOdWXXJM+RDQZJvdx0yEOgC0FgQ1PlJatA5T4= k8s.io/apiextensions-apiserver v0.35.0/go.mod h1:E1Ahk9SADaLQ4qtzYFkwUqusXTcaV2uw3l14aqpL2LU= -k8s.io/apimachinery v0.35.1 h1:yxO6gV555P1YV0SANtnTjXYfiivaTPvCTKX6w6qdDsU= -k8s.io/apimachinery v0.35.1/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= +k8s.io/apimachinery v0.35.2 h1:NqsM/mmZA7sHW02JZ9RTtk3wInRgbVxL8MPfzSANAK8= +k8s.io/apimachinery v0.35.2/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= k8s.io/apiserver v0.35.0 h1:CUGo5o+7hW9GcAEF3x3usT3fX4f9r8xmgQeCBDaOgX4= k8s.io/apiserver v0.35.0/go.mod h1:QUy1U4+PrzbJaM3XGu2tQ7U9A4udRRo5cyxkFX0GEds= -k8s.io/client-go v0.35.1 h1:+eSfZHwuo/I19PaSxqumjqZ9l5XiTEKbIaJ+j1wLcLM= -k8s.io/client-go v0.35.1/go.mod h1:1p1KxDt3a0ruRfc/pG4qT/3oHmUj1AhSHEcxNSGg+OA= +k8s.io/client-go v0.35.2 h1:YUfPefdGJA4aljDdayAXkc98DnPkIetMl4PrKX97W9o= +k8s.io/client-go v0.35.2/go.mod h1:4QqEwh4oQpeK8AaefZ0jwTFJw/9kIjdQi0jpKeYvz7g= k8s.io/component-base v0.35.0 h1:+yBrOhzri2S1BVqyVSvcM3PtPyx5GUxCK2tinZz1G94= k8s.io/component-base v0.35.0/go.mod h1:85SCX4UCa6SCFt6p3IKAPej7jSnF3L8EbfSyMZayJR0= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= @@ -303,8 +305,8 @@ k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzk k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.33.0 h1:qPrZsv1cwQiFeieFlRqT627fVZ+tyfou/+S5S0H5ua0= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.33.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= -sigs.k8s.io/controller-runtime v0.23.1 h1:TjJSM80Nf43Mg21+RCy3J70aj/W6KyvDtOlpKf+PupE= -sigs.k8s.io/controller-runtime v0.23.1/go.mod h1:B6COOxKptp+YaUT5q4l6LqUJTRpizbgf9KSRNdQGns0= +sigs.k8s.io/controller-runtime v0.23.3 h1:VjB/vhoPoA9l1kEKZHBMnQF33tdCLQKJtydy4iqwZ80= +sigs.k8s.io/controller-runtime v0.23.3/go.mod h1:B6COOxKptp+YaUT5q4l6LqUJTRpizbgf9KSRNdQGns0= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= diff --git a/helm/bundles/cortex-cinder/Chart.yaml b/helm/bundles/cortex-cinder/Chart.yaml index b81e033df..a6e369e27 100644 --- a/helm/bundles/cortex-cinder/Chart.yaml +++ b/helm/bundles/cortex-cinder/Chart.yaml @@ -5,23 +5,23 @@ apiVersion: v2 name: cortex-cinder description: A Helm chart deploying Cortex for Cinder. type: application -version: 0.0.38 +version: 0.0.39 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres - name: cortex-postgres repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.5.12 + version: 0.5.13 # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-cinder/values.yaml b/helm/bundles/cortex-cinder/values.yaml index f002fc58b..b01656205 100644 --- a/helm/bundles/cortex-cinder/values.yaml +++ b/helm/bundles/cortex-cinder/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" diff --git a/helm/bundles/cortex-crds/Chart.yaml b/helm/bundles/cortex-crds/Chart.yaml index 5427ac308..c4b2d7095 100644 --- a/helm/bundles/cortex-crds/Chart.yaml +++ b/helm/bundles/cortex-crds/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-crds description: A Helm chart deploying Cortex CRDs. type: application -version: 0.0.38 +version: 0.0.39 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-crds/values.yaml b/helm/bundles/cortex-crds/values.yaml index 2033e435c..bf072086c 100644 --- a/helm/bundles/cortex-crds/values.yaml +++ b/helm/bundles/cortex-crds/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" diff --git a/helm/bundles/cortex-ironcore/Chart.yaml b/helm/bundles/cortex-ironcore/Chart.yaml index a555ba200..0a4a278c5 100644 --- a/helm/bundles/cortex-ironcore/Chart.yaml +++ b/helm/bundles/cortex-ironcore/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-ironcore description: A Helm chart deploying Cortex for IronCore. type: application -version: 0.0.38 +version: 0.0.39 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-ironcore/values.yaml b/helm/bundles/cortex-ironcore/values.yaml index 2f885c7a5..82e490585 100644 --- a/helm/bundles/cortex-ironcore/values.yaml +++ b/helm/bundles/cortex-ironcore/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" diff --git a/helm/bundles/cortex-manila/Chart.yaml b/helm/bundles/cortex-manila/Chart.yaml index 98e7f587c..ac7420728 100644 --- a/helm/bundles/cortex-manila/Chart.yaml +++ b/helm/bundles/cortex-manila/Chart.yaml @@ -5,23 +5,23 @@ apiVersion: v2 name: cortex-manila description: A Helm chart deploying Cortex for Manila. type: application -version: 0.0.38 +version: 0.0.39 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres - name: cortex-postgres repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.5.12 + version: 0.5.13 # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-manila/values.yaml b/helm/bundles/cortex-manila/values.yaml index cc341a112..50d16352e 100644 --- a/helm/bundles/cortex-manila/values.yaml +++ b/helm/bundles/cortex-manila/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" diff --git a/helm/bundles/cortex-nova/Chart.yaml b/helm/bundles/cortex-nova/Chart.yaml index ec1c16b2f..0a02660e9 100644 --- a/helm/bundles/cortex-nova/Chart.yaml +++ b/helm/bundles/cortex-nova/Chart.yaml @@ -5,23 +5,23 @@ apiVersion: v2 name: cortex-nova description: A Helm chart deploying Cortex for Nova. type: application -version: 0.0.38 +version: 0.0.39 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres - name: cortex-postgres repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.5.12 + version: 0.5.13 # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 1c580a9ba..65de5c626 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -10,15 +10,16 @@ groups: context: liveness dashboard: cortex/cortex service: cortex - severity: warning + severity: critical support_group: workload-management playbook: docs/support/playbook/cortex/down annotations: summary: "Cortex Scheduling for Nova is down" description: > The Cortex scheduling service is down. Scheduling requests from Nova will - not be served. This is no immediate problem, since Nova will continue - placing new VMs. However, the placement will be less desirable. + not be served. This is non-critical for vmware virtual machines, but + blocks kvm virtual machines from being scheduled. Thus, it is + recommended to immediately investigate and resolve the issue. - alert: CortexNovaKnowledgeDown expr: | @@ -40,7 +41,7 @@ groups: but the quality of the responses may be affected. - alert: CortexNovaDeschedulerPipelineErroring - expr: delta(cortex_descheduler_pipeline_vm_descheduling_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0 + expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0 for: 5m labels: context: descheduler diff --git a/helm/bundles/cortex-nova/templates/knowledges_kvm.yaml b/helm/bundles/cortex-nova/templates/knowledges_kvm.yaml index f2181fe96..6b3d9fcbc 100644 --- a/helm/bundles/cortex-nova/templates/knowledges_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/knowledges_kvm.yaml @@ -2,6 +2,23 @@ --- apiVersion: cortex.cloud/v1alpha1 kind: Knowledge +metadata: + name: flavor-groups +spec: + schedulingDomain: nova + extractor: + name: flavor_groups + recency: "5m" + description: | + This knowledge extracts flavor groups from Nova flavors based on the + hw_version extra_spec. It identifies all flavors belonging to each group + and determines the largest flavor for reservation slot sizing. + dependencies: + datasources: + - name: nova-flavors +--- +apiVersion: cortex.cloud/v1alpha1 +kind: Knowledge metadata: name: kvm-libvirt-domain-cpu-steal-pct spec: diff --git a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml index 815fed441..68ec01352 100644 --- a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml @@ -15,7 +15,11 @@ spec: Specifically, this pipeline is used for general purpose workloads. type: filter-weigher createDecisions: true - filters: [] + filters: + - name: filter_correct_az + description: | + This step will filter out hosts whose aggregate information indicates they + are not placed in the requested availability zone. weighers: - name: kvm_prefer_smaller_hosts params: @@ -61,7 +65,11 @@ spec: Specifically, this pipeline is used for hana virtual machines. type: filter-weigher createDecisions: true - filters: [] + filters: + - name: filter_correct_az + description: | + This step will filter out hosts whose aggregate information indicates they + are not placed in the requested availability zone. weighers: - name: kvm_prefer_smaller_hosts params: @@ -151,11 +159,6 @@ spec: `domain_name` scheduler hint from the nova request spec. params: - {key: domainNamePrefixes, stringListValue: ["iaas-"]} - - name: filter_packed_virtqueue - description: | - If the flavor extra specs contain the `hw:virtio_packed_ring` key, or the - image properties contain the `hw_virtio_packed_ring` key, this step will - filter out hosts that do not have the `COMPUTE_NET_VIRTIO_PACKED` trait. - name: filter_allowed_projects description: | This step filters hosts based on allowed projects defined in the @@ -282,11 +285,6 @@ spec: `domain_name` scheduler hint from the nova request spec. params: - {key: domainNamePrefixes, stringListValue: ["iaas-"]} - - name: filter_packed_virtqueue - description: | - If the flavor extra specs contain the `hw:virtio_packed_ring` key, or the - image properties contain the `hw_virtio_packed_ring` key, this step will - filter out hosts that do not have the `COMPUTE_NET_VIRTIO_PACKED` trait. - name: filter_allowed_projects description: | This step filters hosts based on allowed projects defined in the diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index b2dbba788..c38b8bfc4 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" @@ -113,9 +113,19 @@ cortex-scheduling-controllers: enabledControllers: - nova-pipeline-controllers - nova-deschedulings-executor + - hypervisor-overcommit-controller - explanation-controller + - reservations-controller enabledTasks: - nova-decisions-cleanup-task + # Endpoints configuration for reservations controller + endpoints: + novaExternalScheduler: "http://localhost:8080/scheduler/nova/external" + # OvercommitMappings is a list of mappings that map hypervisor traits to + # overcommit ratios. Note that this list is applied in order, so if there + # are multiple mappings applying to the same hypervisors, the last mapping + # in this list will override the previous ones. + overcommitMappings: [] cortex-knowledge-controllers: <<: *cortex @@ -134,7 +144,6 @@ cortex-knowledge-controllers: - datasource-controllers - knowledge-controllers - kpis-controller - - reservations-controller enabledTasks: - commitments-sync-task diff --git a/helm/bundles/cortex-pods/Chart.yaml b/helm/bundles/cortex-pods/Chart.yaml index 949a2493a..dafbc2205 100644 --- a/helm/bundles/cortex-pods/Chart.yaml +++ b/helm/bundles/cortex-pods/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-pods description: A Helm chart deploying Cortex for Pods. type: application -version: 0.0.38 +version: 0.0.39 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.25 + version: 0.0.26 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-pods/values.yaml b/helm/bundles/cortex-pods/values.yaml index b7aab8a6d..4c381f539 100644 --- a/helm/bundles/cortex-pods/values.yaml +++ b/helm/bundles/cortex-pods/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" diff --git a/helm/dev/cortex-prometheus-operator/Chart.yaml b/helm/dev/cortex-prometheus-operator/Chart.yaml index 2c28dfe7c..fbe10bb3e 100644 --- a/helm/dev/cortex-prometheus-operator/Chart.yaml +++ b/helm/dev/cortex-prometheus-operator/Chart.yaml @@ -10,4 +10,4 @@ dependencies: # CRDs of the prometheus operator, such as PrometheusRule, ServiceMonitor, etc. - name: kube-prometheus-stack repository: oci://ghcr.io/prometheus-community/charts - version: 82.4.1 + version: 82.10.3 diff --git a/helm/library/cortex-postgres/Chart.yaml b/helm/library/cortex-postgres/Chart.yaml index ef1fc6398..39710ebad 100644 --- a/helm/library/cortex-postgres/Chart.yaml +++ b/helm/library/cortex-postgres/Chart.yaml @@ -5,5 +5,5 @@ apiVersion: v2 name: cortex-postgres description: Postgres setup for Cortex. type: application -version: 0.5.12 -appVersion: "sha-73adf5e6" +version: 0.5.13 +appVersion: "sha-6db36b81" diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 684bbd759..aa0316575 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: cortex description: A Helm chart to distribute cortex. type: application -version: 0.0.25 -appVersion: "sha-349d742b" +version: 0.0.26 +appVersion: "sha-1401e19e" icon: "https://example.com/icon.png" dependencies: [] diff --git a/helm/library/cortex/files/crds/cortex.cloud_knowledges.yaml b/helm/library/cortex/files/crds/cortex.cloud_knowledges.yaml index 0ac596bc2..2e3891ffa 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_knowledges.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_knowledges.yaml @@ -24,6 +24,9 @@ spec: - jsonPath: .status.lastExtracted name: Extracted type: date + - jsonPath: .status.lastContentChange + name: Changed + type: date - jsonPath: .spec.recency name: Recency type: string @@ -248,6 +251,12 @@ spec: - type type: object type: array + lastContentChange: + description: |- + When the extracted knowledge content last changed. + Updated only when the Raw data actually changes, not on every reconcile. + format: date-time + type: string lastExtracted: description: When the knowledge was last successfully extracted. format: date-time diff --git a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml index 5d341cdf6..d9256e5db 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml @@ -15,7 +15,7 @@ spec: scope: Cluster versions: - additionalPrinterColumns: - - jsonPath: .spec.type + - jsonPath: .metadata.labels['reservations\.cortex\.sap\.com/type'] name: Type type: string - jsonPath: .status.host @@ -49,6 +49,10 @@ spec: spec: description: spec defines the desired state of Reservation properties: + availabilityZone: + description: AvailabilityZone specifies the availability zone for + this reservation, if restricted to a specific AZ. + type: string committedResourceReservation: description: |- CommittedResourceReservation contains fields specific to committed resource reservations. @@ -83,6 +87,10 @@ spec: Key: Workload UUID (VM UUID for Nova, Pod UID for Pods, Machine UID for IronCore, etc.) Value: allocation state and metadata type: object + commitmentUUID: + description: CommitmentUUID is the UUID of the commitment that + this reservation corresponds to. + type: string creator: description: |- Creator identifies the system or component that created this reservation. diff --git a/helm/library/cortex/templates/rbac/hypervisor_role.yaml b/helm/library/cortex/templates/rbac/hypervisor_role.yaml index 14b61e5de..0a2fefa00 100644 --- a/helm/library/cortex/templates/rbac/hypervisor_role.yaml +++ b/helm/library/cortex/templates/rbac/hypervisor_role.yaml @@ -1,5 +1,6 @@ {{- if .Values.rbac.hypervisor.enable }} --- +# TODO: Check if this role can be part of the nova bundle, not the core library apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: @@ -14,6 +15,8 @@ rules: verbs: - get - list + - patch + - update - watch - apiGroups: - kvm.cloud.sap diff --git a/helm/library/cortex/values.yaml b/helm/library/cortex/values.yaml index e7a475184..ad3c7ba8b 100644 --- a/helm/library/cortex/values.yaml +++ b/helm/library/cortex/values.yaml @@ -44,9 +44,6 @@ controllerManager: terminationGracePeriodSeconds: 10 serviceAccountName: controller-manager -# Use this to unambiguate multiple cortex deployments in the same cluster. -namePrefix: cortex - # [RBAC]: To enable RBAC (Permissions) configurations rbac: enable: true diff --git a/internal/knowledge/extractor/controller.go b/internal/knowledge/extractor/controller.go index cd4f63972..3dd511b39 100644 --- a/internal/knowledge/extractor/controller.go +++ b/internal/knowledge/extractor/controller.go @@ -5,6 +5,8 @@ package extractor import ( "context" + "encoding/json" + "reflect" "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" @@ -202,9 +204,27 @@ func (r *KnowledgeReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( Reason: "KnowledgeExtracted", Message: "knowledge extracted successfully", }) + + // Check if content actually changed by comparing deserialized data structures. + // This avoids false positives from JSON serialization non-determinism (e.g., map key ordering). + contentChanged := true + if len(knowledge.Status.Raw.Raw) > 0 { + var oldData, newData interface{} + if err := json.Unmarshal(knowledge.Status.Raw.Raw, &oldData); err == nil { + if err := json.Unmarshal(raw.Raw, &newData); err == nil { + contentChanged = !reflect.DeepEqual(oldData, newData) + } + } + } + knowledge.Status.Raw = raw knowledge.Status.LastExtracted = metav1.NewTime(time.Now()) knowledge.Status.RawLength = len(features) + + if contentChanged { + log.Info("content of knowledge has changed", "name", knowledge.Name) + knowledge.Status.LastContentChange = metav1.NewTime(time.Now()) + } patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, knowledge, patch); err != nil { log.Error(err, "failed to patch knowledge status") diff --git a/internal/knowledge/extractor/plugins/compute/flavor_groups.go b/internal/knowledge/extractor/plugins/compute/flavor_groups.go new file mode 100644 index 000000000..d5c47cf2a --- /dev/null +++ b/internal/knowledge/extractor/plugins/compute/flavor_groups.go @@ -0,0 +1,154 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package compute + +import ( + _ "embed" + "encoding/json" + "errors" + "sort" + + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins" + ctrl "sigs.k8s.io/controller-runtime" +) + +// FlavorInGroup represents a single flavor within a flavor group. +type FlavorInGroup struct { + Name string `json:"name"` + VCPUs uint64 `json:"vcpus"` + MemoryMB uint64 `json:"memoryMB"` + DiskGB uint64 `json:"diskGB"` + EphemeralGB uint64 `json:"ephemeralGB,omitempty"` + ExtraSpecs map[string]string `json:"extraSpecs,omitempty"` +} + +// FlavorGroupFeature represents a flavor group with all its member flavors. +// This is the feature that gets stored in the Knowledge CRD. +type FlavorGroupFeature struct { + // Name of the flavor group (from hw_version extra_spec) + Name string `json:"name"` + + // All flavors belonging to this group + Flavors []FlavorInGroup `json:"flavors"` + + // The largest flavor in the group (used for reservation slot sizing) + LargestFlavor FlavorInGroup `json:"largestFlavor"` + + // The smallest flavor in the group (used for CR size quantification) + SmallestFlavor FlavorInGroup `json:"smallestFlavor"` +} + +// flavorRow represents a row from the SQL query. +type flavorRow struct { + Name string `db:"name"` + VCPUs uint64 `db:"vcpus"` + MemoryMB uint64 `db:"memory_mb"` + DiskGB uint64 `db:"disk"` + EphemeralGB uint64 `db:"ephemeral"` + ExtraSpecs string `db:"extra_specs"` +} + +// FlavorGroupExtractor extracts flavor group information from the database. +type FlavorGroupExtractor struct { + // Common base for all extractors that provides standard functionality. + plugins.BaseExtractor[ + struct{}, // No options passed through yaml config + FlavorGroupFeature, // Feature model + ] +} + +//go:embed flavor_groups.sql +var flavorGroupsQuery string + +var flavorGroupLog = ctrl.Log.WithName("flavor_group_extractor") + +// Extract flavor groups from the database. +func (e *FlavorGroupExtractor) Extract() ([]plugins.Feature, error) { + if e.DB == nil { + return nil, errors.New("database connection is not initialized") + } + + // Query all flavors from database + var rows []flavorRow + if _, err := e.DB.Select(&rows, flavorGroupsQuery); err != nil { + flavorGroupLog.Error(err, "failed to query flavors") + return nil, err + } + + // Group flavors by flavorGroupIdentifierName + groupMap := make(map[string][]FlavorInGroup) + + for _, row := range rows { + // Parse extra_specs JSON + var extraSpecs map[string]string + if row.ExtraSpecs != "" { + if err := json.Unmarshal([]byte(row.ExtraSpecs), &extraSpecs); err != nil { + flavorGroupLog.Info("failed to parse extra_specs for flavor", "flavor", row.Name, "error", err) + continue + } + } + + hwVersion, exists := extraSpecs["quota:hw_version"] + if !exists || hwVersion == "" { + flavorGroupLog.Info("flavor missing hw_version extra_spec", "flavor", row.Name) + continue + } + + // Add flavor to its group + flavor := FlavorInGroup{ + Name: row.Name, + VCPUs: row.VCPUs, + MemoryMB: row.MemoryMB, + DiskGB: row.DiskGB, + EphemeralGB: row.EphemeralGB, + ExtraSpecs: extraSpecs, + } + groupMap[hwVersion] = append(groupMap[hwVersion], flavor) + } + + // Convert map to features + features := make([]FlavorGroupFeature, 0, len(groupMap)) + for groupName, flavors := range groupMap { + if len(flavors) == 0 { + continue + } + + // Sort flavors by size descending (largest first), tie break by name for consistent ordering + sort.Slice(flavors, func(i, j int) bool { + if flavors[i].MemoryMB != flavors[j].MemoryMB { + return flavors[i].MemoryMB > flavors[j].MemoryMB + } + if flavors[i].VCPUs != flavors[j].VCPUs { + return flavors[i].VCPUs > flavors[j].VCPUs + } + return flavors[i].Name < flavors[j].Name + }) + + largest := flavors[0] + smallest := flavors[len(flavors)-1] + + flavorGroupLog.Info("identified largest and smallest flavors", + "groupName", groupName, + "largestFlavor", largest.Name, + "largestMemoryMB", largest.MemoryMB, + "largestVCPUs", largest.VCPUs, + "smallestFlavor", smallest.Name, + "smallestMemoryMB", smallest.MemoryMB, + "smallestVCPUs", smallest.VCPUs) + + features = append(features, FlavorGroupFeature{ + Name: groupName, + Flavors: flavors, + LargestFlavor: largest, + SmallestFlavor: smallest, + }) + } + + // Sort features by group name for consistent ordering + sort.Slice(features, func(i, j int) bool { + return features[i].Name < features[j].Name + }) + + return e.Extracted(features) +} diff --git a/internal/knowledge/extractor/plugins/compute/flavor_groups.sql b/internal/knowledge/extractor/plugins/compute/flavor_groups.sql new file mode 100644 index 000000000..0905e0b7d --- /dev/null +++ b/internal/knowledge/extractor/plugins/compute/flavor_groups.sql @@ -0,0 +1,17 @@ +-- Copyright SAP SE +-- SPDX-License-Identifier: Apache-2.0 + +-- Query to extract flavor groups from the openstack_flavors_v2 table +-- Groups flavors by their hw_version extra_spec (or flavor name prefix as workaround) +-- Filters to only include KVM flavors (QEMU and Cloud-Hypervisor) +SELECT + name, + vcpus, + ram as memory_mb, + disk, + ephemeral, + extra_specs +FROM openstack_flavors_v2 +WHERE LOWER(extra_specs) LIKE '%"capabilities:hypervisor_type":"qemu"%' + OR LOWER(extra_specs) LIKE '%"capabilities:hypervisor_type":"ch"%' +ORDER BY name; diff --git a/internal/knowledge/extractor/plugins/compute/flavor_groups_test.go b/internal/knowledge/extractor/plugins/compute/flavor_groups_test.go new file mode 100644 index 000000000..becccadd0 --- /dev/null +++ b/internal/knowledge/extractor/plugins/compute/flavor_groups_test.go @@ -0,0 +1,273 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package compute + +import ( + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" +) + +func TestFlavorGroupExtractor_Extract(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + defer dbEnv.Close() + testDB := db.DB{DbMap: dbEnv.DbMap} + + // Setup test data - create flavors table + if err := testDB.CreateTable( + testDB.AddTable(nova.Flavor{}), + ); err != nil { + t.Fatal(err) + } + + // Insert test flavors with quota:hw_version in extra_specs + // Mix of KVM flavors (should be included) and VMware flavors (should be excluded) + flavors := []any{ + &nova.Flavor{ + ID: "1", + Name: "hana_c30_m480_v2", + VCPUs: 30, + RAM: 491520, // 480GB in MB + Disk: 100, + Ephemeral: 0, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","hw:cpu_policy":"dedicated","quota:hw_version":"v2"}`, + }, + &nova.Flavor{ + ID: "2", + Name: "hana_c60_m960_v2", + VCPUs: 60, + RAM: 983040, // 960GB in MB + Disk: 100, + Ephemeral: 0, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","hw:cpu_policy":"dedicated","quota:hw_version":"v2"}`, + }, + &nova.Flavor{ + ID: "3", + Name: "hana_c240_m3840_v2", + VCPUs: 240, + RAM: 3932160, // 3840GB in MB + Disk: 100, + Ephemeral: 0, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","hw:cpu_policy":"dedicated","hw:numa_nodes":"4","quota:hw_version":"v2"}`, + }, + &nova.Flavor{ + ID: "4", + Name: "gp_c8_m32_v2", + VCPUs: 8, + RAM: 32768, // 32GB in MB + Disk: 50, + Ephemeral: 10, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","quota:hw_version":"v2"}`, + }, + &nova.Flavor{ + ID: "5", + Name: "gp_c16_m64_v2", + VCPUs: 16, + RAM: 65536, // 64GB in MB + Disk: 50, + Ephemeral: 20, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","quota:hw_version":"v2"}`, + }, + // VMware flavor - should be excluded from results (filtered by SQL query) + &nova.Flavor{ + ID: "6", + Name: "vmwa_c32_m512_v1", + VCPUs: 32, + RAM: 524288, // 512GB in MB + Disk: 200, + Ephemeral: 0, + ExtraSpecs: `{"capabilities:hypervisor_type":"VMware vCenter Server","quota:hw_version":"v1"}`, + }, + // Cloud-Hypervisor flavor - should be included (case insensitive) + &nova.Flavor{ + ID: "7", + Name: "gp_c4_m16_ch", + VCPUs: 4, + RAM: 16384, // 16GB in MB + Disk: 25, + Ephemeral: 5, + ExtraSpecs: `{"capabilities:hypervisor_type":"CH","quota:hw_version":"ch"}`, + }, + // Corner case: Same memory as gp_c8_m32_v2 but MORE VCPUs (should come first) + &nova.Flavor{ + ID: "8", + Name: "gp_c12_m32_v2", + VCPUs: 12, + RAM: 32768, // 32GB in MB - same as gp_c8_m32_v2 + Disk: 50, + Ephemeral: 10, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","quota:hw_version":"v2"}`, + }, + // Corner case: Same memory AND same VCPUs as gp_c12_m32_v2 (tests name sorting) + &nova.Flavor{ + ID: "9", + Name: "gp_c12_m32_alt", + VCPUs: 12, + RAM: 32768, // 32GB in MB + Disk: 50, + Ephemeral: 10, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","quota:hw_version":"v2"}`, + }, + } + + if err := testDB.Insert(flavors...); err != nil { + t.Fatal(err) + } + + // Create and run extractor + extractor := &FlavorGroupExtractor{} + config := v1alpha1.KnowledgeSpec{} + if err := extractor.Init(&testDB, nil, config); err != nil { + t.Fatal(err) + } + + features, err := extractor.Extract() + if err != nil { + t.Fatal(err) + } + + // Verify results - should be 2 groups (v2 and ch based on hw_version) + // VMware flavor should be filtered out, Cloud-Hypervisor should be included + if len(features) != 2 { + t.Fatalf("expected 2 flavor groups, got %d", len(features)) + } + + // Convert to typed features for easier testing + var v2Group, chGroup *FlavorGroupFeature + for _, f := range features { + fg := f.(FlavorGroupFeature) + switch fg.Name { + case "v2": + v2Group = &fg + case "ch": + chGroup = &fg + } + } + + // Verify v2 group (contains both HANA and general purpose flavors) + if v2Group == nil { + t.Fatal("v2 group not found") + } + if len(v2Group.Flavors) != 7 { + t.Errorf("expected 7 flavors in v2 group (3 HANA + 4 general purpose), got %d", len(v2Group.Flavors)) + } + // Largest flavor in v2 group should be hana_c240_m3840_v2 (highest memory) + if v2Group.LargestFlavor.Name != "hana_c240_m3840_v2" { + t.Errorf("expected largest flavor to be hana_c240_m3840_v2, got %s", v2Group.LargestFlavor.Name) + } + if v2Group.LargestFlavor.VCPUs != 240 { + t.Errorf("expected largest flavor VCPUs to be 240, got %d", v2Group.LargestFlavor.VCPUs) + } + if v2Group.LargestFlavor.MemoryMB != 3932160 { + t.Errorf("expected largest flavor memory to be 3932160 MB, got %d", v2Group.LargestFlavor.MemoryMB) + } + if v2Group.LargestFlavor.DiskGB != 100 { + t.Errorf("expected largest flavor disk to be 100 GB, got %d", v2Group.LargestFlavor.DiskGB) + } + if v2Group.LargestFlavor.ExtraSpecs == nil { + t.Error("expected largest flavor to have extra_specs") + } + if v2Group.LargestFlavor.ExtraSpecs["hw:numa_nodes"] != "4" { + t.Errorf("expected largest flavor to have hw:numa_nodes=4, got %s", v2Group.LargestFlavor.ExtraSpecs["hw:numa_nodes"]) + } + if v2Group.LargestFlavor.ExtraSpecs["quota:hw_version"] != "v2" { + t.Errorf("expected largest flavor to have quota:hw_version=v2, got %s", v2Group.LargestFlavor.ExtraSpecs["quota:hw_version"]) + } + + // Verify smallest flavor in v2 group should be gp_c4_m16_ch is NOT in v2, so it's gp_c8_m32_v2 (lowest memory among v2 flavors) + if v2Group.SmallestFlavor.Name != "gp_c8_m32_v2" { + t.Errorf("expected smallest flavor to be gp_c8_m32_v2, got %s", v2Group.SmallestFlavor.Name) + } + if v2Group.SmallestFlavor.MemoryMB != 32768 { + t.Errorf("expected smallest flavor memory to be 32768 MB, got %d", v2Group.SmallestFlavor.MemoryMB) + } + if v2Group.SmallestFlavor.VCPUs != 8 { + t.Errorf("expected smallest flavor VCPUs to be 8, got %d", v2Group.SmallestFlavor.VCPUs) + } + + // Verify Cloud-Hypervisor group + if chGroup == nil { + t.Fatal("ch group not found") + } + if len(chGroup.Flavors) != 1 { + t.Errorf("expected 1 flavor in ch group, got %d", len(chGroup.Flavors)) + } + if chGroup.LargestFlavor.Name != "gp_c4_m16_ch" { + t.Errorf("expected largest flavor to be gp_c4_m16_ch, got %s", chGroup.LargestFlavor.Name) + } + if chGroup.LargestFlavor.ExtraSpecs["quota:hw_version"] != "ch" { + t.Errorf("expected ch flavor to have quota:hw_version=ch, got %s", chGroup.LargestFlavor.ExtraSpecs["quota:hw_version"]) + } + + // Verify smallest flavor in ch group (only has 1 flavor, so same as largest) + if chGroup.SmallestFlavor.Name != "gp_c4_m16_ch" { + t.Errorf("expected smallest flavor to be gp_c4_m16_ch, got %s", chGroup.SmallestFlavor.Name) + } + + // Generic check: Verify all flavor groups have correctly ordered flavors + // Flavors must be sorted descending by memory (largest first), with VCPUs as tiebreaker + for _, f := range features { + fg := f.(FlavorGroupFeature) + + // Check that flavors are sorted in descending order + for i := range len(fg.Flavors) - 1 { + current := fg.Flavors[i] + next := fg.Flavors[i+1] + + // Primary sort: memory descending + if current.MemoryMB < next.MemoryMB { + t.Errorf("Flavors in group %s not sorted by memory: %s (%d MB) should come after %s (%d MB)", + fg.Name, current.Name, current.MemoryMB, next.Name, next.MemoryMB) + } + + // Secondary sort: if memory equal, VCPUs descending + if current.MemoryMB == next.MemoryMB && current.VCPUs < next.VCPUs { + t.Errorf("Flavors in group %s with equal memory not sorted by VCPUs: %s (%d VCPUs) should come after %s (%d VCPUs)", + fg.Name, current.Name, current.VCPUs, next.Name, next.VCPUs) + } + } + + // Verify LargestFlavor matches the first flavor in sorted list + if len(fg.Flavors) > 0 && fg.LargestFlavor.Name != fg.Flavors[0].Name { + t.Errorf("Group %s: LargestFlavor (%s) doesn't match first flavor in sorted list (%s)", + fg.Name, fg.LargestFlavor.Name, fg.Flavors[0].Name) + } + + // Verify SmallestFlavor matches the last flavor in sorted list + if len(fg.Flavors) > 0 && fg.SmallestFlavor.Name != fg.Flavors[len(fg.Flavors)-1].Name { + t.Errorf("Group %s: SmallestFlavor (%s) doesn't match last flavor in sorted list (%s)", + fg.Name, fg.SmallestFlavor.Name, fg.Flavors[len(fg.Flavors)-1].Name) + } + } + + // Verify that VMware flavor was filtered out + for _, f := range features { + fg := f.(FlavorGroupFeature) + for _, flavor := range fg.Flavors { + if flavor.Name == "vmwa_c32_m512_v1" { + t.Errorf("VMware flavor should have been filtered out but was found in group %s", fg.Name) + } + } + } + + // Verify that Cloud-Hypervisor flavor was included in ch group + foundCH := false + for _, flavor := range chGroup.Flavors { + if flavor.Name == "gp_c4_m16_ch" { + foundCH = true + if flavor.ExtraSpecs["capabilities:hypervisor_type"] != "CH" { + t.Errorf("expected CH hypervisor_type, got %s", flavor.ExtraSpecs["capabilities:hypervisor_type"]) + } + if flavor.ExtraSpecs["quota:hw_version"] != "ch" { + t.Errorf("expected quota:hw_version=ch, got %s", flavor.ExtraSpecs["quota:hw_version"]) + } + } + } + if !foundCH { + t.Error("Cloud-Hypervisor flavor should have been included but was not found") + } +} diff --git a/internal/knowledge/extractor/supported_extractors.go b/internal/knowledge/extractor/supported_extractors.go index 684697928..6f1cb2fd2 100644 --- a/internal/knowledge/extractor/supported_extractors.go +++ b/internal/knowledge/extractor/supported_extractors.go @@ -23,6 +23,7 @@ var supportedExtractors = map[string]plugins.FeatureExtractor{ "host_az_extractor": &compute.HostAZExtractor{}, "host_pinned_projects_extractor": &compute.HostPinnedProjectsExtractor{}, "sap_host_details_extractor": &compute.HostDetailsExtractor{}, + "flavor_groups": &compute.FlavorGroupExtractor{}, "netapp_storage_pool_cpu_usage_extractor": &storage.StoragePoolCPUUsageExtractor{}, } diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go index 38d3b68d1..638df91da 100644 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go +++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go @@ -154,20 +154,30 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { } for _, hypervisor := range hvs.Items { - cpuTotal, hasCPUTotal := hypervisor.Status.Capacity["cpu"] - ramTotal, hasRAMTotal := hypervisor.Status.Capacity["memory"] + if hypervisor.Status.EffectiveCapacity == nil { + slog.Warn("hypervisor with nil effective capacity, skipping", "host", hypervisor.Name) + continue + } + + cpuTotal, hasCPUTotal := hypervisor.Status.EffectiveCapacity[hv1.ResourceCPU] + ramTotal, hasRAMTotal := hypervisor.Status.EffectiveCapacity[hv1.ResourceMemory] if !hasCPUTotal || !hasRAMTotal { slog.Error("hypervisor missing cpu or ram total capacity", "hypervisor", hypervisor.Name) continue } - cpuUsed, hasCPUUtilized := hypervisor.Status.Allocation["cpu"] + if cpuTotal.IsZero() || ramTotal.IsZero() { + slog.Warn("hypervisor with zero cpu or ram total capacity, skipping", "host", hypervisor.Name) + continue + } + + cpuUsed, hasCPUUtilized := hypervisor.Status.Allocation[hv1.ResourceCPU] if !hasCPUUtilized { cpuUsed = resource.MustParse("0") } - ramUsed, hasRAMUtilized := hypervisor.Status.Allocation["memory"] + ramUsed, hasRAMUtilized := hypervisor.Status.Allocation[hv1.ResourceMemory] if !hasRAMUtilized { ramUsed = resource.MustParse("0") } diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go index 015217e15..bb2e5f91a 100644 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go +++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go @@ -47,6 +47,55 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { hypervisors []hv1.Hypervisor expectedMetrics map[string][]expectedMetric // metric_name -> []expectedMetric }{ + { + name: "single hypervisor with nil effective capacity", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: nil, // Simulate nil effective capacity + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + // No metrics should be emitted for this hypervisor since effective capacity is nil + expectedMetrics: map[string][]expectedMetric{}, + }, + { + name: "single hypervisor with zero total capacity", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), // Simulate zero CPU capacity + hv1.ResourceMemory: resource.MustParse("0"), // Simulate zero RAM capacity + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + Traits: []string{}, + }, + }, + }, + // No metrics should be emitted for this hypervisor since total capacity is zero + expectedMetrics: map[string][]expectedMetric{}, + }, { name: "single hypervisor with default traits", hypervisors: []hv1.Hypervisor{ @@ -58,13 +107,13 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("128"), - "memory": resource.MustParse("512Gi"), + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse("64"), - "memory": resource.MustParse("256Gi"), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), }, Traits: []string{}, }, @@ -148,13 +197,13 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("256"), - "memory": resource.MustParse("1Ti"), + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("256"), + hv1.ResourceMemory: resource.MustParse("1Ti"), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse("128"), - "memory": resource.MustParse("512Gi"), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), }, Traits: []string{ "CUSTOM_HW_SAPPHIRE_RAPIDS", @@ -209,13 +258,13 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("64"), - "memory": resource.MustParse("256Gi"), + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse("32"), - "memory": resource.MustParse("128Gi"), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("32"), + hv1.ResourceMemory: resource.MustParse("128Gi"), }, Traits: []string{ "CUSTOM_DECOMMISSIONING", @@ -255,13 +304,13 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("100"), - "memory": resource.MustParse("200Gi"), + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("100"), + hv1.ResourceMemory: resource.MustParse("200Gi"), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse("50"), - "memory": resource.MustParse("100Gi"), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("50"), + hv1.ResourceMemory: resource.MustParse("100Gi"), }, Traits: []string{}, }, @@ -274,13 +323,13 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("200"), - "memory": resource.MustParse("400Gi"), + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("200"), + hv1.ResourceMemory: resource.MustParse("400Gi"), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse("150"), - "memory": resource.MustParse("300Gi"), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("150"), + hv1.ResourceMemory: resource.MustParse("300Gi"), }, Traits: []string{"CUSTOM_HW_SAPPHIRE_RAPIDS"}, }, @@ -332,9 +381,9 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("96"), - "memory": resource.MustParse("384Gi"), + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("96"), + hv1.ResourceMemory: resource.MustParse("384Gi"), }, // No Allocation field - simulating missing data Allocation: nil, diff --git a/internal/scheduling/lib/detector_monitor.go b/internal/scheduling/lib/detector_monitor.go index 4df9c9950..c72ab2804 100644 --- a/internal/scheduling/lib/detector_monitor.go +++ b/internal/scheduling/lib/detector_monitor.go @@ -17,7 +17,7 @@ type DetectorPipelineMonitor struct { // A counter to measure how many vm ids are selected for descheduling by each step. stepDeschedulingCounter *prometheus.GaugeVec // A histogram to measure how long the pipeline takes to run in total. - pipelineRunTimer prometheus.Histogram + pipelineRunTimer *prometheus.HistogramVec // The name of the pipeline being monitored. PipelineName string @@ -34,11 +34,11 @@ func NewDetectorPipelineMonitor() DetectorPipelineMonitor { Name: "cortex_detector_pipeline_step_detections", Help: "Number of resources detected by a detector pipeline step", }, []string{"step"}), - pipelineRunTimer: prometheus.NewHistogram(prometheus.HistogramOpts{ + pipelineRunTimer: prometheus.NewHistogramVec(prometheus.HistogramOpts{ Name: "cortex_detector_pipeline_run_duration_seconds", Help: "Duration of descheduler pipeline run", Buckets: prometheus.DefBuckets, - }), + }, []string{"error"}), } } diff --git a/internal/scheduling/lib/detector_pipeline.go b/internal/scheduling/lib/detector_pipeline.go index b0db5f235..bed650562 100644 --- a/internal/scheduling/lib/detector_pipeline.go +++ b/internal/scheduling/lib/detector_pipeline.go @@ -61,12 +61,15 @@ func (p *DetectorPipeline[DetectionType]) Init( // Execute the descheduler steps in parallel and collect the decisions made by // each step. func (p *DetectorPipeline[DetectionType]) Run() map[string][]DetectionType { + lock := sync.Mutex{} + decisionsByStep := map[string][]DetectionType{} + metricErrLabel := "false" if p.Monitor.pipelineRunTimer != nil { - timer := prometheus.NewTimer(p.Monitor.pipelineRunTimer) + timer := prometheus.NewTimer(prometheus.ObserverFunc(func(v float64) { + p.Monitor.pipelineRunTimer.WithLabelValues(metricErrLabel).Observe(v) + })) defer timer.ObserveDuration() } - var lock sync.Mutex - decisionsByStep := map[string][]DetectionType{} var wg sync.WaitGroup for stepName, step := range p.steps { wg.Go(func() { @@ -76,13 +79,14 @@ func (p *DetectorPipeline[DetectionType]) Run() map[string][]DetectionType { slog.Info("descheduler: step skipped") return } + lock.Lock() + defer lock.Unlock() if err != nil { slog.Error("descheduler: failed to run step", "error", err) + metricErrLabel = "true" return } slog.Info("descheduler: finished step") - lock.Lock() - defer lock.Unlock() decisionsByStep[stepName] = decisions }) } diff --git a/internal/scheduling/lib/detector_pipeline_test.go b/internal/scheduling/lib/detector_pipeline_test.go index 99ad1834b..9d14cc661 100644 --- a/internal/scheduling/lib/detector_pipeline_test.go +++ b/internal/scheduling/lib/detector_pipeline_test.go @@ -10,6 +10,8 @@ import ( "testing" "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) @@ -379,3 +381,106 @@ func TestDetectorPipeline_RunWithMonitor(t *testing.T) { t.Errorf("expected 1 step result, got %d", len(result)) } } + +func TestDetectorPipeline_Run_MetricErrorLabel(t *testing.T) { + tests := []struct { + name string + steps map[string]Detector[mockDetection] + expectedErrLabel string + }{ + { + name: "successful run has error=false label", + steps: map[string]Detector[mockDetection]{ + "step1": &mockDetectorStep{ + decisions: []mockDetection{ + {resource: "vm1", host: "host1", reason: "reason1"}, + }, + }, + }, + expectedErrLabel: "false", + }, + { + name: "failed step sets error=true label", + steps: map[string]Detector[mockDetection]{ + "failing_step": &mockDetectorStep{ + runErr: errors.New("run failed"), + }, + }, + expectedErrLabel: "true", + }, + { + name: "one failing step among multiple sets error=true label", + steps: map[string]Detector[mockDetection]{ + "failing_step": &mockDetectorStep{ + runErr: errors.New("run failed"), + }, + "working_step": &mockDetectorStep{ + decisions: []mockDetection{ + {resource: "vm1", host: "host1", reason: "reason1"}, + }, + }, + }, + expectedErrLabel: "true", + }, + { + name: "skipped step does not set error=true label", + steps: map[string]Detector[mockDetection]{ + "skipped_step": &mockDetectorStep{ + runErr: ErrStepSkipped, + }, + }, + expectedErrLabel: "false", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + monitor := NewDetectorPipelineMonitor() + pipeline := &DetectorPipeline[mockDetection]{ + steps: tt.steps, + Monitor: monitor, + } + + pipeline.Run() + + // Verify the histogram has observations + count := testutil.CollectAndCount(monitor.pipelineRunTimer, "cortex_detector_pipeline_run_duration_seconds") + if count == 0 { + t.Errorf("expected histogram to have observations") + } + + // Gather metrics from the histogram and check the labels + reg := prometheus.NewRegistry() + reg.MustRegister(monitor.pipelineRunTimer) + families, err := reg.Gather() + if err != nil { + t.Fatalf("failed to gather metrics: %v", err) + } + + found := false + for _, family := range families { + if family.GetName() != "cortex_detector_pipeline_run_duration_seconds" { + continue + } + for _, metric := range family.GetMetric() { + for _, label := range metric.GetLabel() { + if label.GetName() == "error" && label.GetValue() == tt.expectedErrLabel { + found = true + } + // Verify opposite label is not present + oppositeLabel := "true" + if tt.expectedErrLabel == "true" { + oppositeLabel = "false" + } + if label.GetName() == "error" && label.GetValue() == oppositeLabel { + t.Errorf("expected metric to NOT have error=%s label", oppositeLabel) + } + } + } + } + if !found { + t.Errorf("expected metric to have error=%s label", tt.expectedErrLabel) + } + }) + } +} diff --git a/internal/scheduling/nova/hypervisor_overcommit_controller.go b/internal/scheduling/nova/hypervisor_overcommit_controller.go new file mode 100644 index 000000000..946fabec4 --- /dev/null +++ b/internal/scheduling/nova/hypervisor_overcommit_controller.go @@ -0,0 +1,245 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package nova + +import ( + "context" + "errors" + "fmt" + "maps" + "slices" + + "github.com/cobaltcore-dev/cortex/pkg/conf" + "github.com/cobaltcore-dev/cortex/pkg/multicluster" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/client-go/util/workqueue" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +// HypervisorOvercommitMapping maps hypervisor types to their desired +// overcommit ratios. This mapping will be loaded from a configmap +// that is mounted into the controller pod. +type HypervisorOvercommitMapping struct { + // Overcommit is the overcommit ratio to set for hypervisors by resource name. + // Values must be set to something >= 1.0, otherwise the controller will + // ignore them. + Overcommit map[hv1.ResourceName]float64 `json:"overcommit"` + + // HasTrait specifies a trait that a hypervisor may have, and that, if present, + // triggers the controller to set the overcommit ratio specified in the + // overcommit field for that hypervisor. + HasTrait *string `json:"hasTrait,omitempty"` + + // HasntTrait specifies a trait that a hypervisor may have, and that, if + // NOT present, triggers the controller to set the overcommit ratio + // specified in the overcommit field for that hypervisor. + HasntTrait *string `json:"hasntTrait,omitempty"` +} + +// Validate the provided HypervisorOvercommitMapping, returning an error if the +// mapping is invalid. +func (m *HypervisorOvercommitMapping) Validate() error { + for resource, overcommit := range m.Overcommit { + if overcommit < 1.0 { + return errors.New("invalid overcommit ratio in config, must be >= 1.0. " + + "Invalid value for resource " + string(resource) + ": " + + fmt.Sprintf("%f", overcommit)) + } + } + // Has trait and hasn't trait are mutually exclusive, so if both are set + // we return an error. + if m.HasTrait != nil && m.HasntTrait != nil { + return errors.New("invalid overcommit mapping, hasTrait and hasntTrait are mutually exclusive") + } + // At least one of has trait and hasn't trait must be set, + // otherwise we don't know when to apply this mapping. + if m.HasTrait == nil && m.HasntTrait == nil { + return errors.New("invalid overcommit mapping, at least one of hasTrait and hasntTrait must be set") + } + return nil +} + +// HypervisorOvercommitConfig holds the configuration for the +// HypervisorOvercommitController and is loaded from a configmap that is mounted +// into the controller pod. +type HypervisorOvercommitConfig struct { + // OvercommitMappings is a list of mappings that map hypervisor traits to + // overcommit ratios. Note that this list is applied in order, so if there + // are multiple mappings applying to the same hypervisors, the last mapping + // in this list will override the previous ones. + OvercommitMappings []HypervisorOvercommitMapping `json:"overcommitMappings"` +} + +// Validate the provided HypervisorOvercommitConfig, returning an error if the +// config is invalid. +func (c *HypervisorOvercommitConfig) Validate() error { + // Check that all the individual mappings are valid. + for _, mapping := range c.OvercommitMappings { + if err := mapping.Validate(); err != nil { + return err + } + } + return nil +} + +// HypervisorOvercommitController is a controller that reconciles on the +// hypervisor crd and sets desired overcommit ratios based on the hypervisor +// type. +type HypervisorOvercommitController struct { + client.Client + + // config holds the configuration for the controller, which is loaded from a + // configmap that is mounted into the controller pod. + config HypervisorOvercommitConfig +} + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +// +// For more details, check Reconcile and its Result here: +// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.20.2/pkg/reconcile +// +// For more details about the method shape, read up here: +// - https://ahmet.im/blog/controller-pitfalls/#reconcile-method-shape +func (c *HypervisorOvercommitController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx) + log.Info("Reconciling resource") + + obj := new(hv1.Hypervisor) + if err := c.Get(ctx, req.NamespacedName, obj); err != nil { + if apierrors.IsNotFound(err) { + // If the custom resource is not found then it usually means + // that it was deleted or not created. + log.Info("Resource not found. Ignoring since object must be deleted") + return ctrl.Result{}, nil + } + // Error reading the object - requeue the request. + log.Error(err, "Failed to get resource") + return ctrl.Result{}, err + } + + // Build desired overcommit ratios by iterating mappings in order. + // Later mappings override earlier ones for the same resource, preserving + // non-overlapping resources from previous mappings. + desiredOvercommit := make(map[hv1.ResourceName]float64) + for _, mapping := range c.config.OvercommitMappings { + log.Info("Processing overcommit mapping", + "mapping", mapping, + "hypervisorTraits", obj.Status.Traits) + var applyMapping bool + switch { + // These are mutually exclusive. + case mapping.HasTrait != nil: + applyMapping = slices.Contains(obj.Status.Traits, *mapping.HasTrait) + case mapping.HasntTrait != nil: + applyMapping = !slices.Contains(obj.Status.Traits, *mapping.HasntTrait) + default: + // This should never happen due to validation, but we check it just in case. + log.Info("Skipping overcommit mapping with no trait specified", + "overcommit", mapping.Overcommit) + continue + } + if !applyMapping { + continue + } + log.Info("Applying overcommit mapping on hypervisor", + "overcommit", mapping.Overcommit) + maps.Copy(desiredOvercommit, mapping.Overcommit) + } + log.Info("Desired overcommit ratios based on traits", + "desiredOvercommit", desiredOvercommit) + if maps.Equal(desiredOvercommit, obj.Spec.Overcommit) { + log.Info("Overcommit ratios are up to date, no update needed") + return ctrl.Result{}, nil + } + + // Update the desired overcommit ratios on the hypervisor spec. + orig := obj.DeepCopy() + obj.Spec.Overcommit = desiredOvercommit + if err := c.Patch(ctx, obj, client.MergeFrom(orig)); err != nil { + log.Error(err, "Failed to update hypervisor overcommit ratios") + return ctrl.Result{}, err + } + log.Info("Updated hypervisor with new overcommit ratios", + "overcommit", desiredOvercommit) + + return ctrl.Result{}, nil +} + +// handleRemoteHypervisor is called by watches in remote clusters and triggers +// a reconcile on the hypervisor resource that was changed in the remote cluster. +func (c *HypervisorOvercommitController) handleRemoteHypervisor() handler.EventHandler { + handler := handler.Funcs{} + handler.CreateFunc = func(ctx context.Context, evt event.CreateEvent, + queue workqueue.TypedRateLimitingInterface[reconcile.Request]) { + + queue.Add(ctrl.Request{NamespacedName: client.ObjectKey{ + Name: evt.Object.(*hv1.Hypervisor).Name, // cluster-scoped crd + }}) + } + handler.UpdateFunc = func(ctx context.Context, evt event.UpdateEvent, + queue workqueue.TypedRateLimitingInterface[reconcile.Request]) { + + queue.Add(ctrl.Request{NamespacedName: client.ObjectKey{ + Name: evt.ObjectOld.(*hv1.Hypervisor).Name, // cluster-scoped crd + }}) + } + handler.DeleteFunc = func(ctx context.Context, evt event.DeleteEvent, + queue workqueue.TypedRateLimitingInterface[reconcile.Request]) { + + queue.Add(ctrl.Request{NamespacedName: client.ObjectKey{ + Name: evt.Object.(*hv1.Hypervisor).Name, // cluster-scoped crd + }}) + } + return handler +} + +// predicateRemoteHypervisor is used to filter events from remote clusters, +// so that only events for hypervisors that should be processed by this +// controller will trigger reconciliations. +func (c *HypervisorOvercommitController) predicateRemoteHypervisor() predicate.Predicate { + // Currently we're watching all hypervisors. In this way, if a trait + // gets removed from the hypervisor, we'll still reconcile this + // hypervisor and update the overcommit ratios accordingly. + return predicate.NewPredicateFuncs(func(object client.Object) bool { + _, ok := object.(*hv1.Hypervisor) + return ok + }) +} + +// SetupWithManager sets up the controller with the Manager and a multicluster +// client. The multicluster client is used to watch for changes in the +// Hypervisor CRD across all clusters and trigger reconciliations accordingly. +func (c *HypervisorOvercommitController) SetupWithManager(mgr ctrl.Manager) (err error) { + // This will load the config in a safe way and gracefully handle errors. + c.config, err = conf.GetConfig[HypervisorOvercommitConfig]() + if err != nil { + return err + } + // Validate we don't have any weird values in the config. + if err := c.config.Validate(); err != nil { + return err + } + // Check that the provided client is a multicluster client, since we need + // that to watch for hypervisors across clusters. + mcl, ok := c.Client.(*multicluster.Client) + if !ok { + return errors.New("provided client must be a multicluster client") + } + return multicluster. + BuildController(mcl, mgr). + // The hypervisor crd may be distributed across multiple remote clusters. + WatchesMulticluster(&hv1.Hypervisor{}, + c.handleRemoteHypervisor(), + c.predicateRemoteHypervisor(), + ). + Named("hypervisor-overcommit-controller"). + Complete(c) +} diff --git a/internal/scheduling/nova/hypervisor_overcommit_controller_test.go b/internal/scheduling/nova/hypervisor_overcommit_controller_test.go new file mode 100644 index 000000000..e52669c3a --- /dev/null +++ b/internal/scheduling/nova/hypervisor_overcommit_controller_test.go @@ -0,0 +1,936 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package nova + +import ( + "context" + "errors" + "strings" + "testing" + + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/workqueue" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +func TestHypervisorOvercommitMapping_Validate(t *testing.T) { + gpuTrait := "CUSTOM_GPU" + standardTrait := "CUSTOM_STANDARD" + + tests := []struct { + name string + mapping HypervisorOvercommitMapping + expectError bool + }{ + { + name: "valid overcommit ratios with HasTrait", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + hv1.ResourceMemory: 1.5, + }, + HasTrait: &gpuTrait, + }, + expectError: false, + }, + { + name: "valid minimum overcommit ratio of 1.0 with HasntTrait", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, + }, + HasntTrait: &gpuTrait, + }, + expectError: false, + }, + { + name: "invalid overcommit ratio less than 1.0", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 0.5, + }, + HasTrait: &gpuTrait, + }, + expectError: true, + }, + { + name: "invalid overcommit ratio of zero", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 0.0, + }, + HasTrait: &gpuTrait, + }, + expectError: true, + }, + { + name: "invalid negative overcommit ratio", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: -1.0, + }, + HasTrait: &gpuTrait, + }, + expectError: true, + }, + { + name: "empty overcommit map is invalid", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + expectError: true, + }, + { + name: "nil overcommit map is invalid", + mapping: HypervisorOvercommitMapping{ + Overcommit: nil, + }, + expectError: true, + }, + { + name: "mixed valid and invalid overcommit ratios", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + hv1.ResourceMemory: 0.5, // invalid + }, + HasTrait: &gpuTrait, + }, + expectError: true, + }, + { + name: "invalid: both HasTrait and HasntTrait set", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasTrait: &gpuTrait, + HasntTrait: &standardTrait, + }, + expectError: true, + }, + { + name: "invalid: neither HasTrait nor HasntTrait set with non-empty overcommit", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + }, + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.mapping.Validate() + if tt.expectError && err == nil { + t.Error("expected error but got nil") + } + if !tt.expectError && err != nil { + t.Errorf("expected no error but got: %v", err) + } + }) + } +} + +func TestHypervisorOvercommitConfig_Validate(t *testing.T) { + gpuTrait := "CUSTOM_GPU" + standardTrait := "CUSTOM_STANDARD" + tests := []struct { + name string + config HypervisorOvercommitConfig + expectError bool + }{ + { + name: "valid config with single mapping", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectError: false, + }, + { + name: "valid config with multiple mappings", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasTrait: &gpuTrait, + }, + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.5, + }, + HasntTrait: &standardTrait, + }, + }, + }, + expectError: false, + }, + { + name: "invalid config with bad overcommit ratio", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 0.5, // invalid ratio + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectError: true, + }, + { + name: "invalid config with both HasTrait and HasntTrait", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasTrait: &gpuTrait, + HasntTrait: &standardTrait, + }, + }, + }, + expectError: true, + }, + { + name: "invalid config with neither HasTrait nor HasntTrait", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + }, + }, + }, + expectError: true, + }, + { + name: "empty config is valid", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{}, + }, + expectError: false, + }, + { + name: "nil mappings is valid", + config: HypervisorOvercommitConfig{ + OvercommitMappings: nil, + }, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.config.Validate() + if tt.expectError && err == nil { + t.Error("expected error but got nil") + } + if !tt.expectError && err != nil { + t.Errorf("expected no error but got: %v", err) + } + }) + } +} + +func newTestHypervisorScheme(t *testing.T) *runtime.Scheme { + t.Helper() + scheme := runtime.NewScheme() + if err := hv1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add hv1 to scheme: %v", err) + } + return scheme +} + +func TestHypervisorOvercommitController_Reconcile(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + gpuTrait := "CUSTOM_GPU" + standardTrait := "CUSTOM_STANDARD" + missingTrait := "CUSTOM_MISSING" + + tests := []struct { + name string + hypervisor *hv1.Hypervisor + config HypervisorOvercommitConfig + expectedOvercommit map[hv1.ResourceName]float64 + expectNoUpdate bool + expectNotFoundError bool + }{ + { + name: "apply overcommit for matching HasTrait", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + }, + { + name: "apply overcommit for matching HasntTrait", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{}, // missing trait + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasntTrait: &missingTrait, + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + }, + { + name: "skip mapping when HasTrait not present", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_OTHER"}, + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{}, + }, + { + name: "skip mapping when HasntTrait is present", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, // trait is present + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasntTrait: &gpuTrait, // should skip because GPU trait IS present + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{}, + }, + { + name: "later mappings override earlier ones", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU", "CUSTOM_STANDARD"}, + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasTrait: &standardTrait, + }, + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, // should override the first + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + }, + { + name: "no update when overcommit already matches", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + expectNoUpdate: true, + }, + { + name: "skip mapping without trait specified", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + // No HasTrait or HasntTrait specified + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{}, + }, + { + name: "combine HasTrait and HasntTrait mappings", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, // has GPU, doesn't have STANDARD + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + HasTrait: &gpuTrait, + }, + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.5, + }, + HasntTrait: &standardTrait, // STANDARD not present + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + hv1.ResourceMemory: 1.5, + }, + }, + { + name: "hypervisor not found", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nonexistent", + }, + }, + config: HypervisorOvercommitConfig{}, + expectNotFoundError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var fakeClient client.Client + if tt.expectNotFoundError { + // Don't add the hypervisor to the fake client + fakeClient = fake.NewClientBuilder(). + WithScheme(scheme). + Build() + } else { + fakeClient = fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(tt.hypervisor). + Build() + } + + controller := &HypervisorOvercommitController{ + Client: fakeClient, + config: tt.config, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: tt.hypervisor.Name, + }, + } + + ctx := context.Background() + result, err := controller.Reconcile(ctx, req) + + if err != nil { + t.Errorf("unexpected error: %v", err) + return + } + + if result.RequeueAfter > 0 { + t.Error("expected no requeue") + } + + if tt.expectNotFoundError { + // For not found case, we expect no error and no requeue + return + } + + // Get the updated hypervisor + updated := &hv1.Hypervisor{} + if err := fakeClient.Get(ctx, req.NamespacedName, updated); err != nil { + t.Fatalf("failed to get updated hypervisor: %v", err) + } + + // Check overcommit ratios + if len(updated.Spec.Overcommit) != len(tt.expectedOvercommit) { + t.Errorf("expected %d overcommit entries, got %d", + len(tt.expectedOvercommit), len(updated.Spec.Overcommit)) + } + + for resource, expected := range tt.expectedOvercommit { + actual, ok := updated.Spec.Overcommit[resource] + if !ok { + t.Errorf("expected overcommit for resource %s, but not found", resource) + continue + } + if actual != expected { + t.Errorf("expected overcommit %f for resource %s, got %f", + expected, resource, actual) + } + } + }) + } +} + +func TestHypervisorOvercommitController_ReconcileNotFound(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + controller := &HypervisorOvercommitController{ + Client: fakeClient, + config: HypervisorOvercommitConfig{}, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "nonexistent-hypervisor", + }, + } + + ctx := context.Background() + result, err := controller.Reconcile(ctx, req) + + if err != nil { + t.Errorf("expected no error for not found resource, got: %v", err) + } + + if result.RequeueAfter > 0 { + t.Error("expected no requeue for not found resource") + } +} + +// mockWorkQueue implements workqueue.TypedRateLimitingInterface for testing +type mockWorkQueue struct { + workqueue.TypedRateLimitingInterface[reconcile.Request] + items []reconcile.Request +} + +func (m *mockWorkQueue) Add(item reconcile.Request) { + m.items = append(m.items, item) +} + +func TestHypervisorOvercommitController_HandleRemoteHypervisor(t *testing.T) { + controller := &HypervisorOvercommitController{} + handler := controller.handleRemoteHypervisor() + + hypervisor := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + } + + ctx := context.Background() + + t.Run("CreateFunc", func(t *testing.T) { + queue := &mockWorkQueue{} + handler.Create(ctx, event.CreateEvent{Object: hypervisor}, queue) + + if len(queue.items) != 1 { + t.Errorf("expected 1 item in queue, got %d", len(queue.items)) + } + if queue.items[0].Name != "test-hypervisor" { + t.Errorf("expected hypervisor name 'test-hypervisor', got %s", queue.items[0].Name) + } + }) + + t.Run("UpdateFunc", func(t *testing.T) { + queue := &mockWorkQueue{} + handler.Update(ctx, event.UpdateEvent{ + ObjectOld: hypervisor, + ObjectNew: hypervisor, + }, queue) + + if len(queue.items) != 1 { + t.Errorf("expected 1 item in queue, got %d", len(queue.items)) + } + if queue.items[0].Name != "test-hypervisor" { + t.Errorf("expected hypervisor name 'test-hypervisor', got %s", queue.items[0].Name) + } + }) + + t.Run("DeleteFunc", func(t *testing.T) { + queue := &mockWorkQueue{} + handler.Delete(ctx, event.DeleteEvent{Object: hypervisor}, queue) + + if len(queue.items) != 1 { + t.Errorf("expected 1 item in queue, got %d", len(queue.items)) + } + if queue.items[0].Name != "test-hypervisor" { + t.Errorf("expected hypervisor name 'test-hypervisor', got %s", queue.items[0].Name) + } + }) +} + +func TestHypervisorOvercommitController_PredicateRemoteHypervisor(t *testing.T) { + controller := &HypervisorOvercommitController{} + predicate := controller.predicateRemoteHypervisor() + + t.Run("accepts Hypervisor objects", func(t *testing.T) { + hypervisor := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + } + + if !predicate.Generic(event.GenericEvent{Object: hypervisor}) { + t.Error("expected predicate to accept Hypervisor object") + } + }) + + t.Run("rejects non-Hypervisor objects", func(t *testing.T) { + // Create a non-Hypervisor object by using a different type + // We'll test with a nil object which should return false + type nonHypervisor struct { + client.Object + } + + if predicate.Generic(event.GenericEvent{Object: &nonHypervisor{}}) { + t.Error("expected predicate to reject non-Hypervisor object") + } + }) +} + +func TestHypervisorOvercommitController_SetupWithManager_InvalidClient(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + // Create a regular fake client (not a multicluster client) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + controller := &HypervisorOvercommitController{ + Client: fakeClient, + } + + // Create a minimal mock manager for testing + mgr := &mockManager{scheme: scheme} + + // SetupWithManager should fail - either because config loading fails + // (in test environment without config files) or because the client + // is not a multicluster client. + err := controller.SetupWithManager(mgr) + if err == nil { + t.Error("expected error when calling SetupWithManager, got nil") + } + // The error could be either about missing config or about multicluster client + // depending on the test environment. We just verify an error is returned. +} + +// mockManager implements ctrl.Manager for testing SetupWithManager +type mockManager struct { + ctrl.Manager + scheme *runtime.Scheme +} + +func (m *mockManager) GetScheme() *runtime.Scheme { + return m.scheme +} + +// patchFailingClient wraps a client.Client and returns an error on Patch calls +type patchFailingClient struct { + client.Client + patchErr error +} + +func (c *patchFailingClient) Patch(ctx context.Context, obj client.Object, patch client.Patch, opts ...client.PatchOption) error { + return c.patchErr +} + +func TestHypervisorOvercommitController_Reconcile_PatchError(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + gpuTrait := "CUSTOM_GPU" + hypervisor := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + } + + // Create a fake client with the hypervisor, then wrap it to fail on Patch + baseClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(hypervisor). + Build() + + patchErr := errors.New("patch failed") + failingClient := &patchFailingClient{ + Client: baseClient, + patchErr: patchErr, + } + + controller := &HypervisorOvercommitController{ + Client: failingClient, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + HasTrait: &gpuTrait, + }, + }, + }, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: hypervisor.Name, + }, + } + + ctx := context.Background() + _, err := controller.Reconcile(ctx, req) + + // Reconcile should return an error when Patch fails + if err == nil { + t.Error("expected error when Patch fails, got nil") + } + if !strings.Contains(err.Error(), "patch failed") { + t.Errorf("expected error message to contain 'patch failed', got: %v", err) + } +} + +func TestHypervisorOvercommitController_Reconcile_EmptyConfig(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + hypervisor := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(hypervisor). + Build() + + controller := &HypervisorOvercommitController{ + Client: fakeClient, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{}, + }, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: hypervisor.Name, + }, + } + + ctx := context.Background() + result, err := controller.Reconcile(ctx, req) + + if err != nil { + t.Errorf("unexpected error: %v", err) + } + + if result.RequeueAfter > 0 { + t.Error("expected no requeue") + } + + // Verify no changes were made + updated := &hv1.Hypervisor{} + if err := fakeClient.Get(ctx, req.NamespacedName, updated); err != nil { + t.Fatalf("failed to get updated hypervisor: %v", err) + } + + if len(updated.Spec.Overcommit) != 0 { + t.Errorf("expected empty overcommit, got %v", updated.Spec.Overcommit) + } +} + +func TestHypervisorOvercommitController_Reconcile_MultipleResources(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + gpuTrait := "CUSTOM_GPU" + hypervisor := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(hypervisor). + Build() + + controller := &HypervisorOvercommitController{ + Client: fakeClient, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + hv1.ResourceMemory: 1.5, + }, + HasTrait: &gpuTrait, + }, + }, + }, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: hypervisor.Name, + }, + } + + ctx := context.Background() + _, err := controller.Reconcile(ctx, req) + + if err != nil { + t.Errorf("unexpected error: %v", err) + } + + updated := &hv1.Hypervisor{} + if err := fakeClient.Get(ctx, req.NamespacedName, updated); err != nil { + t.Fatalf("failed to get updated hypervisor: %v", err) + } + + if len(updated.Spec.Overcommit) != 2 { + t.Errorf("expected 2 overcommit entries, got %d", len(updated.Spec.Overcommit)) + } + + if updated.Spec.Overcommit[hv1.ResourceCPU] != 4.0 { + t.Errorf("expected CPU overcommit 4.0, got %f", updated.Spec.Overcommit[hv1.ResourceCPU]) + } + + if updated.Spec.Overcommit[hv1.ResourceMemory] != 1.5 { + t.Errorf("expected Memory overcommit 1.5, got %f", updated.Spec.Overcommit[hv1.ResourceMemory]) + } +} diff --git a/internal/scheduling/nova/integration_test.go b/internal/scheduling/nova/integration_test.go index 137ac5a10..a1267c9c0 100644 --- a/internal/scheduling/nova/integration_test.go +++ b/internal/scheduling/nova/integration_test.go @@ -48,13 +48,13 @@ func newHypervisor(name, cpuCap, cpuAlloc, memCap, memAlloc string) *hv1.Hypervi Name: name, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse(cpuCap), - "memory": resource.MustParse(memCap), + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpuCap), + hv1.ResourceMemory: resource.MustParse(memCap), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse(cpuAlloc), - "memory": resource.MustParse(memAlloc), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpuAlloc), + hv1.ResourceMemory: resource.MustParse(memAlloc), }, }, } @@ -68,9 +68,9 @@ func newCommittedReservation(name, targetHost, observedHost, projectID, flavorNa Spec: v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeCommittedResource, TargetHost: targetHost, - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse(cpu), - "memory": resource.MustParse(memory), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpu), + hv1.ResourceMemory: resource.MustParse(memory), }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ProjectID: projectID, @@ -100,9 +100,9 @@ func newFailoverReservation(name, targetHost, resourceGroup, cpu, memory string, Spec: v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeFailover, TargetHost: targetHost, - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse(cpu), - "memory": resource.MustParse(memory), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpu), + hv1.ResourceMemory: resource.MustParse(memory), }, FailoverReservation: &v1alpha1.FailoverReservationSpec{ ResourceGroup: resourceGroup, diff --git a/internal/scheduling/nova/plugins/filters/filter_external_customer.go b/internal/scheduling/nova/plugins/filters/filter_external_customer.go index 62c059b10..56f73c8ac 100644 --- a/internal/scheduling/nova/plugins/filters/filter_external_customer.go +++ b/internal/scheduling/nova/plugins/filters/filter_external_customer.go @@ -37,8 +37,8 @@ func (s *FilterExternalCustomerStep) Run(traceLog *slog.Logger, request api.Exte result := s.IncludeAllHostsFromRequest(request) domainName, err := request.Spec.Data.GetSchedulerHintStr("domain_name") if err != nil { - traceLog.Error("failed to get domain_name scheduler hint", "error", err) - return nil, err + traceLog.Error("failed to get domain_name scheduler hint, skipping filter", "error", err) + return result, nil } if slices.Contains(s.Options.CustomerIgnoredDomainNames, domainName) { traceLog.Info("domain is no external customer domain, skipping filter", "domain", domainName) diff --git a/internal/scheduling/nova/plugins/filters/filter_external_customer_test.go b/internal/scheduling/nova/plugins/filters/filter_external_customer_test.go index fb9971a83..7ca313dc3 100644 --- a/internal/scheduling/nova/plugins/filters/filter_external_customer_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_external_customer_test.go @@ -245,7 +245,7 @@ func TestFilterExternalCustomerStep_Run(t *testing.T) { filteredHosts: []string{"host3"}, }, { - name: "Missing domain_name in scheduler hints - error", + name: "Missing domain_name in scheduler hints - skips filter, all hosts pass", opts: FilterExternalCustomerStepOpts{ CustomerDomainNamePrefixes: []string{"ext-"}, }, @@ -257,12 +257,14 @@ func TestFilterExternalCustomerStep_Run(t *testing.T) { }, Hosts: []api.ExternalSchedulerHost{ {ComputeHost: "host1"}, + {ComputeHost: "host3"}, }, }, - expectError: true, + expectedHosts: []string{"host1", "host3"}, + filteredHosts: []string{}, }, { - name: "Nil scheduler hints - error", + name: "Nil scheduler hints - skips filter, all hosts pass", opts: FilterExternalCustomerStepOpts{ CustomerDomainNamePrefixes: []string{"ext-"}, }, @@ -274,9 +276,11 @@ func TestFilterExternalCustomerStep_Run(t *testing.T) { }, Hosts: []api.ExternalSchedulerHost{ {ComputeHost: "host1"}, + {ComputeHost: "host2"}, }, }, - expectError: true, + expectedHosts: []string{"host1", "host2"}, + filteredHosts: []string{}, }, { name: "Case sensitive prefix matching", diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go index a90638eac..5e1f1dc3c 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go @@ -45,7 +45,7 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa result := s.IncludeAllHostsFromRequest(request) // This map holds the free resources per host. - freeResourcesByHost := make(map[string]map[string]resource.Quantity) + freeResourcesByHost := make(map[string]map[hv1.ResourceName]resource.Quantity) // The hypervisor resource auto-discovers its current utilization. // We can use the hypervisor status to calculate the total capacity @@ -56,8 +56,13 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa return nil, err } for _, hv := range hvs.Items { - // Start with the total capacity. - freeResourcesByHost[hv.Name] = hv.Status.Capacity + if hv.Status.EffectiveCapacity == nil { + traceLog.Warn("hypervisor with nil effective capacity, use capacity instead (overprovisioning not considered)", "host", hv.Name) + freeResourcesByHost[hv.Name] = hv.Status.Capacity + } else { + // Start with the total effective capacity which is capacity * overcommit ratio. + freeResourcesByHost[hv.Name] = hv.Status.EffectiveCapacity + } // Subtract allocated resources. for resourceName, allocated := range hv.Status.Allocation { @@ -145,7 +150,7 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa // For CR reservations with allocations, calculate remaining (unallocated) resources to block. // This prevents double-blocking of resources already consumed by running instances. - var resourcesToBlock map[string]resource.Quantity + var resourcesToBlock map[hv1.ResourceName]resource.Quantity if reservation.Spec.Type == v1alpha1.ReservationTypeCommittedResource && // if the reservation is not being migrated, block only unused resources reservation.Spec.TargetHost == reservation.Status.Host && @@ -154,7 +159,7 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa len(reservation.Spec.CommittedResourceReservation.Allocations) > 0 && len(reservation.Status.CommittedResourceReservation.Allocations) > 0 { // Start with full reservation resources - resourcesToBlock = make(map[string]resource.Quantity) + resourcesToBlock = make(map[hv1.ResourceName]resource.Quantity) for k, v := range reservation.Spec.Resources { resourcesToBlock[k] = v.DeepCopy() } diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go index cb998a286..452782484 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go @@ -39,13 +39,56 @@ func newHypervisor(name, cpuCap, cpuAlloc, memCap, memAlloc string) *hv1.Hypervi Name: name, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse(cpuCap), - "memory": resource.MustParse(memCap), + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpuCap), + hv1.ResourceMemory: resource.MustParse(memCap), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse(cpuAlloc), - "memory": resource.MustParse(memAlloc), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpuAlloc), + hv1.ResourceMemory: resource.MustParse(memAlloc), + }, + }, + } +} + +// newHypervisorWithCapacityOnly creates a hypervisor with only Capacity set (no EffectiveCapacity). +func newHypervisorWithCapacityOnly(name, cpuCap, memCap string) *hv1.Hypervisor { + return &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Status: hv1.HypervisorStatus{ + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpuCap), + hv1.ResourceMemory: resource.MustParse(memCap), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + }, + } +} + +// newHypervisorWithBothCapacities creates a hypervisor with both Capacity and EffectiveCapacity set. +// EffectiveCapacity is typically >= Capacity due to overcommit ratio. +func newHypervisorWithBothCapacities(name, cpuCap, cpuEffCap, memCap, memEffCap string) *hv1.Hypervisor { + return &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Status: hv1.HypervisorStatus{ + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpuCap), + hv1.ResourceMemory: resource.MustParse(memCap), + }, + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpuEffCap), + hv1.ResourceMemory: resource.MustParse(memEffCap), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), }, }, } @@ -64,9 +107,9 @@ func newCommittedReservation( Spec: v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeCommittedResource, TargetHost: targetHost, - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse(cpu), - "memory": resource.MustParse(memory), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpu), + hv1.ResourceMemory: resource.MustParse(memory), }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ProjectID: projectID, @@ -104,9 +147,9 @@ func newFailoverReservation(name, targetHost, cpu, memory string, allocations ma Spec: v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeFailover, TargetHost: targetHost, - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse(cpu), - "memory": resource.MustParse(memory), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(cpu), + hv1.ResourceMemory: resource.MustParse(memory), }, FailoverReservation: &v1alpha1.FailoverReservationSpec{ ResourceGroup: "m1.large", @@ -150,9 +193,9 @@ func crSpecAllocs(vms ...crVmAlloc) map[string]v1alpha1.CommittedResourceAllocat for _, v := range vms { allocs[v.uuid] = v1alpha1.CommittedResourceAllocation{ CreationTimestamp: metav1.Now(), - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse(v.cpu), - "memory": resource.MustParse(v.mem), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(v.cpu), + hv1.ResourceMemory: resource.MustParse(v.mem), }, } } @@ -440,7 +483,7 @@ func TestFilterHasEnoughCapacity_ReservationTypes(t *testing.T) { t.Run(tt.name, func(t *testing.T) { objects := make([]client.Object, 0, len(hypervisors)+len(tt.reservations)) for _, h := range hypervisors { - objects = append(objects, h) + objects = append(objects, h.DeepCopy()) } for _, r := range tt.reservations { objects = append(objects, r) @@ -469,3 +512,87 @@ func TestFilterHasEnoughCapacity_ReservationTypes(t *testing.T) { }) } } + +func TestFilterHasEnoughCapacity_NilEffectiveCapacityFallback(t *testing.T) { + scheme := buildTestScheme(t) + + tests := []struct { + name string + hypervisors []*hv1.Hypervisor + request api.ExternalSchedulerRequest + expectedHosts []string + filteredHosts []string + }{ + { + name: "Hypervisor with nil EffectiveCapacity uses Capacity fallback", + hypervisors: []*hv1.Hypervisor{ + newHypervisor("host1", "16", "8", "32Gi", "16Gi"), // has EffectiveCapacity: 8 CPU free, 16Gi free + newHypervisorWithCapacityOnly("host2", "8", "16Gi"), // nil EffectiveCapacity, uses Capacity: 8 CPU free, 16Gi free + newHypervisorWithCapacityOnly("host3", "2", "4Gi"), // nil EffectiveCapacity, uses Capacity: 2 CPU free (not enough) + newHypervisorWithCapacityOnly("host4", "16", "32Gi"), // nil EffectiveCapacity, uses Capacity: 16 CPU free, 32Gi free + }, + request: newNovaRequest("instance-123", "project-A", "m1.small", "gp-1", 4, "8Gi", false, []string{"host1", "host2", "host3", "host4"}), + expectedHosts: []string{"host1", "host2", "host4"}, + filteredHosts: []string{"host3"}, + }, + { + name: "All hypervisors with nil EffectiveCapacity use Capacity fallback", + hypervisors: []*hv1.Hypervisor{ + newHypervisorWithCapacityOnly("host1", "8", "16Gi"), + newHypervisorWithCapacityOnly("host2", "4", "8Gi"), + }, + request: newNovaRequest("instance-123", "project-A", "m1.small", "gp-1", 4, "8Gi", false, []string{"host1", "host2"}), + expectedHosts: []string{"host1", "host2"}, + filteredHosts: []string{}, + }, + { + name: "EffectiveCapacity used when both are set (overcommit scenario)", + hypervisors: []*hv1.Hypervisor{ + // host1: Capacity=8 CPU, EffectiveCapacity=16 CPU (2x overcommit) + // With Capacity only: 8 free -> passes + // With EffectiveCapacity: 16 free -> passes (more capacity available) + newHypervisorWithBothCapacities("host1", "8", "16", "16Gi", "32Gi"), + // host2: Capacity=4 CPU, EffectiveCapacity=8 CPU (2x overcommit) + // With Capacity only: 4 free -> would be filtered (need 5) + // With EffectiveCapacity: 8 free -> passes + newHypervisorWithBothCapacities("host2", "4", "8", "8Gi", "16Gi"), + // host3: Capacity=4 CPU, EffectiveCapacity=4 CPU (no overcommit) + // Both: 4 free -> filtered (need 5) + newHypervisorWithBothCapacities("host3", "4", "4", "8Gi", "8Gi"), + }, + request: newNovaRequest("instance-123", "project-A", "m1.small", "gp-1", 5, "8Gi", false, []string{"host1", "host2", "host3"}), + expectedHosts: []string{"host1", "host2"}, + filteredHosts: []string{"host3"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + objects := make([]client.Object, 0, len(tt.hypervisors)) + for _, h := range tt.hypervisors { + objects = append(objects, h.DeepCopy()) + } + + step := &FilterHasEnoughCapacity{} + step.Client = fake.NewClientBuilder().WithScheme(scheme).WithObjects(objects...).Build() + step.Options = FilterHasEnoughCapacityOpts{LockReserved: false} + + result, err := step.Run(slog.Default(), tt.request) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + + for _, host := range tt.expectedHosts { + if _, ok := result.Activations[host]; !ok { + t.Errorf("expected host %s to be present in activations, but got %+v", host, result.Activations) + } + } + + for _, host := range tt.filteredHosts { + if _, ok := result.Activations[host]; ok { + t.Errorf("expected host %s to be filtered out", host) + } + } + }) + } +} diff --git a/internal/scheduling/nova/plugins/filters/filter_packed_virtqueue.go b/internal/scheduling/nova/plugins/filters/filter_packed_virtqueue.go deleted file mode 100644 index dac317e59..000000000 --- a/internal/scheduling/nova/plugins/filters/filter_packed_virtqueue.go +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package filters - -import ( - "context" - "log/slog" - "slices" - - api "github.com/cobaltcore-dev/cortex/api/external/nova" - "github.com/cobaltcore-dev/cortex/internal/scheduling/lib" - hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" -) - -type FilterPackedVirtqueueStep struct { - lib.BaseFilter[api.ExternalSchedulerRequest, lib.EmptyFilterWeigherPipelineStepOpts] -} - -// If requested, only get hosts with packed virtqueues. -func (s *FilterPackedVirtqueueStep) Run(traceLog *slog.Logger, request api.ExternalSchedulerRequest) (*lib.FilterWeigherPipelineStepResult, error) { - result := s.IncludeAllHostsFromRequest(request) - // We don't care about the value. - _, reqInSpecs := request.Spec.Data.Flavor.Data.ExtraSpecs["hw:virtio_packed_ring"] - _, reqInProps := request.Spec.Data.Image.Data.Properties.Data["hw_virtio_packed_ring"] - if !reqInSpecs && !reqInProps { - traceLog.Info("no request for packed virtqueues, skipping filter") - return result, nil // No packed virtqueue requested, nothing to filter. - } - - hvs := &hv1.HypervisorList{} - if err := s.Client.List(context.Background(), hvs); err != nil { - traceLog.Error("failed to list hypervisors", "error", err) - return nil, err - } - hvsWithTrait := make(map[string]struct{}) - for _, hv := range hvs.Items { - traits := hv.Status.Traits - traits = append(traits, hv.Spec.CustomTraits...) - if !slices.Contains(traits, "COMPUTE_NET_VIRTIO_PACKED") { - continue - } - hvsWithTrait[hv.Name] = struct{}{} - } - - traceLog.Info("hosts with packed virtqueues", "hosts", hvsWithTrait) - for host := range result.Activations { - if _, ok := hvsWithTrait[host]; ok { - traceLog.Info("host has packed virtqueues, keeping", "host", host) - continue - } - delete(result.Activations, host) - traceLog.Info("filtering host without packed virtqueues", "host", host) - } - return result, nil -} - -func init() { - Index["filter_packed_virtqueue"] = func() NovaFilter { return &FilterPackedVirtqueueStep{} } -} diff --git a/internal/scheduling/nova/plugins/filters/filter_packed_virtqueue_test.go b/internal/scheduling/nova/plugins/filters/filter_packed_virtqueue_test.go deleted file mode 100644 index 82b68da81..000000000 --- a/internal/scheduling/nova/plugins/filters/filter_packed_virtqueue_test.go +++ /dev/null @@ -1,510 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package filters - -import ( - "log/slog" - "testing" - - api "github.com/cobaltcore-dev/cortex/api/external/nova" - hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/fake" -) - -func TestFilterPackedVirtqueueStep_Run(t *testing.T) { - scheme, err := hv1.SchemeBuilder.Build() - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - hvs := []client.Object{ - &hv1.Hypervisor{ - ObjectMeta: v1.ObjectMeta{ - Name: "host1", - }, - Status: hv1.HypervisorStatus{ - Traits: []string{"COMPUTE_NET_VIRTIO_PACKED"}, - }, - }, - &hv1.Hypervisor{ - ObjectMeta: v1.ObjectMeta{ - Name: "host2", - }, - Status: hv1.HypervisorStatus{ - Traits: []string{"COMPUTE_NET_VIRTIO_PACKED", "SOME_OTHER_TRAIT"}, - }, - }, - &hv1.Hypervisor{ - ObjectMeta: v1.ObjectMeta{ - Name: "host3", - }, - Status: hv1.HypervisorStatus{ - Traits: []string{"SOME_OTHER_TRAIT"}, - }, - }, - &hv1.Hypervisor{ - ObjectMeta: v1.ObjectMeta{ - Name: "host4", - }, - Status: hv1.HypervisorStatus{ - Traits: []string{}, - }, - }, - } - - tests := []struct { - name string - request api.ExternalSchedulerRequest - expectedHosts []string - filteredHosts []string - }{ - { - name: "No packed virtqueue requested - all hosts pass", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{}, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host2"}, - {ComputeHost: "host3"}, - {ComputeHost: "host4"}, - }, - }, - expectedHosts: []string{"host1", "host2", "host3", "host4"}, - filteredHosts: []string{}, - }, - { - name: "Packed virtqueue requested in flavor extra specs", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "true", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host2"}, - {ComputeHost: "host3"}, - {ComputeHost: "host4"}, - }, - }, - expectedHosts: []string{"host1", "host2"}, - filteredHosts: []string{"host3", "host4"}, - }, - { - name: "Packed virtqueue requested in image properties", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{}, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{ - "hw_virtio_packed_ring": "true", - }, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host2"}, - {ComputeHost: "host3"}, - {ComputeHost: "host4"}, - }, - }, - expectedHosts: []string{"host1", "host2"}, - filteredHosts: []string{"host3", "host4"}, - }, - { - name: "Packed virtqueue requested in both flavor and image", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "true", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{ - "hw_virtio_packed_ring": "true", - }, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host3"}, - }, - }, - expectedHosts: []string{"host1"}, - filteredHosts: []string{"host3"}, - }, - { - name: "Packed virtqueue with false value in flavor - still triggers filter", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "false", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host2"}, - {ComputeHost: "host3"}, - }, - }, - expectedHosts: []string{"host1", "host2"}, - filteredHosts: []string{"host3"}, - }, - { - name: "Packed virtqueue with empty value in image - still triggers filter", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{}, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{ - "hw_virtio_packed_ring": "", - }, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host4"}, - }, - }, - expectedHosts: []string{"host1"}, - filteredHosts: []string{"host4"}, - }, - { - name: "No hosts with trait - all filtered", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "true", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host3"}, - {ComputeHost: "host4"}, - }, - }, - expectedHosts: []string{}, - filteredHosts: []string{"host3", "host4"}, - }, - { - name: "All hosts have trait", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "true", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host2"}, - }, - }, - expectedHosts: []string{"host1", "host2"}, - filteredHosts: []string{}, - }, - { - name: "Empty host list with packed virtqueue requested", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "true", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{}, - }, - expectedHosts: []string{}, - filteredHosts: []string{}, - }, - { - name: "Empty host list without packed virtqueue requested", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{}, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{}, - }, - expectedHosts: []string{}, - filteredHosts: []string{}, - }, - { - name: "Host not in database with packed virtqueue requested", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "true", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host-unknown"}, - }, - }, - expectedHosts: []string{"host1"}, - filteredHosts: []string{"host-unknown"}, - }, - { - name: "Packed virtqueue with additional extra specs", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "true", - "hw:cpu_policy": "dedicated", - "hw:mem_page_size": "large", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host3"}, - }, - }, - expectedHosts: []string{"host1"}, - filteredHosts: []string{"host3"}, - }, - { - name: "Mixed hosts with and without trait", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{ - "hw:virtio_packed_ring": "true", - }, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{}, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host1"}, - {ComputeHost: "host2"}, - {ComputeHost: "host3"}, - {ComputeHost: "host4"}, - }, - }, - expectedHosts: []string{"host1", "host2"}, - filteredHosts: []string{"host3", "host4"}, - }, - { - name: "Image property with additional properties", - request: api.ExternalSchedulerRequest{ - Spec: api.NovaObject[api.NovaSpec]{ - Data: api.NovaSpec{ - Flavor: api.NovaObject[api.NovaFlavor]{ - Data: api.NovaFlavor{ - ExtraSpecs: map[string]string{}, - }, - }, - Image: api.NovaObject[api.NovaImageMeta]{ - Data: api.NovaImageMeta{ - Properties: api.NovaObject[map[string]any]{ - Data: map[string]any{ - "hw_virtio_packed_ring": "true", - "hw_disk_bus": "virtio", - "hw_vif_model": "virtio", - }, - }, - }, - }, - }, - }, - Hosts: []api.ExternalSchedulerHost{ - {ComputeHost: "host2"}, - {ComputeHost: "host4"}, - }, - }, - expectedHosts: []string{"host2"}, - filteredHosts: []string{"host4"}, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - step := &FilterPackedVirtqueueStep{} - step.Client = fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(hvs...). - Build() - - result, err := step.Run(slog.Default(), tt.request) - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - // Check expected hosts are present - for _, host := range tt.expectedHosts { - if _, ok := result.Activations[host]; !ok { - t.Errorf("expected host %s to be present in activations", host) - } - } - - // Check filtered hosts are not present - for _, host := range tt.filteredHosts { - if _, ok := result.Activations[host]; ok { - t.Errorf("expected host %s to be filtered out", host) - } - } - - // Check total count - if len(result.Activations) != len(tt.expectedHosts) { - t.Errorf("expected %d hosts, got %d", len(tt.expectedHosts), len(result.Activations)) - } - }) - } -} diff --git a/internal/scheduling/nova/plugins/weighers/kvm_binpack.go b/internal/scheduling/nova/plugins/weighers/kvm_binpack.go index 1a3bd7573..e1509a4cc 100644 --- a/internal/scheduling/nova/plugins/weighers/kvm_binpack.go +++ b/internal/scheduling/nova/plugins/weighers/kvm_binpack.go @@ -13,7 +13,6 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" "github.com/cobaltcore-dev/cortex/internal/scheduling/lib" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" ) @@ -23,7 +22,7 @@ type KVMBinpackStepOpts struct { // node's resource utilizations after placing the VM. // If a resource is not specified, is ignored in the score calculation // (equivalent to a weight of 0). - ResourceWeights map[corev1.ResourceName]float64 `json:"resourceWeights"` + ResourceWeights map[hv1.ResourceName]float64 `json:"resourceWeights"` } // Validate the options to ensure they are correct before running the weigher. @@ -31,9 +30,9 @@ func (o KVMBinpackStepOpts) Validate() error { if len(o.ResourceWeights) == 0 { return errors.New("at least one resource weight must be specified") } - supportedResources := []corev1.ResourceName{ - corev1.ResourceMemory, - corev1.ResourceCPU, + supportedResources := []hv1.ResourceName{ + hv1.ResourceMemory, + hv1.ResourceCPU, } for resourceName, value := range o.ResourceWeights { if !slices.Contains(supportedResources, resourceName) { @@ -94,18 +93,19 @@ func (s *KVMBinpackStep) Run(traceLog *slog.Logger, request api.ExternalSchedule var totalWeightedUtilization, totalWeight float64 for resourceName, weight := range s.Options.ResourceWeights { - capacity, ok := hv.Status.Capacity[resourceName.String()] + // Effective capacity = capacity * overcommit ratio. + capacity, ok := hv.Status.EffectiveCapacity[resourceName] if !ok { - traceLog.Warn("no capacity in status, skipping", + traceLog.Warn("no effective capacity in status, skipping", "host", host, "resource", resourceName) continue } if capacity.IsZero() { - traceLog.Warn("capacity is zero, skipping", + traceLog.Warn("effective capacity is zero, skipping", "host", host, "resource", resourceName) continue } - allocation, ok := hv.Status.Allocation[resourceName.String()] + allocation, ok := hv.Status.Allocation[resourceName] if !ok { traceLog.Warn("no allocation in status, skipping", "host", host, "resource", resourceName) @@ -138,15 +138,15 @@ func (s *KVMBinpackStep) Run(traceLog *slog.Logger, request api.ExternalSchedule } // calcVMResources calculates the total resource requests for the VM to be scheduled. -func (s *KVMBinpackStep) calcVMResources(req api.ExternalSchedulerRequest) map[corev1.ResourceName]resource.Quantity { - resources := make(map[corev1.ResourceName]resource.Quantity) +func (s *KVMBinpackStep) calcVMResources(req api.ExternalSchedulerRequest) map[hv1.ResourceName]resource.Quantity { + resources := make(map[hv1.ResourceName]resource.Quantity) resourcesMemBytes := int64(req.Spec.Data.Flavor.Data.MemoryMB * 1_000_000) //nolint:gosec // memory values are bounded by Nova resourcesMemBytes *= int64(req.Spec.Data.NumInstances) //nolint:gosec // instance count is bounded by Nova - resources[corev1.ResourceMemory] = *resource. + resources[hv1.ResourceMemory] = *resource. NewQuantity(resourcesMemBytes, resource.DecimalSI) resourcesCPU := int64(req.Spec.Data.Flavor.Data.VCPUs) //nolint:gosec // vCPU values are bounded by Nova resourcesCPU *= int64(req.Spec.Data.NumInstances) //nolint:gosec // instance count is bounded by Nova - resources[corev1.ResourceCPU] = *resource. + resources[hv1.ResourceCPU] = *resource. NewQuantity(resourcesCPU, resource.DecimalSI) return resources } diff --git a/internal/scheduling/nova/plugins/weighers/kvm_binpack_test.go b/internal/scheduling/nova/plugins/weighers/kvm_binpack_test.go index dde381e71..69e1aa9f6 100644 --- a/internal/scheduling/nova/plugins/weighers/kvm_binpack_test.go +++ b/internal/scheduling/nova/plugins/weighers/kvm_binpack_test.go @@ -10,7 +10,6 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" @@ -23,13 +22,13 @@ func newHypervisor(name, capacityCPU, capacityMem, allocationCPU, allocationMem Name: name, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse(capacityCPU), - "memory": resource.MustParse(capacityMem), + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(capacityCPU), + hv1.ResourceMemory: resource.MustParse(capacityMem), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse(allocationCPU), - "memory": resource.MustParse(allocationMem), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(allocationCPU), + hv1.ResourceMemory: resource.MustParse(allocationMem), }, }, } @@ -81,9 +80,9 @@ func TestKVMBinpackStepOpts_Validate(t *testing.T) { { name: "valid opts with memory and cpu weights", opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, + hv1.ResourceCPU: 1.0, }, }, wantErr: false, @@ -91,9 +90,9 @@ func TestKVMBinpackStepOpts_Validate(t *testing.T) { { name: "inverted weights should raise error", opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: -1.0, - corev1.ResourceCPU: -1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: -1.0, + hv1.ResourceCPU: -1.0, }, }, wantErr: true, @@ -101,9 +100,9 @@ func TestKVMBinpackStepOpts_Validate(t *testing.T) { { name: "zero weights should raise error", opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 0.0, - corev1.ResourceCPU: 0.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 0.0, + hv1.ResourceCPU: 0.0, }, }, wantErr: true, @@ -111,8 +110,8 @@ func TestKVMBinpackStepOpts_Validate(t *testing.T) { { name: "valid opts with only memory weight", opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 2.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 2.0, }, }, wantErr: false, @@ -120,8 +119,8 @@ func TestKVMBinpackStepOpts_Validate(t *testing.T) { { name: "valid opts with only cpu weight", opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 0.5, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 0.5, }, }, wantErr: false, @@ -129,9 +128,9 @@ func TestKVMBinpackStepOpts_Validate(t *testing.T) { { name: "zero weights should raise error", opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 0.0, - corev1.ResourceCPU: 0.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 0.0, + hv1.ResourceCPU: 0.0, }, }, wantErr: true, @@ -139,7 +138,7 @@ func TestKVMBinpackStepOpts_Validate(t *testing.T) { { name: "valid opts with empty resource weights", opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{}, + ResourceWeights: map[hv1.ResourceName]float64{}, }, wantErr: true, }, @@ -148,30 +147,10 @@ func TestKVMBinpackStepOpts_Validate(t *testing.T) { opts: KVMBinpackStepOpts{}, wantErr: true, }, - { - name: "invalid opts with unsupported resource", - opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceStorage: 1.0, - }, - }, - wantErr: true, - errMsg: "unsupported resource", - }, - { - name: "invalid opts with unsupported ephemeral-storage resource", - opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceEphemeralStorage: 1.0, - }, - }, - wantErr: true, - errMsg: "unsupported resource", - }, { name: "invalid opts with custom unsupported resource", opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ + ResourceWeights: map[hv1.ResourceName]float64{ "nvidia.com/gpu": 1.0, }, }, @@ -221,8 +200,8 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(8192, 4, 1, []string{"host1", "host2"}), // 8Gi memory opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ // with 0.1 tolerance @@ -243,8 +222,8 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(8192, 4, 1, []string{"host1", "host2"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ // with 0.1 tolerance @@ -261,9 +240,9 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(8192, 4, 1, []string{"host1", "host2"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ // with 0.1 tolerance @@ -281,9 +260,9 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(8192, 4, 1, []string{"host1"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 2.0, - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ // with 0.1 tolerance @@ -299,8 +278,8 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(8192, 4, 2, []string{"host1"}), // 2 instances opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ // with 0.1 tolerance @@ -314,8 +293,8 @@ func TestKVMBinpackStep_Run(t *testing.T) { hypervisors: []*hv1.Hypervisor{}, request: newBinpackRequest(8192, 4, 1, []string{"host1", "host2"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -333,8 +312,8 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(8192, 4, 1, []string{"host1", "host2"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -351,7 +330,7 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(8192, 4, 1, []string{"host1"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{}, + ResourceWeights: map[hv1.ResourceName]float64{}, }, expectedWeights: map[string]float64{ "host1": 0, // No weights configured, score is 0 @@ -364,21 +343,21 @@ func TestKVMBinpackStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("0"), - "memory": resource.MustParse("100Gi"), + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("100Gi"), }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse("0"), - "memory": resource.MustParse("80Gi"), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("80Gi"), }, }, }, }, request: newBinpackRequest(8192, 4, 1, []string{"host1"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -392,10 +371,10 @@ func TestKVMBinpackStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("100"), + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("100"), }, - Allocation: map[string]resource.Quantity{ + Allocation: map[hv1.ResourceName]resource.Quantity{ // No CPU allocation }, }, @@ -403,8 +382,8 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(8192, 4, 1, []string{"host1"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -418,19 +397,19 @@ func TestKVMBinpackStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ // No CPU capacity }, - Allocation: map[string]resource.Quantity{ - "cpu": resource.MustParse("80"), + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("80"), }, }, }, }, request: newBinpackRequest(8192, 4, 1, []string{"host1"}), opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -446,8 +425,8 @@ func TestKVMBinpackStep_Run(t *testing.T) { }, request: newBinpackRequest(20480, 20, 1, []string{"host1"}), // 20Gi, 20 CPUs - more than available opts: KVMBinpackStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -547,7 +526,7 @@ func TestKVMBinpackStep_calcVMResources(t *testing.T) { step := &KVMBinpackStep{} resources := step.calcVMResources(tt.request) - memResource, ok := resources[corev1.ResourceMemory] + memResource, ok := resources[hv1.ResourceMemory] if !ok { t.Error("expected memory resource to be present") } else { @@ -557,7 +536,7 @@ func TestKVMBinpackStep_calcVMResources(t *testing.T) { } } - cpuResource, ok := resources[corev1.ResourceCPU] + cpuResource, ok := resources[hv1.ResourceCPU] if !ok { t.Error("expected CPU resource to be present") } else { diff --git a/internal/scheduling/nova/plugins/weighers/kvm_failover_evacuation_test.go b/internal/scheduling/nova/plugins/weighers/kvm_failover_evacuation_test.go index 9c3ace3ec..0664e55d4 100644 --- a/internal/scheduling/nova/plugins/weighers/kvm_failover_evacuation_test.go +++ b/internal/scheduling/nova/plugins/weighers/kvm_failover_evacuation_test.go @@ -49,9 +49,9 @@ func newFailoverReservation(name, targetHost string, failed bool, allocations ma Spec: v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeFailover, TargetHost: targetHost, - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse("4"), - "memory": resource.MustParse("8Gi"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("4"), + hv1.ResourceMemory: resource.MustParse("8Gi"), }, FailoverReservation: &v1alpha1.FailoverReservationSpec{ ResourceGroup: "m1.large", @@ -84,9 +84,9 @@ func newCommittedReservation(name, targetHost string) *v1alpha1.Reservation { Spec: v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeCommittedResource, TargetHost: targetHost, - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse("4"), - "memory": resource.MustParse("8Gi"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("4"), + hv1.ResourceMemory: resource.MustParse("8Gi"), }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ProjectID: "project-A", diff --git a/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts.go b/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts.go index 1bb070592..b65a5f75f 100644 --- a/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts.go +++ b/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts.go @@ -13,7 +13,6 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" "github.com/cobaltcore-dev/cortex/internal/scheduling/lib" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" ) @@ -23,7 +22,7 @@ type KVMPreferSmallerHostsStepOpts struct { // of the normalized distances from the smallest capacity for each resource. // If a resource is not specified, it is ignored in the score calculation // (equivalent to a weight of 0). - ResourceWeights map[corev1.ResourceName]float64 `json:"resourceWeights"` + ResourceWeights map[hv1.ResourceName]float64 `json:"resourceWeights"` } // Validate the options to ensure they are correct before running the weigher. @@ -31,9 +30,9 @@ func (o KVMPreferSmallerHostsStepOpts) Validate() error { if len(o.ResourceWeights) == 0 { return errors.New("at least one resource weight must be specified") } - supportedResources := []corev1.ResourceName{ - corev1.ResourceMemory, - corev1.ResourceCPU, + supportedResources := []hv1.ResourceName{ + hv1.ResourceMemory, + hv1.ResourceCPU, } for resourceName, val := range o.ResourceWeights { if val < 0 { @@ -73,8 +72,8 @@ func (s *KVMPreferSmallerHostsStep) Run(traceLog *slog.Logger, request api.Exter } // Calculate smallest and largest capacity for each resource across active hosts - smallest := make(map[corev1.ResourceName]*resource.Quantity) - largest := make(map[corev1.ResourceName]*resource.Quantity) + smallest := make(map[hv1.ResourceName]*resource.Quantity) + largest := make(map[hv1.ResourceName]*resource.Quantity) for resourceName := range s.Options.ResourceWeights { for _, hv := range hvs.Items { @@ -82,9 +81,10 @@ func (s *KVMPreferSmallerHostsStep) Run(traceLog *slog.Logger, request api.Exter if _, ok := result.Activations[hv.Name]; !ok { continue } - capacity, ok := hv.Status.Capacity[resourceName.String()] + // Effective capacity = capacity * overcommit ratio. + capacity, ok := hv.Status.EffectiveCapacity[resourceName] if !ok { - traceLog.Warn("hypervisor has no capacity for resource, skipping", + traceLog.Warn("hypervisor has no effective capacity for resource, skipping", "host", hv.Name, "resource", resourceName) continue } @@ -107,9 +107,9 @@ func (s *KVMPreferSmallerHostsStep) Run(traceLog *slog.Logger, request api.Exter var totalWeightedScore, totalWeight float64 for resourceName, weight := range s.Options.ResourceWeights { - capacity, ok := hv.Status.Capacity[resourceName.String()] + capacity, ok := hv.Status.EffectiveCapacity[resourceName] if !ok { - traceLog.Warn("hypervisor has no capacity for resource, skipping", + traceLog.Warn("hypervisor has no effective capacity for resource, skipping", "host", hv.Name, "resource", resourceName) continue } @@ -118,14 +118,14 @@ func (s *KVMPreferSmallerHostsStep) Run(traceLog *slog.Logger, request api.Exter largestCap := largest[resourceName] if smallestCap == nil || largestCap == nil { - traceLog.Warn("no capacity range found for resource, skipping", + traceLog.Warn("no effective capacity range found for resource, skipping", "resource", resourceName) continue } - // If all hosts have the same capacity for this resource, skip it + // If all hosts have the same effective capacity for this resource, skip it if smallestCap.Cmp(*largestCap) == 0 { - traceLog.Info("all hypervisors have the same capacity for resource, skipping", + traceLog.Info("all hypervisors have the same effective capacity for resource, skipping", "resource", resourceName) continue } diff --git a/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts_test.go b/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts_test.go index 545c124ab..2ab2deb89 100644 --- a/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts_test.go +++ b/internal/scheduling/nova/plugins/weighers/kvm_prefer_smaller_hosts_test.go @@ -10,7 +10,6 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" @@ -23,9 +22,9 @@ func newHypervisorWithCapacity(name, capacityCPU, capacityMem string) *hv1.Hyper Name: name, }, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse(capacityCPU), - "memory": resource.MustParse(capacityMem), + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse(capacityCPU), + hv1.ResourceMemory: resource.MustParse(capacityMem), }, }, } @@ -77,9 +76,9 @@ func TestKVMPreferSmallerHostsStepOpts_Validate(t *testing.T) { { name: "valid opts with memory and cpu weights", opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, + hv1.ResourceCPU: 1.0, }, }, wantErr: false, @@ -87,8 +86,8 @@ func TestKVMPreferSmallerHostsStepOpts_Validate(t *testing.T) { { name: "valid opts with only memory weight", opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 2.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 2.0, }, }, wantErr: false, @@ -96,8 +95,8 @@ func TestKVMPreferSmallerHostsStepOpts_Validate(t *testing.T) { { name: "valid opts with only cpu weight", opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 0.5, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 0.5, }, }, wantErr: false, @@ -105,9 +104,9 @@ func TestKVMPreferSmallerHostsStepOpts_Validate(t *testing.T) { { name: "valid opts with zero weights", opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 0.0, - corev1.ResourceCPU: 0.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 0.0, + hv1.ResourceCPU: 0.0, }, }, wantErr: false, @@ -115,7 +114,7 @@ func TestKVMPreferSmallerHostsStepOpts_Validate(t *testing.T) { { name: "invalid opts with empty resource weights", opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{}, + ResourceWeights: map[hv1.ResourceName]float64{}, }, wantErr: true, errMsg: "at least one resource weight must be specified", @@ -129,8 +128,8 @@ func TestKVMPreferSmallerHostsStepOpts_Validate(t *testing.T) { { name: "invalid opts with negative weight", opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: -1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: -1.0, }, }, wantErr: true, @@ -139,37 +138,17 @@ func TestKVMPreferSmallerHostsStepOpts_Validate(t *testing.T) { { name: "invalid opts with negative cpu weight", opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: -0.5, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: -0.5, }, }, wantErr: true, errMsg: "resource weights must be greater than or equal to zero", }, - { - name: "invalid opts with unsupported resource", - opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceStorage: 1.0, - }, - }, - wantErr: true, - errMsg: "unsupported resource", - }, - { - name: "invalid opts with unsupported ephemeral-storage resource", - opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceEphemeralStorage: 1.0, - }, - }, - wantErr: true, - errMsg: "unsupported resource", - }, { name: "invalid opts with custom unsupported resource", opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ + ResourceWeights: map[hv1.ResourceName]float64{ "nvidia.com/gpu": 1.0, }, }, @@ -216,8 +195,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -236,8 +215,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -259,9 +238,9 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -284,9 +263,9 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 2.0, // memory is weighted 2x - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 2.0, // memory is weighted 2x + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ @@ -305,8 +284,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -324,8 +303,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -343,8 +322,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -358,8 +337,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { hypervisors: []*hv1.Hypervisor{}, request: newPreferSmallerHostsRequest([]string{"host1", "host2"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -378,8 +357,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -397,8 +376,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host3"}, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "cpu": resource.MustParse("100"), + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("100"), // No memory capacity }, }, @@ -406,8 +385,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -427,8 +406,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { // Only host1 and host2 in the request (host3 was filtered out) request: newPreferSmallerHostsRequest([]string{"host1", "host2"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -446,8 +425,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -468,8 +447,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3", "host4"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -487,20 +466,20 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{}, + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{}, }, }, { ObjectMeta: metav1.ObjectMeta{Name: "host2"}, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{}, + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{}, }, }, }, request: newPreferSmallerHostsRequest([]string{"host1", "host2"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -518,8 +497,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -538,8 +517,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, }, }, expectedWeights: map[string]float64{ @@ -555,8 +534,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "memory": resource.MustParse("64Gi"), + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("64Gi"), // No CPU }, }, @@ -564,8 +543,8 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "host2"}, Status: hv1.HypervisorStatus{ - Capacity: map[string]resource.Quantity{ - "memory": resource.MustParse("128Gi"), + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("128Gi"), // No CPU }, }, @@ -573,9 +552,9 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 1.0, - corev1.ResourceCPU: 1.0, // CPU requested but not available + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.0, + hv1.ResourceCPU: 1.0, // CPU requested but not available }, }, expectedWeights: map[string]float64{ @@ -594,9 +573,9 @@ func TestKVMPreferSmallerHostsStep_Run(t *testing.T) { }, request: newPreferSmallerHostsRequest([]string{"host1", "host2", "host3"}), opts: KVMPreferSmallerHostsStepOpts{ - ResourceWeights: map[corev1.ResourceName]float64{ - corev1.ResourceMemory: 0.0, // zero weight - ignored - corev1.ResourceCPU: 1.0, + ResourceWeights: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 0.0, // zero weight - ignored + hv1.ResourceCPU: 1.0, }, }, expectedWeights: map[string]float64{ diff --git a/internal/scheduling/reservations/commitments/api.go b/internal/scheduling/reservations/commitments/api.go new file mode 100644 index 000000000..9d8fd5944 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api.go @@ -0,0 +1,39 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "net/http" + "sync" + + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// HTTPAPI implements Limes LIQUID commitment validation endpoints. +type HTTPAPI struct { + client client.Client + config Config + // Mutex to serialize change-commitments requests + changeMutex sync.Mutex +} + +func NewAPI(client client.Client) *HTTPAPI { + return NewAPIWithConfig(client, DefaultConfig()) +} + +func NewAPIWithConfig(client client.Client, config Config) *HTTPAPI { + return &HTTPAPI{ + client: client, + config: config, + } +} + +func (api *HTTPAPI) Init(mux *http.ServeMux) { + mux.HandleFunc("/v1/commitments/change-commitments", api.HandleChangeCommitments) + // mux.HandleFunc("/v1/report-capacity", api.HandleReportCapacity) + mux.HandleFunc("/v1/commitments/info", api.HandleInfo) +} + +var commitmentApiLog = ctrl.Log.WithName("commitment_api") diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go new file mode 100644 index 000000000..1c6276ade --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_change_commitments.go @@ -0,0 +1,377 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "sort" + "strings" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/go-logr/logr" + . "github.com/majewsky/gg/option" + "github.com/sapcc/go-api-declarations/liquid" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// sortedKeys returns map keys sorted alphabetically for deterministic iteration. +func sortedKeys[K ~string, V any](m map[K]V) []K { + keys := make([]K, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + sort.Slice(keys, func(i, j int) bool { + return string(keys[i]) < string(keys[j]) + }) + return keys +} + +// implements POST /v1/change-commitments from Limes LIQUID API: +// See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid +// +// This endpoint handles commitment changes by creating/updating/deleting Reservation CRDs based on the commitment lifecycle. +// A request may contain multiple commitment changes which are processed in a single transaction. If any change fails, all changes are rolled back. +func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Request) { + // Serialize all change-commitments requests + api.changeMutex.Lock() + defer api.changeMutex.Unlock() + + // Extract or generate request ID for tracing + requestID := r.Header.Get("X-Request-ID") + if requestID == "" { + requestID = fmt.Sprintf("req-%d", time.Now().UnixNano()) + } + log := commitmentApiLog.WithValues("requestID", requestID, "endpoint", "/v1/change-commitments") + + // Only accept POST method + if r.Method != http.MethodPost { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + // Parse request body + var req liquid.CommitmentChangeRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + log.Error(err, "invalid request body") + http.Error(w, "Invalid request body: "+err.Error(), http.StatusBadRequest) + return + } + + log.Info("received change commitments request", "affectedProjects", len(req.ByProject), "dryRun", req.DryRun, "availabilityZone", req.AZ) + + // Initialize response + resp := liquid.CommitmentChangeResponse{} + + // Check for dry run -> early reject, not supported yet + if req.DryRun { + resp.RejectionReason = "Dry run not supported yet" + log.Info("rejecting dry run request") + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(resp); err != nil { + return + } + return + } + + // Process commitment changes + // For now, we'll implement a simplified path that checks capacity for immediate start CRs + if err := api.processCommitmentChanges(w, log, req, &resp); err != nil { + // Error already written to response by processCommitmentChanges + return + } + + // Return response + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(resp); err != nil { + return + } +} + +func (api *HTTPAPI) processCommitmentChanges(w http.ResponseWriter, log logr.Logger, req liquid.CommitmentChangeRequest, resp *liquid.CommitmentChangeResponse) error { + ctx := context.Background() + manager := NewReservationManager(api.client) + requireRollback := false + failedCommitments := make(map[string]string) // commitmentUUID to reason for failure, for better response messages in case of rollback + log.Info("processing commitment change request", "availabilityZone", req.AZ, "dryRun", req.DryRun, "affectedProjects", len(req.ByProject)) + + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: api.client} + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) + if err != nil { + log.Info("failed to get flavor groups from knowledge extractor", "error", err) + resp.RejectionReason = "caches not ready" + retryTime := time.Now().Add(1 * time.Minute) + resp.RetryAt = Some(retryTime) + return nil + } + + // Validate InfoVersion from request matches current version (= last content change of flavor group knowledge) + var currentVersion int64 = -1 + if knowledgeCRD, err := knowledge.Get(ctx); err == nil && knowledgeCRD != nil && !knowledgeCRD.Status.LastContentChange.IsZero() { + currentVersion = knowledgeCRD.Status.LastContentChange.Unix() + } + + if req.InfoVersion != currentVersion { + log.Info("version mismatch in commitment change request", + "requestVersion", req.InfoVersion, + "currentVersion", currentVersion) + http.Error(w, fmt.Sprintf("Version mismatch: request version %d, current version %d. Please refresh and retry.", + req.InfoVersion, currentVersion), http.StatusConflict) + return errors.New("version mismatch") + } + + statesBefore := make(map[string]*CommitmentState) // map of commitmentID to existing state for rollback + var reservationsToWatch []v1alpha1.Reservation + + if req.DryRun { + resp.RejectionReason = "Dry run not supported yet" + return nil + } + +ProcessLoop: + for _, projectID := range sortedKeys(req.ByProject) { + projectChanges := req.ByProject[projectID] + for _, resourceName := range sortedKeys(projectChanges.ByResource) { + resourceChanges := projectChanges.ByResource[resourceName] + // Validate resource name pattern (instances_group_*) + flavorGroupName, err := getFlavorGroupNameFromResource(string(resourceName)) + if err != nil { + resp.RejectionReason = fmt.Sprintf("project with unknown resource name %s: %v", projectID, err) + requireRollback = true + break ProcessLoop + } + + // Verify flavor group exists in Knowledge CRDs + flavorGroup, flavorGroupExists := flavorGroups[flavorGroupName] + if !flavorGroupExists { + resp.RejectionReason = "flavor group not found: " + flavorGroupName + requireRollback = true + break ProcessLoop + } + + for _, commitment := range resourceChanges.Commitments { + // Additional per-commitment validation if needed + log.Info("processing commitment change", "commitmentUUID", commitment.UUID, "projectID", projectID, "resourceName", resourceName, "oldStatus", commitment.OldStatus.UnwrapOr("none"), "newStatus", commitment.NewStatus.UnwrapOr("none")) + + // TODO add configurable upper limit validation for commitment size (number of instances) to prevent excessive reservation creation + // TODO add domain + + // List all committed resource reservations, then filter by name prefix + var all_reservations v1alpha1.ReservationList + if err := api.client.List(ctx, &all_reservations, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + failedCommitments[string(commitment.UUID)] = "failed to list reservations" + log.Info(fmt.Sprintf("failed to list reservations for commitment %s: %v", commitment.UUID, err)) + requireRollback = true + break ProcessLoop + } + + // Filter by name prefix to find reservations for this commitment + namePrefix := fmt.Sprintf("commitment-%s-", string(commitment.UUID)) + var existing_reservations v1alpha1.ReservationList + for _, res := range all_reservations.Items { + if len(res.Name) >= len(namePrefix) && res.Name[:len(namePrefix)] == namePrefix { + existing_reservations.Items = append(existing_reservations.Items, res) + } + } + + var stateBefore *CommitmentState + if len(existing_reservations.Items) == 0 { + stateBefore = &CommitmentState{ + CommitmentUUID: string(commitment.UUID), + ProjectID: string(projectID), + FlavorGroupName: flavorGroupName, + TotalMemoryBytes: 0, + } + } else { + stateBefore, err = FromReservations(existing_reservations.Items) + if err != nil { + failedCommitments[string(commitment.UUID)] = "failed to parse existing commitment reservations" + log.Info(fmt.Sprintf("failed to get existing state for commitment %s: %v", commitment.UUID, err)) + requireRollback = true + break ProcessLoop + } + } + statesBefore[string(commitment.UUID)] = stateBefore + + // get desired state + stateDesired, err := FromChangeCommitmentTargetState(commitment, string(projectID), flavorGroupName, flavorGroup, string(req.AZ)) + if err != nil { + failedCommitments[string(commitment.UUID)] = "failed to determine desired commitment state" + log.Info(fmt.Sprintf("failed to get desired state for commitment %s: %v", commitment.UUID, err)) + requireRollback = true + break ProcessLoop + } + + log.Info("applying commitment state change", "commitmentUUID", commitment.UUID, "oldState", stateBefore, "desiredState", stateDesired) + + touchedReservations, deletedReservations, err := manager.ApplyCommitmentState(ctx, log, stateDesired, flavorGroups, "changeCommitmentsApi") + if err != nil { + failedCommitments[string(commitment.UUID)] = "failed to apply commitment state" + log.Info(fmt.Sprintf("failed to apply commitment state for commitment %s: %v", commitment.UUID, err)) + requireRollback = true + break ProcessLoop + } + log.Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(touchedReservations), "deletedReservations", len(deletedReservations)) + reservationsToWatch = append(reservationsToWatch, touchedReservations...) + } + } + } + + // TODO make the rollback defer safe + if !requireRollback { + log.Info("applied commitment changes, now watching for reservation readiness", "reservationsToWatch", len(reservationsToWatch)) + + time_start := time.Now() + + if failedReservations, errors := watchReservationsUntilReady(ctx, log, api.client, reservationsToWatch, api.config.ChangeAPIWatchReservationsTimeout, api.config.ChangeAPIWatchReservationsPollInterval); len(failedReservations) > 0 || len(errors) > 0 { + log.Info("reservations failed to become ready, initiating rollback", + "failedReservations", len(failedReservations), + "errors", errors) + + for _, res := range failedReservations { + failedCommitments[res.Spec.CommittedResourceReservation.CommitmentUUID] = "not sufficient capacity" + } + if len(failedReservations) == 0 { + resp.RejectionReason += "timeout reached while processing commitment changes" + } + requireRollback = true + } + + log.Info("finished watching reservation", "totalSchedulingTimeSeconds", time.Since(time_start).Seconds()) + } + + if requireRollback { + // Build rejection reason from failed commitments + if len(failedCommitments) > 0 { + var reasonBuilder strings.Builder + reasonBuilder.WriteString(fmt.Sprintf("%d commitment(s) failed to apply: ", len(failedCommitments))) + for commitmentUUID, reason := range failedCommitments { + reasonBuilder.WriteString(fmt.Sprintf("\n- commitment %s: %s", commitmentUUID, reason)) + } + resp.RejectionReason = reasonBuilder.String() + } + + log.Info("rollback of commitment changes") + for commitmentUUID, state := range statesBefore { + // Rollback to statesBefore for this commitment + log.Info("applying rollback for commitment", "commitmentUUID", commitmentUUID, "stateBefore", state) + _, _, err := manager.ApplyCommitmentState(ctx, log, state, flavorGroups, "changeCommitmentsApiRollback") + if err != nil { + log.Info("failed to apply rollback state for commitment", "commitmentUUID", commitmentUUID, "error", err) + // continue with best effort rollback for other projects + } + } + + log.Info("finished applying rollbacks for commitment changes", "reasonOfRollback", resp.RejectionReason) + return nil + } + + log.Info("commitment changes accepted") + return nil +} + +// watchReservationsUntilReady polls until all reservations reach Ready=True or timeout. +func watchReservationsUntilReady( + ctx context.Context, + log logr.Logger, + k8sClient client.Client, + reservations []v1alpha1.Reservation, + timeout time.Duration, + pollInterval time.Duration, +) (failedReservations []v1alpha1.Reservation, errors []error) { + + if len(reservations) == 0 { + return failedReservations, nil + } + + deadline := time.Now().Add(timeout) + + reservationsToWatch := make([]v1alpha1.Reservation, len(reservations)) + copy(reservationsToWatch, reservations) + + for { + var stillWaiting []v1alpha1.Reservation + if time.Now().After(deadline) { + errors = append(errors, fmt.Errorf("timeout after %v waiting for reservations to become ready", timeout)) + return failedReservations, errors + } + + allChecked := true + + for _, res := range reservationsToWatch { + // Fetch current state + var current v1alpha1.Reservation + nn := types.NamespacedName{ + Name: res.Name, + Namespace: res.Namespace, + } + + if err := k8sClient.Get(ctx, nn, ¤t); err != nil { + allChecked = false + // Reservation is still in process of being created, or there is a transient error, continue waiting for it + log.V(1).Info("transient error getting reservation, will retry", "reservation", res.Name, "error", err) + stillWaiting = append(stillWaiting, res) + continue + } + + // Check Ready condition + readyCond := meta.FindStatusCondition( + current.Status.Conditions, + v1alpha1.ReservationConditionReady, + ) + + if readyCond == nil { + // Condition not set yet, keep waiting + allChecked = false + stillWaiting = append(stillWaiting, res) + continue + } + + switch readyCond.Status { + case metav1.ConditionTrue: + // TODO use more than readyCondition + case metav1.ConditionFalse: + allChecked = false + failedReservations = append(failedReservations, res) + case metav1.ConditionUnknown: + allChecked = false + stillWaiting = append(stillWaiting, res) + } + } + + if allChecked || len(stillWaiting) == 0 { + log.Info("all reservations checked", + "failed", len(failedReservations)) + return failedReservations, errors + } + + reservationsToWatch = stillWaiting + // Log progress + log.Info("waiting for reservations to become ready", + "notReady", len(reservationsToWatch), + "total", len(reservations), + "timeRemaining", time.Until(deadline).Round(time.Second)) + + // Wait before next poll + select { + case <-time.After(pollInterval): + // Continue polling + case <-ctx.Done(): + return failedReservations, append(errors, fmt.Errorf("context cancelled while waiting for reservations: %w", ctx.Err())) + } + } +} diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_test.go new file mode 100644 index 000000000..871e72b54 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_change_commitments_test.go @@ -0,0 +1,1720 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +//nolint:unparam,unused // test helper functions have fixed parameters for simplicity +package commitments + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/http/httptest" + "os" + "sort" + "strconv" + "strings" + "sync" + "testing" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/sapcc/go-api-declarations/liquid" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" +) + +// ============================================================================ +// Integration Tests +// ============================================================================ + +func TestCommitmentChangeIntegration(t *testing.T) { + m1Tiny := &TestFlavor{Name: "m1.tiny", Group: "gp_1", MemoryMB: 256, VCPUs: 1} + m1Small := &TestFlavor{Name: "m1.small", Group: "hana_1", MemoryMB: 1024, VCPUs: 4} + m1Large := &TestFlavor{Name: "m1.large", Group: "hana_1", MemoryMB: 4096, VCPUs: 16} + m1XL := &TestFlavor{Name: "m1.xl", Group: "hana_1", MemoryMB: 8192, VCPUs: 32} + + testCases := []CommitmentChangeTestCase{ + { + Name: "Shrinking CR - unused reservations removed, used reservations untouched", + VMs: []*TestVM{{UUID: "vm-a1", Flavor: m1Large, ProjectID: "project-A", Host: "host-1", AZ: "az-a"}}, + Flavors: []*TestFlavor{m1Small, m1Large}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-a1"}}, + {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-123", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("ram_hana_1", "project-A", "uuid-123", "confirmed", 2)), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-a1"}}, + {CommitmentID: "uuid-123", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Insufficient capacity when increasing CR", + VMs: []*TestVM{}, + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{{CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("ram_hana_1", "project-A", "uuid-456", "confirmed", 3)), + AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 1024, "host-2": 0}}, + ExpectedReservations: []*TestReservation{{CommitmentID: "uuid-456", Host: "", Flavor: m1Small, ProjectID: "project-A"}}, + ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment uuid-456: not sufficient capacity"), + }, + { + Name: "Swap capacity between CRs - order dependent - delete-first succeeds", + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-456", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-456", "confirmed", 0), + createCommitment("ram_hana_1", "project-B", "uuid-123", "confirmed", 2), + ), + AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0}}, + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-B"}, + {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-B"}}, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Swap capacity between CRs - order dependent - create-first fails", + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-B"}, + {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-B"}}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-456", "confirmed", 2), + createCommitment("ram_hana_1", "project-B", "uuid-123", "confirmed", 0), + ), + AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0}}, + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-B"}, + {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-B"}}, + ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment uuid-456: not sufficient capacity"), + }, + { + Name: "Flavor bin-packing - mixed sizes when largest doesn't fit", + // Greedy selection: 10GB request with 8/4/1GB flavors β†’ picks 1Γ—8GB + 2Γ—1GB + Flavors: []*TestFlavor{m1XL, m1Large, m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-binpack", "confirmed", 10), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-binpack", Flavor: m1XL, ProjectID: "project-A"}, + {CommitmentID: "uuid-binpack", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-binpack", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Version mismatch - request rejected with 409 Conflict", + // InfoVersion validation prevents stale requests (1233 vs 1234) + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1233, + createCommitment("ram_hana_1", "project-A", "uuid-version", "confirmed", 2), + ), + EnvInfoVersion: 1234, + ExpectedReservations: []*TestReservation{}, + ExpectedAPIResponse: APIResponseExpectation{StatusCode: 409}, + }, + { + Name: "Multi-project rollback - one failure rolls back all", + // Transactional: project-B fails (insufficient capacity) β†’ both projects rollback + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-project-a", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-project-a", "confirmed", 2), + createCommitment("ram_hana_1", "project-B", "uuid-project-b", "confirmed", 2), + ), + AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 1024, "host-2": 0}}, + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-project-a", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse("uuid-project-b", "not sufficient capacity"), + }, + { + Name: "Rollback with VMs allocated - limitation: VM allocations not rolled back", + // Controller will eventually clean up and repair inconsistent state + VMs: []*TestVM{{UUID: "vm-rollback", Flavor: m1Small, ProjectID: "project-A", Host: "host-1", AZ: "az-a"}}, + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "commitment-A", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-rollback"}}, + {CommitmentID: "commitment-A", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "commitment-A", "confirmed", 0), + createCommitment("ram_hana_1", "project-B", "commitment-B", "confirmed", 6), + ), + AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0}}, + ExpectedReservations: []*TestReservation{ + // Rollback creates unscheduled reservations (empty Host accepts any in matching) + {CommitmentID: "commitment-A", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "commitment-A", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse("commitment-B", "not sufficient capacity"), + }, + { + Name: "New commitment creation - from zero to N reservations", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-new", "confirmed", 3), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-new", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-new", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-new", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "New commitment creation - large batch", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-new", "confirmed", 200), + ), + ExpectedReservations: func() []*TestReservation { + var reservations []*TestReservation + for range 200 { + reservations = append(reservations, &TestReservation{ + CommitmentID: "uuid-new", + Flavor: m1Small, + ProjectID: "project-A", + }) + } + return reservations + }(), + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "With reservations of custom size - total unchanged", + // Preserves custom-sized reservations when total matches (2Γ—2GB = 4GB) + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-custom", "confirmed", 4), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "With reservations of custom size - increase total", + // 4GB (2Γ—2GB custom) β†’ 6GB: preserves custom sizes, adds standard-sized reservations + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-custom", "confirmed", 6), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "With reservations of custom size - decrease total", + // 4GB (2Γ—2GB custom) β†’ 3GB: removes 1Γ—2GB custom, adds 1Γ—1GB standard + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-custom", "confirmed", 3), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, + {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Complete commitment deletion - N to zero reservations", + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-delete", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-delete", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-delete", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-b-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-B"}, + {CommitmentID: "uuid-a-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-delete", "confirmed", 0), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-b-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-B"}, + {CommitmentID: "uuid-a-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "VM allocation preservation - keep VMs during growth", + VMs: []*TestVM{{UUID: "vm-existing", Flavor: m1Small, ProjectID: "project-A", Host: "host-1", AZ: "az-a"}}, + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-growth", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-existing"}}, + {CommitmentID: "uuid-growth", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-growth", "confirmed", 3), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-growth", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-existing"}}, + {CommitmentID: "uuid-growth", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-growth", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Multi-project success - both projects succeed", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-a", "confirmed", 2), + createCommitment("ram_hana_1", "project-B", "uuid-b", "confirmed", 2), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-a", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-a", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-b", Flavor: m1Small, ProjectID: "project-B"}, + {CommitmentID: "uuid-b", Flavor: m1Small, ProjectID: "project-B"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Multiple flavor groups - ram_hana_1 and ram_hana_2", + // Amount in multiples of smallest flavor: hana_1 (2Γ—1GB), hana_2 (2Γ—2GB) + Flavors: []*TestFlavor{ + m1Small, + {Name: "m2.small", Group: "hana_2", MemoryMB: 2048, VCPUs: 8}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-hana1", "confirmed", 2), + createCommitment("ram_hana_2", "project-A", "uuid-hana2", "confirmed", 2), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-hana1", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-hana1", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-hana2", Flavor: &TestFlavor{Name: "m2.small", Group: "hana_2", MemoryMB: 2048, VCPUs: 8}, ProjectID: "project-A"}, + {CommitmentID: "uuid-hana2", Flavor: &TestFlavor{Name: "m2.small", Group: "hana_2", MemoryMB: 2048, VCPUs: 8}, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Unknown flavor group - clear rejection message", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_nonexistent", "project-A", "uuid-unknown", "confirmed", 2), + ), + ExpectedReservations: []*TestReservation{}, + ExpectedAPIResponse: newAPIResponse("flavor group not found"), + }, + { + Name: "Three-way capacity swap - complex reallocation", + // A:2β†’0, B:1β†’0, C:0β†’3 in single transaction + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-a", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-a", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, + {CommitmentID: "uuid-b", Host: "host-3", Flavor: m1Small, ProjectID: "project-B"}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-a", "confirmed", 0), + createCommitment("ram_hana_1", "project-B", "uuid-b", "confirmed", 0), + createCommitment("ram_hana_1", "project-C", "uuid-c", "confirmed", 3), + ), + AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0, "host-3": 0}}, + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-c", Host: "host-1", Flavor: m1Small, ProjectID: "project-C"}, + {CommitmentID: "uuid-c", Host: "host-2", Flavor: m1Small, ProjectID: "project-C"}, + {CommitmentID: "uuid-c", Host: "host-3", Flavor: m1Small, ProjectID: "project-C"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Reservation repair - existing reservations with wrong metadata", + Flavors: []*TestFlavor{m1Small, m1Large}, + ExistingReservations: []*TestReservation{ + {CommitmentID: "uuid-repair", Host: "host-preserved", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, + {CommitmentID: "uuid-repair", Host: "host-1", Flavor: m1Small, ProjectID: "wrong-project", AZ: "az-a"}, + {CommitmentID: "uuid-repair", Host: "host-2", Flavor: &TestFlavor{Name: "m1.small", Group: "hana_13", MemoryMB: 1024, VCPUs: 4}, ProjectID: "project-A", AZ: "az-a"}, + {CommitmentID: "uuid-repair", Host: "host-4", Flavor: m1Small, ProjectID: "project-A", AZ: "wrong-az"}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-repair", "confirmed", 8, "az-a"), + ), + ExpectedReservations: []*TestReservation{ + {CommitmentID: "uuid-repair", Host: "host-preserved", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, + {CommitmentID: "uuid-repair", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, + {CommitmentID: "uuid-repair", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, + {CommitmentID: "uuid-repair", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, + {CommitmentID: "uuid-repair", Flavor: m1Large, ProjectID: "project-A", AZ: "az-a"}, + }, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Empty request - no commitment changes", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234), + ExpectedReservations: []*TestReservation{}, + ExpectedAPIResponse: newAPIResponse(), + }, + { + Name: "Dry run request - feature not yet implemented", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", true, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-dryrun", "confirmed", 2), + ), + ExpectedReservations: []*TestReservation{}, + ExpectedAPIResponse: newAPIResponse("Dry run not supported"), + }, + { + Name: "Knowledge not ready - clear rejection with RetryAt", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-knowledge", "confirmed", 2), + ), + ExpectedReservations: []*TestReservation{}, + ExpectedAPIResponse: APIResponseExpectation{ + StatusCode: 200, + RejectReasonSubstrings: []string{"caches not ready"}, + RetryAtPresent: true, + }, + EnvInfoVersion: -1, // Skip Knowledge CRD creation + }, + { + Name: "Multiple commitments insufficient capacity - all listed in error", + // Tests that multiple failed commitments are all mentioned in the rejection reason + Flavors: []*TestFlavor{m1Small, m1Tiny}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-multi-fail-1", "confirmed", 3), + createCommitment("ram_hana_1", "project-B", "uuid-multi-fail-2", "confirmed", 3), + createCommitment("ram_gp_1", "project-C", "uuid-would-not-fail", "confirmed", 1), // would be rolled back, but not part of the reject reason + ), + AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 256}}, + ExpectedReservations: []*TestReservation{}, + ExpectedAPIResponse: newAPIResponse("2 commitment(s) failed", "commitment uuid-multi-fail-1: not sufficient capacity", "commitment uuid-multi-fail-2: not sufficient capacity"), + }, + { + Name: "Watch timeout with custom config - triggers rollback with timeout error", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-timeout", "confirmed", 2), + ), + // With 0ms timeout, the watch will timeout immediately before reservations become ready + CustomConfig: &Config{ + ChangeAPIWatchReservationsTimeout: 0 * time.Millisecond, + ChangeAPIWatchReservationsPollInterval: 100 * time.Millisecond, + }, + ExpectedReservations: []*TestReservation{}, // Rollback removes all reservations + ExpectedAPIResponse: newAPIResponse("timeout reached while processing commitment changes"), + }, + } + + for _, tc := range testCases { + t.Run(tc.Name, func(t *testing.T) { + runCommitmentChangeTest(t, tc) + }) + } +} + +// runCommitmentChangeTest executes a single commitment change integration test case. +func runCommitmentChangeTest(t *testing.T, tc CommitmentChangeTestCase) { + t.Helper() + + // Convert test types to actual types + var vms []VM + for _, testVM := range tc.VMs { + vms = append(vms, testVM.ToVM()) + } + + var flavorInGroups []compute.FlavorInGroup + for _, testFlavor := range tc.Flavors { + flavorInGroups = append(flavorInGroups, testFlavor.ToFlavorInGroup()) + } + + // Use EnvInfoVersion if specified (non-zero), otherwise default to CommitmentRequest.InfoVersion + envInfoVersion := tc.CommitmentRequest.InfoVersion + if tc.EnvInfoVersion != 0 { + envInfoVersion = tc.EnvInfoVersion + } + + flavorGroups := TestFlavorGroup{ + infoVersion: envInfoVersion, + flavors: flavorInGroups, + }.ToFlavorGroupsKnowledge() + + // Convert existing reservations with auto-numbering per commitment + var existingReservations []*v1alpha1.Reservation + numberCounters := make(map[string]int) + for _, testRes := range tc.ExistingReservations { + number := numberCounters[testRes.CommitmentID] + numberCounters[testRes.CommitmentID]++ + existingReservations = append(existingReservations, testRes.toReservation(number)) + } + + // Create test environment with available resources and custom config if provided + env := newCommitmentTestEnv(t, vms, nil, existingReservations, flavorGroups, tc.AvailableResources, tc.CustomConfig) + defer env.Close() + + t.Log("Initial state:") + env.LogStateSummary() + + // Call commitment change API + reqJSON := buildRequestJSON(tc.CommitmentRequest) + resp, respJSON, statusCode := env.CallChangeCommitmentsAPI(reqJSON) + + t.Log("After API call:") + env.LogStateSummary() + + // Verify API response + env.VerifyAPIResponse(tc.ExpectedAPIResponse, resp, respJSON, statusCode) + + // Verify reservations using content-based matching + env.VerifyReservationsMatch(tc.ExpectedReservations) + + // Log final test result + if t.Failed() { + t.Log("❌ Test FAILED") + } else { + t.Log("βœ… Test PASSED") + } +} + +// ============================================================================ +// Test Types & Constants +// ============================================================================ + +const ( + defaultFlavorDiskGB = 40 + flavorGroupsKnowledgeName = "flavor-groups" + knowledgeRecencyDuration = 60 * time.Second + defaultCommitmentExpiryYears = 1 +) + +type CommitmentChangeTestCase struct { + Name string + VMs []*TestVM + Flavors []*TestFlavor + ExistingReservations []*TestReservation + CommitmentRequest CommitmentChangeRequest + ExpectedReservations []*TestReservation + ExpectedAPIResponse APIResponseExpectation + AvailableResources *AvailableResources // If nil, all reservations accepted without checks + EnvInfoVersion int64 // Override InfoVersion for version mismatch tests + CustomConfig *Config // Override default config for testing timeout behavior +} + +// AvailableResources defines available memory per host (MB). +// Scheduler uses first-come-first-serve. CPU is ignored. +type AvailableResources struct { + PerHost map[string]int64 // host -> available memory MB +} + +type TestFlavorGroup struct { + infoVersion int64 + flavors []compute.FlavorInGroup +} + +func (tfg TestFlavorGroup) ToFlavorGroupsKnowledge() FlavorGroupsKnowledge { + groupMap := make(map[string][]compute.FlavorInGroup) + + for _, flavor := range tfg.flavors { + groupName := flavor.ExtraSpecs["quota:hw_version"] + if groupName == "" { + panic("Flavor " + flavor.Name + " is missing quota:hw_version in extra specs") + } + groupMap[groupName] = append(groupMap[groupName], flavor) + } + + var groups []compute.FlavorGroupFeature + for groupName, groupFlavors := range groupMap { + if len(groupFlavors) == 0 { + continue + } + + // Sort descending: required by reservation manager's flavor selection + sort.Slice(groupFlavors, func(i, j int) bool { + return groupFlavors[i].MemoryMB > groupFlavors[j].MemoryMB + }) + + smallest := groupFlavors[len(groupFlavors)-1] + largest := groupFlavors[0] + + groups = append(groups, compute.FlavorGroupFeature{ + Name: groupName, + Flavors: groupFlavors, + SmallestFlavor: smallest, + LargestFlavor: largest, + }) + } + + return FlavorGroupsKnowledge{ + InfoVersion: tfg.infoVersion, + Groups: groups, + } +} + +type FlavorGroupsKnowledge struct { + InfoVersion int64 + Groups []compute.FlavorGroupFeature +} + +type CommitmentChangeRequest struct { + AZ string + DryRun bool + InfoVersion int64 + Commitments []TestCommitment +} + +type TestCommitment struct { + ResourceName liquid.ResourceName + ProjectID string + ConfirmationID string + State string + Amount uint64 +} + +type APIResponseExpectation struct { + StatusCode int + RejectReasonSubstrings []string + RetryAtPresent bool +} + +type ReservationVerification struct { + Host string + Allocations map[string]string +} + +type VM struct { + UUID string + FlavorName string + ProjectID string + CurrentHypervisor string + AvailabilityZone string + Resources map[string]int64 + FlavorExtraSpecs map[string]string +} + +type TestFlavor struct { + Name string + Group string + MemoryMB int64 + VCPUs int64 + DiskGB uint64 +} + +func (f *TestFlavor) ToFlavorInGroup() compute.FlavorInGroup { + diskGB := f.DiskGB + if diskGB == 0 { + diskGB = defaultFlavorDiskGB + } + return compute.FlavorInGroup{ + Name: f.Name, + MemoryMB: uint64(f.MemoryMB), //nolint:gosec // test values are always positive + VCPUs: uint64(f.VCPUs), //nolint:gosec // test values are always positive + DiskGB: diskGB, + ExtraSpecs: map[string]string{ + "quota:hw_version": f.Group, + }, + } +} + +type TestVM struct { + UUID string + Flavor *TestFlavor + ProjectID string + Host string + AZ string +} + +func (vm *TestVM) ToVM() VM { + return VM{ + UUID: vm.UUID, + FlavorName: vm.Flavor.Name, + ProjectID: vm.ProjectID, + CurrentHypervisor: vm.Host, + AvailabilityZone: vm.AZ, + Resources: map[string]int64{ + "memory": vm.Flavor.MemoryMB, + "vcpus": vm.Flavor.VCPUs, + }, + FlavorExtraSpecs: map[string]string{ + "quota:hw_version": vm.Flavor.Group, + }, + } +} + +type TestReservation struct { + CommitmentID string + Host string // Empty = any host accepted in matching + Flavor *TestFlavor + ProjectID string + VMs []string // VM UUIDs + MemoryMB int64 // If 0, uses Flavor.MemoryMB; else custom size + AZ string +} + +func (tr *TestReservation) toReservation(number int) *v1alpha1.Reservation { + name := fmt.Sprintf("commitment-%s-%d", tr.CommitmentID, number) + + memoryMB := tr.MemoryMB + if memoryMB == 0 { + memoryMB = tr.Flavor.MemoryMB + } + + specAllocations := make(map[string]v1alpha1.CommittedResourceAllocation) + statusAllocations := make(map[string]string) + for _, vmUUID := range tr.VMs { + specAllocations[vmUUID] = v1alpha1.CommittedResourceAllocation{ + CreationTimestamp: metav1.Now(), + Resources: map[hv1.ResourceName]resource.Quantity{ + "memory": resource.MustParse(strconv.FormatInt(memoryMB, 10) + "Mi"), + "cpu": resource.MustParse(strconv.FormatInt(tr.Flavor.VCPUs, 10)), + }, + } + statusAllocations[vmUUID] = tr.Host + } + + spec := v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + TargetHost: tr.Host, + Resources: map[hv1.ResourceName]resource.Quantity{ + "memory": resource.MustParse(strconv.FormatInt(memoryMB, 10) + "Mi"), + "cpu": resource.MustParse(strconv.FormatInt(tr.Flavor.VCPUs, 10)), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + CommitmentUUID: tr.CommitmentID, + ProjectID: tr.ProjectID, + ResourceName: tr.Flavor.Name, + ResourceGroup: tr.Flavor.Group, + Allocations: specAllocations, + }, + } + + if tr.AZ != "" { + spec.AvailabilityZone = tr.AZ + } + + return &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: spec, + Status: v1alpha1.ReservationStatus{ + Conditions: []metav1.Condition{ + { + Type: v1alpha1.ReservationConditionReady, + Status: metav1.ConditionTrue, + Reason: "ReservationActive", + }, + }, + Host: tr.Host, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationStatus{ + Allocations: statusAllocations, + }, + }, + } +} + +// ============================================================================ +// Test Environment +// ============================================================================ + +type CommitmentTestEnv struct { + T *testing.T + Scheme *runtime.Scheme + K8sClient client.Client + VMSource *MockVMSource + FlavorGroups FlavorGroupsKnowledge + HTTPServer *httptest.Server + API *HTTPAPI + availableResources map[string]int64 // host -> available memory MB + processedReserv map[string]bool // track processed reservations + mu sync.Mutex // protects availableResources and processedReserv +} + +// FakeReservationController simulates synchronous reservation controller. +type FakeReservationController struct { + env *CommitmentTestEnv +} + +func (c *FakeReservationController) OnReservationCreated(res *v1alpha1.Reservation) { + c.env.processNewReservation(res) +} + +func (c *FakeReservationController) OnReservationDeleted(res *v1alpha1.Reservation) { + c.env.mu.Lock() + defer c.env.mu.Unlock() + + // Return memory when Delete() is called directly (before deletion timestamp is set) + if c.env.availableResources != nil && res.Status.Host != "" { + memoryQuantity := res.Spec.Resources["memory"] + memoryBytes := memoryQuantity.Value() + memoryMB := memoryBytes / (1024 * 1024) + + if _, exists := c.env.availableResources[res.Status.Host]; exists { + c.env.availableResources[res.Status.Host] += memoryMB + c.env.T.Logf("↩ Returned %d MB to %s (now %d MB available) via OnReservationDeleted for %s", + memoryMB, res.Status.Host, c.env.availableResources[res.Status.Host], res.Name) + } + } + + // Clear tracking so recreated reservations with same name are processed + delete(c.env.processedReserv, res.Name) +} + +// operationInterceptorClient routes reservation events to FakeReservationController. +type operationInterceptorClient struct { + client.Client + controller *FakeReservationController +} + +func (d *operationInterceptorClient) Create(ctx context.Context, obj client.Object, opts ...client.CreateOption) error { + err := d.Client.Create(ctx, obj, opts...) + if err != nil { + return err + } + + if res, ok := obj.(*v1alpha1.Reservation); ok { + d.controller.OnReservationCreated(res) + } + + return nil +} + +func (d *operationInterceptorClient) Delete(ctx context.Context, obj client.Object, opts ...client.DeleteOption) error { + if res, ok := obj.(*v1alpha1.Reservation); ok { + d.controller.OnReservationDeleted(res) + } + + return d.Client.Delete(ctx, obj, opts...) +} + +func (env *CommitmentTestEnv) Close() { + if env.HTTPServer != nil { + env.HTTPServer.Close() + } +} + +func newCommitmentTestEnv( + t *testing.T, + vms []VM, + hypervisors []*hv1.Hypervisor, + reservations []*v1alpha1.Reservation, + flavorGroups FlavorGroupsKnowledge, + resources *AvailableResources, + customConfig *Config, +) *CommitmentTestEnv { + + t.Helper() + + log.SetLogger(zap.New(zap.WriteTo(os.Stderr), zap.UseDevMode(true))) + + objects := make([]client.Object, 0, len(hypervisors)+len(reservations)) + for _, hv := range hypervisors { + objects = append(objects, hv) + } + for _, res := range reservations { + objects = append(objects, res) + } + + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add v1alpha1 scheme: %v", err) + } + if err := hv1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add hv1 scheme: %v", err) + } + + // InfoVersion of -1 skips Knowledge CRD creation (tests "not ready" scenario) + if flavorGroups.InfoVersion != -1 { + knowledgeCRD := createKnowledgeCRD(flavorGroups) + objects = append(objects, knowledgeCRD) + } + + baseK8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(objects...). + WithStatusSubresource(&v1alpha1.Reservation{}). + WithStatusSubresource(&v1alpha1.Knowledge{}). + WithIndex(&v1alpha1.Reservation{}, "spec.type", func(obj client.Object) []string { + res := obj.(*v1alpha1.Reservation) + return []string{string(res.Spec.Type)} + }). + Build() + + var availableResources map[string]int64 + if resources != nil && resources.PerHost != nil { + availableResources = make(map[string]int64) + for host, memMB := range resources.PerHost { + availableResources[host] = memMB + } + } + + env := &CommitmentTestEnv{ + T: t, + Scheme: scheme, + K8sClient: nil, // Will be set below + VMSource: NewMockVMSource(vms), + FlavorGroups: flavorGroups, + HTTPServer: nil, // Will be set below + API: nil, // Will be set below + availableResources: availableResources, + processedReserv: make(map[string]bool), + } + + controller := &FakeReservationController{env: env} + wrappedClient := &operationInterceptorClient{ + Client: baseK8sClient, + controller: controller, + } + env.K8sClient = wrappedClient + + // Use custom config if provided, otherwise use default + var api *HTTPAPI + if customConfig != nil { + api = NewAPIWithConfig(wrappedClient, *customConfig) + } else { + api = NewAPI(wrappedClient) + } + mux := http.NewServeMux() + api.Init(mux) + httpServer := httptest.NewServer(mux) + + env.HTTPServer = httpServer + env.API = api + + return env +} + +// ============================================================================ +// Environment Helper Methods +// ============================================================================ + +// ListVMs returns all VMs from the VMSource. +func (env *CommitmentTestEnv) ListVMs() []VM { + vms, err := env.VMSource.ListVMs(context.Background()) + if err != nil { + env.T.Fatalf("Failed to list VMs: %v", err) + } + return vms +} + +// ListReservations returns all reservations. +func (env *CommitmentTestEnv) ListReservations() []v1alpha1.Reservation { + var list v1alpha1.ReservationList + if err := env.K8sClient.List(context.Background(), &list); err != nil { + env.T.Fatalf("Failed to list reservations: %v", err) + } + return list.Items +} + +// ListHypervisors returns all hypervisors. +func (env *CommitmentTestEnv) ListHypervisors() []hv1.Hypervisor { + var list hv1.HypervisorList + if err := env.K8sClient.List(context.Background(), &list); err != nil { + env.T.Fatalf("Failed to list hypervisors: %v", err) + } + return list.Items +} + +// LogStateSummary logs a summary of the current state. +func (env *CommitmentTestEnv) LogStateSummary() { + env.T.Helper() + + hypervisors := env.ListHypervisors() + vms := env.ListVMs() + reservations := env.ListReservations() + + env.T.Log("=== State Summary ===") + env.T.Logf("Hypervisors: %d", len(hypervisors)) + env.T.Logf("VMs: %d", len(vms)) + env.T.Logf("Reservations: %d", len(reservations)) + + for _, res := range reservations { + allocCount := 0 + if res.Status.CommittedResourceReservation != nil { + allocCount = len(res.Status.CommittedResourceReservation.Allocations) + } + env.T.Logf(" - %s (host: %s, allocations: %d)", res.Name, res.Status.Host, allocCount) + } + env.T.Log("=====================") +} + +// CallChangeCommitmentsAPI calls the change commitments API endpoint with JSON. +// It uses a hybrid approach: fast polling during API execution + synchronous final pass. +func (env *CommitmentTestEnv) CallChangeCommitmentsAPI(reqJSON string) (resp liquid.CommitmentChangeResponse, respJSON string, statusCode int) { + env.T.Helper() + + // Start fast polling in background to handle reservations during API execution + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + + go func() { + ticker := time.NewTicker(5 * time.Millisecond) // Very fast - 5ms + defer ticker.Stop() + defer close(done) + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + env.processReservations() + } + } + }() + + // Make HTTP request + url := env.HTTPServer.URL + "/v1/commitments/change-commitments" + httpResp, err := http.Post(url, "application/json", bytes.NewReader([]byte(reqJSON))) //nolint:gosec,noctx // test server URL, not user input + if err != nil { + cancel() + <-done + env.T.Fatalf("Failed to make HTTP request: %v", err) + } + defer httpResp.Body.Close() + + // Read response body + respBytes, err := io.ReadAll(httpResp.Body) + if err != nil { + cancel() + <-done + env.T.Fatalf("Failed to read response body: %v", err) + } + + respJSON = string(respBytes) + + // Parse response - only for 200 OK responses + // Non-200 responses (like 409 Conflict for version mismatch) use plain text via http.Error() + if httpResp.StatusCode == http.StatusOK { + if err := json.Unmarshal(respBytes, &resp); err != nil { + cancel() + <-done + env.T.Fatalf("Failed to unmarshal response: %v", err) + } + } + + // Stop background polling + cancel() + <-done + + // Final synchronous pass to ensure all reservations are processed + // This eliminates any race conditions + env.processReservations() + + statusCode = httpResp.StatusCode + return resp, respJSON, statusCode +} + +// processReservations handles all reservation lifecycle events synchronously. +// This includes marking reservations as Ready/Failed and removing finalizers from deleted reservations. +func (env *CommitmentTestEnv) processReservations() { + ctx := context.Background() + reservations := env.ListReservations() + + for _, res := range reservations { + // Handle deletion - return memory to host and remove finalizers + if !res.DeletionTimestamp.IsZero() { + env.T.Logf("Processing deletion for reservation %s (host: %s)", res.Name, res.Status.Host) + + env.mu.Lock() + // Return memory to host if resource tracking is enabled + if env.availableResources != nil { + env.T.Logf("Resource tracking enabled, returning memory for %s", res.Name) + memoryQuantity := res.Spec.Resources["memory"] + memoryBytes := memoryQuantity.Value() + memoryMB := memoryBytes / (1024 * 1024) + + env.T.Logf("Reservation %s has host=%s, memory=%d MB", res.Name, res.Status.Host, memoryMB) + + // Check if host exists in our tracking + if _, exists := env.availableResources[res.Status.Host]; !exists { + env.mu.Unlock() + env.T.Fatalf("Host %s not found in available resources for reservation %s - this indicates an inconsistency", + res.Status.Host, res.Name) + } + + // Return memory to host + env.availableResources[res.Status.Host] += memoryMB + env.T.Logf("↩ Returned %d MB to %s (now %d MB available) from deleted reservation %s", + memoryMB, res.Status.Host, env.availableResources[res.Status.Host], res.Name) + } else { + env.T.Logf("Resource tracking NOT enabled for %s", res.Name) + } + + // Clear tracking so recreated reservations with same name are processed + delete(env.processedReserv, res.Name) + env.mu.Unlock() + + // Remove finalizers to allow deletion + if len(res.Finalizers) > 0 { + res.Finalizers = []string{} + if err := env.K8sClient.Update(ctx, &res); err != nil { + // Ignore errors - might be already deleted + continue + } + } + continue + } + + // Skip if already processed (has a condition set) + if env.hasCondition(&res) { + continue + } + + env.mu.Lock() + alreadyProcessed := env.processedReserv[res.Name] + env.mu.Unlock() + + // Skip if already tracked as processed + if alreadyProcessed { + continue + } + + // Process new reservation with resource-based scheduling + env.processNewReservation(&res) + } +} + +// hasCondition checks if a reservation has any Ready condition set. +func (env *CommitmentTestEnv) hasCondition(res *v1alpha1.Reservation) bool { + for _, cond := range res.Status.Conditions { + if cond.Type == v1alpha1.ReservationConditionReady { + return true + } + } + return false +} + +// processNewReservation implements first-come-first-serve scheduling based on available resources. +// It tries to find a host with enough memory capacity and assigns the reservation to that host. +func (env *CommitmentTestEnv) processNewReservation(res *v1alpha1.Reservation) { + env.mu.Lock() + defer env.mu.Unlock() + + env.processedReserv[res.Name] = true + + // If no available resources configured, accept all reservations without host assignment + if env.availableResources == nil { + env.markReservationReady(res) + return + } + + // Get required memory from reservation spec + memoryQuantity := res.Spec.Resources["memory"] + memoryBytes := memoryQuantity.Value() + memoryMB := memoryBytes / (1024 * 1024) + + // First-come-first-serve: find first host with enough capacity + // Sort hosts to ensure deterministic behavior (Go map iteration is random) + hosts := make([]string, 0, len(env.availableResources)) + for host := range env.availableResources { + hosts = append(hosts, host) + } + sort.Strings(hosts) + + var selectedHost string + for _, host := range hosts { + if env.availableResources[host] >= memoryMB { + selectedHost = host + break + } + } + + if selectedHost != "" { + // SUCCESS: Schedule on this host + env.availableResources[selectedHost] -= memoryMB + + // Update reservation with selected host + ctx := context.Background() + + // Update spec (TargetHost) + res.Spec.TargetHost = selectedHost + if err := env.K8sClient.Update(ctx, res); err != nil { + env.T.Logf("Warning: Failed to update reservation spec: %v", err) + } + + // Update status (Host) - requires Status().Update + res.Status.Host = selectedHost + if err := env.K8sClient.Status().Update(ctx, res); err != nil { + env.T.Logf("Warning: Failed to update reservation status host: %v", err) + } + + env.markReservationReady(res) + env.T.Logf("βœ“ Scheduled reservation %s on %s (%d MB used, %d MB remaining)", + res.Name, selectedHost, memoryMB, env.availableResources[selectedHost]) + } else { + // FAILURE: No host has enough capacity + env.markReservationFailed(res, "Insufficient capacity on all hosts") + env.T.Logf("βœ— Failed to schedule reservation %s (needs %d MB, no host has capacity)", + res.Name, memoryMB) + } +} + +// markReservationReady updates a reservation to have Ready=True status. +func (env *CommitmentTestEnv) markReservationReady(res *v1alpha1.Reservation) { + res.Status.Conditions = []metav1.Condition{ + { + Type: v1alpha1.ReservationConditionReady, + Status: metav1.ConditionTrue, + Reason: "ReservationActive", + Message: "Reservation is ready (set by test controller)", + LastTransitionTime: metav1.Now(), + }, + } + + if err := env.K8sClient.Status().Update(context.Background(), res); err != nil { + // Ignore errors - might be deleted during update + return + } +} + +// markReservationFailed updates a reservation to have Ready=False status (scheduling failed). +func (env *CommitmentTestEnv) markReservationFailed(res *v1alpha1.Reservation, reason string) { + res.Status.Conditions = []metav1.Condition{ + { + Type: v1alpha1.ReservationConditionReady, + Status: metav1.ConditionFalse, + Reason: "SchedulingFailed", + Message: reason, + LastTransitionTime: metav1.Now(), + }, + } + + if err := env.K8sClient.Status().Update(context.Background(), res); err != nil { + // Ignore errors - might be deleted during update + return + } +} + +// VerifyAPIResponse verifies the API response matches expectations. +// For rejection reasons, it checks if ALL expected substrings are present in the actual rejection reason. +func (env *CommitmentTestEnv) VerifyAPIResponse(expected APIResponseExpectation, actual liquid.CommitmentChangeResponse, respJSON string, statusCode int) { + env.T.Helper() + + if statusCode != expected.StatusCode { + env.T.Errorf("Expected status code %d, got %d", expected.StatusCode, statusCode) + } + + if len(expected.RejectReasonSubstrings) > 0 { + if actual.RejectionReason == "" { + env.T.Errorf("Expected rejection reason containing substrings %v, got none", expected.RejectReasonSubstrings) + } else { + // Check that ALL expected substrings are present + for _, substring := range expected.RejectReasonSubstrings { + if !strings.Contains(actual.RejectionReason, substring) { + env.T.Errorf("Expected rejection reason to contain %q, but got %q", substring, actual.RejectionReason) + } + } + } + } else { + if actual.RejectionReason != "" { + env.T.Errorf("Expected no rejection reason, got %q", actual.RejectionReason) + } + } + + // Check RetryAt field presence in JSON (avoids dealing with option.Option type) + retryAtPresent := strings.Contains(respJSON, `"retryAt"`) + if expected.RetryAtPresent { + if !retryAtPresent { + env.T.Error("Expected retryAt field to be present in JSON response, but it was not found") + } + } else { + if retryAtPresent { + env.T.Error("Expected retryAt field to be absent from JSON response, but it was found") + } + } +} + +// VerifyReservationsMatch verifies that actual reservations match expected reservations by content. +func (env *CommitmentTestEnv) VerifyReservationsMatch(expected []*TestReservation) { + env.T.Helper() + + actualReservations := env.ListReservations() + + // Make copies of both lists so we can remove matched items + expectedCopy := make([]*TestReservation, len(expected)) + copy(expectedCopy, expected) + + actualCopy := make([]v1alpha1.Reservation, len(actualReservations)) + copy(actualCopy, actualReservations) + + // Track unmatched items for detailed reporting + var unmatchedExpected []*TestReservation + var unmatchedActual []v1alpha1.Reservation + + // Greedy matching: while there are expected items, find matches and remove + for len(expectedCopy) > 0 { + exp := expectedCopy[0] + found := false + + // Find first actual that matches this expected + for i, actual := range actualCopy { + if env.reservationMatches(exp, &actual) { + expectedCopy = expectedCopy[1:] + actualCopy = append(actualCopy[:i], actualCopy[i+1:]...) + found = true + break + } + } + + if !found { + unmatchedExpected = append(unmatchedExpected, exp) + expectedCopy = expectedCopy[1:] + } + } + + unmatchedActual = actualCopy + + // If there are any mismatches, print detailed comparison + if len(unmatchedExpected) > 0 || len(unmatchedActual) > 0 { + env.T.Error("❌ Reservation mismatch detected!") + env.T.Log("") + env.T.Log("═══════════════════════════════════════════════════════════════") + env.T.Log("EXPECTED RESERVATIONS:") + env.T.Log("═══════════════════════════════════════════════════════════════") + env.printExpectedReservations(expected, unmatchedExpected) + + env.T.Log("") + env.T.Log("═══════════════════════════════════════════════════════════════") + env.T.Log("ACTUAL RESERVATIONS:") + env.T.Log("═══════════════════════════════════════════════════════════════") + env.printActualReservations(actualReservations, unmatchedActual) + + env.T.Log("") + env.T.Log("═══════════════════════════════════════════════════════════════") + env.T.Log("DIFF SUMMARY:") + env.T.Log("═══════════════════════════════════════════════════════════════") + env.printDiffSummary(unmatchedExpected, unmatchedActual) + env.T.Log("═══════════════════════════════════════════════════════════════") + } +} + +// String returns a compact string representation of a TestReservation. +func (tr *TestReservation) String() string { + flavorName := "" + flavorGroup := "" + if tr.Flavor != nil { + flavorName = tr.Flavor.Name + flavorGroup = tr.Flavor.Group + } + + host := tr.Host + if host == "" { + host = "" + } + + az := tr.AZ + if az == "" { + az = "" + } + + vmInfo := "" + if len(tr.VMs) > 0 { + vmInfo = fmt.Sprintf(" VMs=%v", tr.VMs) + } + + return fmt.Sprintf("%s/%s/%s(%s)/%s/az=%s%s", tr.CommitmentID, tr.ProjectID, flavorName, flavorGroup, host, az, vmInfo) +} + +// compactReservationString returns a compact string representation of an actual Reservation. +func compactReservationString(res *v1alpha1.Reservation) string { + commitmentID := "" + projectID := "" + flavorName := "" + flavorGroup := "" + vmCount := 0 + + if res.Spec.CommittedResourceReservation != nil { + commitmentID = res.Spec.CommittedResourceReservation.CommitmentUUID + projectID = res.Spec.CommittedResourceReservation.ProjectID + flavorName = res.Spec.CommittedResourceReservation.ResourceName + flavorGroup = res.Spec.CommittedResourceReservation.ResourceGroup + if res.Status.CommittedResourceReservation != nil { + vmCount = len(res.Status.CommittedResourceReservation.Allocations) + } + } + + host := res.Status.Host + if host == "" { + host = "" + } + + az := res.Spec.AvailabilityZone + if az == "" { + az = "" + } + + vmInfo := "" + if vmCount > 0 { + vmInfo = fmt.Sprintf(" VMs=%d", vmCount) + } + + return fmt.Sprintf("%s/%s/%s(%s)/%s/az=%s%s", commitmentID, projectID, flavorName, flavorGroup, host, az, vmInfo) +} + +// printExpectedReservations prints all expected reservations with markers for unmatched ones. +func (env *CommitmentTestEnv) printExpectedReservations(all, unmatched []*TestReservation) { + env.T.Helper() + + unmatchedMap := make(map[*TestReservation]bool) + for _, res := range unmatched { + unmatchedMap[res] = true + } + + if len(all) == 0 { + env.T.Log(" (none)") + return + } + + for i, res := range all { + marker := "βœ“" + if unmatchedMap[res] { + marker = "βœ—" + } + env.T.Logf(" %s [%d] %s", marker, i+1, res.String()) + } + + env.T.Logf(" Total: %d (%d matched, %d missing)", + len(all), len(all)-len(unmatched), len(unmatched)) +} + +// printActualReservations prints all actual reservations with markers for unmatched ones. +func (env *CommitmentTestEnv) printActualReservations(all, unmatched []v1alpha1.Reservation) { + env.T.Helper() + + unmatchedMap := make(map[string]bool) + for _, res := range unmatched { + unmatchedMap[res.Name] = true + } + + if len(all) == 0 { + env.T.Log(" (none)") + return + } + + for i, res := range all { + marker := "βœ“" + if unmatchedMap[res.Name] { + marker = "βŠ•" + } + env.T.Logf(" %s [%d] %s", marker, i+1, compactReservationString(&res)) + } + + env.T.Logf(" Total: %d (%d matched, %d unexpected)", + len(all), len(all)-len(unmatched), len(unmatched)) +} + +// printDiffSummary prints a summary of differences between expected and actual. +func (env *CommitmentTestEnv) printDiffSummary(unmatchedExpected []*TestReservation, unmatchedActual []v1alpha1.Reservation) { + env.T.Helper() + + if len(unmatchedExpected) > 0 { + env.T.Logf(" MISSING (%d expected, not found):", len(unmatchedExpected)) + for _, res := range unmatchedExpected { + env.T.Logf(" β€’ %s", res.String()) + } + } + + if len(unmatchedActual) > 0 { + env.T.Logf(" UNEXPECTED (%d found, not expected):", len(unmatchedActual)) + for _, res := range unmatchedActual { + env.T.Logf(" β€’ %s", compactReservationString(&res)) + } + } + + if len(unmatchedExpected) == 0 && len(unmatchedActual) == 0 { + env.T.Log(" βœ“ All match!") + } +} + +// reservationMatches checks if an actual reservation matches an expected one. +// All fields are checked comprehensively for complete validation. +func (env *CommitmentTestEnv) reservationMatches(expected *TestReservation, actual *v1alpha1.Reservation) bool { + // Check CommitmentID (from reservation name prefix) + if !strings.HasPrefix(actual.Name, "commitment-"+expected.CommitmentID+"-") { + return false + } + + // Check that CommittedResourceReservation spec exists + if actual.Spec.CommittedResourceReservation == nil { + return false + } + + // Check CommitmentUUID in spec matches + if actual.Spec.CommittedResourceReservation.CommitmentUUID != expected.CommitmentID { + return false + } + + // Check ProjectID + if actual.Spec.CommittedResourceReservation.ProjectID != expected.ProjectID { + return false + } + + // Check ResourceName (flavor name) + if expected.Flavor != nil { + if actual.Spec.CommittedResourceReservation.ResourceName != expected.Flavor.Name { + return false + } + } + + // Check ResourceGroup (flavor group) + if expected.Flavor != nil { + if actual.Spec.CommittedResourceReservation.ResourceGroup != expected.Flavor.Group { + return false + } + } + + // Check Host (if specified in expected) + if expected.Host != "" && actual.Status.Host != expected.Host { + return false + } + + // Check AZ (if specified in expected) + if expected.AZ != "" && actual.Spec.AvailabilityZone != expected.AZ { + return false + } + + // Check Memory (use custom MemoryMB if non-zero, otherwise use flavor size) + expectedMemoryMB := expected.MemoryMB + if expectedMemoryMB == 0 && expected.Flavor != nil { + expectedMemoryMB = expected.Flavor.MemoryMB + } + memoryQuantity := actual.Spec.Resources["memory"] + actualMemoryBytes := memoryQuantity.Value() + actualMemoryMB := actualMemoryBytes / (1024 * 1024) + if actualMemoryMB != expectedMemoryMB { + return false + } + + // Check CPU (from flavor if available) + if expected.Flavor != nil { + cpuQuantity := actual.Spec.Resources["cpu"] + actualCPU := cpuQuantity.Value() + if actualCPU != expected.Flavor.VCPUs { + return false + } + } + + // Check VM allocations (set comparison - order doesn't matter) + if !env.vmAllocationsMatch(expected.VMs, actual) { + return false + } + + // Check reservation type + if actual.Spec.Type != v1alpha1.ReservationTypeCommittedResource { + return false + } + + return true +} + +// vmAllocationsMatch checks if VM allocations match (set comparison). +func (env *CommitmentTestEnv) vmAllocationsMatch(expectedVMs []string, actual *v1alpha1.Reservation) bool { + if actual.Status.CommittedResourceReservation == nil { + return len(expectedVMs) == 0 + } + + actualVMs := make(map[string]bool) + for vmUUID := range actual.Status.CommittedResourceReservation.Allocations { + actualVMs[vmUUID] = true + } + + // Check counts match + if len(expectedVMs) != len(actualVMs) { + return false + } + + // Check all expected VMs are in actual + for _, vmUUID := range expectedVMs { + if !actualVMs[vmUUID] { + return false + } + } + + return true +} + +// ============================================================================ +// Mock VM Source +// ============================================================================ + +// MockVMSource implements VMSource for testing. +type MockVMSource struct { + VMs []VM +} + +// NewMockVMSource creates a new MockVMSource with the given VMs. +func NewMockVMSource(vms []VM) *MockVMSource { + return &MockVMSource{VMs: vms} +} + +// ListVMs returns the configured VMs. +func (s *MockVMSource) ListVMs(_ context.Context) ([]VM, error) { + return s.VMs, nil +} + +// ============================================================================ +// Helper Functions +// ============================================================================ + +// newHypervisorWithAZ creates a Hypervisor CRD with the given parameters including availability zone. +func newHypervisorWithAZ(name string, cpuCap, memoryGi, cpuAlloc, memoryGiAlloc int, instances []hv1.Instance, traits []string, az string) *hv1.Hypervisor { + labels := make(map[string]string) + if az != "" { + labels[corev1.LabelTopologyZone] = az + } + return &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: labels, + }, + Status: hv1.HypervisorStatus{ + Capacity: map[hv1.ResourceName]resource.Quantity{ + "cpu": resource.MustParse(strconv.Itoa(cpuCap)), + "memory": resource.MustParse(strconv.Itoa(memoryGi) + "Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + "cpu": resource.MustParse(strconv.Itoa(cpuAlloc)), + "memory": resource.MustParse(strconv.Itoa(memoryGiAlloc) + "Gi"), + }, + NumInstances: len(instances), + Instances: instances, + Traits: traits, + }, + } +} + +// createCommitment creates a TestCommitment for use in test cases. +// The az parameter is optional - if empty string, no AZ constraint is set. +func createCommitment(resourceName, projectID, confirmationID, state string, amount uint64, az ...string) TestCommitment { + return TestCommitment{ + ResourceName: liquid.ResourceName(resourceName), + ProjectID: projectID, + ConfirmationID: confirmationID, + State: state, + Amount: amount, + } +} + +// newCommitmentRequest creates a CommitmentChangeRequest with the given commitments. +func newCommitmentRequest(az string, dryRun bool, infoVersion int64, commitments ...TestCommitment) CommitmentChangeRequest { + return CommitmentChangeRequest{ + AZ: az, + DryRun: dryRun, + InfoVersion: infoVersion, + Commitments: commitments, + } +} + +// newAPIResponse creates an APIResponseExpectation with 200 OK status. +func newAPIResponse(rejectReasonSubstrings ...string) APIResponseExpectation { + return APIResponseExpectation{ + StatusCode: 200, + RejectReasonSubstrings: rejectReasonSubstrings, + } +} + +// buildRequestJSON converts a test CommitmentChangeRequest to JSON string. +// Builds the nested JSON structure directly for simplicity. +func buildRequestJSON(req CommitmentChangeRequest) string { + // Group commitments by project and resource for nested structure + type projectResources map[liquid.ResourceName][]TestCommitment + byProject := make(map[string]projectResources) + + for _, commit := range req.Commitments { + if byProject[commit.ProjectID] == nil { + byProject[commit.ProjectID] = make(projectResources) + } + byProject[commit.ProjectID][commit.ResourceName] = append( + byProject[commit.ProjectID][commit.ResourceName], + commit, + ) + } + + // Build nested JSON structure + var projectParts []string + for projectID, resources := range byProject { + var resourceParts []string + for resourceName, commits := range resources { + var commitParts []string + for _, c := range commits { + expiryTime := time.Now().Add(time.Duration(defaultCommitmentExpiryYears) * 365 * 24 * time.Hour) + commitParts = append(commitParts, fmt.Sprintf(`{"uuid":"%s","newStatus":"%s","amount":%d,"expiresAt":"%s"}`, + c.ConfirmationID, c.State, c.Amount, expiryTime.Format(time.RFC3339))) + } + resourceParts = append(resourceParts, fmt.Sprintf(`"%s":{"commitments":[%s]}`, + resourceName, strings.Join(commitParts, ","))) + } + projectParts = append(projectParts, fmt.Sprintf(`"%s":{"byResource":{%s}}`, + projectID, strings.Join(resourceParts, ","))) + } + + return fmt.Sprintf(`{"az":"%s","dryRun":%t,"infoVersion":%d,"byProject":{%s}}`, + req.AZ, req.DryRun, req.InfoVersion, strings.Join(projectParts, ",")) +} + +// createKnowledgeCRD creates a Knowledge CRD populated with flavor groups. +func createKnowledgeCRD(flavorGroups FlavorGroupsKnowledge) *v1alpha1.Knowledge { + rawExt, err := v1alpha1.BoxFeatureList(flavorGroups.Groups) + if err != nil { + panic("Failed to box flavor groups: " + err.Error()) + } + + lastContentChange := time.Unix(flavorGroups.InfoVersion, 0) + + return &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{ + Name: flavorGroupsKnowledgeName, + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{ + Name: flavorGroupsKnowledgeName, + }, + Recency: metav1.Duration{Duration: knowledgeRecencyDuration}, + }, + Status: v1alpha1.KnowledgeStatus{ + LastExtracted: metav1.Time{Time: lastContentChange}, + LastContentChange: metav1.Time{Time: lastContentChange}, + Raw: rawExt, + RawLength: len(flavorGroups.Groups), + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionTrue, + Reason: "KnowledgeReady", + Message: "Flavor groups knowledge is ready", + LastTransitionTime: metav1.Time{Time: lastContentChange}, + }, + }, + }, + } +} diff --git a/internal/scheduling/reservations/commitments/api_info.go b/internal/scheduling/reservations/commitments/api_info.go new file mode 100644 index 000000000..db02dd708 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_info.go @@ -0,0 +1,117 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "strings" + "time" + + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/go-logr/logr" + liquid "github.com/sapcc/go-api-declarations/liquid" +) + +// handles GET /v1/info requests from Limes: +// See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid +func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) { + // Extract or generate request ID for tracing + requestID := r.Header.Get("X-Request-ID") + if requestID == "" { + requestID = fmt.Sprintf("req-%d", time.Now().UnixNano()) + } + log := commitmentApiLog.WithValues("requestID", requestID, "endpoint", "/v1/info") + + // Only accept GET method + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + log.V(1).Info("processing info request") + + // Build info response + info, err := api.buildServiceInfo(r.Context(), log) + if err != nil { + // Use Info level for expected conditions like knowledge not being ready yet + log.Info("service info not available yet", "error", err.Error()) + http.Error(w, "Service temporarily unavailable: "+err.Error(), + http.StatusServiceUnavailable) + return + } + + // Return response + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(info); err != nil { + log.Error(err, "failed to encode service info") + return + } +} + +// buildServiceInfo constructs the ServiceInfo response with metadata for all flavor groups. +func (api *HTTPAPI) buildServiceInfo(ctx context.Context, log logr.Logger) (liquid.ServiceInfo, error) { + // Get all flavor groups from Knowledge CRDs + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: api.client} + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) + if err != nil { + // Return -1 as version when knowledge is not ready + return liquid.ServiceInfo{ + Version: -1, + Resources: make(map[liquid.ResourceName]liquid.ResourceInfo), + }, err + } + + // Build resources map + resources := make(map[liquid.ResourceName]liquid.ResourceInfo) + for groupName, groupData := range flavorGroups { + resourceName := liquid.ResourceName("ram_" + groupName) + + flavorNames := make([]string, 0, len(groupData.Flavors)) + for _, flavor := range groupData.Flavors { + flavorNames = append(flavorNames, flavor.Name) + } + displayName := fmt.Sprintf( + "multiples of %d MiB (usable by: %s)", + groupData.SmallestFlavor.MemoryMB, + strings.Join(flavorNames, ", "), + ) + + resources[resourceName] = liquid.ResourceInfo{ + DisplayName: displayName, + Unit: liquid.UnitNone, // Countable: multiples of smallest flavor instances + Topology: liquid.AZAwareTopology, // Commitments are per-AZ + NeedsResourceDemand: false, // Capacity planning out of scope for now + HasCapacity: true, // We report capacity via /v1/report-capacity + HasQuota: false, // No quota enforcement as of now + HandlesCommitments: true, // We handle commitment changes via /v1/change-commitments + } + + log.V(1).Info("registered flavor group resource", + "resourceName", resourceName, + "flavorGroup", groupName, + "displayName", displayName, + "smallestFlavor", groupData.SmallestFlavor.Name, + "smallestRamMB", groupData.SmallestFlavor.MemoryMB) + } + + // Get last content changed from flavor group knowledge and treat it as version + var version int64 = -1 + if knowledgeCRD, err := knowledge.Get(ctx); err == nil && knowledgeCRD != nil && !knowledgeCRD.Status.LastContentChange.IsZero() { + version = knowledgeCRD.Status.LastContentChange.Unix() + } + + log.Info("built service info", + "resourceCount", len(resources), + "version", version) + + return liquid.ServiceInfo{ + Version: version, + Resources: resources, + }, nil +} diff --git a/internal/scheduling/reservations/commitments/api_info_test.go b/internal/scheduling/reservations/commitments/api_info_test.go new file mode 100644 index 000000000..71c560c19 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_info_test.go @@ -0,0 +1,78 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "net/http" + "net/http/httptest" + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestHandleInfo_KnowledgeNotReady(t *testing.T) { + // Test when flavor groups knowledge is not available + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + // No Knowledge CRD created - simulates knowledge not ready + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + api := &HTTPAPI{ + client: k8sClient, + } + + req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody) + w := httptest.NewRecorder() + + api.HandleInfo(w, req) + + resp := w.Result() + defer resp.Body.Close() + + // Should return 503 Service Unavailable when knowledge is not ready + if resp.StatusCode != http.StatusServiceUnavailable { + t.Errorf("expected status code %d (Service Unavailable), got %d", http.StatusServiceUnavailable, resp.StatusCode) + } + + // Verify Content-Type is text/plain (set by http.Error) + contentType := resp.Header.Get("Content-Type") + if contentType != "text/plain; charset=utf-8" { + t.Errorf("expected Content-Type 'text/plain; charset=utf-8', got %q", contentType) + } +} + +func TestHandleInfo_MethodNotAllowed(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + api := &HTTPAPI{ + client: k8sClient, + } + + // Use POST instead of GET + req := httptest.NewRequest(http.MethodPost, "/v1/info", http.NoBody) + w := httptest.NewRecorder() + + api.HandleInfo(w, req) + + resp := w.Result() + defer resp.Body.Close() + + if resp.StatusCode != http.StatusMethodNotAllowed { + t.Errorf("expected status code %d (Method Not Allowed), got %d", http.StatusMethodNotAllowed, resp.StatusCode) + } +} diff --git a/internal/scheduling/reservations/commitments/api_report_capacity.go b/internal/scheduling/reservations/commitments/api_report_capacity.go new file mode 100644 index 000000000..0ec1f5e7d --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_report_capacity.go @@ -0,0 +1,61 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "encoding/json" + "fmt" + "net/http" + "time" + + "github.com/sapcc/go-api-declarations/liquid" +) + +// handles POST /v1/report-capacity requests from Limes: +// See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid +// Reports available capacity across all flavor group resources. Note, unit is specified in the Info API response with multiple of the smallest memory resource unit within a flavor group. +func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) { + // Extract or generate request ID for tracing + requestID := r.Header.Get("X-Request-ID") + if requestID == "" { + requestID = fmt.Sprintf("req-%d", time.Now().UnixNano()) + } + log := commitmentApiLog.WithValues("requestID", requestID, "endpoint", "/v1/report-capacity") + + // Only accept POST method + if r.Method != http.MethodPost { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + log.V(1).Info("processing report capacity request") + + // Parse request body (may be empty or contain ServiceCapacityRequest) + var req liquid.ServiceCapacityRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + // Empty body is acceptable for capacity reports + req = liquid.ServiceCapacityRequest{} + } + + // Calculate capacity + calculator := NewCapacityCalculator(api.client) + report, err := calculator.CalculateCapacity(r.Context()) + if err != nil { + log.Error(err, "failed to calculate capacity") + http.Error(w, "Failed to calculate capacity: "+err.Error(), + http.StatusInternalServerError) + return + } + + log.Info("calculated capacity report", "resourceCount", len(report.Resources)) + + // Return response + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(report); err != nil { + log.Error(err, "failed to encode capacity report") + return + } +} diff --git a/internal/scheduling/reservations/commitments/api_report_capacity_test.go b/internal/scheduling/reservations/commitments/api_report_capacity_test.go new file mode 100644 index 000000000..76140e218 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_report_capacity_test.go @@ -0,0 +1,285 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/sapcc/go-api-declarations/liquid" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" +) + +func TestHandleReportCapacity(t *testing.T) { + // Setup fake client + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + // Create empty flavor groups knowledge so capacity calculation doesn't fail + emptyKnowledge := createEmptyFlavorGroupKnowledge() + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(emptyKnowledge). + Build() + + api := NewAPI(fakeClient) + + tests := []struct { + name string + method string + body interface{} + expectedStatus int + checkResponse func(*testing.T, *liquid.ServiceCapacityReport) + }{ + { + name: "POST request succeeds", + method: http.MethodPost, + body: liquid.ServiceCapacityRequest{}, + expectedStatus: http.StatusOK, + checkResponse: func(t *testing.T, resp *liquid.ServiceCapacityReport) { + // Resources may be nil or empty for empty capacity + if len(resp.Resources) != 0 { + t.Errorf("Expected empty or nil Resources, got %d resources", len(resp.Resources)) + } + }, + }, + { + name: "POST with empty body succeeds", + method: http.MethodPost, + body: nil, + expectedStatus: http.StatusOK, + checkResponse: func(t *testing.T, resp *liquid.ServiceCapacityReport) { + // Resources may be nil or empty for empty capacity + if len(resp.Resources) != 0 { + t.Errorf("Expected empty or nil Resources, got %d resources", len(resp.Resources)) + } + }, + }, + { + name: "GET request fails", + method: http.MethodGet, + body: nil, + expectedStatus: http.StatusMethodNotAllowed, + checkResponse: nil, + }, + { + name: "PUT request fails", + method: http.MethodPut, + body: nil, + expectedStatus: http.StatusMethodNotAllowed, + checkResponse: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create request + var req *http.Request + if tt.body != nil { + bodyBytes, err := json.Marshal(tt.body) + if err != nil { + t.Fatal(err) + } + req = httptest.NewRequest(tt.method, "/v1/report-capacity", bytes.NewReader(bodyBytes)) + } else { + req = httptest.NewRequest(tt.method, "/v1/report-capacity", http.NoBody) + } + req = req.WithContext(context.Background()) + + // Create response recorder + rr := httptest.NewRecorder() + + // Call handler + api.HandleReportCapacity(rr, req) + + // Check status code + if rr.Code != tt.expectedStatus { + t.Errorf("Expected status %d, got %d", tt.expectedStatus, rr.Code) + } + + // Check response if applicable + if tt.checkResponse != nil && rr.Code == http.StatusOK { + var resp liquid.ServiceCapacityReport + if err := json.NewDecoder(rr.Body).Decode(&resp); err != nil { + t.Fatalf("Failed to decode response: %v", err) + } + tt.checkResponse(t, &resp) + } + }) + } +} + +func TestCapacityCalculator(t *testing.T) { + // Setup fake client with Knowledge CRD + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + t.Run("CalculateCapacity returns error when no flavor groups knowledge exists", func(t *testing.T) { + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + calculator := NewCapacityCalculator(fakeClient) + _, err := calculator.CalculateCapacity(context.Background()) + if err == nil { + t.Fatal("Expected error when flavor groups knowledge doesn't exist, got nil") + } + if !strings.Contains(err.Error(), "not found") { + t.Errorf("Expected 'not found' error, got: %v", err) + } + }) + + t.Run("CalculateCapacity returns empty report when flavor groups knowledge exists but is empty", func(t *testing.T) { + // Create empty flavor groups knowledge + emptyKnowledge := createEmptyFlavorGroupKnowledge() + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(emptyKnowledge). + Build() + + calculator := NewCapacityCalculator(fakeClient) + report, err := calculator.CalculateCapacity(context.Background()) + if err != nil { + t.Fatalf("Expected no error, got: %v", err) + } + + if report.Resources == nil { + t.Error("Expected Resources map to be initialized") + } + + if len(report.Resources) != 0 { + t.Errorf("Expected 0 resources, got %d", len(report.Resources)) + } + }) + + t.Run("CalculateCapacity returns empty perAZ when no HostDetails exist", func(t *testing.T) { + // Create a flavor group knowledge without host details + flavorGroupKnowledge := createTestFlavorGroupKnowledge(t, "test-group") + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(flavorGroupKnowledge). + Build() + + calculator := NewCapacityCalculator(fakeClient) + report, err := calculator.CalculateCapacity(context.Background()) + if err != nil { + t.Fatalf("Expected no error, got: %v", err) + } + + if len(report.Resources) != 1 { + t.Fatalf("Expected 1 resource, got %d", len(report.Resources)) + } + + resource := report.Resources[liquid.ResourceName("ram_test-group")] + if resource == nil { + t.Fatal("Expected ram_test-group resource to exist") + } + + // Should have empty perAZ map when no host details + if len(resource.PerAZ) != 0 { + t.Errorf("Expected 0 AZs, got %d", len(resource.PerAZ)) + } + }) +} + +// createEmptyFlavorGroupKnowledge creates an empty flavor groups Knowledge CRD +func createEmptyFlavorGroupKnowledge() *v1alpha1.Knowledge { + // Box empty array properly + emptyFeatures := []map[string]interface{}{} + raw, err := v1alpha1.BoxFeatureList(emptyFeatures) + if err != nil { + panic(err) // Should never happen for empty slice + } + + return &v1alpha1.Knowledge{ + ObjectMeta: v1.ObjectMeta{ + Name: "flavor-groups", + // No namespace - Knowledge is cluster-scoped + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{ + Name: "flavor_groups", + }, + }, + Status: v1alpha1.KnowledgeStatus{ + Conditions: []v1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: "True", + }, + }, + Raw: raw, + }, + } +} + +// createTestFlavorGroupKnowledge creates a test Knowledge CRD with flavor group data +func createTestFlavorGroupKnowledge(t *testing.T, groupName string) *v1alpha1.Knowledge { + t.Helper() + + features := []map[string]interface{}{ + { + "name": groupName, + "flavors": []map[string]interface{}{ + { + "name": "test_c8_m32", + "vcpus": 8, + "memoryMB": 32768, + "diskGB": 50, + }, + }, + "largestFlavor": map[string]interface{}{ + "name": "test_c8_m32", + "vcpus": 8, + "memoryMB": 32768, + "diskGB": 50, + }, + }, + } + + // Use BoxFeatureList to properly format the features + raw, err := v1alpha1.BoxFeatureList(features) + if err != nil { + t.Fatal(err) + } + + return &v1alpha1.Knowledge{ + ObjectMeta: v1.ObjectMeta{ + Name: "flavor-groups", + // No namespace - Knowledge is cluster-scoped + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{ + Name: "flavor_groups", + }, + }, + Status: v1alpha1.KnowledgeStatus{ + Conditions: []v1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: "True", + }, + }, + Raw: raw, + }, + } +} diff --git a/internal/scheduling/reservations/commitments/capacity.go b/internal/scheduling/reservations/commitments/capacity.go new file mode 100644 index 000000000..04ad177e1 --- /dev/null +++ b/internal/scheduling/reservations/commitments/capacity.go @@ -0,0 +1,124 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "fmt" + "sort" + + "github.com/sapcc/go-api-declarations/liquid" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" +) + +// CapacityCalculator computes capacity reports for Limes LIQUID API. +type CapacityCalculator struct { + client client.Client +} + +func NewCapacityCalculator(client client.Client) *CapacityCalculator { + return &CapacityCalculator{client: client} +} + +// CalculateCapacity computes per-AZ capacity for all flavor groups. +func (c *CapacityCalculator) CalculateCapacity(ctx context.Context) (liquid.ServiceCapacityReport, error) { + // Get all flavor groups from Knowledge CRDs + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: c.client} + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) + if err != nil { + return liquid.ServiceCapacityReport{}, fmt.Errorf("failed to get flavor groups: %w", err) + } + + // Build capacity report per flavor group + report := liquid.ServiceCapacityReport{ + Resources: make(map[liquid.ResourceName]*liquid.ResourceCapacityReport), + } + + for groupName, groupData := range flavorGroups { + // Resource name follows pattern: ram_ + resourceName := liquid.ResourceName("ram_" + groupName) + + // Calculate per-AZ capacity and usage + azCapacity, err := c.calculateAZCapacity(ctx, groupName, groupData) + if err != nil { + return liquid.ServiceCapacityReport{}, fmt.Errorf("failed to calculate capacity for %s: %w", groupName, err) + } + + report.Resources[resourceName] = &liquid.ResourceCapacityReport{ + PerAZ: azCapacity, + } + } + + return report, nil +} + +func (c *CapacityCalculator) calculateAZCapacity( + ctx context.Context, + _ string, // groupName - reserved for future use + _ compute.FlavorGroupFeature, // groupData - reserved for future use +) (map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport, error) { + // Get list of availability zones from HostDetails Knowledge + azs, err := c.getAvailabilityZones(ctx) + if err != nil { + return nil, fmt.Errorf("failed to get availability zones: %w", err) + } + + // Create report entry for each AZ with empty capacity/usage + // Capacity and Usage are left unset (zero value of option.Option[uint64]) + // This signals to Limes: "These AZs exist, but capacity/usage not yet calculated" + result := make(map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport) + for _, az := range azs { + result[liquid.AvailabilityZone(az)] = &liquid.AZResourceCapacityReport{ + // Both Capacity and Usage left unset (empty optional values) + // TODO: Calculate actual capacity from Reservation CRDs or host resources + // TODO: Calculate actual usage from VM allocations + } + } + + return result, nil +} + +func (c *CapacityCalculator) getAvailabilityZones(ctx context.Context) ([]string, error) { + // List all Knowledge CRDs to find host-details knowledge + var knowledgeList v1alpha1.KnowledgeList + if err := c.client.List(ctx, &knowledgeList); err != nil { + return nil, fmt.Errorf("failed to list Knowledge CRDs: %w", err) + } + + // Find host-details knowledge and extract AZs + azSet := make(map[string]struct{}) + for _, knowledge := range knowledgeList.Items { + // Look for host-details extractor + if knowledge.Spec.Extractor.Name != "host_details" { + continue + } + + // Parse features from Raw data + features, err := v1alpha1.UnboxFeatureList[compute.HostDetails](knowledge.Status.Raw) + if err != nil { + // Skip if we can't parse this knowledge + continue + } + + // Collect unique AZ names + for _, feature := range features { + if feature.AvailabilityZone != "" { + azSet[feature.AvailabilityZone] = struct{}{} + } + } + } + + // Convert set to sorted slice + azs := make([]string, 0, len(azSet)) + for az := range azSet { + azs = append(azs, az) + } + sort.Strings(azs) + + return azs, nil +} diff --git a/internal/scheduling/reservations/commitments/client.go b/internal/scheduling/reservations/commitments/client.go index 31e79c5b0..2e5585c99 100644 --- a/internal/scheduling/reservations/commitments/client.go +++ b/internal/scheduling/reservations/commitments/client.go @@ -14,11 +14,10 @@ import ( "github.com/cobaltcore-dev/cortex/pkg/keystone" "github.com/cobaltcore-dev/cortex/pkg/sso" "github.com/gophercloud/gophercloud/v2" - "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/flavors" - "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/servers" "github.com/gophercloud/gophercloud/v2/openstack/identity/v3/projects" "github.com/sapcc/go-bits/jobloop" "github.com/sapcc/go-bits/must" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -28,13 +27,8 @@ type CommitmentsClient interface { Init(ctx context.Context, client client.Client, conf SyncerConfig) error // List all projects to resolve commitments. ListProjects(ctx context.Context) ([]Project, error) - // List all flavors by their name to resolve instance commitments. - ListFlavorsByName(ctx context.Context) (map[string]Flavor, error) // List all commitments with resolved metadata (e.g. project, flavor, ...). ListCommitmentsByID(ctx context.Context, projects ...Project) (map[string]Commitment, error) - // List all servers for the given projects from nova. - // The result is a map from project ID to the list of servers. - ListServersByProjectID(ctx context.Context, projects ...Project) (map[string][]Server, error) } // Commitments client fetching commitments from openstack services. @@ -49,14 +43,13 @@ type commitmentsClient struct { limes *gophercloud.ServiceClient } -// Create a new commitments client. -// By default, this client will fetch commitments from the limes API. func NewCommitmentsClient() CommitmentsClient { return &commitmentsClient{} } -// Init the client. func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf SyncerConfig) error { + log := ctrl.Log.WithName("CommitmentClient") + var authenticatedHTTP = http.DefaultClient if conf.SSOSecretRef != nil { var err error @@ -79,7 +72,7 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf Type: "identity", Availability: "public", })) - syncLog.Info("using identity endpoint", "url", url) + log.Info("using identity endpoint", "url", url) c.keystone = &gophercloud.ServiceClient{ ProviderClient: c.provider, Endpoint: url, @@ -91,7 +84,7 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf Type: "compute", Availability: "public", })) - syncLog.Info("using nova endpoint", "url", url) + log.Info("using nova endpoint", "url", url) c.nova = &gophercloud.ServiceClient{ ProviderClient: c.provider, Endpoint: url, @@ -104,7 +97,7 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf Type: "resources", Availability: "public", })) - syncLog.Info("using limes endpoint", "url", url) + log.Info("using limes endpoint", "url", url) c.limes = &gophercloud.ServiceClient{ ProviderClient: c.provider, Endpoint: url, @@ -113,32 +106,10 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf return nil } -// Get all Nova flavors by their name to resolve instance commitments. -func (c *commitmentsClient) ListFlavorsByName(ctx context.Context) (map[string]Flavor, error) { - syncLog.Info("fetching all flavors from nova") - flo := flavors.ListOpts{AccessType: flavors.AllAccess} - pages, err := flavors.ListDetail(c.nova, flo).AllPages(ctx) - if err != nil { - return nil, err - } - // Parse the json data into our custom model. - var data = &struct { - Flavors []Flavor `json:"flavors"` - }{} - if err := pages.(flavors.FlavorPage).ExtractInto(data); err != nil { - return nil, err - } - syncLog.Info("fetched flavors from nova", "count", len(data.Flavors)) - flavorsByName := make(map[string]Flavor, len(data.Flavors)) - for _, flavor := range data.Flavors { - flavorsByName[flavor.Name] = flavor - } - return flavorsByName, nil -} - -// Get all projects from Keystone to resolve commitments. func (c *commitmentsClient) ListProjects(ctx context.Context) ([]Project, error) { - syncLog.Info("fetching projects from keystone") + log := ctrl.Log.WithName("CommitmentClient") + + log.V(1).Info("fetching projects from keystone") allPages, err := projects.List(c.keystone, nil).AllPages(ctx) if err != nil { return nil, err @@ -149,14 +120,15 @@ func (c *commitmentsClient) ListProjects(ctx context.Context) ([]Project, error) if err := allPages.(projects.ProjectPage).ExtractInto(data); err != nil { return nil, err } - syncLog.Info("fetched projects from keystone", "count", len(data.Projects)) + log.V(1).Info("fetched projects from keystone", "count", len(data.Projects)) return data.Projects, nil } -// Get all available commitments from limes + keystone + nova. -// This function fetches the commitments for each project in parallel. +// ListCommitmentsByID fetches commitments for all projects in parallel. func (c *commitmentsClient) ListCommitmentsByID(ctx context.Context, projects ...Project) (map[string]Commitment, error) { - syncLog.Info("fetching commitments from limes", "projects", len(projects)) + log := ctrl.Log.WithName("CommitmentClient") + + log.V(1).Info("fetching commitments from limes", "projects", len(projects)) commitmentsMutex := gosync.Mutex{} commitments := make(map[string]Commitment) var wg gosync.WaitGroup @@ -189,15 +161,14 @@ func (c *commitmentsClient) ListCommitmentsByID(ctx context.Context, projects .. // Return the first error encountered, if any. for err := range errChan { if err != nil { - syncLog.Error(err, "failed to resolve commitments") + log.Error(err, "failed to resolve commitments") return nil, err } } - syncLog.Info("resolved commitments from limes", "count", len(commitments)) + log.V(1).Info("resolved commitments from limes", "count", len(commitments)) return commitments, nil } -// Resolve the commitments for the given project. func (c *commitmentsClient) listCommitments(ctx context.Context, project Project) ([]Commitment, error) { url := c.limes.Endpoint + "v1" + "/domains/" + project.DomainID + @@ -232,67 +203,3 @@ func (c *commitmentsClient) listCommitments(ctx context.Context, project Project } return commitments, nil } - -// Get all servers for the given project ids from nova. -// The result is a map from project ID to the list of servers. -func (c *commitmentsClient) ListServersByProjectID(ctx context.Context, projects ...Project) (map[string][]Server, error) { - syncLog.Info("fetching servers from nova") - serversByProject := make(map[string][]Server, len(projects)) - var mu gosync.Mutex - var wg gosync.WaitGroup - ctx, cancel := context.WithCancel(ctx) - defer cancel() - // Channel to communicate errors from goroutines. - errChan := make(chan error, len(projects)) - for _, project := range projects { - wg.Go(func() { - servers, err := c.listServersForProject(ctx, project) - if err != nil { - errChan <- err - cancel() - return - } - mu.Lock() - serversByProject[project.ID] = servers - mu.Unlock() - }) - time.Sleep(jobloop.DefaultJitter(50 * time.Millisecond)) // Don't overload the API. - } - // Wait for all goroutines to finish and close the error channel. - go func() { - wg.Wait() - close(errChan) - }() - // Return the first error encountered, if any. - for err := range errChan { - if err != nil { - syncLog.Error(err, "failed to fetch servers") - return nil, err - } - } - syncLog.Info("fetched servers from nova", "projects", len(serversByProject)) - return serversByProject, nil -} - -// Get all servers for the given project id from nova. -func (c *commitmentsClient) listServersForProject(ctx context.Context, project Project) ([]Server, error) { - lo := servers.ListOpts{ - // AllTenants must be set to fetch servers from other projects - // than the one we are authenticated with. - AllTenants: true, - TenantID: project.ID, - } - pages, err := servers.List(c.nova, lo).AllPages(ctx) - if err != nil { - return nil, err - } - // Parse the json data into our custom model. - var data = &struct { - Servers []Server `json:"servers"` - }{} - if err := pages.(servers.ServerPage).ExtractInto(data); err != nil { - return nil, err - } - syncLog.Info("fetched servers for project", "project", project.ID, "count", len(data.Servers)) - return data.Servers, nil -} diff --git a/internal/scheduling/reservations/commitments/client_test.go b/internal/scheduling/reservations/commitments/client_test.go index f3a1d0a8f..be2d66ff9 100644 --- a/internal/scheduling/reservations/commitments/client_test.go +++ b/internal/scheduling/reservations/commitments/client_test.go @@ -8,7 +8,6 @@ import ( "encoding/json" "net/http" "net/http/httptest" - "reflect" "strings" "testing" "time" @@ -127,134 +126,6 @@ func TestCommitmentsClient_ListProjects_Error(t *testing.T) { } } -func TestCommitmentsClient_ListFlavorsByName(t *testing.T) { - // Mock server for Nova compute service - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if strings.Contains(r.URL.Path, "/flavors/detail") { - // Return raw JSON string as the gophercloud pages expect - w.Header().Set("Content-Type", "application/json") - _, err := w.Write([]byte(`{ - "flavors": [ - { - "id": "flavor1", - "name": "m1.small", - "ram": 2048, - "vcpus": 1, - "disk": 20, - "rxtx_factor": 1.0, - "os-flavor-access:is_public": true, - "OS-FLV-EXT-DATA:ephemeral": 0, - "description": "Small flavor", - "extra_specs": {"hw:cpu_policy": "shared"} - }, - { - "id": "flavor2", - "name": "m1.medium", - "ram": 4096, - "vcpus": 2, - "disk": 40, - "rxtx_factor": 1.0, - "os-flavor-access:is_public": true, - "OS-FLV-EXT-DATA:ephemeral": 0, - "description": "Medium flavor", - "extra_specs": {"hw:cpu_policy": "dedicated"} - } - ] - }`)) - if err != nil { - t.Fatalf("failed to write response: %v", err) - } - return - } - http.NotFound(w, r) - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL + "/", - Microversion: "2.61", - }, - } - - ctx := context.Background() - flavorsByName, err := client.ListFlavorsByName(ctx) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - expectedFlavors := map[string]Flavor{ - "m1.small": { - ID: "flavor1", - Name: "m1.small", - RAM: 2048, - VCPUs: 1, - Disk: 20, - RxTxFactor: 1.0, - IsPublic: true, - Ephemeral: 0, - Description: "Small flavor", - ExtraSpecs: map[string]string{"hw:cpu_policy": "shared"}, - }, - "m1.medium": { - ID: "flavor2", - Name: "m1.medium", - RAM: 4096, - VCPUs: 2, - Disk: 40, - RxTxFactor: 1.0, - IsPublic: true, - Ephemeral: 0, - Description: "Medium flavor", - ExtraSpecs: map[string]string{"hw:cpu_policy": "dedicated"}, - }, - } - - if len(flavorsByName) != len(expectedFlavors) { - t.Fatalf("expected %d flavors, got %d", len(expectedFlavors), len(flavorsByName)) - } - - for name, expected := range expectedFlavors { - actual, exists := flavorsByName[name] - if !exists { - t.Errorf("expected flavor %s to exist", name) - continue - } - if !reflect.DeepEqual(actual, expected) { - t.Errorf("flavor %s: expected %+v, got %+v", name, expected, actual) - } - } -} - -func TestCommitmentsClient_ListFlavorsByName_Error(t *testing.T) { - // Mock server that returns an error - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - http.Error(w, "Service Unavailable", http.StatusServiceUnavailable) - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL + "/", - }, - } - - ctx := context.Background() - flavors, err := client.ListFlavorsByName(ctx) - if err == nil { - t.Fatal("expected error, got nil") - } - if flavors != nil { - t.Errorf("expected nil flavors, got %+v", flavors) - } -} - func TestCommitmentsClient_ListCommitmentsByID(t *testing.T) { // Mock server for Limes service server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -342,147 +213,6 @@ func TestCommitmentsClient_ListCommitmentsByID(t *testing.T) { } } -func TestCommitmentsClient_ListCommitmentsByID_Error(t *testing.T) { - // Mock server that returns an error - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - http.Error(w, "Unauthorized", http.StatusUnauthorized) - })) - defer server.Close() - - client := &commitmentsClient{ - limes: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - TokenID: "test-token", - }, - Endpoint: server.URL + "/", - }, - } - - projects := []Project{ - {ID: "project1", DomainID: "domain1"}, - } - - ctx := context.Background() - commitments, err := client.ListCommitmentsByID(ctx, projects...) - if err == nil { - t.Fatal("expected error, got nil") - } - if commitments != nil { - t.Errorf("expected nil commitments, got %+v", commitments) - } -} - -func TestCommitmentsClient_ListServersByProjectID(t *testing.T) { - // Mock server for Nova compute service - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if strings.Contains(r.URL.Path, "/servers/detail") { - // Parse query parameters to determine which project - tenantID := r.URL.Query().Get("tenant_id") - - // Return raw JSON string as the gophercloud pages expect - w.Header().Set("Content-Type", "application/json") - if tenantID == "project1" { - if _, err := w.Write([]byte(`{ - "servers": [ - { - "id": "server1", - "name": "test-server-1", - "status": "ACTIVE", - "tenant_id": "project1", - "flavor": {"original_name": "m1.small"} - } - ] - }`)); err != nil { - t.Fatalf("failed to write response: %v", err) - } - } else { - if _, err := w.Write([]byte(`{"servers": []}`)); err != nil { - t.Fatalf("failed to write response: %v", err) - } - } - return - } - http.NotFound(w, r) - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL + "/", - }, - } - - projects := []Project{ - {ID: "project1", Name: "Test Project 1"}, - {ID: "project2", Name: "Test Project 2"}, - } - - ctx := context.Background() - serversByProject, err := client.ListServersByProjectID(ctx, projects...) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if len(serversByProject) != 2 { - t.Fatalf("expected 2 project entries, got %d", len(serversByProject)) - } - - // Check project1 has 1 server - servers1, exists := serversByProject["project1"] - if !exists { - t.Fatal("expected project1 to exist in results") - } - if len(servers1) != 1 { - t.Fatalf("expected 1 server for project1, got %d", len(servers1)) - } - if servers1[0].ID != "server1" { - t.Errorf("expected server ID server1, got %s", servers1[0].ID) - } - - // Check project2 has 0 servers - servers2, exists := serversByProject["project2"] - if !exists { - t.Fatal("expected project2 to exist in results") - } - if len(servers2) != 0 { - t.Fatalf("expected 0 servers for project2, got %d", len(servers2)) - } -} - -func TestCommitmentsClient_ListServersByProjectID_Error(t *testing.T) { - // Mock server that returns an error - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - http.Error(w, "Forbidden", http.StatusForbidden) - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL + "/", - }, - } - - projects := []Project{ - {ID: "project1"}, - } - - ctx := context.Background() - servers, err := client.ListServersByProjectID(ctx, projects...) - if err == nil { - t.Fatal("expected error, got nil") - } - if servers != nil { - t.Errorf("expected nil servers, got %+v", servers) - } -} - func TestCommitmentsClient_listCommitments(t *testing.T) { // Mock server for Limes service server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -625,136 +355,6 @@ func TestCommitmentsClient_listCommitments_JSONError(t *testing.T) { } } -func TestCommitmentsClient_listServersForProject(t *testing.T) { - // Mock server for Nova compute service - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if !strings.Contains(r.URL.Path, "/servers/detail") { - http.NotFound(w, r) - return - } - - // Verify query parameters - query := r.URL.Query() - if query.Get("all_tenants") != "true" { - t.Errorf("expected all_tenants=true, got %s", query.Get("all_tenants")) - } - if query.Get("tenant_id") != "test-project" { - t.Errorf("expected tenant_id=test-project, got %s", query.Get("tenant_id")) - } - - // Return raw JSON string as the gophercloud pages expect - w.Header().Set("Content-Type", "application/json") - if _, err := w.Write([]byte(`{ - "servers": [ - { - "id": "server1", - "name": "test-server", - "status": "ACTIVE", - "tenant_id": "test-project", - "flavor": {"original_name": "m1.small"} - }, - { - "id": "server2", - "name": "another-server", - "status": "ACTIVE", - "tenant_id": "test-project", - "flavor": {"original_name": "m1.medium"} - } - ] - }`)); err != nil { - t.Fatalf("failed to write response: %v", err) - } - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL + "/", - }, - } - - project := Project{ - ID: "test-project", - Name: "Test Project", - } - - ctx := context.Background() - servers, err := client.listServersForProject(ctx, project) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if len(servers) != 2 { - t.Fatalf("expected 2 servers, got %d", len(servers)) - } - - expectedServers := []Server{ - { - ID: "server1", - Name: "test-server", - Status: "ACTIVE", - TenantID: "test-project", - FlavorName: "m1.small", - }, - { - ID: "server2", - Name: "another-server", - Status: "ACTIVE", - TenantID: "test-project", - FlavorName: "m1.medium", - }, - } - - for i, expected := range expectedServers { - if servers[i].ID != expected.ID { - t.Errorf("server %d: expected ID %s, got %s", i, expected.ID, servers[i].ID) - } - if servers[i].Name != expected.Name { - t.Errorf("server %d: expected Name %s, got %s", i, expected.Name, servers[i].Name) - } - if servers[i].Status != expected.Status { - t.Errorf("server %d: expected Status %s, got %s", i, expected.Status, servers[i].Status) - } - if servers[i].TenantID != expected.TenantID { - t.Errorf("server %d: expected TenantID %s, got %s", i, expected.TenantID, servers[i].TenantID) - } - if servers[i].FlavorName != expected.FlavorName { - t.Errorf("server %d: expected FlavorName %s, got %s", i, expected.FlavorName, servers[i].FlavorName) - } - } -} - -func TestCommitmentsClient_listServersForProject_Error(t *testing.T) { - // Mock server that returns an error - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - http.Error(w, "Internal Server Error", http.StatusInternalServerError) - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL, - }, - } - - project := Project{ID: "test-project"} - - ctx := context.Background() - servers, err := client.listServersForProject(ctx, project) - if err == nil { - t.Fatal("expected error, got nil") - } - if servers != nil { - t.Errorf("expected nil servers, got %+v", servers) - } -} - func TestCommitmentsClient_ContextCancellation(t *testing.T) { // Test context cancellation handling slowServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/internal/scheduling/reservations/commitments/config.go b/internal/scheduling/reservations/commitments/config.go new file mode 100644 index 000000000..95dc904d8 --- /dev/null +++ b/internal/scheduling/reservations/commitments/config.go @@ -0,0 +1,22 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import "time" + +// Config defines the configuration for the commitments HTTP API. +type Config struct { + // how long to wait for reservations to become ready before timing out and rolling back. + ChangeAPIWatchReservationsTimeout time.Duration `json:"changeAPIWatchReservationsTimeout"` + + // how frequently to poll reservation status during watch. + ChangeAPIWatchReservationsPollInterval time.Duration `json:"changeAPIWatchReservationsPollInterval"` +} + +func DefaultConfig() Config { + return Config{ + ChangeAPIWatchReservationsTimeout: 2 * time.Second, + ChangeAPIWatchReservationsPollInterval: 100 * time.Millisecond, + } +} diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go new file mode 100644 index 000000000..21ee1fee1 --- /dev/null +++ b/internal/scheduling/reservations/commitments/reservation_manager.go @@ -0,0 +1,318 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "fmt" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/go-logr/logr" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// ReservationManager handles CRUD operations for Reservation CRDs. +type ReservationManager struct { + client.Client +} + +func NewReservationManager(k8sClient client.Client) *ReservationManager { + return &ReservationManager{ + Client: k8sClient, + } +} + +// ApplyCommitmentState synchronizes Reservation CRDs to match the desired commitment state. +// This function performs CRUD operations (create/update/delete) on reservation slots to align +// with the capacity specified in desiredState. +// +// Entry points: +// - from Syncer - periodic sync with Limes state +// - from API ChangeCommitmentsHandler - batch processing of commitment changes +// +// The function is idempotent and handles: +// - Repairing inconsistent slots (wrong flavor group/project) +// - Creating new reservation slots when capacity increases +// - Deleting unused/excess slots when capacity decreases +// - Syncing reservation metadata for all remaining slots +// +// Returns touched reservations (created/updated) and removed reservations for caller tracking. +func (m *ReservationManager) ApplyCommitmentState( + ctx context.Context, + log logr.Logger, + desiredState *CommitmentState, + flavorGroups map[string]compute.FlavorGroupFeature, + creator string, +) (touchedReservations, removedReservations []v1alpha1.Reservation, err error) { + + log = log.WithName("ReservationManager") + + // Phase 1: List and filter existing reservations for this commitment + var allReservations v1alpha1.ReservationList + if err := m.List(ctx, &allReservations, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + return nil, nil, fmt.Errorf("failed to list reservations: %w", err) + } + + // Filter by name prefix to find reservations for this commitment + namePrefix := fmt.Sprintf("commitment-%s-", desiredState.CommitmentUUID) + var existing []v1alpha1.Reservation + for _, res := range allReservations.Items { + if len(res.Name) >= len(namePrefix) && res.Name[:len(namePrefix)] == namePrefix { + existing = append(existing, res) + } + } + + // Phase 2: Calculate memory delta (desired - current) + flavorGroup, exists := flavorGroups[desiredState.FlavorGroupName] + + if !exists { + return nil, nil, fmt.Errorf("flavor group not found: %s", desiredState.FlavorGroupName) + } + deltaMemoryBytes := desiredState.TotalMemoryBytes + for _, res := range existing { + memoryQuantity := res.Spec.Resources[hv1.ResourceMemory] + deltaMemoryBytes -= memoryQuantity.Value() + } + + log.Info("applying commitment state", + "commitmentUUID", desiredState.CommitmentUUID, + "desiredMemoryBytes", desiredState.TotalMemoryBytes, + "deltaMemoryBytes", deltaMemoryBytes, + "existingSlots", len(existing), + ) + + nextSlotIndex := GetNextSlotIndex(existing) + + // Phase 3 (DELETE): Delete inconsistent reservations (wrong flavor group/project) + // They will be recreated with correct metadata in subsequent phases. + var validReservations []v1alpha1.Reservation + for _, res := range existing { + if res.Spec.CommittedResourceReservation.ResourceGroup != desiredState.FlavorGroupName || + res.Spec.CommittedResourceReservation.ProjectID != desiredState.ProjectID { + log.Info("Found a reservation with wrong flavor group or project, delete and recreate afterward", + "commitmentUUID", desiredState.CommitmentUUID, + "name", res.Name, + "expectedFlavorGroup", desiredState.FlavorGroupName, + "actualFlavorGroup", res.Spec.CommittedResourceReservation.ResourceGroup, + "expectedProjectID", desiredState.ProjectID, + "actualProjectID", res.Spec.CommittedResourceReservation.ProjectID) + removedReservations = append(removedReservations, res) + memValue := res.Spec.Resources[hv1.ResourceMemory] + deltaMemoryBytes += memValue.Value() + + if err := m.Delete(ctx, &res); err != nil { + return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", res.Name, err) + } + } else { + validReservations = append(validReservations, res) + } + } + existing = validReservations + + // Phase 4 (DELETE): Remove reservations (capacity decreased) + for deltaMemoryBytes < 0 && len(existing) > 0 { + // prefer unused reservation slot or simply remove last one + var reservationToDelete *v1alpha1.Reservation + for i, res := range existing { + if len(res.Spec.CommittedResourceReservation.Allocations) == 0 { + reservationToDelete = &res + existing = append(existing[:i], existing[i+1:]...) // remove from existing list + break + } + } + if reservationToDelete == nil { + reservationToDelete = &existing[len(existing)-1] + existing = existing[:len(existing)-1] // remove from existing list + } + removedReservations = append(removedReservations, *reservationToDelete) + memValue := reservationToDelete.Spec.Resources[hv1.ResourceMemory] + deltaMemoryBytes += memValue.Value() + + log.Info("deleting reservation (capacity decrease)", + "commitmentUUID", desiredState.CommitmentUUID, + "deltaMemoryBytes", deltaMemoryBytes, + "name", reservationToDelete.Name, + "numAllocations", len(reservationToDelete.Spec.CommittedResourceReservation.Allocations), + "memoryBytes", memValue.Value()) + + if err := m.Delete(ctx, reservationToDelete); err != nil { + return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", reservationToDelete.Name, err) + } + } + + // Phase 5 (CREATE): Create new reservations (capacity increased) + for deltaMemoryBytes > 0 { + // Need to create new reservation slots, always prefer largest flavor within the group + // TODO more sophisticated flavor selection, especially with flavors of different cpu/memory ratio + reservation := m.newReservation(desiredState, nextSlotIndex, deltaMemoryBytes, flavorGroup, creator) + touchedReservations = append(touchedReservations, *reservation) + memValue := reservation.Spec.Resources[hv1.ResourceMemory] + deltaMemoryBytes -= memValue.Value() + + log.Info("creating reservation", + "commitmentUUID", desiredState.CommitmentUUID, + "deltaMemoryBytes", deltaMemoryBytes, + "name", reservation.Name, + "memoryBytes", memValue.Value()) + + if err := m.Create(ctx, reservation); err != nil { + if apierrors.IsAlreadyExists(err) { + return touchedReservations, removedReservations, fmt.Errorf( + "reservation %s already exists (collision detected): %w", + reservation.Name, err) + } + return touchedReservations, removedReservations, fmt.Errorf( + "failed to create reservation slot %d: %w", + nextSlotIndex, err) + } + + nextSlotIndex++ + } + + // Phase 6 (UPDATE): Sync metadata for remaining reservations + for i := range existing { + updated, err := m.syncReservationMetadata(ctx, log, &existing[i], desiredState) + if err != nil { + return touchedReservations, removedReservations, err + } + if updated != nil { + touchedReservations = append(touchedReservations, *updated) + } + } + + log.Info("completed commitment state sync", + "commitmentUUID", desiredState.CommitmentUUID, + "totalReservations", len(existing), + "created", len(touchedReservations)-len(existing), + "deleted", len(removedReservations)) + + return touchedReservations, removedReservations, nil +} + +// syncReservationMetadata updates reservation metadata if it differs from desired state. +func (m *ReservationManager) syncReservationMetadata( + ctx context.Context, + log logr.Logger, + reservation *v1alpha1.Reservation, + state *CommitmentState, +) (*v1alpha1.Reservation, error) { + + // if any of CommitmentUUID, AZ, StarTime, EndTime differ from desired state, need to patch + if (state.CommitmentUUID != "" && reservation.Spec.CommittedResourceReservation.CommitmentUUID != state.CommitmentUUID) || + (state.AvailabilityZone != "" && reservation.Spec.AvailabilityZone != state.AvailabilityZone) || + (state.StartTime != nil && (reservation.Spec.StartTime == nil || !reservation.Spec.StartTime.Time.Equal(*state.StartTime))) || + (state.EndTime != nil && (reservation.Spec.EndTime == nil || !reservation.Spec.EndTime.Time.Equal(*state.EndTime))) { + // Apply patch + log.Info("syncing reservation metadata", + "reservation", reservation, + "desired commitmentUUID", state.CommitmentUUID, + "desired availabilityZone", state.AvailabilityZone, + "desired startTime", state.StartTime, + "desired endTime", state.EndTime) + + patch := client.MergeFrom(reservation.DeepCopy()) + + if state.CommitmentUUID != "" { + reservation.Spec.CommittedResourceReservation.CommitmentUUID = state.CommitmentUUID + } + + if state.AvailabilityZone != "" { + reservation.Spec.AvailabilityZone = state.AvailabilityZone + } + if state.StartTime != nil { + reservation.Spec.StartTime = &metav1.Time{Time: *state.StartTime} + } + if state.EndTime != nil { + reservation.Spec.EndTime = &metav1.Time{Time: *state.EndTime} + } + + if err := m.Patch(ctx, reservation, patch); err != nil { + return nil, fmt.Errorf("failed to patch reservation %s: %w", + reservation.Name, err) + } + + return reservation, nil + } else { + return nil, nil // No changes needed + } +} + +func (m *ReservationManager) newReservation( + state *CommitmentState, + slotIndex int, + deltaMemoryBytes int64, + flavorGroup compute.FlavorGroupFeature, + creator string, +) *v1alpha1.Reservation { + + name := fmt.Sprintf("commitment-%s-%d", state.CommitmentUUID, slotIndex) + + // Select first flavor that fits remaining memory (flavors sorted descending by size) + flavorInGroup := flavorGroup.Flavors[len(flavorGroup.Flavors)-1] // default to smallest + memoryBytes := deltaMemoryBytes + cpus := int64(flavorInGroup.VCPUs) //nolint:gosec // VCPUs from flavor specs, realistically bounded + + for _, flavor := range flavorGroup.Flavors { + flavorMemoryBytes := int64(flavor.MemoryMB) * 1024 * 1024 //nolint:gosec // flavor memory from specs, realistically bounded + if flavorMemoryBytes <= deltaMemoryBytes { + flavorInGroup = flavor + memoryBytes = flavorMemoryBytes + cpus = int64(flavorInGroup.VCPUs) //nolint:gosec // VCPUs from flavor specs, realistically bounded + break + } + } + + spec := v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity( + memoryBytes, + resource.BinarySI, + ), + hv1.ResourceCPU: *resource.NewQuantity( + cpus, + resource.DecimalSI, + ), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: state.ProjectID, + CommitmentUUID: state.CommitmentUUID, + DomainID: state.DomainID, + ResourceGroup: state.FlavorGroupName, + ResourceName: flavorInGroup.Name, + Creator: creator, + Allocations: nil, + }, + } + + // Set AvailabilityZone if specified + if state.AvailabilityZone != "" { + spec.AvailabilityZone = state.AvailabilityZone + } + + // Set validity times if specified + if state.StartTime != nil { + spec.StartTime = &metav1.Time{Time: *state.StartTime} + } + if state.EndTime != nil { + spec.EndTime = &metav1.Time{Time: *state.EndTime} + } + + return &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: spec, + } +} diff --git a/internal/scheduling/reservations/commitments/reservation_manager_test.go b/internal/scheduling/reservations/commitments/reservation_manager_test.go new file mode 100644 index 000000000..8022999fb --- /dev/null +++ b/internal/scheduling/reservations/commitments/reservation_manager_test.go @@ -0,0 +1,541 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/go-logr/logr" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestApplyCommitmentState_CreatesNewReservations(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + manager := NewReservationManager(client) + flavorGroup := testFlavorGroup() + flavorGroups := map[string]compute.FlavorGroupFeature{ + "test-group": flavorGroup, + } + + // Desired state: 3 multiples of smallest flavor (24 GiB) + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: 3 * 8192 * 1024 * 1024, + } + + touched, removed, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(removed) != 0 { + t.Errorf("expected 0 removed reservations, got %d", len(removed)) + } + + // Should create reservations to fulfill the commitment + if len(touched) == 0 { + t.Fatal("expected at least one reservation to be created") + } + + // Verify created reservations sum to desired state + totalMemory := int64(0) + for _, res := range touched { + memQuantity := res.Spec.Resources[hv1.ResourceMemory] + totalMemory += memQuantity.Value() + } + + if totalMemory != desiredState.TotalMemoryBytes { + t.Errorf("expected total memory %d, got %d", desiredState.TotalMemoryBytes, totalMemory) + } +} + +func TestApplyCommitmentState_DeletesExcessReservations(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + // Create existing reservations (32 GiB total) + existingReservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-1", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, + }, + }, + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(&existingReservations[0], &existingReservations[1]). + Build() + + manager := NewReservationManager(client) + flavorGroup := testFlavorGroup() + flavorGroups := map[string]compute.FlavorGroupFeature{ + "test-group": flavorGroup, + } + + // Desired state: only 8 GiB (need to reduce) + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: 8 * 1024 * 1024 * 1024, + } + + _, removed, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Note: May create a new 8GiB reservation while removing the two 16GiB ones + // This is expected behavior based on the slot sizing algorithm + + // Should remove excess reservations + if len(removed) == 0 { + t.Fatal("expected reservations to be removed") + } + + // Verify remaining capacity matches desired state + var remainingList v1alpha1.ReservationList + if err := client.List(context.Background(), &remainingList); err != nil { + t.Fatal(err) + } + + totalMemory := int64(0) + for _, res := range remainingList.Items { + memQuantity := res.Spec.Resources[hv1.ResourceMemory] + totalMemory += memQuantity.Value() + } + + if totalMemory != desiredState.TotalMemoryBytes { + t.Errorf("expected remaining memory %d, got %d", desiredState.TotalMemoryBytes, totalMemory) + } +} + +func TestApplyCommitmentState_PreservesAllocatedReservations(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + // Create reservations: one with allocation, one without + existingReservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{ + "vm-123": {}, // Has allocation + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-1", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, // No allocation + }, + }, + }, + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(&existingReservations[0], &existingReservations[1]). + Build() + + manager := NewReservationManager(client) + flavorGroup := testFlavorGroup() + flavorGroups := map[string]compute.FlavorGroupFeature{ + "test-group": flavorGroup, + } + + // Desired state: only 16 GiB (need to reduce by one slot) + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: 16 * 1024 * 1024 * 1024, + } + + _, removed, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Should remove the unallocated reservation, not the allocated one + if len(removed) != 1 { + t.Fatalf("expected 1 removed reservation, got %d", len(removed)) + } + + // Verify the removed one had no allocations + if len(removed[0].Spec.CommittedResourceReservation.Allocations) != 0 { + t.Error("expected unallocated reservation to be removed first") + } + + // Verify the allocated reservation still exists + var remainingList v1alpha1.ReservationList + if err := client.List(context.Background(), &remainingList); err != nil { + t.Fatal(err) + } + + if len(remainingList.Items) != 1 { + t.Fatalf("expected 1 remaining reservation, got %d", len(remainingList.Items)) + } + + // Verify the remaining one has the allocation + if len(remainingList.Items[0].Spec.CommittedResourceReservation.Allocations) == 0 { + t.Error("expected allocated reservation to be preserved") + } +} + +func TestApplyCommitmentState_HandlesZeroCapacity(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + // Create existing reservation + existingReservation := v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, + }, + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(&existingReservation). + Build() + + manager := NewReservationManager(client) + flavorGroup := testFlavorGroup() + flavorGroups := map[string]compute.FlavorGroupFeature{ + "test-group": flavorGroup, + } + + // Desired state: zero capacity (commitment expired or canceled) + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: 0, + } + + touched, removed, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(touched) != 0 { + t.Errorf("expected 0 new reservations, got %d", len(touched)) + } + + // Should remove all reservations + if len(removed) != 1 { + t.Fatalf("expected 1 removed reservation, got %d", len(removed)) + } + + // Verify no reservations remain + var remainingList v1alpha1.ReservationList + if err := client.List(context.Background(), &remainingList); err != nil { + t.Fatal(err) + } + + if len(remainingList.Items) != 0 { + t.Errorf("expected 0 remaining reservations, got %d", len(remainingList.Items)) + } +} + +func TestApplyCommitmentState_FixesWrongFlavorGroup(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + // Create reservation with wrong flavor group + existingReservation := v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "wrong-group", // Wrong flavor group + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, + }, + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(&existingReservation). + Build() + + manager := NewReservationManager(client) + flavorGroup := testFlavorGroup() + flavorGroups := map[string]compute.FlavorGroupFeature{ + "test-group": flavorGroup, + } + + // Desired state with correct flavor group + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: 8 * 1024 * 1024 * 1024, + } + + touched, removed, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Should remove wrong reservation and create new one + if len(removed) != 1 { + t.Fatalf("expected 1 removed reservation, got %d", len(removed)) + } + + if len(touched) != 1 { + t.Fatalf("expected 1 new reservation, got %d", len(touched)) + } + + // Verify new reservation has correct flavor group + if touched[0].Spec.CommittedResourceReservation.ResourceGroup != "test-group" { + t.Errorf("expected flavor group test-group, got %s", + touched[0].Spec.CommittedResourceReservation.ResourceGroup) + } +} + +func TestApplyCommitmentState_UnknownFlavorGroup(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + manager := NewReservationManager(client) + flavorGroups := map[string]compute.FlavorGroupFeature{} // Empty + + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "unknown-group", + TotalMemoryBytes: 8 * 1024 * 1024 * 1024, + } + + _, _, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err == nil { + t.Fatal("expected error for unknown flavor group, got nil") + } +} + +func TestNewReservation_SelectsAppropriateFlavor(t *testing.T) { + manager := &ReservationManager{} + flavorGroup := testFlavorGroup() + + tests := []struct { + name string + deltaMemory int64 + expectedName string + expectedCores int64 + }{ + { + name: "fits large flavor", + deltaMemory: 32768 * 1024 * 1024, // 32 GiB + expectedName: "large", + expectedCores: 16, + }, + { + name: "fits medium flavor", + deltaMemory: 16384 * 1024 * 1024, // 16 GiB + expectedName: "medium", + expectedCores: 8, + }, + { + name: "fits small flavor", + deltaMemory: 8192 * 1024 * 1024, // 8 GiB + expectedName: "small", + expectedCores: 4, + }, + { + name: "oversized uses largest available flavor", + deltaMemory: 100 * 1024 * 1024 * 1024, // 100 GiB (larger than any flavor) + expectedName: "large", // Will use largest available + expectedCores: 16, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + state := &CommitmentState{ + CommitmentUUID: "test-uuid", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: tt.deltaMemory, + } + + reservation := manager.newReservation( + state, + 0, + tt.deltaMemory, + flavorGroup, + "syncer", + ) + + // Verify flavor selection + if reservation.Spec.CommittedResourceReservation.ResourceName != tt.expectedName { + t.Errorf("expected flavor %s, got %s", + tt.expectedName, + reservation.Spec.CommittedResourceReservation.ResourceName) + } + + // Verify CPU allocation + cpuQuantity := reservation.Spec.Resources[hv1.ResourceCPU] + if cpuQuantity.Value() != tt.expectedCores { + t.Errorf("expected %d cores, got %d", + tt.expectedCores, cpuQuantity.Value()) + } + }) + } +} diff --git a/internal/scheduling/reservations/commitments/state.go b/internal/scheduling/reservations/commitments/state.go new file mode 100644 index 000000000..50108beef --- /dev/null +++ b/internal/scheduling/reservations/commitments/state.go @@ -0,0 +1,202 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "errors" + "fmt" + "strings" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/sapcc/go-api-declarations/liquid" + ctrl "sigs.k8s.io/controller-runtime" +) + +var stateLog = ctrl.Log.WithName("commitment_state") + +// Limes LIQUID resource naming convention: ram_ +const commitmentResourceNamePrefix = "ram_" + +func getFlavorGroupNameFromResource(resourceName string) (string, error) { + if !strings.HasPrefix(resourceName, commitmentResourceNamePrefix) { + return "", fmt.Errorf("invalid resource name: %s", resourceName) + } + return strings.TrimPrefix(resourceName, commitmentResourceNamePrefix), nil +} + +// CommitmentState represents desired or current commitment resource allocation. +type CommitmentState struct { + // CommitmentUUID is the UUID of the commitment this state corresponds to. + CommitmentUUID string + // ProjectID is the OpenStack project this commitment belongs to + ProjectID string + // DomainID is the OpenStack domain this commitment belongs to + DomainID string + // FlavorGroupName identifies the flavor group (e.g., "hana_medium_v2") + FlavorGroupName string + // the total memory in bytes across all reservation slots + TotalMemoryBytes int64 + // AvailabilityZone specifies the availability zone for this commitment + AvailabilityZone string + // StartTime is when the commitment becomes active + StartTime *time.Time + // EndTime is when the commitment expires + EndTime *time.Time +} + +// FromCommitment converts Limes commitment to CommitmentState. +func FromCommitment( + commitment Commitment, + flavorGroup compute.FlavorGroupFeature, +) (*CommitmentState, error) { + + flavorGroupName, err := getFlavorGroupNameFromResource(commitment.ResourceName) + if err != nil { + return nil, err + } + + // Calculate total memory from commitment amount (amount = multiples of smallest flavor) + smallestFlavorMemoryBytes := int64(flavorGroup.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec // flavor memory from specs, realistically bounded + totalMemoryBytes := int64(commitment.Amount) * smallestFlavorMemoryBytes //nolint:gosec // commitment amount from Limes API, bounded by quota limits + + // Set start time: use ConfirmedAt if available, otherwise CreatedAt + var startTime *time.Time + if commitment.ConfirmedAt != nil { + t := time.Unix(int64(*commitment.ConfirmedAt), 0) //nolint:gosec // timestamp from Limes API, realistically bounded + startTime = &t + } else { + t := time.Unix(int64(commitment.CreatedAt), 0) //nolint:gosec // timestamp from Limes API, realistically bounded + startTime = &t + } + + // Set end time from ExpiresAt + var endTime *time.Time + if commitment.ExpiresAt > 0 { + t := time.Unix(int64(commitment.ExpiresAt), 0) //nolint:gosec // timestamp from Limes API, realistically bounded + endTime = &t + } + + return &CommitmentState{ + CommitmentUUID: commitment.UUID, + ProjectID: commitment.ProjectID, + DomainID: commitment.DomainID, + FlavorGroupName: flavorGroupName, + TotalMemoryBytes: totalMemoryBytes, + AvailabilityZone: commitment.AvailabilityZone, + StartTime: startTime, + EndTime: endTime, + }, nil +} + +// FromChangeCommitmentTargetState converts LIQUID API request to CommitmentState. +func FromChangeCommitmentTargetState( + commitment liquid.Commitment, + projectID string, + flavorGroupName string, + flavorGroup compute.FlavorGroupFeature, + az string, +) (*CommitmentState, error) { + + amountMultiple := uint64(0) + var startTime *time.Time + var endTime *time.Time + + switch commitment.NewStatus.UnwrapOr("none") { + // guaranteed and confirmed commitments are honored with start time now + case liquid.CommitmentStatusGuaranteed, liquid.CommitmentStatusConfirmed: + amountMultiple = commitment.Amount + // Set start time to now for active commitments + now := time.Now() + startTime = &now + } + + // ConfirmBy is ignored for now + // TODO do more sophisticated handling of guaranteed commitments + + // Set end time if not zero (commitments can have no expiry) + if !commitment.ExpiresAt.IsZero() { + endTime = &commitment.ExpiresAt + // check expiry time + if commitment.ExpiresAt.Before(time.Now()) || commitment.ExpiresAt.Equal(time.Now()) { + // commitment is already expired, ignore capacity + amountMultiple = 0 + } + } + + // Flavors are sorted by size descending, so the last one is the smallest + smallestFlavor := flavorGroup.SmallestFlavor + smallestFlavorMemoryBytes := int64(smallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec // flavor memory from specs, realistically bounded + + // Amount represents multiples of the smallest flavor in the group + totalMemoryBytes := int64(amountMultiple) * smallestFlavorMemoryBytes //nolint:gosec // commitment amount from Limes API, bounded by quota limits + + return &CommitmentState{ + CommitmentUUID: string(commitment.UUID), + ProjectID: projectID, + FlavorGroupName: flavorGroupName, + TotalMemoryBytes: totalMemoryBytes, + AvailabilityZone: az, + StartTime: startTime, + EndTime: endTime, + }, nil +} + +// FromReservations reconstructs CommitmentState from existing Reservation CRDs. +func FromReservations(reservations []v1alpha1.Reservation) (*CommitmentState, error) { + if len(reservations) == 0 { + return nil, errors.New("no reservations provided") + } + + // Extract commitment metadata from first reservation + first := reservations[0] + if first.Spec.CommittedResourceReservation == nil { + return nil, errors.New("not a committed resource reservation") + } + + state := &CommitmentState{ + CommitmentUUID: extractCommitmentUUID(first.Name), + ProjectID: first.Spec.CommittedResourceReservation.ProjectID, + DomainID: first.Spec.CommittedResourceReservation.DomainID, + FlavorGroupName: first.Spec.CommittedResourceReservation.ResourceGroup, + TotalMemoryBytes: 0, + AvailabilityZone: first.Spec.AvailabilityZone, + } + + if first.Spec.StartTime != nil { + state.StartTime = &first.Spec.StartTime.Time + } + if first.Spec.EndTime != nil { + state.EndTime = &first.Spec.EndTime.Time + } + + // Sum memory across all reservations + for _, res := range reservations { + if res.Spec.CommittedResourceReservation == nil { + return nil, errors.New("unexpected reservation type of reservation " + res.Name) + } + // check if it belongs to the same commitment + if extractCommitmentUUID(res.Name) != state.CommitmentUUID { + return nil, errors.New("reservation " + res.Name + " does not belong to commitment " + state.CommitmentUUID) + } + // check flavor group consistency, ignore if not matching to repair corrupted state in k8s + if res.Spec.CommittedResourceReservation.ResourceGroup != state.FlavorGroupName { + // log message + stateLog.Error(errors.New("inconsistent flavor group in reservation"), + "reservation belongs to same commitment but has different flavor group - ignoring reservation for capacity calculation", + "reservationName", res.Name, + "expectedFlavorGroup", state.FlavorGroupName, + "actualFlavorGroup", res.Spec.CommittedResourceReservation.ResourceGroup, + ) + continue + } + + memoryQuantity := res.Spec.Resources["memory"] + memoryBytes := memoryQuantity.Value() + state.TotalMemoryBytes += memoryBytes + } + + return state, nil +} diff --git a/internal/scheduling/reservations/commitments/state_test.go b/internal/scheduling/reservations/commitments/state_test.go new file mode 100644 index 000000000..7060300db --- /dev/null +++ b/internal/scheduling/reservations/commitments/state_test.go @@ -0,0 +1,253 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// Test helper: creates a minimal flavor group for testing +func testFlavorGroup() compute.FlavorGroupFeature { + return compute.FlavorGroupFeature{ + Name: "test-group", + Flavors: []compute.FlavorInGroup{ + {Name: "large", VCPUs: 16, MemoryMB: 32768, DiskGB: 100}, + {Name: "medium", VCPUs: 8, MemoryMB: 16384, DiskGB: 50}, + {Name: "small", VCPUs: 4, MemoryMB: 8192, DiskGB: 25}, + }, + SmallestFlavor: compute.FlavorInGroup{ + Name: "small", VCPUs: 4, MemoryMB: 8192, DiskGB: 25, + }, + LargestFlavor: compute.FlavorInGroup{ + Name: "large", VCPUs: 16, MemoryMB: 32768, DiskGB: 100, + }, + } +} + +func TestFromCommitment_CalculatesMemoryCorrectly(t *testing.T) { + flavorGroup := testFlavorGroup() + commitment := Commitment{ + UUID: "test-uuid", + ProjectID: "project-1", + ResourceName: "ram_test-group", + Amount: 5, // 5 multiples of smallest flavor + } + + state, err := FromCommitment(commitment, flavorGroup) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Verify basic fields + if state.CommitmentUUID != "test-uuid" { + t.Errorf("expected UUID test-uuid, got %s", state.CommitmentUUID) + } + if state.ProjectID != "project-1" { + t.Errorf("expected ProjectID project-1, got %s", state.ProjectID) + } + if state.FlavorGroupName != "test-group" { + t.Errorf("expected FlavorGroupName test-group, got %s", state.FlavorGroupName) + } + + // Verify memory calculation: 5 * 8192 MB = 40960 MB = 42949672960 bytes + expectedMemory := int64(5 * 8192 * 1024 * 1024) + if state.TotalMemoryBytes != expectedMemory { + t.Errorf("expected memory %d, got %d", expectedMemory, state.TotalMemoryBytes) + } +} + +func TestFromCommitment_InvalidResourceName(t *testing.T) { + flavorGroup := testFlavorGroup() + commitment := Commitment{ + UUID: "test-uuid", + ProjectID: "project-1", + ResourceName: "invalid_resource_name", // missing "ram_" prefix + Amount: 1, + } + + _, err := FromCommitment(commitment, flavorGroup) + if err == nil { + t.Fatal("expected error for invalid resource name, got nil") + } +} + +func TestFromReservations_SumsMemoryCorrectly(t *testing.T) { + reservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), // 8 GiB + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-1", + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), // 16 GiB + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + }, + }, + }, + } + + state, err := FromReservations(reservations) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Verify fields extracted from first reservation + if state.CommitmentUUID != "abc123" { + t.Errorf("expected UUID abc123, got %s", state.CommitmentUUID) + } + if state.ProjectID != "project-1" { + t.Errorf("expected ProjectID project-1, got %s", state.ProjectID) + } + if state.FlavorGroupName != "test-group" { + t.Errorf("expected FlavorGroupName test-group, got %s", state.FlavorGroupName) + } + + // Verify memory is summed correctly: 8 GiB + 16 GiB = 24 GiB + expectedMemory := int64(24 * 1024 * 1024 * 1024) + if state.TotalMemoryBytes != expectedMemory { + t.Errorf("expected memory %d, got %d", expectedMemory, state.TotalMemoryBytes) + } +} + +func TestFromReservations_EmptyList(t *testing.T) { + _, err := FromReservations([]v1alpha1.Reservation{}) + if err == nil { + t.Fatal("expected error for empty reservation list, got nil") + } +} + +func TestFromReservations_SkipsInconsistentFlavorGroup(t *testing.T) { + reservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-1", + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "wrong-group", // Different flavor group + }, + }, + }, + } + + state, err := FromReservations(reservations) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Should only count first reservation with matching flavor group + expectedMemory := int64(8 * 1024 * 1024 * 1024) + if state.TotalMemoryBytes != expectedMemory { + t.Errorf("expected memory %d (ignoring inconsistent reservation), got %d", + expectedMemory, state.TotalMemoryBytes) + } +} + +func TestFromReservations_MixedCommitmentUUIDs(t *testing.T) { + reservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + }, + Spec: v1alpha1.ReservationSpec{ + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-xyz789-0", // Different commitment UUID + }, + Spec: v1alpha1.ReservationSpec{ + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + }, + }, + }, + } + + _, err := FromReservations(reservations) + if err == nil { + t.Fatal("expected error for mixed commitment UUIDs, got nil") + } +} + +func TestFromReservations_NonCommittedResourceType(t *testing.T) { + reservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, // Wrong type + }, + }, + } + + _, err := FromReservations(reservations) + if err == nil { + t.Fatal("expected error for non-CR reservation type, got nil") + } +} + +func TestGetFlavorGroupNameFromResource_Valid(t *testing.T) { + name, err := getFlavorGroupNameFromResource("ram_hana_medium_v2") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if name != "hana_medium_v2" { + t.Errorf("expected hana_medium_v2, got %s", name) + } +} + +func TestGetFlavorGroupNameFromResource_Invalid(t *testing.T) { + _, err := getFlavorGroupNameFromResource("invalid_resource") + if err == nil { + t.Fatal("expected error for invalid resource name, got nil") + } +} diff --git a/internal/scheduling/reservations/commitments/syncer.go b/internal/scheduling/reservations/commitments/syncer.go index 970a44b26..b9e6fe3b4 100644 --- a/internal/scheduling/reservations/commitments/syncer.go +++ b/internal/scheduling/reservations/commitments/syncer.go @@ -5,24 +5,20 @@ package commitments import ( "context" - "errors" "fmt" - "slices" - "sort" "strings" - - ctrl "sigs.k8s.io/controller-runtime" + "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" - k8serrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/api/resource" - "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) var ( - syncLog = ctrl.Log.WithName("sync") // CreatorValue identifies reservations created by this syncer. CreatorValue = "commitments-syncer" ) @@ -35,13 +31,12 @@ type SyncerConfig struct { } type Syncer struct { - // Client to fetch commitments. + // Client to fetch commitments from Limes CommitmentsClient - // Client for the kubernetes API. + // Kubernetes client for CRD operations client.Client } -// Create a new compute reservation syncer. func NewSyncer(k8sClient client.Client) *Syncer { return &Syncer{ CommitmentsClient: NewCommitmentsClient(), @@ -49,233 +44,175 @@ func NewSyncer(k8sClient client.Client) *Syncer { } } -// Initialize the syncer. func (s *Syncer) Init(ctx context.Context, config SyncerConfig) error { - // Initialize the syncer. if err := s.CommitmentsClient.Init(ctx, s.Client, config); err != nil { return err } return nil } -// Helper struct to unify the commitment with metadata needed for reservation creation. -type resolvedCommitment struct { - Commitment - Flavor Flavor -} - -// Get all compute commitments that should be converted to reservations. -func (s *Syncer) resolveUnusedCommitments(ctx context.Context) ([]resolvedCommitment, error) { - // Get all data we need from the openstack services. +func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavorGroups map[string]compute.FlavorGroupFeature) ([]*CommitmentState, error) { allProjects, err := s.ListProjects(ctx) if err != nil { return nil, err } - flavors, err := s.ListFlavorsByName(ctx) - if err != nil { - return nil, err - } commitments, err := s.ListCommitmentsByID(ctx, allProjects...) if err != nil { return nil, err } - // Remove non-compute/non-instance commitments or commitments we can't resolve. - var resolvedCommitments []resolvedCommitment + // Filter for compute commitments with RAM flavor group resources + var commitmentStates []*CommitmentState for id, commitment := range commitments { if commitment.ServiceType != "compute" { - delete(commitments, id) - syncLog.Info("skipping non-compute commitment", "id", id, "serviceType", commitment.ServiceType) - continue - } - if !strings.HasPrefix(commitment.ResourceName, "instances_") { - syncLog.Info("skipping non-instance commitment", "id", id, "resourceName", commitment.ResourceName) - delete(commitments, id) + log.Info("skipping non-compute commitment", "id", id, "serviceType", commitment.ServiceType) continue } - flavorName := strings.TrimPrefix(commitment.ResourceName, "instances_") - flavor, ok := flavors[flavorName] - if !ok { - syncLog.Info("skipping commitment without known flavor", "id", id, "flavorName", flavorName) - delete(commitments, id) + if !strings.HasPrefix(commitment.ResourceName, commitmentResourceNamePrefix) { + log.Info("skipping non-RAM-flavor-group commitment", "id", id, "resourceName", commitment.ResourceName) continue } - // We only support cloud-hypervisor and qemu hypervisors for commitments. - hvType, ok := flavor.ExtraSpecs["capabilities:hypervisor_type"] - if !ok || !slices.Contains([]string{"ch", "qemu"}, strings.ToLower(hvType)) { - syncLog.Info("skipping commitment with unsupported hv type", "commitmentID", commitment.UUID, "hypervisorType", hvType) - delete(commitments, id) + + // Extract flavor group name from resource name + flavorGroupName, err := getFlavorGroupNameFromResource(commitment.ResourceName) + if err != nil { + log.Info("skipping commitment with invalid resource name", + "id", id, + "resourceName", commitment.ResourceName, + "error", err) continue } - resolvedCommitments = append(resolvedCommitments, resolvedCommitment{ - Commitment: commitment, - Flavor: flavor, - }) - } - // Remove all commitments which are currently actively in use by a vm. - projectsWithCommitments := make([]Project, 0, len(resolvedCommitments)) - projectIDs := make(map[string]bool) - for _, commitment := range resolvedCommitments { - projectIDs[commitment.ProjectID] = true - } - for _, project := range allProjects { - if _, exists := projectIDs[project.ID]; exists { - projectsWithCommitments = append(projectsWithCommitments, project) + // Validate flavor group exists in Knowledge + flavorGroup, exists := flavorGroups[flavorGroupName] + if !exists { + log.Info("skipping commitment with unknown flavor group", + "id", id, + "flavorGroup", flavorGroupName) + continue } - } - // List all servers, not only the active ones, like limes when it calculates - // subresource usage: https://github.com/sapcc/limes/blob/c146c82/internal/liquids/nova/subresources.go#L94 - servers, err := s.ListServersByProjectID(ctx, projectsWithCommitments...) - if err != nil { - return nil, err - } - sort.Slice(resolvedCommitments, func(i, j int) bool { - return resolvedCommitments[i].ID < resolvedCommitments[j].ID - }) - mappedServers := map[string]struct{}{} // Servers subtracted from a commitment - var unusedCommitments []resolvedCommitment - for _, commitment := range resolvedCommitments { - matchingServerCount := uint64(0) - activeServers, ok := servers[commitment.ProjectID] - if !ok || len(activeServers) == 0 { - // No active servers in this project, keep the commitment. - unusedCommitments = append(unusedCommitments, commitment) + // Skip commitments with empty UUID + if commitment.UUID == "" { + log.Info("skipping commitment with empty UUID", + "id", id) continue } - // Some active servers, subtract them from the commitment amount. - sort.Slice(activeServers, func(i, j int) bool { - return activeServers[i].ID < activeServers[j].ID - }) - for _, server := range activeServers { - if _, exists := mappedServers[server.ID]; exists { - // This server is already subtracted from another commitment. - continue - } - if server.FlavorName != commitment.Flavor.Name { - // This server is of a different flavor, skip it. - continue - } - mappedServers[server.ID] = struct{}{} - matchingServerCount++ - syncLog.Info("subtracting server from commitment", "commitmentID", commitment.UUID, "serverID", server.ID, "remainingAmount", commitment.Amount) - } - if matchingServerCount >= commitment.Amount { - syncLog.Info("skipping commitment that is fully used by active servers", "id", commitment.UUID, "project", commitment.ProjectID) + + // Convert commitment to state using FromCommitment + state, err := FromCommitment(commitment, flavorGroup) + if err != nil { + log.Error(err, "failed to convert commitment to state", + "id", id, + "uuid", commitment.UUID) continue } - commitment.Amount -= matchingServerCount - unusedCommitments = append(unusedCommitments, commitment) + + log.Info("resolved commitment to state", + "commitmentID", commitment.UUID, + "flavorGroup", flavorGroupName, + "amount", commitment.Amount, + "totalMemoryBytes", state.TotalMemoryBytes) + + commitmentStates = append(commitmentStates, state) } - return unusedCommitments, nil + return commitmentStates, nil } -// Fetch commitments and update/create reservations for each of them. +// SyncReservations fetches commitments from Limes and synchronizes Reservation CRDs. func (s *Syncer) SyncReservations(ctx context.Context) error { - // Get all commitments that should be converted to reservations. - // TODO keep all commitments, not only the unused ones, propagate allocation correctly - commitments, err := s.resolveUnusedCommitments(ctx) + // TODO handle concurrency with change API: consider creation time of reservations and status ready + + // Create logger with run ID for this sync execution + runID := fmt.Sprintf("sync-%d", time.Now().Unix()) + log := ctrl.Log.WithName("CommitmentSyncer").WithValues("runID", runID) + + log.Info("starting commitment sync") + + // Check if flavor group knowledge is ready + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: s.Client} + knowledgeCRD, err := knowledge.Get(ctx) if err != nil { - syncLog.Error(err, "failed to get compute commitments") + log.Error(err, "failed to check flavor group knowledge readiness") return err } - // Map commitments to reservations. - var reservationsByName = make(map[string]v1alpha1.Reservation) - for _, commitment := range commitments { - // Get only the 5 first characters from the uuid. This should be safe enough. - if len(commitment.UUID) < 5 { - err := errors.New("commitment UUID is too short") - syncLog.Error(err, "uuid is less than 5 characters", "uuid", commitment.UUID) - continue - } - commitmentUUIDShort := commitment.UUID[:5] - spec := v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeCommittedResource, - Resources: map[string]resource.Quantity{ - "memory": *resource.NewQuantity(int64(commitment.Flavor.RAM)*1024*1024, resource.BinarySI), - "cpu": *resource.NewQuantity(int64(commitment.Flavor.VCPUs), resource.DecimalSI), - // Disk is currently not considered. - }, - CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - ProjectID: commitment.ProjectID, - DomainID: commitment.DomainID, - ResourceName: commitment.Flavor.Name, - ResourceGroup: commitment.Flavor.ExtraSpecs["hw_version"], - Allocations: make(map[string]v1alpha1.CommittedResourceAllocation), - Creator: CreatorValue, - }, - } - for n := range commitment.Amount { // N instances - meta := ctrl.ObjectMeta{ - Name: fmt.Sprintf("commitment-%s-%d", commitmentUUIDShort, n), - } - if _, exists := reservationsByName[meta.Name]; exists { - syncLog.Error(errors.New("duplicate reservation name"), - "reservation name already exists", - "name", meta.Name, - "commitmentUUID", commitment.UUID, - ) - continue - } - reservationsByName[meta.Name] = v1alpha1.Reservation{ - ObjectMeta: meta, - Spec: spec, - } - } + if knowledgeCRD == nil { + log.Info("skipping commitment sync - flavor group knowledge not ready yet") + return nil } - // Create new reservations or update existing ones. - for _, res := range reservationsByName { - // Check if the reservation already exists. - nn := types.NamespacedName{Name: res.Name, Namespace: res.Namespace} - var existing v1alpha1.Reservation - if err := s.Get(ctx, nn, &existing); err != nil { - if !k8serrors.IsNotFound(err) { - syncLog.Error(err, "failed to get reservation", "name", nn.Name) - return err - } - // Reservation does not exist, create it. - if err := s.Create(ctx, &res); err != nil { - return err - } - syncLog.Info("created reservation", "name", nn.Name) + // Get flavor groups using the CRD we already fetched + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, knowledgeCRD) + if err != nil { + log.Error(err, "failed to get flavor groups from knowledge") + return err + } + + // Get all commitments as states + commitmentStates, err := s.getCommitmentStates(ctx, log, flavorGroups) + if err != nil { + log.Error(err, "failed to get compute commitments") + return err + } + + // Create ReservationManager to handle state application + manager := NewReservationManager(s.Client) + + // Apply each commitment state using the manager + for _, state := range commitmentStates { + log.Info("applying commitment state", + "commitmentUUID", state.CommitmentUUID, + "projectID", state.ProjectID, + "flavorGroup", state.FlavorGroupName, + "totalMemoryBytes", state.TotalMemoryBytes) + + _, _, err := manager.ApplyCommitmentState(ctx, log, state, flavorGroups, CreatorValue) + if err != nil { + log.Error(err, "failed to apply commitment state", + "commitmentUUID", state.CommitmentUUID) + // Continue with other commitments even if one fails continue } - // Reservation exists, update it. - old := existing.DeepCopy() - existing.Spec = res.Spec - patch := client.MergeFrom(old) - if err := s.Patch(ctx, &existing, patch); err != nil { - syncLog.Error(err, "failed to patch reservation", "name", nn.Name) - return err - } - syncLog.Info("updated reservation", "name", nn.Name) } - // Delete reservations that are not in the commitments anymore. + // Delete reservations that are no longer in commitments + // Only query committed resource reservations using labels for efficiency var existingReservations v1alpha1.ReservationList - if err := s.List(ctx, &existingReservations); err != nil { - syncLog.Error(err, "failed to list existing reservations") + if err := s.List(ctx, &existingReservations, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + log.Error(err, "failed to list existing committed resource reservations") return err } + + // Build set of commitment UUIDs we should have + activeCommitments := make(map[string]bool) + for _, state := range commitmentStates { + activeCommitments[state.CommitmentUUID] = true + } + + // Delete reservations for commitments that no longer exist for _, existing := range existingReservations.Items { - // Only manage reservations created by this syncer (identified by Creator field). - if existing.Spec.CommittedResourceReservation == nil || - existing.Spec.CommittedResourceReservation.Creator != CreatorValue { + // Extract commitment UUID from reservation name + commitmentUUID := extractCommitmentUUID(existing.Name) + if commitmentUUID == "" { + log.Info("skipping reservation with unparseable name", "name", existing.Name) continue } - if _, found := reservationsByName[existing.Name]; !found { - // Reservation not found in commitments, delete it. + + if !activeCommitments[commitmentUUID] { + // This commitment no longer exists, delete the reservation if err := s.Delete(ctx, &existing); err != nil { - syncLog.Error(err, "failed to delete reservation", "name", existing.Name) + log.Error(err, "failed to delete reservation", "name", existing.Name) return err } - syncLog.Info("deleted reservation", "name", existing.Name) + log.Info("deleted reservation for expired commitment", + "name", existing.Name, + "commitmentUUID", commitmentUUID) } } - syncLog.Info("synced reservations", "count", len(reservationsByName)) + log.Info("synced reservations", "commitmentCount", len(commitmentStates)) return nil } diff --git a/internal/scheduling/reservations/commitments/syncer_test.go b/internal/scheduling/reservations/commitments/syncer_test.go index 4db74801b..75512299a 100644 --- a/internal/scheduling/reservations/commitments/syncer_test.go +++ b/internal/scheduling/reservations/commitments/syncer_test.go @@ -7,15 +7,90 @@ import ( "context" "testing" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" - - "github.com/cobaltcore-dev/cortex/api/v1alpha1" ) +// FlavorGroupData holds test data for creating a flavor group +type FlavorGroupData struct { + LargestFlavorName string + LargestFlavorVCPUs uint64 + LargestFlavorMemoryMB uint64 + SmallestFlavorName string + SmallestFlavorVCPUs uint64 + SmallestFlavorMemoryMB uint64 +} + +// createFlavorGroupKnowledge creates a Knowledge CRD with flavor group data for testing +func createFlavorGroupKnowledge(t *testing.T, groups map[string]FlavorGroupData) *v1alpha1.Knowledge { + t.Helper() + + // Build flavor group features + features := make([]compute.FlavorGroupFeature, 0, len(groups)) + for groupName, data := range groups { + features = append(features, compute.FlavorGroupFeature{ + Name: groupName, + Flavors: []compute.FlavorInGroup{ + { + Name: data.LargestFlavorName, + VCPUs: data.LargestFlavorVCPUs, + MemoryMB: data.LargestFlavorMemoryMB, + }, + { + Name: data.SmallestFlavorName, + VCPUs: data.SmallestFlavorVCPUs, + MemoryMB: data.SmallestFlavorMemoryMB, + }, + }, + LargestFlavor: compute.FlavorInGroup{ + Name: data.LargestFlavorName, + VCPUs: data.LargestFlavorVCPUs, + MemoryMB: data.LargestFlavorMemoryMB, + }, + SmallestFlavor: compute.FlavorInGroup{ + Name: data.SmallestFlavorName, + VCPUs: data.SmallestFlavorVCPUs, + MemoryMB: data.SmallestFlavorMemoryMB, + }, + }) + } + + // Box the features + rawFeatures, err := v1alpha1.BoxFeatureList(features) + if err != nil { + t.Fatalf("Failed to box flavor group features: %v", err) + } + + return &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{ + Name: "flavor-groups", + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{ + Name: "flavor_groups", + }, + }, + Status: v1alpha1.KnowledgeStatus{ + Raw: rawFeatures, + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionTrue, + Reason: "ExtractorSucceeded", + }, + }, + }, + } +} + // Mock CommitmentsClient for testing type mockCommitmentsClient struct { initFunc func(ctx context.Context, client client.Client, conf SyncerConfig) error @@ -123,19 +198,32 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) { t.Fatalf("Failed to add scheme: %v", err) } + // Create flavor group knowledge CRD + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "test_group_v1": { + LargestFlavorName: "test-flavor", + LargestFlavorVCPUs: 2, + LargestFlavorMemoryMB: 1024, + SmallestFlavorName: "test-flavor", + SmallestFlavorVCPUs: 2, + SmallestFlavorMemoryMB: 1024, + }, + }) + k8sClient := fake.NewClientBuilder(). WithScheme(scheme). + WithObjects(flavorGroupsKnowledge). Build() - // Create mock commitments with instance flavors + // Create mock commitments with flavor group resources (using ram_ prefix) mockCommitments := []Commitment{ { ID: 1, UUID: "12345-67890-abcdef", ServiceType: "compute", - ResourceName: "instances_test-flavor", + ResourceName: "ram_test_group_v1", AvailabilityZone: "az1", - Amount: 2, // 2 instances + Amount: 2, // 2 multiples of smallest flavor (2 * 1024MB = 2048MB total) Unit: "", ProjectID: "test-project-1", DomainID: "test-domain-1", @@ -150,23 +238,6 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) { } return result, nil }, - listFlavorsByNameFunc: func(ctx context.Context) (map[string]Flavor, error) { - return map[string]Flavor{ - "test-flavor": { - ID: "flavor-1", - Name: "test-flavor", - RAM: 1024, // 1GB in MB - VCPUs: 2, - Disk: 20, // 20GB - ExtraSpecs: map[string]string{ - "hw:cpu_policy": "dedicated", - "hw:numa_nodes": "1", - "aggregate_instance_extra_specs:pinned": "true", - "capabilities:hypervisor_type": "qemu", - }, - }, - }, nil - }, listProjectsFunc: func(ctx context.Context) ([]Project, error) { return []Project{ {ID: "test-project-1", DomainID: "test-domain-1", Name: "Test Project 1"}, @@ -200,7 +271,7 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) { return } - // Should have 2 reservations (Amount = 2) + // Should have 2 reservations (Amount = 2, each for smallest flavor) if len(reservations.Items) != 2 { t.Errorf("Expected 2 reservations, got %d", len(reservations.Items)) return @@ -216,19 +287,20 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) { t.Errorf("Expected project ID test-project-1, got %v", res.Spec.CommittedResourceReservation.ProjectID) } - if res.Spec.CommittedResourceReservation.ResourceName != "test-flavor" { - t.Errorf("Expected flavor test-flavor, got %v", res.Spec.CommittedResourceReservation.ResourceName) + if res.Spec.CommittedResourceReservation.ResourceGroup != "test_group_v1" { + t.Errorf("Expected resource group test_group_v1, got %v", res.Spec.CommittedResourceReservation.ResourceGroup) } - // Check resource values + // Check resource values - should be sized for the flavor that fits + // With 2048MB total capacity, we can fit 2x 1024MB flavors expectedMemory := resource.MustParse("1073741824") // 1024MB in bytes - if !res.Spec.Resources["memory"].Equal(expectedMemory) { - t.Errorf("Expected memory %v, got %v", expectedMemory, res.Spec.Resources["memory"]) + if !res.Spec.Resources[hv1.ResourceMemory].Equal(expectedMemory) { + t.Errorf("Expected memory %v, got %v", expectedMemory, res.Spec.Resources[hv1.ResourceMemory]) } expectedVCPUs := resource.MustParse("2") - if !res.Spec.Resources["cpu"].Equal(expectedVCPUs) { - t.Errorf("Expected vCPUs %v, got %v", expectedVCPUs, res.Spec.Resources["cpu"]) + if !res.Spec.Resources[hv1.ResourceCPU].Equal(expectedVCPUs) { + t.Errorf("Expected vCPUs %v, got %v", expectedVCPUs, res.Spec.Resources[hv1.ResourceCPU]) } } @@ -238,37 +310,54 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { t.Fatalf("Failed to add scheme: %v", err) } - // Create an existing reservation + // Create flavor group knowledge CRD + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "new_group_v1": { + LargestFlavorName: "new-flavor", + LargestFlavorVCPUs: 4, + LargestFlavorMemoryMB: 2048, + SmallestFlavorName: "new-flavor-small", + SmallestFlavorVCPUs: 2, + SmallestFlavorMemoryMB: 1024, + }, + }) + + // Create an existing reservation with mismatched project/flavor group + // The ReservationManager will delete this and create a new one existingReservation := &v1alpha1.Reservation{ ObjectMeta: ctrl.ObjectMeta{ - Name: "commitment-12345-0", // Instance commitments have -0 suffix + Name: "commitment-12345-67890-abcdef-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, }, Spec: v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeCommittedResource, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - ProjectID: "old-project", - ResourceName: "old-flavor", - Creator: CreatorValue, + ProjectID: "old-project", + ResourceName: "old-flavor", + ResourceGroup: "old_group", + Creator: CreatorValue, }, - Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("512Mi"), - "cpu": resource.MustParse("1"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("512Mi"), + hv1.ResourceCPU: resource.MustParse("1"), }, }, } k8sClient := fake.NewClientBuilder(). WithScheme(scheme). - WithObjects(existingReservation). + WithObjects(existingReservation, flavorGroupsKnowledge). Build() - // Create mock commitment that should update the existing reservation + // Create mock commitment that will replace the existing reservation mockCommitments := []Commitment{ { ID: 1, UUID: "12345-67890-abcdef", ServiceType: "compute", - ResourceName: "instances_new-flavor", + ResourceName: "ram_new_group_v1", AvailabilityZone: "az1", Amount: 1, Unit: "", @@ -285,23 +374,6 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { } return result, nil }, - listFlavorsByNameFunc: func(ctx context.Context) (map[string]Flavor, error) { - return map[string]Flavor{ - "new-flavor": { - ID: "flavor-2", - Name: "new-flavor", - RAM: 2048, // 2GB in MB - VCPUs: 4, - Disk: 40, // 40GB - ExtraSpecs: map[string]string{ - "hw:cpu_policy": "shared", - "hw:numa_nodes": "2", - "aggregate_instance_extra_specs:pinned": "false", - "capabilities:hypervisor_type": "qemu", - }, - }, - }, nil - }, listProjectsFunc: func(ctx context.Context) ([]Project, error) { return []Project{ {ID: "new-project", DomainID: "new-domain", Name: "New Project"}, @@ -327,45 +399,66 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { return } - // Verify that the reservation was updated - var updatedReservation v1alpha1.Reservation - err = k8sClient.Get(context.Background(), client.ObjectKey{Name: "commitment-12345-0"}, &updatedReservation) + // Verify that reservations were updated (old one deleted, new one created) + // The new reservation will be at index 0 since the old one was deleted first + var reservations v1alpha1.ReservationList + err = k8sClient.List(context.Background(), &reservations) if err != nil { - t.Errorf("Failed to get updated reservation: %v", err) + t.Errorf("Failed to list reservations: %v", err) + return + } + + if len(reservations.Items) != 1 { + t.Errorf("Expected 1 reservation, got %d", len(reservations.Items)) return } - // Verify the reservation was updated with new values - if updatedReservation.Spec.CommittedResourceReservation == nil { + newReservation := reservations.Items[0] + + // Verify the new reservation has correct values + if newReservation.Spec.CommittedResourceReservation == nil { t.Errorf("Expected CommittedResourceReservation to be set") return } - if updatedReservation.Spec.CommittedResourceReservation.ProjectID != "new-project" { - t.Errorf("Expected project ID new-project, got %v", updatedReservation.Spec.CommittedResourceReservation.ProjectID) + if newReservation.Spec.CommittedResourceReservation.ProjectID != "new-project" { + t.Errorf("Expected project ID new-project, got %v", newReservation.Spec.CommittedResourceReservation.ProjectID) } - if updatedReservation.Spec.CommittedResourceReservation.ResourceName != "new-flavor" { - t.Errorf("Expected flavor new-flavor, got %v", updatedReservation.Spec.CommittedResourceReservation.ResourceName) + if newReservation.Spec.CommittedResourceReservation.ResourceGroup != "new_group_v1" { + t.Errorf("Expected resource group new_group_v1, got %v", newReservation.Spec.CommittedResourceReservation.ResourceGroup) } } -func TestSyncer_SyncReservations_ShortUUID(t *testing.T) { +func TestSyncer_SyncReservations_EmptyUUID(t *testing.T) { scheme := runtime.NewScheme() if err := v1alpha1.AddToScheme(scheme); err != nil { t.Fatalf("Failed to add scheme: %v", err) } + // Create flavor group knowledge CRD + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "test_group_v1": { + LargestFlavorName: "test-flavor", + LargestFlavorVCPUs: 2, + LargestFlavorMemoryMB: 1024, + SmallestFlavorName: "test-flavor", + SmallestFlavorVCPUs: 2, + SmallestFlavorMemoryMB: 1024, + }, + }) + k8sClient := fake.NewClientBuilder(). WithScheme(scheme). + WithObjects(flavorGroupsKnowledge). Build() - // Create mock commitment with short UUID (should be skipped) + // Create mock commitment with empty UUID (should be skipped) mockCommitments := []Commitment{ { ID: 1, - UUID: "123", // Too short + UUID: "", // Empty UUID ServiceType: "compute", - ResourceName: "instances_test-flavor", + ResourceName: "ram_test_group_v1", AvailabilityZone: "az1", Amount: 1, Unit: "", @@ -382,23 +475,6 @@ func TestSyncer_SyncReservations_ShortUUID(t *testing.T) { } return result, nil }, - listFlavorsByNameFunc: func(ctx context.Context) (map[string]Flavor, error) { - return map[string]Flavor{ - "test-flavor": { - ID: "flavor-1", - Name: "test-flavor", - RAM: 1024, // 1GB in MB - VCPUs: 2, - Disk: 20, // 20GB - ExtraSpecs: map[string]string{ - "hw:cpu_policy": "dedicated", - "hw:numa_nodes": "1", - "aggregate_instance_extra_specs:pinned": "true", - "capabilities:hypervisor_type": "qemu", - }, - }, - }, nil - }, listProjectsFunc: func(ctx context.Context) ([]Project, error) { return []Project{ {ID: "test-project", DomainID: "test-domain", Name: "Test Project"}, @@ -424,7 +500,7 @@ func TestSyncer_SyncReservations_ShortUUID(t *testing.T) { return } - // Verify that no reservations were created due to short UUID + // Verify that no reservations were created due to empty UUID var reservations v1alpha1.ReservationList err = k8sClient.List(context.Background(), &reservations) if err != nil { @@ -433,6 +509,6 @@ func TestSyncer_SyncReservations_ShortUUID(t *testing.T) { } if len(reservations.Items) != 0 { - t.Errorf("Expected 0 reservations due to short UUID, got %d", len(reservations.Items)) + t.Errorf("Expected 0 reservations due to empty UUID, got %d", len(reservations.Items)) } } diff --git a/internal/scheduling/reservations/commitments/utils.go b/internal/scheduling/reservations/commitments/utils.go new file mode 100644 index 000000000..0afb3ab67 --- /dev/null +++ b/internal/scheduling/reservations/commitments/utils.go @@ -0,0 +1,46 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "strconv" + "strings" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" +) + +func GetMaxSlotIndex(reservations []v1alpha1.Reservation) int { + maxIndex := -1 + for _, res := range reservations { + // Parse slot index from name: "commitment--" + parts := strings.Split(res.Name, "-") + if len(parts) >= 3 { + if index, err := strconv.Atoi(parts[len(parts)-1]); err == nil { + if index > maxIndex { + maxIndex = index + } + } + } + } + return maxIndex +} + +// Always continue counting slots from max, instead of filling gaps. +func GetNextSlotIndex(reservations []v1alpha1.Reservation) int { + maxIndex := GetMaxSlotIndex(reservations) + return maxIndex + 1 +} + +// extractCommitmentUUID parses UUID from reservation name (commitment--). +func extractCommitmentUUID(name string) string { + // Remove "commitment-" prefix + withoutPrefix := strings.TrimPrefix(name, "commitment-") + // Split by "-" and take all but the last part (which is the slot index) + parts := strings.Split(withoutPrefix, "-") + if len(parts) > 1 { + // Rejoin all parts except the last one (slot index) + return strings.Join(parts[:len(parts)-1], "-") + } + return withoutPrefix +} diff --git a/internal/scheduling/reservations/commitments/utils_test.go b/internal/scheduling/reservations/commitments/utils_test.go new file mode 100644 index 000000000..b16268b2f --- /dev/null +++ b/internal/scheduling/reservations/commitments/utils_test.go @@ -0,0 +1,84 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestGetMaxSlotIndex_FindsHighestIndex(t *testing.T) { + reservations := []v1alpha1.Reservation{ + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123-0"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123-5"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123-2"}}, + } + + maxIndex := GetMaxSlotIndex(reservations) + if maxIndex != 5 { + t.Errorf("expected max index 5, got %d", maxIndex) + } +} + +func TestGetMaxSlotIndex_EmptyList(t *testing.T) { + maxIndex := GetMaxSlotIndex([]v1alpha1.Reservation{}) + if maxIndex != -1 { + t.Errorf("expected -1 for empty list, got %d", maxIndex) + } +} + +func TestGetMaxSlotIndex_InvalidNames(t *testing.T) { + reservations := []v1alpha1.Reservation{ + {ObjectMeta: metav1.ObjectMeta{Name: "invalid-name"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123"}}, // Missing slot index + } + + maxIndex := GetMaxSlotIndex(reservations) + if maxIndex != -1 { + t.Errorf("expected -1 when no valid indices found, got %d", maxIndex) + } +} + +func TestGetNextSlotIndex_IncrementsByOne(t *testing.T) { + reservations := []v1alpha1.Reservation{ + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123-0"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123-3"}}, + } + + nextIndex := GetNextSlotIndex(reservations) + if nextIndex != 4 { + t.Errorf("expected next index 4, got %d", nextIndex) + } +} + +func TestGetNextSlotIndex_EmptyList(t *testing.T) { + nextIndex := GetNextSlotIndex([]v1alpha1.Reservation{}) + if nextIndex != 0 { + t.Errorf("expected 0 for empty list, got %d", nextIndex) + } +} + +func TestExtractCommitmentUUID_SimpleUUID(t *testing.T) { + uuid := extractCommitmentUUID("commitment-abc123-0") + if uuid != "abc123" { + t.Errorf("expected abc123, got %s", uuid) + } +} + +func TestExtractCommitmentUUID_ComplexUUID(t *testing.T) { + // UUID with dashes (like standard UUID format) + uuid := extractCommitmentUUID("commitment-550e8400-e29b-41d4-a716-446655440000-5") + if uuid != "550e8400-e29b-41d4-a716-446655440000" { + t.Errorf("expected full UUID, got %s", uuid) + } +} + +func TestExtractCommitmentUUID_NoSlotIndex(t *testing.T) { + uuid := extractCommitmentUUID("commitment-abc123") + if uuid != "abc123" { + t.Errorf("expected abc123, got %s", uuid) + } +} diff --git a/internal/scheduling/reservations/controller/client.go b/internal/scheduling/reservations/controller/client.go deleted file mode 100644 index a57428dc9..000000000 --- a/internal/scheduling/reservations/controller/client.go +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package controller - -import ( - "context" - "encoding/json" - "fmt" - "net/http" - - "github.com/cobaltcore-dev/cortex/pkg/keystone" - "github.com/cobaltcore-dev/cortex/pkg/sso" - "github.com/gophercloud/gophercloud/v2" - "github.com/sapcc/go-bits/must" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -var ( - syncLog = ctrl.Log.WithName("sync") -) - -// OpenStack hypervisor model as returned by the Nova API under /os-hypervisors/detail. -// See: https://docs.openstack.org/api-ref/compute/#list-hypervisors-details -type Hypervisor struct { - ID string `json:"id"` - Hostname string `json:"hypervisor_hostname"` - Service struct { - Host string `json:"host"` - } `json:"service"` - Type string `json:"hypervisor_type"` -} - -// Client to fetch hypervisor data. -type HypervisorClient interface { - // Init the client. - Init(ctx context.Context, client client.Client, conf Config) error - // List all hypervisors. - ListHypervisors(ctx context.Context) ([]Hypervisor, error) -} - -// Hypervisor client fetching commitments from openstack services. -type hypervisorClient struct { - // Providerclient authenticated against openstack. - provider *gophercloud.ProviderClient - // Nova service client for OpenStack. - nova *gophercloud.ServiceClient -} - -// Create a new hypervisor client. -// By default, this client will fetch hypervisors from the nova API. -func NewHypervisorClient() HypervisorClient { - return &hypervisorClient{} -} - -// Init the client. -func (c *hypervisorClient) Init(ctx context.Context, client client.Client, conf Config) error { - var authenticatedHTTP = http.DefaultClient - if conf.SSOSecretRef != nil { - var err error - authenticatedHTTP, err = sso.Connector{Client: client}. - FromSecretRef(ctx, *conf.SSOSecretRef) - if err != nil { - return err - } - } - authenticatedKeystone, err := keystone. - Connector{Client: client, HTTPClient: authenticatedHTTP}. - FromSecretRef(ctx, conf.KeystoneSecretRef) - if err != nil { - return err - } - // Automatically fetch the nova endpoint from the keystone service catalog. - c.provider = authenticatedKeystone.Client() - - // Get the nova endpoint. - url := must.Return(c.provider.EndpointLocator(gophercloud.EndpointOpts{ - Type: "compute", - Availability: "public", - })) - syncLog.Info("using nova endpoint", "url", url) - c.nova = &gophercloud.ServiceClient{ - ProviderClient: c.provider, - Endpoint: url, - Type: "compute", - Microversion: "2.61", - } - return nil -} - -func (c *hypervisorClient) ListHypervisors(ctx context.Context) ([]Hypervisor, error) { - // Note: currently we need to fetch this without gophercloud. - // Gophercloud will just assume the request is a single page even when - // the response is paginated, returning only the first page. - initialURL := c.nova.Endpoint + "os-hypervisors/detail" - var nextURL = &initialURL - var hypervisors []Hypervisor - for nextURL != nil { - req, err := http.NewRequestWithContext(ctx, http.MethodGet, *nextURL, http.NoBody) - if err != nil { - return nil, err - } - req.Header.Set("X-Auth-Token", c.provider.Token()) - req.Header.Set("X-OpenStack-Nova-API-Version", c.nova.Microversion) - resp, err := c.nova.HTTPClient.Do(req) - if err != nil { - return nil, err - } - defer resp.Body.Close() - if resp.StatusCode != http.StatusOK { - return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode) - } - var list struct { - Hypervisors []Hypervisor `json:"hypervisors"` - Links []struct { - Rel string `json:"rel"` - Href string `json:"href"` - } `json:"hypervisors_links"` - } - err = json.NewDecoder(resp.Body).Decode(&list) - if err != nil { - return nil, err - } - hypervisors = append(hypervisors, list.Hypervisors...) - nextURL = nil - for _, link := range list.Links { - if link.Rel == "next" { - nextURL = &link.Href - break - } - } - } - return hypervisors, nil -} diff --git a/internal/scheduling/reservations/controller/client_test.go b/internal/scheduling/reservations/controller/client_test.go deleted file mode 100644 index f2b5582bc..000000000 --- a/internal/scheduling/reservations/controller/client_test.go +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package controller - -import ( - "context" - - "sigs.k8s.io/controller-runtime/pkg/client" -) - -type mockHypervisorClient struct { - hypervisorsToReturn []Hypervisor - errToReturn error -} - -func (m *mockHypervisorClient) Init(ctx context.Context, client client.Client, conf Config) error { - return nil -} - -func (m *mockHypervisorClient) ListHypervisors(ctx context.Context) ([]Hypervisor, error) { - return m.hypervisorsToReturn, m.errToReturn -} diff --git a/internal/scheduling/reservations/controller/controller.go b/internal/scheduling/reservations/controller/controller.go index 4eae4cfc2..17177770c 100644 --- a/internal/scheduling/reservations/controller/controller.go +++ b/internal/scheduling/reservations/controller/controller.go @@ -10,6 +10,7 @@ import ( "fmt" "net/http" "strings" + "time" "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/resource" @@ -23,10 +24,21 @@ import ( schedulerdelegationapi "github.com/cobaltcore-dev/cortex/api/external/nova" "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" "github.com/cobaltcore-dev/cortex/pkg/multicluster" corev1 "k8s.io/api/core/v1" ) +const ( + // RequeueIntervalActive is the interval for requeueing active reservations for verification. + RequeueIntervalActive = 5 * time.Minute + // RequeueIntervalRetry is the interval for requeueing when retrying after knowledge is not ready. + RequeueIntervalRetry = 1 * time.Minute +) + // Endpoints for the reservations operator. type EndpointsConfig struct { // The nova external scheduler endpoint. @@ -42,18 +54,21 @@ type Config struct { // Secret ref to keystone credentials stored in a k8s secret. KeystoneSecretRef corev1.SecretReference `json:"keystoneSecretRef"` + + // Secret ref to the database credentials for querying VM state. + DatabaseSecretRef *corev1.SecretReference `json:"databaseSecretRef,omitempty"` } // ReservationReconciler reconciles a Reservation object type ReservationReconciler struct { - // Client to fetch hypervisors. - HypervisorClient // Client for the kubernetes API. client.Client // Kubernetes scheme to use for the reservations. Scheme *runtime.Scheme // Configuration for the controller. Conf Config + // Database connection for querying VM state from Knowledge cache. + DB *db.DB } // Reconcile is part of the main kubernetes reconciliation loop which aims to @@ -63,16 +78,60 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) // Fetch the reservation object. var res v1alpha1.Reservation if err := r.Get(ctx, req.NamespacedName, &res); err != nil { - // Can happen when the resource was just deleted. - return ctrl.Result{}, err + // Ignore not-found errors, since they can't be fixed by an immediate requeue + return ctrl.Result{}, client.IgnoreNotFound(err) } - // If the reservation is already active (Ready=True), skip it. + if meta.IsStatusConditionTrue(res.Status.Conditions, v1alpha1.ReservationConditionReady) { - log.Info("reservation is already active, skipping", "reservation", req.Name) - return ctrl.Result{}, nil // Don't need to requeue. + log.Info("reservation is active, verifying allocations", "reservation", req.Name) + + // Verify all allocations in Spec against actual VM state from database + if err := r.reconcileAllocations(ctx, &res); err != nil { + log.Error(err, "failed to reconcile allocations") + return ctrl.Result{}, err + } + + // Requeue periodically to keep verifying allocations + return ctrl.Result{RequeueAfter: RequeueIntervalActive}, nil } - // Sync Spec values to Status fields + // TODO trigger re-placement of unused reservations over time + + // Check if this is a pre-allocated reservation with allocations + if res.Spec.CommittedResourceReservation != nil && + len(res.Spec.CommittedResourceReservation.Allocations) > 0 && + res.Spec.TargetHost != "" { + // mark as ready without calling the placement API + log.Info("detected pre-allocated reservation", + "reservation", req.Name, + "targetHost", res.Spec.TargetHost, + "allocatedVMs", len(res.Spec.CommittedResourceReservation.Allocations)) + + old := res.DeepCopy() + res.Status.Host = res.Spec.TargetHost + meta.SetStatusCondition(&res.Status.Conditions, metav1.Condition{ + Type: v1alpha1.ReservationConditionReady, + Status: metav1.ConditionTrue, + Reason: "PreAllocated", + Message: "reservation pre-allocated with VM allocations", + }) + patch := client.MergeFrom(old) + if err := r.Status().Patch(ctx, &res, patch); err != nil { + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + log.Error(err, "failed to patch pre-allocated reservation status") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil + } + + log.Info("marked pre-allocated reservation as ready", "reservation", req.Name, "host", res.Status.Host) + // Requeue immediately to run verification in next reconcile loop + return ctrl.Result{Requeue: true}, nil + } + + // Sync Spec values to Status fields for non-pre-allocated reservations // This ensures the observed state reflects the desired state from Spec needsStatusUpdate := false if res.Spec.TargetHost != "" && res.Status.Host != res.Spec.TargetHost { @@ -83,13 +142,18 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) old := res.DeepCopy() patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { - log.Error(err, "failed to sync spec to status") - return ctrl.Result{}, err + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + log.Error(err, "failed to sync spec to status") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil } log.Info("synced spec to status", "reservation", req.Name, "host", res.Status.Host) } - // Currently we can only reconcile nova CommittedResourceReservations (those with ResourceName set). + // filter for CR reservations resourceName := "" if res.Spec.CommittedResourceReservation != nil { resourceName = res.Spec.CommittedResourceReservation.ResourceName @@ -105,8 +169,13 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) }) patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { - log.Error(err, "failed to patch reservation status") - return ctrl.Result{}, err + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + log.Error(err, "failed to patch reservation status") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil } return ctrl.Result{}, nil // Don't need to requeue. } @@ -130,49 +199,67 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) cpu = uint64(cpuValue) } - // Get all hosts and assign zero-weights to them. - hypervisors, err := r.ListHypervisors(ctx) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to list hypervisors: %w", err) + // Get project ID from CommittedResourceReservation spec if available. + projectID := "" + if res.Spec.CommittedResourceReservation != nil { + projectID = res.Spec.CommittedResourceReservation.ProjectID } - var eligibleHosts []schedulerdelegationapi.ExternalSchedulerHost - for _, hv := range hypervisors { - eligibleHosts = append(eligibleHosts, schedulerdelegationapi.ExternalSchedulerHost{ - ComputeHost: hv.Service.Host, - HypervisorHostname: hv.Hostname, - }) + + // Get AvailabilityZone from reservation if available + availabilityZone := "" + if res.Spec.AvailabilityZone != "" { + availabilityZone = res.Spec.AvailabilityZone } - if len(eligibleHosts) == 0 { - log.Info("no eligible hosts found for reservation", "reservation", req.Name) - return ctrl.Result{}, errors.New("no eligible hosts found for reservation") + + // Get flavor details from flavor group knowledge CRD + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: r.Client} + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) + if err != nil { + log.Info("flavor knowledge not ready, requeueing", + "resourceName", resourceName, + "error", err) + return ctrl.Result{RequeueAfter: RequeueIntervalRetry}, nil } - weights := make(map[string]float64, len(eligibleHosts)) - for _, host := range eligibleHosts { - weights[host.ComputeHost] = 0.0 + + // Search for the flavor across all flavor groups + var flavorDetails *compute.FlavorInGroup + for _, fg := range flavorGroups { + for _, flavor := range fg.Flavors { + if flavor.Name == resourceName { + flavorDetails = &flavor + break + } + } + if flavorDetails != nil { + break + } } - // Get project ID from CommittedResourceReservation spec if available. - projectID := "" - if res.Spec.CommittedResourceReservation != nil { - projectID = res.Spec.CommittedResourceReservation.ProjectID + // Check if flavor was found + if flavorDetails == nil { + log.Error(errors.New("flavor not found"), "flavor not found in any flavor group", + "resourceName", resourceName) + return ctrl.Result{RequeueAfter: 5 * time.Minute}, nil } // Call the external scheduler delegation API to get a host for the reservation. + // Cortex will fetch candidate hosts and weights itself from its knowledge state. externalSchedulerRequest := schedulerdelegationapi.ExternalSchedulerRequest{ Reservation: true, - Hosts: eligibleHosts, - Weights: weights, Spec: schedulerdelegationapi.NovaObject[schedulerdelegationapi.NovaSpec]{ Data: schedulerdelegationapi.NovaSpec{ - InstanceUUID: res.Name, - NumInstances: 1, // One for each reservation. - ProjectID: projectID, + InstanceUUID: res.Name, + NumInstances: 1, // One for each reservation. + ProjectID: projectID, + AvailabilityZone: availabilityZone, Flavor: schedulerdelegationapi.NovaObject[schedulerdelegationapi.NovaFlavor]{ Data: schedulerdelegationapi.NovaFlavor{ - Name: resourceName, - MemoryMB: memoryMB, - VCPUs: cpu, + Name: flavorDetails.Name, + MemoryMB: memoryMB, // take the memory from the reservation spec, not from the flavor - reservation might be bigger + VCPUs: cpu, // take the cpu from the reservation spec, not from the flavor - reservation might be bigger + ExtraSpecs: flavorDetails.ExtraSpecs, // Disk is currently not considered. + }, }, }, @@ -187,13 +274,26 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) } response, err := httpClient.Post(url, "application/json", strings.NewReader(string(reqBody))) if err != nil { - log.Error(err, "failed to send external scheduler request") + log.Error(err, "failed to send external scheduler request", "url", url) return ctrl.Result{}, err } defer response.Body.Close() + + // Check HTTP status code before attempting to decode JSON + if response.StatusCode != http.StatusOK { + err := fmt.Errorf("unexpected HTTP status code: %d", response.StatusCode) + log.Error(err, "external scheduler returned non-OK status", + "url", url, + "statusCode", response.StatusCode, + "status", response.Status) + return ctrl.Result{}, err + } + var externalSchedulerResponse schedulerdelegationapi.ExternalSchedulerResponse if err := json.NewDecoder(response.Body).Decode(&externalSchedulerResponse); err != nil { - log.Error(err, "failed to decode external scheduler response") + log.Error(err, "failed to decode external scheduler response", + "url", url, + "statusCode", response.StatusCode) return ctrl.Result{}, err } if len(externalSchedulerResponse.Hosts) == 0 { @@ -207,8 +307,13 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) }) patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { - log.Error(err, "failed to patch reservation status") - return ctrl.Result{}, err + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + log.Error(err, "failed to patch reservation status") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil } return ctrl.Result{}, nil // No need to requeue, we didn't find a host. } @@ -226,12 +331,141 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) res.Status.Host = host patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { - log.Error(err, "failed to patch reservation status") - return ctrl.Result{}, err + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + log.Error(err, "failed to patch reservation status") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil } return ctrl.Result{}, nil // No need to requeue, the reservation is now active. } +// reconcileAllocations verifies all allocations in Spec against actual Nova VM state. +// It updates Status.Allocations based on the actual host location of each VM. +func (r *ReservationReconciler) reconcileAllocations(ctx context.Context, res *v1alpha1.Reservation) error { + log := logf.FromContext(ctx) + + // Skip if no CommittedResourceReservation + if res.Spec.CommittedResourceReservation == nil { + return nil + } + + // TODO trigger migrations of unused reservations (to PAYG VMs) + + // Skip if no allocations to verify + if len(res.Spec.CommittedResourceReservation.Allocations) == 0 { + log.Info("no allocations to verify", "reservation", res.Name) + return nil + } + + // Query all VMs for this project from the database + projectID := res.Spec.CommittedResourceReservation.ProjectID + serverMap, err := r.listServersByProjectID(ctx, projectID) + if err != nil { + return fmt.Errorf("failed to list servers for project %s: %w", projectID, err) + } + + // initialize + if res.Status.CommittedResourceReservation == nil { + res.Status.CommittedResourceReservation = &v1alpha1.CommittedResourceReservationStatus{} + } + + // Build new Status.Allocations map based on actual VM locations + newStatusAllocations := make(map[string]string) + + for vmUUID := range res.Spec.CommittedResourceReservation.Allocations { + server, exists := serverMap[vmUUID] + if exists { + // VM found - record its actual host location + actualHost := server.OSEXTSRVATTRHost + newStatusAllocations[vmUUID] = actualHost + + log.Info("verified VM allocation", + "vm", vmUUID, + "reservation", res.Name, + "actualHost", actualHost, + "expectedHost", res.Status.Host) + } else { + // VM not found in database + log.Info("VM not found in database", + "vm", vmUUID, + "reservation", res.Name, + "projectID", projectID) + + // TODO handle entering and leave event + } + } + + // Patch the reservation status + old := res.DeepCopy() + + // Update Status.Allocations + res.Status.CommittedResourceReservation.Allocations = newStatusAllocations + + patch := client.MergeFrom(old) + if err := r.Status().Patch(ctx, res, patch); err != nil { + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) == nil { + // Object was deleted, no need to continue + return nil + } + return fmt.Errorf("failed to patch reservation status: %w", err) + } + + log.Info("reconciled allocations", + "reservation", res.Name, + "specAllocations", len(res.Spec.CommittedResourceReservation.Allocations), + "statusAllocations", len(newStatusAllocations)) + + return nil +} + +// Init initializes the reconciler with required clients and DB connection. +func (r *ReservationReconciler) Init(ctx context.Context, client client.Client, conf Config) error { + // Initialize database connection if DatabaseSecretRef is provided. + if conf.DatabaseSecretRef != nil { + var err error + r.DB, err = db.Connector{Client: client}.FromSecretRef(ctx, *conf.DatabaseSecretRef) + if err != nil { + return fmt.Errorf("failed to initialize database connection: %w", err) + } + logf.FromContext(ctx).Info("database connection initialized for reservation controller") + } + + return nil +} + +func (r *ReservationReconciler) listServersByProjectID(ctx context.Context, projectID string) (map[string]*nova.Server, error) { + if r.DB == nil { + return nil, errors.New("database connection not initialized") + } + + log := logf.FromContext(ctx) + + // Query servers from the database cache. + var servers []nova.Server + _, err := r.DB.Select(&servers, + "SELECT * FROM openstack_servers WHERE tenant_id = $1", + projectID) + if err != nil { + return nil, fmt.Errorf("failed to query servers from database: %w", err) + } + + log.V(1).Info("queried servers from database", + "projectID", projectID, + "serverCount", len(servers)) + + // Build lookup map for O(1) access by VM UUID. + serverMap := make(map[string]*nova.Server, len(servers)) + for i := range servers { + serverMap[servers[i].ID] = &servers[i] + } + + return serverMap, nil +} + // SetupWithManager sets up the controller with the Manager. func (r *ReservationReconciler) SetupWithManager(mgr ctrl.Manager, mcl *multicluster.Client) error { if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { diff --git a/internal/scheduling/reservations/controller/controller_test.go b/internal/scheduling/reservations/controller/controller_test.go index d716c0b63..0ef3e253c 100644 --- a/internal/scheduling/reservations/controller/controller_test.go +++ b/internal/scheduling/reservations/controller/controller_test.go @@ -10,6 +10,7 @@ import ( "net/http/httptest" "testing" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -36,7 +37,7 @@ func TestReservationReconciler_Reconcile(t *testing.T) { shouldRequeue bool }{ { - name: "skip already active reservation", + name: "expect already active reservation", reservation: &v1alpha1.Reservation{ ObjectMeta: ctrl.ObjectMeta{ Name: "test-reservation", @@ -59,7 +60,7 @@ func TestReservationReconciler_Reconcile(t *testing.T) { }, }, expectedReady: true, - shouldRequeue: false, + shouldRequeue: true, }, { name: "skip reservation without resource name", @@ -148,17 +149,78 @@ func TestReservationReconciler_reconcileInstanceReservation_Success(t *testing.T ProjectID: "test-project", ResourceName: "test-flavor", }, - Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("1Gi"), - "cpu": resource.MustParse("2"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("1Gi"), + hv1.ResourceCPU: resource.MustParse("2"), + }, + }, + } + + // Create flavor group knowledge CRD for the test + // Need to import compute package for FlavorGroupFeature + flavorGroups := []struct { + Name string `json:"name"` + Flavors []struct { + Name string `json:"name"` + MemoryMB uint64 `json:"memoryMB"` + VCPUs uint64 `json:"vcpus"` + ExtraSpecs map[string]string `json:"extraSpecs"` + } `json:"flavors"` + }{ + { + Name: "test-group", + Flavors: []struct { + Name string `json:"name"` + MemoryMB uint64 `json:"memoryMB"` + VCPUs uint64 `json:"vcpus"` + ExtraSpecs map[string]string `json:"extraSpecs"` + }{ + { + Name: "test-flavor", + MemoryMB: 1024, + VCPUs: 2, + ExtraSpecs: map[string]string{}, + }, + }, + }, + } + + // Marshal flavor groups into runtime.RawExtension + flavorGroupsJSON, err := json.Marshal(map[string]interface{}{ + "features": flavorGroups, + }) + if err != nil { + t.Fatalf("Failed to marshal flavor groups: %v", err) + } + + flavorGroupKnowledge := &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{ + Name: "flavor-groups", + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{ + Name: "flavor_groups", // Note: underscore not hyphen + }, + Recency: metav1.Duration{Duration: 0}, + }, + Status: v1alpha1.KnowledgeStatus{ + Raw: runtime.RawExtension{Raw: flavorGroupsJSON}, + RawLength: 1, + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionTrue, + Reason: "TestReady", + }, }, }, } client := fake.NewClientBuilder(). WithScheme(scheme). - WithObjects(reservation). - WithStatusSubresource(&v1alpha1.Reservation{}). + WithObjects(reservation, flavorGroupKnowledge). + WithStatusSubresource(&v1alpha1.Reservation{}, &v1alpha1.Knowledge{}). Build() // Create a mock server that returns a successful response @@ -196,28 +258,6 @@ func TestReservationReconciler_reconcileInstanceReservation_Success(t *testing.T Client: client, Scheme: scheme, Conf: config, - HypervisorClient: &mockHypervisorClient{ - hypervisorsToReturn: []Hypervisor{ - { - Hostname: "test-host-1", - Type: "qemu", - Service: struct { - Host string `json:"host"` - }{ - Host: "compute1", - }, - }, - { - Hostname: "test-host-2", - Type: "qemu", - Service: struct { - Host string `json:"host"` - }{ - Host: "compute2", - }, - }, - }, - }, } req := ctrl.Request{ diff --git a/internal/scheduling/reservations/controller/monitor.go b/internal/scheduling/reservations/controller/monitor.go index 3e6c6dae6..0c0ad2875 100644 --- a/internal/scheduling/reservations/controller/monitor.go +++ b/internal/scheduling/reservations/controller/monitor.go @@ -101,7 +101,7 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) { resourcesByLabels[key] = map[string]uint64{} } for resourceName, resourceQuantity := range reservation.Spec.Resources { - resourcesByLabels[key][resourceName] += resourceQuantity.AsDec().UnscaledBig().Uint64() + resourcesByLabels[key][string(resourceName)] += resourceQuantity.AsDec().UnscaledBig().Uint64() } } for key, resources := range resourcesByLabels { diff --git a/internal/scheduling/reservations/controller/monitor_test.go b/internal/scheduling/reservations/controller/monitor_test.go index fef88e35e..eef11892e 100644 --- a/internal/scheduling/reservations/controller/monitor_test.go +++ b/internal/scheduling/reservations/controller/monitor_test.go @@ -14,6 +14,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" "github.com/cobaltcore-dev/cortex/api/v1alpha1" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" ) @@ -98,9 +99,9 @@ func TestMonitor_Collect_WithReservations(t *testing.T) { CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ResourceName: "test-flavor", }, - Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("1Gi"), - "cpu": resource.MustParse("2"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("1Gi"), + hv1.ResourceCPU: resource.MustParse("2"), }, }, Status: v1alpha1.ReservationStatus{ @@ -123,9 +124,9 @@ func TestMonitor_Collect_WithReservations(t *testing.T) { CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ResourceName: "test-flavor", }, - Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("2Gi"), - "cpu": resource.MustParse("4"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("2Gi"), + hv1.ResourceCPU: resource.MustParse("4"), }, }, Status: v1alpha1.ReservationStatus{ @@ -148,9 +149,9 @@ func TestMonitor_Collect_WithReservations(t *testing.T) { CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ResourceName: "test-flavor", }, - Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("4Gi"), - "cpu": resource.MustParse("4"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("4Gi"), + hv1.ResourceCPU: resource.MustParse("4"), }, }, Status: v1alpha1.ReservationStatus{ @@ -244,9 +245,9 @@ func TestMonitor_Collect_ResourceMetrics(t *testing.T) { CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ResourceName: "test-flavor", }, - Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("1000Mi"), - "cpu": resource.MustParse("2"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("1000Mi"), + hv1.ResourceCPU: resource.MustParse("2"), }, }, Status: v1alpha1.ReservationStatus{ @@ -367,9 +368,9 @@ func TestMonitor_Collect_LabelSanitization(t *testing.T) { CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ ResourceName: "test-flavor", }, - Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("1Gi"), - "cpu": resource.MustParse("2"), + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("1Gi"), + hv1.ResourceCPU: resource.MustParse("2"), }, }, Status: v1alpha1.ReservationStatus{ diff --git a/internal/scheduling/reservations/flavor_groups.go b/internal/scheduling/reservations/flavor_groups.go new file mode 100644 index 000000000..197406eac --- /dev/null +++ b/internal/scheduling/reservations/flavor_groups.go @@ -0,0 +1,74 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package reservations + +import ( + "context" + "errors" + "fmt" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// FlavorGroupKnowledgeClient accesses flavor group data from Knowledge CRDs. +type FlavorGroupKnowledgeClient struct { + client.Client +} + +// Get retrieves the flavor groups Knowledge CRD and returns it if ready. +// Returns nil, nil if not ready yet. +func (c *FlavorGroupKnowledgeClient) Get(ctx context.Context) (*v1alpha1.Knowledge, error) { + knowledge := &v1alpha1.Knowledge{} + err := c.Client.Get(ctx, types.NamespacedName{ + Name: "flavor-groups", + // Namespace is empty as Knowledge is cluster-scoped + }, knowledge) + + if err != nil { + return nil, fmt.Errorf("failed to get flavor groups knowledge: %w", err) + } + + if meta.IsStatusConditionTrue(knowledge.Status.Conditions, v1alpha1.KnowledgeConditionReady) { + return knowledge, nil + } + + // Found but not ready yet + return nil, nil +} + +// GetAllFlavorGroups returns all flavor groups as a map. +// If knowledgeCRD is provided, uses it directly. Otherwise fetches the Knowledge CRD. +func (c *FlavorGroupKnowledgeClient) GetAllFlavorGroups(ctx context.Context, knowledgeCRD *v1alpha1.Knowledge) (map[string]compute.FlavorGroupFeature, error) { + // If no CRD provided, fetch it + if knowledgeCRD == nil { + var err error + knowledgeCRD, err = c.Get(ctx) + if err != nil { + return nil, err + } + if knowledgeCRD == nil { + return nil, errors.New("flavor groups knowledge is not ready") + } + } + + // Unbox the features from the raw extension + features, err := v1alpha1.UnboxFeatureList[compute.FlavorGroupFeature]( + knowledgeCRD.Status.Raw, + ) + if err != nil { + return nil, fmt.Errorf("failed to unbox flavor group features: %w", err) + } + + // Build map for efficient lookups + flavorGroupMap := make(map[string]compute.FlavorGroupFeature, len(features)) + for _, feature := range features { + flavorGroupMap[feature.Name] = feature + } + + return flavorGroupMap, nil +} diff --git a/pkg/conf/conf.go b/pkg/conf/conf.go index 595b33bf2..b0feb02c2 100644 --- a/pkg/conf/conf.go +++ b/pkg/conf/conf.go @@ -17,35 +17,51 @@ import ( // // The values read from secrets.json will override the values in conf.json func GetConfigOrDie[C any]() C { + c, err := GetConfig[C]() + if err != nil { + panic(err) + } + return c +} + +// Create a new configuration from the default config json file. +// Return an error if the config cannot be read or parsed. +// +// This will read two files: +// - /etc/config/conf.json +// - /etc/secrets/secrets.json +// +// The values read from secrets.json will override the values in conf.json +func GetConfig[C any]() (C, error) { // Note: We need to read the config as a raw map first, to avoid golang // unmarshalling default values for the fields. // Read the base config from the configmap (not including secrets). cmConf, err := readRawConfig("/etc/config/conf.json") if err != nil { - panic(err) + return *new(C), err } // Read the secrets config from the kubernetes secret. secretConf, err := readRawConfig("/etc/secrets/secrets.json") if err != nil { - panic(err) + return *new(C), err } return newConfigFromMaps[C](cmConf, secretConf) } -func newConfigFromMaps[C any](base, override map[string]any) C { +func newConfigFromMaps[C any](base, override map[string]any) (C, error) { // Merge the base config with the override config. mergedConf := mergeMaps(base, override) // Marshal again, and then unmarshal into the config struct. mergedBytes, err := json.Marshal(mergedConf) if err != nil { - panic(err) + return *new(C), err } var c C if err := json.Unmarshal(mergedBytes, &c); err != nil { - panic(err) + return *new(C), err } - return c + return c, nil } // Read the json as a map from the given file path. diff --git a/postgres/Dockerfile b/postgres/Dockerfile index 796069d8b..09c049295 100644 --- a/postgres/Dockerfile +++ b/postgres/Dockerfile @@ -1,5 +1,5 @@ -# Last updated: 19 Feb 2026 -FROM debian:trixie-slim +# Last updated: 17 Mar 2026 +FROM debian:trixie-slim@sha256:26f98ccd92fd0a44d6928ce8ff8f4921b4d2f535bfa07555ee5d18f61429cf0c # explicitly set user/group IDs RUN set -eux; \ @@ -194,4 +194,4 @@ STOPSIGNAL SIGINT # that even 90 seconds may not be long enough in many instances. EXPOSE 5432 -CMD ["postgres"] \ No newline at end of file +CMD ["postgres"] diff --git a/tools/plutono/provisioning/dashboards/cortex-status.json b/tools/plutono/provisioning/dashboards/cortex-status.json index 043d07983..2481aded0 100644 --- a/tools/plutono/provisioning/dashboards/cortex-status.json +++ b/tools/plutono/provisioning/dashboards/cortex-status.json @@ -2124,9 +2124,9 @@ "targets": [ { "exemplar": true, - "expr": "sum by(error, skipped) (rate(cortex_descheduler_pipeline_vm_descheduling_duration_seconds_count{}[2m]))", + "expr": "sum by(error) (rate(cortex_detector_pipeline_run_duration_seconds_count{}[2m]))", "interval": "", - "legendFormat": "Error: {{error}}, Skipped: {{skipped}}", + "legendFormat": "Error: {{error}}", "refId": "A" } ], @@ -2321,11 +2321,11 @@ "targets": [ { "exemplar": true, - "expr": "sum by (error, skipped, source_host, target_host, vm_id) (delta(cortex_descheduler_pipeline_vm_descheduling_duration_seconds_count{}[2m]))", + "expr": "sum by (error) (delta(cortex_detector_pipeline_run_duration_seconds_count{}[2m]))", "format": "time_series", "instant": false, "interval": "", - "legendFormat": "{{vm_id}}: ({{source_host}}) -> ({{target_host}}), Error: {{error}}, Skipped: {{skipped}}", + "legendFormat": "Error: {{error}}", "refId": "A" } ], diff --git a/tools/spawner/cli/cli.go b/tools/spawner/cli/cli.go index 0caeaf8d0..57ebfb059 100644 --- a/tools/spawner/cli/cli.go +++ b/tools/spawner/cli/cli.go @@ -12,6 +12,7 @@ import ( "strings" "github.com/cobaltcore-dev/cortex/tools/spawner/defaults" + "github.com/cobaltcore-dev/cortex/tools/spawner/types" "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/flavors" "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/hypervisors" "github.com/gophercloud/gophercloud/v2/openstack/identity/v3/domains" @@ -28,6 +29,8 @@ type CLI interface { ChooseImage([]images.Image) images.Image ChooseHypervisorType([]string) string ChooseHypervisor([]hypervisors.Hypervisor) hypervisors.Hypervisor + ChooseServerGroupPolicy([]string) string + ChooseServerGroup([]types.ServerGroup) types.ServerGroup } type cli struct { @@ -92,6 +95,20 @@ func (c *cli) ChooseHypervisor(hs []hypervisors.Hypervisor) hypervisors.Hypervis return choose(c.defaults, "WS_HYPERVISOR", "πŸ“‚ Hypervisors", hs, f) } +func (c *cli) ChooseServerGroupPolicy(ps []string) string { + f := func(p string) string { + return p + } + return choose(c.defaults, "WS_SERVER_GROUP_POLICY", "πŸ“‚ Server Group Policies", ps, f) +} + +func (c *cli) ChooseServerGroup(sgs []types.ServerGroup) types.ServerGroup { + f := func(sg types.ServerGroup) string { + return fmt.Sprintf("%s (%s) id:%s", sg.Name, sg.Policy, sg.ID[:5]) + } + return choose(c.defaults, "WS_SERVER_GROUP", "πŸ“‚ Server Groups", sgs, f) +} + // Choose asks the user to choose one of the given options. // The user can choose by index or by name. The user can also choose the default value. // If the user chooses to input a name, the mapping is done by the displayname function. diff --git a/tools/spawner/main.go b/tools/spawner/main.go index c16b66a1b..2a9bcf12b 100644 --- a/tools/spawner/main.go +++ b/tools/spawner/main.go @@ -19,6 +19,7 @@ import ( "github.com/cobaltcore-dev/cortex/tools/spawner/cli" "github.com/cobaltcore-dev/cortex/tools/spawner/defaults" + "github.com/cobaltcore-dev/cortex/tools/spawner/types" "github.com/gophercloud/gophercloud/v2" "github.com/gophercloud/gophercloud/v2/openstack" "github.com/gophercloud/gophercloud/v2/openstack/blockstorage/v3/volumes" @@ -326,8 +327,8 @@ func main() { var network *networks.Network if len(networksAll) == 1 { fmt.Printf("❓ Delete existing network %s [y/N, default: \033[1;34mN\033[0m]: ", networkName) - reader := bufio.NewReader(os.Stdin) - input := must.Return(reader.ReadString('\n')) + reader = bufio.NewReader(os.Stdin) + input = must.Return(reader.ReadString('\n')) input = strings.TrimSpace(input) if input == "y" { // Delete the subnets. @@ -384,8 +385,8 @@ func main() { // Delete all existing keypairs with the same name. if len(keypairsFiltered) > 0 { fmt.Printf("❓ Delete existing keypairs %v? [y/N, default: \033[1;34my\033[0m]: ", keyName) - reader := bufio.NewReader(os.Stdin) - input := must.Return(reader.ReadString('\n')) + reader = bufio.NewReader(os.Stdin) + input = must.Return(reader.ReadString('\n')) input = strings.TrimSpace(input) if input == "" { input = "y" @@ -412,6 +413,89 @@ func main() { keypair := must.Return(keypairs.Create(ctx, projectCompute, kpo).Extract()) fmt.Printf("πŸ›œ Using keypair %s\n", keyName) + // Check if there are existing server groups and check if the user wants to delete them. + fmt.Println("πŸ”„ Looking up existing server groups") + // Gophercloud doesn't support server groups, so we have to do a raw API call here. + var getServerGroupsResponse struct { + ServerGroups []types.ServerGroup `json:"server_groups"` + } + _ = must.Return(projectCompute.Get(ctx, projectCompute.Endpoint+"/os-server-groups", &getServerGroupsResponse, nil)) + if len(getServerGroupsResponse.ServerGroups) > 0 { + fmt.Printf("❓ Delete existing server groups with name prefix %s [y/N, default: \033[1;34my\033[0m]: ", prefix) + reader = bufio.NewReader(os.Stdin) + input = must.Return(reader.ReadString('\n')) + input = strings.TrimSpace(input) + if input == "" { + input = "y" + } + if input == "y" { + var wg sync.WaitGroup + for _, sg := range getServerGroupsResponse.ServerGroups { + if strings.HasPrefix(sg.Name, prefix) { + wg.Go(func() { + fmt.Printf("🧨 Deleting server group %s\n", sg.Name) + _ = must.Return(projectCompute.Delete(ctx, projectCompute.Endpoint+"/os-server-groups/"+sg.ID, nil)) + fmt.Printf("πŸ’₯ Deleted server group %s\n", sg.Name) + }) + } + } + wg.Wait() + fmt.Println("🧨 Deleted all existing server groups") + } + } + + var selectedServerGroupID string + + // Get the server groups again and check if the user wants to use an existing one or create a new one. + fmt.Println("πŸ”„ Checking existing server groups again") + _ = must.Return(projectCompute.Get(ctx, projectCompute.Endpoint+"/os-server-groups", &getServerGroupsResponse, nil)) + if len(getServerGroupsResponse.ServerGroups) > 0 { + // Ask the user if they want to use an existing server group. + fmt.Printf("❓ Use existing server group for affinity rules? [y/N, default: \033[1;34mN\033[0m]: ") + reader = bufio.NewReader(os.Stdin) + input = must.Return(reader.ReadString('\n')) + input = strings.TrimSpace(input) + if input == "y" { + selectedServerGroupID = cli.ChooseServerGroup(getServerGroupsResponse.ServerGroups).ID + } + } + // If the user doesn't want to use an existing server group, ask if they want to create a new one. + if selectedServerGroupID == "" { + fmt.Printf("❓ Create a server group for affinity rules? [y/N, default: \033[1;34mN\033[0m]: ") + reader = bufio.NewReader(os.Stdin) + input = must.Return(reader.ReadString('\n')) + input = strings.TrimSpace(input) + if input == "y" { + policies := []string{"anti-affinity", "affinity", "soft-anti-affinity", "soft-affinity"} + policy := cli.ChooseServerGroupPolicy(policies) + serverGroupName := prefix + "-server-group" + fmt.Printf("πŸ†• Creating server group %s with policy %s\n", serverGroupName, policy) + createServerGroupRequest := struct { + ServerGroup struct { + Name string `json:"name"` + Policy string `json:"policy"` + // For simplicity, we don't include rules for now. + } `json:"server_group"` + }{} + createServerGroupRequest.ServerGroup.Name = serverGroupName + createServerGroupRequest.ServerGroup.Policy = policy + var createServerGroupResponse struct { + ServerGroup struct { + ID string `json:"id"` + } `json:"server_group"` + } + _ = must.Return(projectCompute.Post(ctx, projectCompute.Endpoint+"/os-server-groups", &createServerGroupRequest, &createServerGroupResponse, &gophercloud.RequestOpts{ + OkCodes: []int{200, 201, 202}, + })) + selectedServerGroupID = createServerGroupResponse.ServerGroup.ID + } + } + if selectedServerGroupID != "" { + fmt.Printf("πŸ›œ Using server group with id %s\n", selectedServerGroupID) + } else { + fmt.Printf("🚫 Not using a server group for affinity rules\n") + } + // Load the script template tmpl, err := template.ParseFiles("tools/spawner/script.sh.tpl") must.Succeed(err) @@ -473,7 +557,7 @@ func main() { KeyName: keyName, CreateOptsBuilder: sco, } - ho := servers.SchedulerHintOpts{} + ho := servers.SchedulerHintOpts{Group: selectedServerGroupID} serverCreateResult, err := servers.Create(ctx, projectCompute, so, ho).Extract() baseMsg := fmt.Sprintf( "... (%d/%d) Spawning VM %s on %s with flavor %s, image %s ", diff --git a/tools/spawner/types/server_group.go b/tools/spawner/types/server_group.go new file mode 100644 index 000000000..da7716db9 --- /dev/null +++ b/tools/spawner/types/server_group.go @@ -0,0 +1,11 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package types + +// Not supported by gophercloud. +type ServerGroup struct { + ID string `json:"id"` + Policy string `json:"policy"` + Name string `json:"name"` +} diff --git a/tools/visualizer/Dockerfile b/tools/visualizer/Dockerfile deleted file mode 100644 index af7c859dd..000000000 --- a/tools/visualizer/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright SAP SE -# SPDX-License-Identifier: Apache-2.0 - -FROM nginx - -COPY nova.html /usr/share/nginx/html/nova.html -COPY shared.css /usr/share/nginx/html/shared.css -COPY favicon.ico /usr/share/nginx/html/favicon.ico -COPY nginx.conf /etc/nginx/conf.d/default.conf diff --git a/tools/visualizer/app.yaml b/tools/visualizer/app.yaml deleted file mode 100644 index 5697571e3..000000000 --- a/tools/visualizer/app.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright SAP SE -# SPDX-License-Identifier: Apache-2.0 - -apiVersion: apps/v1 -kind: Deployment -metadata: - name: cortex-visualizer - labels: - app: cortex-visualizer -spec: - replicas: 1 - selector: - matchLabels: - app: cortex-visualizer - template: - metadata: - labels: - app: cortex-visualizer - spec: - serviceAccountName: cortex-visualizer - containers: - - name: cortex-visualizer - image: cortex-visualizer - ports: - - containerPort: 80 - - name: kubectl-proxy - image: alpine:latest - command: ["/bin/sh"] - args: - - -c - - | - apk add --no-cache curl - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" - chmod +x kubectl - mv kubectl /usr/local/bin/ - kubectl proxy --port=8001 --address=0.0.0.0 --accept-hosts=.* - ports: - - containerPort: 8001 ---- -apiVersion: v1 -kind: Service -metadata: - name: cortex-visualizer -spec: - selector: - app: cortex-visualizer - ports: - - name: http - port: 80 - targetPort: 80 - - name: kubectl-proxy - port: 8001 - targetPort: 8001 - type: ClusterIP \ No newline at end of file diff --git a/tools/visualizer/favicon.ico b/tools/visualizer/favicon.ico deleted file mode 100644 index b4f9d5fb8..000000000 Binary files a/tools/visualizer/favicon.ico and /dev/null differ diff --git a/tools/visualizer/nginx.conf b/tools/visualizer/nginx.conf deleted file mode 100644 index 7be253e28..000000000 --- a/tools/visualizer/nginx.conf +++ /dev/null @@ -1,18 +0,0 @@ -server { - listen 80; - - # Serve static files - location / { - root /usr/share/nginx/html; - index nova.html; - } - - # Proxy API requests to kubectl proxy - location /k8s/ { - proxy_pass http://127.0.0.1:8001/; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_http_version 1.1; - proxy_set_header Connection ""; - } -} \ No newline at end of file diff --git a/tools/visualizer/nova.html b/tools/visualizer/nova.html deleted file mode 100644 index 455b8a00c..000000000 --- a/tools/visualizer/nova.html +++ /dev/null @@ -1,503 +0,0 @@ - - - - - Cortex Nova Visualizer - - - - - - - - -
-
- -
Cortex Nova Visualizer
-
- - - -
-
-
-
Loading...
- -
-
-
- - - - - - \ No newline at end of file diff --git a/tools/visualizer/role.yaml b/tools/visualizer/role.yaml deleted file mode 100644 index e497f3b88..000000000 --- a/tools/visualizer/role.yaml +++ /dev/null @@ -1,29 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: cortex-visualizer - namespace: default - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: cortex-visualizer-decision-reader -rules: -- apiGroups: ["cortex.cloud"] - resources: ["decisions"] - verbs: ["get", "list", "watch"] - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: cortex-visualizer-decision-reader-binding -subjects: -- kind: ServiceAccount - name: cortex-visualizer - namespace: default -roleRef: - kind: ClusterRole - name: cortex-visualizer-decision-reader - apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/tools/visualizer/shared.css b/tools/visualizer/shared.css deleted file mode 100644 index b3a933375..000000000 --- a/tools/visualizer/shared.css +++ /dev/null @@ -1,289 +0,0 @@ -/* Copyright SAP SE */ -/* SPDX-License-Identifier: Apache-2.0 */ - -body { - font-family: Arial, Helvetica, sans-serif; - - --color-primary: rgb(255, 165, 2); - --color-on-primary: rgb(255, 255, 255); - --color-secondary: rgb(112, 161, 255); - --color-on-secondary: rgb(255, 255, 255); - --color-tertiary: rgb(255, 71, 87); - --color-on-tertiary: rgb(255, 255, 255); - --color-background: rgb(241, 242, 246); - --color-on-background: rgb(74, 74, 74); - --color-surface: rgb(255, 255, 255); - --color-on-surface: rgb(74, 74, 74); - - color: var(--color-text); - background: var(--color-background); - /* Remove the default margin and padding from the body. */ - margin: 0; -} - -/* Nice animated progress bar on top of the page. */ -.progress { - position: fixed; - top: 0; - left: 0; - right: 0; - height: 0.5em; - background: var(--color-primary); - z-index: 1000; -} - -.progress::before { - content: ''; - position: absolute; - top: 0; - left: 0; - right: 0; - height: 0.5em; - background: var(--color-secondary); - animation: progress 2s infinite; -} - -@keyframes progress { - 0% { - left: -100%; - right: 100%; - } - - 100% { - left: 100%; - right: -100%; - } -} - -.progress-text { - position: fixed; - top: 2em; - left: 0; - right: 0; - text-align: center; - font-weight: bold; -} - -/* Navbar that shows information. */ -nav { - padding-left: 0.25em; - background: var(--color-surface); - box-shadow: 0 0 1em rgba(0, 0, 0, 0.1); - z-index: 1; -} - -nav div.element { - display: inline-block; - padding-top: 1em; - padding-bottom: 2em; - padding-left: 1em; - padding-right: 1em; - margin: 0; - background: var(--color-surface); - color: var(--color-on-surface); - border-right: 2px solid var(--color-background); - font-size: 1em; -} - -nav div.element p.highlight { - font-size: 1.25em; - font-weight: bold; -} - -table { - /* Revert the default spacing used by the browser. */ - border-spacing: 0; -} - -/* Table cell showing the weight during scheduling. */ -td.weight { - text-align: center; - position: relative; - animation: weightAnimation 0.25s ease-in-out; -} - -td.weight div { - border-radius: 0.5em; - padding: 0.5em; - margin: 0.5em; - border: 2px solid var(--color-surface); -} - -/* Backdrop white for the weight cells */ -td.weight::after { - content: ''; - position: absolute; - --m: 0.6em; - top: var(--m); - bottom: var(--m); - left: var(--m); - right: var(--m); - border-radius: 0.5em; - background: var(--color-surface); - z-index: -1; -} - -/* Animation for weights when they first appear */ -@keyframes weightAnimation { - 0% { - opacity: 0; - transform: scale(0.5); - } - - 100% { - opacity: 1; - transform: scale(1); - } -} - -/* Table cell showing the hostname/name. */ -th.hostname { - text-align: center; - position: relative; -} - -th.hostname div { - position: relative; - padding: 0.1em; - padding-top: 0.5em; - padding-bottom: 0.5em; - margin: 0.1em; - width: 6em; - height: 6em; - overflow: hidden; -} - -/* Table cell showing additional information. */ -th.metainfo { - text-align: center; - position: relative; -} - -th.metainfo div p { - width: 6em; - overflow: hidden; -} - -th.metainfo div p.issue { - color: var(--color-tertiary); - border-radius: 0.5em; - font-size: 0.8em; -} - -/* Table row showing the name of a step in the pipeline. */ -th.stepkey { - text-align: left; - font-weight: bold; - padding-left: 0.75em; - padding-top: 0.5em; - padding-bottom: 0.25em; -} - -/* Highlighted rows in the table. */ -tr.highlight { - background: var(--color-surface); - /* tr doesn't support border-radius */ - clip-path: xywh(0 0 100% 100% round 0.75em); -} - -/* Chart showing usage statistics. */ -td.chart { - position: relative; - height: 24em; -} - -td.chart div.barsbefore, -td.chart div.barsafter, -td.chart div.backdrop, -td.chart div.stats { - position: absolute; - top: 0; - left: 0; - right: 0; - bottom: 0; - display: flex; - margin-top: 1.5em; - margin-bottom: 0.5em; - padding-left: 0.5em; - padding-right: 0.5em; - flex-direction: row; - justify-content: center; - align-items: flex-end; -} - -td.chart div.barsbefore p, -td.chart div.barsafter p, -td.chart div.backdrop p, -td.chart div.stats p { - margin-left: 0.1em; - margin-right: 0.1em; - display: flex; - border-radius: 0.2em; -} - -td.chart div.backdrop p { - height: 100%; - border-radius: 0.2em; - border: 1px solid rgba(0, 0, 0, 0.05); - background: white; -} - -td.chart div.stats { - text-align: center; - display: flex; - justify-content: center; - align-items: flex-start; -} - -td.chart div.stats p { - writing-mode: vertical-lr; - text-orientation: mixed; - display: flex; - font-size: 1em; - font-weight: bold; - margin-left: 0.1em; - margin-right: 0.1em; - justify-content: center; - align-items: center; -} - -/* Animation for chart bars */ -td.chart div.barsafter p, -td.chart div.barsbefore p { - animation: barAnim 0.25s ease-in-out; - overflow: hidden; -} - -@keyframes barAnim { - 0% { - transform: scaleY(0); - } - - 100% { - transform: scaleY(1); - } -} - -td.chart div.barsafter p.cpu { - background: var(--color-primary); -} - -td.chart div.barsafter p.mem { - background: var(--color-primary); -} - -td.chart div.barsafter p.disk { - background: var(--color-primary); -} - -td.chart div.barsbefore p.cpu { - background: var(--color-secondary); -} - -td.chart div.barsbefore p.mem { - background: var(--color-secondary); -} - -td.chart div.barsbefore p.disk { - background: var(--color-secondary); -}