Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion .github/renovate.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,19 @@
"matchStrings": ["GOTESTSUM_VERSION \\?= (?<currentValue>v[\\d.]+)"],
"depNameTemplate": "gotest.tools/gotestsum",
"datasourceTemplate": "go"
},
{
"fileMatch": ["^postgres/Dockerfile$"],
"matchStrings": ["FROM (?<depName>[^:\\n]+):(?<currentValue>[^@\\n]+)@sha256:(?<currentDigest>[a-f0-9]+)"],
"datasourceTemplate": "docker"
},
{
"fileMatch": ["^postgres/Dockerfile$"],
"matchStrings": ["ENV PG_VERSION (?<currentValue>[\\d]+\\.[\\d]+)-[^\\n]+"],
"depNameTemplate": "postgres",
"datasourceTemplate": "docker",
"versioningTemplate": "semver-coerced",
"autoReplaceStringTemplate": "ENV PG_VERSION {{{newValue}}}-1.pgdg13+1"
}
],
"packageRules": [
Expand All @@ -49,6 +62,17 @@
],
"allowedVersions": "1.26.x"
},
{
"matchPackageNames": [
"postgres"
],
"matchFileNames": [
"postgres/Dockerfile"
],
"allowedVersions": "17.x",
"automerge": true,
"groupName": "postgres Dockerfile"
},
{
"matchPackageNames": [
"/^github\\.com\\/sapcc\\/.*/"
Expand Down Expand Up @@ -80,4 +104,4 @@
"before 8am on Friday"
],
"semanticCommits": "disabled"
}
}
40 changes: 36 additions & 4 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,14 @@ import (
"context"
"crypto/tls"
"flag"
"log/slog"
"net/http"

uberzap "go.uber.org/zap"
"os"
"path/filepath"
"slices"
"strings"
"time"

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
Expand Down Expand Up @@ -143,12 +147,39 @@ func main() {
flag.BoolVar(&enableHTTP2, "enable-http2", false,
"If set, HTTP/2 will be enabled for the metrics and webhook servers")
opts := zap.Options{
Development: true,
Development: false,
}
opts.BindFlags(flag.CommandLine)
flag.Parse()

ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
ctrl.SetLogger(zap.New(
zap.UseFlagOptions(&opts),
zap.RawZapOpts(uberzap.WrapCore(monitoring.WrapCoreWithLogMetrics)),
))

// Configure slog (used across internal packages) with JSON output and
// level control via the LOG_LEVEL environment variable.
// Supported values: debug, info (default), warn, error.
slogLevel := new(slog.LevelVar)
slogLevel.Set(slog.LevelInfo)
if lvl := os.Getenv("LOG_LEVEL"); lvl != "" {
switch strings.ToLower(lvl) {
case "debug":
slogLevel.Set(slog.LevelDebug)
case "info":
slogLevel.Set(slog.LevelInfo)
case "warn", "warning":
slogLevel.Set(slog.LevelWarn)
case "error":
slogLevel.Set(slog.LevelError)
}
}
slog.SetDefault(slog.New(monitoring.NewMetricsSlogHandler(
slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
Level: slogLevel,
}),
)))
slog.Info("slog configured", "level", slogLevel.Level().String())

// Log the main configuration
setupLog.Info("loaded main configuration",
Expand Down Expand Up @@ -301,6 +332,7 @@ func main() {
// This is useful to distinguish metrics from different deployments.
metricsConfig := conf.GetConfigOrDie[monitoring.Config]()
metrics.Registry = monitoring.WrapRegistry(metrics.Registry, metricsConfig)
metrics.Registry.MustRegister(monitoring.LogMessagesTotal)

// TODO: Remove me after scheduling pipeline steps don't require DB connections anymore.
metrics.Registry.MustRegister(&db.Monitor)
Expand Down Expand Up @@ -652,10 +684,10 @@ func main() {
os.Exit(1)
}

syncerMonitor := commitments.NewSyncerMonitor()
must.Succeed(metrics.Registry.Register(syncerMonitor))
if slices.Contains(mainConfig.EnabledTasks, "commitments-sync-task") {
setupLog.Info("starting commitments syncer")
syncerMonitor := commitments.NewSyncerMonitor()
must.Succeed(metrics.Registry.Register(syncerMonitor))
syncer := commitments.NewSyncer(multiclusterClient, syncerMonitor)
syncerConfig := conf.GetConfigOrDie[commitments.SyncerConfig]()
syncerConfig.ApplyDefaults()
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ require (
go.opentelemetry.io/otel/trace v1.43.0 // indirect
go.opentelemetry.io/proto/otlp v1.8.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.1 // indirect
go.uber.org/zap v1.27.1
go.yaml.in/yaml/v2 v2.4.3 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
go4.org/netipx v0.0.0-20231129151722-fdeea329fbba // indirect
Expand Down
2 changes: 1 addition & 1 deletion helm/bundles/cortex-cinder/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ dependencies:
# from: file://../../library/cortex-postgres
- name: cortex-postgres
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
version: 0.5.13
version: 0.5.14

# from: file://../../library/cortex
- name: cortex
Expand Down
2 changes: 1 addition & 1 deletion helm/bundles/cortex-manila/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ dependencies:
# from: file://../../library/cortex-postgres
- name: cortex-postgres
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
version: 0.5.13
version: 0.5.14

# from: file://../../library/cortex
- name: cortex
Expand Down
2 changes: 1 addition & 1 deletion helm/bundles/cortex-nova/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ dependencies:
# from: file://../../library/cortex-postgres
- name: cortex-postgres
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
version: 0.5.13
version: 0.5.14

# from: file://../../library/cortex
- name: cortex
Expand Down
2 changes: 1 addition & 1 deletion helm/bundles/cortex-nova/alerts/nova.alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -731,4 +731,4 @@ groups:
The webhook {{ $labels.webhook }} has experienced errors in the last 5 minutes.
This may indicate issues with the webhook logic, connectivity problems, or
external factors causing failures. Check the webhook server logs for error
details and investigate the affected resources.
details and investigate the affected resources.
1 change: 1 addition & 0 deletions helm/bundles/cortex-nova/templates/knowledges_kvm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ metadata:
name: kvm-libvirt-domain-cpu-steal-pct
spec:
schedulingDomain: nova
recency: "60s"
extractor:
name: kvm_libvirt_domain_cpu_steal_pct_extractor
description: |
Expand Down
2 changes: 1 addition & 1 deletion helm/library/cortex-postgres/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ apiVersion: v2
name: cortex-postgres
description: Postgres setup for Cortex.
type: application
version: 0.5.13
version: 0.5.14
appVersion: "sha-6db36b81"
7 changes: 4 additions & 3 deletions internal/knowledge/extractor/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@ func (r *KnowledgeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
// Sanity checks.
lastExtracted := knowledge.Status.LastExtracted.Time
recency := knowledge.Spec.Recency.Duration
if lastExtracted.Add(recency).After(time.Now()) && knowledge.Status.RawLength != 0 {
log.Info("skipping knowledge extraction, not yet time", "name", knowledge.Name)
return ctrl.Result{RequeueAfter: time.Until(lastExtracted.Add(recency))}, nil
if lastExtracted.Add(recency).After(time.Now()) {
waitFor := time.Until(lastExtracted.Add(recency))
log.Info("skipping knowledge extraction, not yet time", "name", knowledge.Name, "waitFor", waitFor)
return ctrl.Result{RequeueAfter: waitFor}, nil
}

extractor, ok := supportedExtractors[knowledge.Spec.Extractor.Name]
Expand Down
8 changes: 4 additions & 4 deletions internal/scheduling/lib/filter_weigher_pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func InitNewFilterWeigherPipeline[RequestType FilterWeigherPipelineRequest](
unknownFilters := []string{}
for _, filterConfig := range confedFilters {
slog.Info("scheduler: configuring filter", "name", filterConfig.Name)
slog.Info("supported:", "filters", maps.Keys(supportedFilters))
slog.Info("supported:", "filters", slices.Sorted(maps.Keys(supportedFilters)))
makeFilter, ok := supportedFilters[filterConfig.Name]
if !ok {
slog.Error("scheduler: unsupported filter", "name", filterConfig.Name)
Expand All @@ -73,7 +73,7 @@ func InitNewFilterWeigherPipeline[RequestType FilterWeigherPipelineRequest](
filter = validateFilter(filter)
filter = monitorFilter(filter, filterConfig.Name, pipelineMonitor)
if err := filter.Init(ctx, client, filterConfig); err != nil {
slog.Error("scheduler: failed to initialize filter", "name", filterConfig.Name, "error", err)
slog.Warn("scheduler: failed to initialize filter", "name", filterConfig.Name, "error", err)
filterErrors[filterConfig.Name] = errors.New("failed to initialize filter: " + err.Error())
continue
}
Expand All @@ -90,7 +90,7 @@ func InitNewFilterWeigherPipeline[RequestType FilterWeigherPipelineRequest](
unknownWeighers := []string{}
for _, weigherConfig := range confedWeighers {
slog.Info("scheduler: configuring weigher", "name", weigherConfig.Name)
slog.Info("supported:", "weighers", maps.Keys(supportedWeighers))
slog.Info("supported:", "weighers", slices.Sorted(maps.Keys(supportedWeighers)))
makeWeigher, ok := supportedWeighers[weigherConfig.Name]
if !ok {
slog.Error("scheduler: unsupported weigher", "name", weigherConfig.Name)
Expand All @@ -102,7 +102,7 @@ func InitNewFilterWeigherPipeline[RequestType FilterWeigherPipelineRequest](
weigher = validateWeigher(weigher)
weigher = monitorWeigher(weigher, weigherConfig.Name, pipelineMonitor)
if err := weigher.Init(ctx, client, weigherConfig); err != nil {
slog.Error("scheduler: failed to initialize weigher", "name", weigherConfig.Name, "error", err)
slog.Warn("scheduler: failed to initialize weigher", "name", weigherConfig.Name, "error", err)
weigherErrors[weigherConfig.Name] = errors.New("failed to initialize weigher: " + err.Error())
continue
}
Expand Down
4 changes: 2 additions & 2 deletions internal/scheduling/nova/external_scheduler_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,12 @@ func (httpAPI *httpAPI) canRunScheduler(requestData api.ExternalSchedulerRequest
func (httpAPI *httpAPI) inferPipelineName(requestData api.ExternalSchedulerRequest) (string, error) {
hvType, err := requestData.GetHypervisorType()
if err != nil {
slog.Info("failed to determine hypervisor type, cannot infer pipeline name", "error", err)
slog.Warn("failed to determine hypervisor type, cannot infer pipeline name", "error", err)
return "", errors.New("failed to determine hypervisor type from request data")
}
flavorType, err := requestData.GetFlavorType()
if err != nil {
slog.Info("failed to determine flavor type, cannot infer pipeline name", "error", err)
slog.Warn("failed to determine flavor type, cannot infer pipeline name", "error", err)
return "", errors.New("failed to determine flavor type from request data")
}
switch hvType {
Expand Down
12 changes: 6 additions & 6 deletions internal/scheduling/nova/hypervisor_overcommit_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ type HypervisorOvercommitController struct {
// - https://ahmet.im/blog/controller-pitfalls/#reconcile-method-shape
func (c *HypervisorOvercommitController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := ctrl.LoggerFrom(ctx)
log.Info("Reconciling resource")
log.V(1).Info("Reconciling resource")

obj := new(hv1.Hypervisor)
if err := c.Get(ctx, req.NamespacedName, obj); err != nil {
Expand All @@ -130,7 +130,7 @@ func (c *HypervisorOvercommitController) Reconcile(ctx context.Context, req ctrl
// non-overlapping resources from previous mappings.
desiredOvercommit := make(map[hv1.ResourceName]float64)
for _, mapping := range c.config.OvercommitMappings {
log.Info("Processing overcommit mapping",
log.V(1).Info("Processing overcommit mapping",
"mapping", mapping,
"hypervisorTraits", obj.Status.Traits)
var applyMapping bool
Expand All @@ -142,21 +142,21 @@ func (c *HypervisorOvercommitController) Reconcile(ctx context.Context, req ctrl
applyMapping = !slices.Contains(obj.Status.Traits, *mapping.HasntTrait)
default:
// This should never happen due to validation, but we check it just in case.
log.Info("Skipping overcommit mapping with no trait specified",
log.V(1).Info("Skipping overcommit mapping with no trait specified",
"overcommit", mapping.Overcommit)
continue
}
if !applyMapping {
continue
}
log.Info("Applying overcommit mapping on hypervisor",
log.V(1).Info("Applying overcommit mapping on hypervisor",
"overcommit", mapping.Overcommit)
maps.Copy(desiredOvercommit, mapping.Overcommit)
}
log.Info("Desired overcommit ratios based on traits",
log.V(1).Info("Desired overcommit ratios based on traits",
"desiredOvercommit", desiredOvercommit)
if maps.Equal(desiredOvercommit, obj.Spec.Overcommit) {
log.Info("Overcommit ratios are up to date, no update needed")
log.V(1).Info("Overcommit ratios are up to date, no update needed")
return ctrl.Result{}, nil
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,12 @@ func (s *FilterCapabilitiesStep) Run(traceLog *slog.Logger, request api.External

hvCaps := make(map[string]map[string]string)
for _, hv := range hvs.Items {
var err error
if hvCaps[hv.Name], err = hvToNovaCapabilities(hv); err != nil {
traceLog.Error("failed to get nova capabilities from hypervisor", "host", hv.Name, "error", err)
return nil, err
caps, err := hvToNovaCapabilities(hv)
if err != nil {
traceLog.Warn("skipping hypervisor with unknown capabilities", "host", hv.Name, "error", err)
continue
}
hvCaps[hv.Name] = caps
}
traceLog.Info("looking for capabilities", "capabilities", hvCaps)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ func (s *FilterExternalCustomerStep) Run(traceLog *slog.Logger, request api.Exte
result := s.IncludeAllHostsFromRequest(request)
domainName, err := request.Spec.Data.GetSchedulerHintStr("domain_name")
if err != nil {
traceLog.Error("failed to get domain_name scheduler hint, skipping filter", "error", err)
traceLog.Warn("failed to get domain_name scheduler hint, skipping filter", "error", err)
return result, nil
}
if slices.Contains(s.Options.CustomerIgnoredDomainNames, domainName) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa
}
freeCPU, ok := free["cpu"]
if !ok || freeCPU.Value() < 0 {
traceLog.Error(
traceLog.Warn(
"host with invalid CPU capacity",
"host", host, "freeCPU", freeCPU.String(),
)
Expand Down
Loading
Loading