Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions helm/bundles/cortex-nova/alerts/nova.alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -592,3 +592,20 @@ groups:
corruption, bugs in reservation creation, or external modifications.
Reservations are automatically repaired, but the root cause should be
investigated if this alert persists.

- alert: CortexNovaDoesntFindValidKVMHosts
expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0
for: 5m
labels:
Comment thread
coderabbitai[bot] marked this conversation as resolved.
context: scheduling
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
Comment thread
PhilippMatthes marked this conversation as resolved.
annotations:
summary: "Nova scheduling cannot find valid KVM hosts"
description: >
Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling
failed to find a valid `{{$labels.hvtype}}` host. This may indicate
capacity issues, misconfigured filters, or resource constraints in the
datacenter. Investigate the affected VMs and hypervisor availability.
Comment thread
PhilippMatthes marked this conversation as resolved.
17 changes: 17 additions & 0 deletions helm/bundles/cortex-nova/templates/kpis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,23 @@ spec:
---
apiVersion: cortex.cloud/v1alpha1
kind: KPI
metadata:
name: vm-faults
spec:
schedulingDomain: nova
impl: vm_faults_kpi
dependencies:
datasources:
- name: nova-servers
- name: nova-flavors
description: |
This kpi tracks vm faults in the datacenter. It exposes helpful information
about the faults, such as the availability zone, hypervisor type, vm state,
and error info if available. This can be used to identify issues in the
datacenter and to monitor the overall health of the vms.
---
apiVersion: cortex.cloud/v1alpha1
kind: KPI
metadata:
name: cortex-nova-datasource-state
spec:
Expand Down
122 changes: 109 additions & 13 deletions internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,24 @@ type Server struct {
OSEXTSTSVmState string `json:"OS-EXT-STS:vm_state" db:"os_ext_sts_vm_state"`
OSEXTSTSPowerState int `json:"OS-EXT-STS:power_state" db:"os_ext_sts_power_state"`

// From nested JSON
// From nested server.flavor JSON
FlavorName string `json:"-" db:"flavor_name"`

// From nested server.fault JSON

// The error response code.
FaultCode *uint `json:"-" db:"fault_code"`
// The date and time when the exception was raised. The date and time stamp
// format is ISO 8601 (CCYY-MM-DDThh:mm:ss±hh:mm). For example,
// 2015-08-27T09:49:58-05:00. The ±hh:mm value if included, is the time zone
// as an offset from UTC. In the previous example, the offset value is -05:00.
FaultCreated *string `json:"-" db:"fault_created"`
// The error message.
FaultMessage *string `json:"-" db:"fault_message"`
// The stack trace. It is available if the response code is not 500 or you
// have the administrator privilege.
FaultDetails *string `json:"-" db:"fault_details"`

// Note: there are some more fields that are omitted. To include them again, add
// custom unmarshalers and marshalers for the struct below.
}
Expand All @@ -119,7 +134,8 @@ type Server struct {
func (s *Server) UnmarshalJSON(data []byte) error {
type Alias Server
aux := &struct {
Flavor json.RawMessage `json:"flavor"`
Flavor json.RawMessage `json:"flavor"`
Fault *json.RawMessage `json:"fault,omitempty"`
*Alias
}{
Alias: (*Alias)(s),
Expand All @@ -135,31 +151,63 @@ func (s *Server) UnmarshalJSON(data []byte) error {
return err
}
s.FlavorName = flavor.Name
var fault struct {
Code uint `json:"code"`
Created string `json:"created"`
Message string `json:"message"`
Details *string `json:"details,omitempty"`
}
if aux.Fault != nil {
if err := json.Unmarshal(*aux.Fault, &fault); err != nil {
return err
}
s.FaultCode = &fault.Code
s.FaultCreated = &fault.Created
s.FaultMessage = &fault.Message
s.FaultDetails = fault.Details
}
return nil
}

// Custom marshaler for OpenStackServer to handle nested JSON.
func (s *Server) MarshalJSON() ([]byte, error) {
type Alias Server
type flavor struct {
// Starting in microversion 2.47, "id" was removed...
Name string `json:"original_name"`
}
flavorVal := flavor{
Name: s.FlavorName,
}
type fault struct {
Code uint `json:"code"`
Created string `json:"created"`
Message string `json:"message"`
Details *string `json:"details,omitempty"`
}
var faultVal *fault
if s.FaultCode != nil && s.FaultCreated != nil && s.FaultMessage != nil {
faultVal = &fault{
Code: *s.FaultCode,
Created: *s.FaultCreated,
Message: *s.FaultMessage,
Details: s.FaultDetails,
}
}
aux := &struct {
Flavor struct {
// Starting in microversion 2.47, "id" was removed...
Name string `json:"original_name"`
} `json:"flavor"`
Flavor flavor `json:"flavor"`
Fault *fault `json:"fault,omitempty"`
*Alias
}{
Alias: (*Alias)(s),
Flavor: struct {
Name string `json:"original_name"`
}{
Name: s.FlavorName,
},
Alias: (*Alias)(s),
Flavor: flavorVal,
Fault: faultVal,
}
return json.Marshal(aux)
}

// Table in which the openstack model is stored.
func (Server) TableName() string { return "openstack_servers" }
func (Server) TableName() string { return "openstack_servers_v2" }

// Index for the openstack model.
func (Server) Indexes() map[string][]string { return nil }
Expand Down Expand Up @@ -285,6 +333,54 @@ type Flavor struct {
ExtraSpecs string `json:"extra_specs" db:"extra_specs"`
}

// FlavorHypervisorType is a type alias for a string to represent the specific
// values the hypervisor type contained in flavor extra specs may have.
type FlavorHypervisorType string

const (
// FlavorHypervisorTypeQEMU maps a flavor for QEMU/KVM hypervisors.
FlavorHypervisorTypeQEMU FlavorHypervisorType = "QEMU"
// FlavorHypervisorTypeCH maps flavors to Cloud-Hypervisor/KVM hypervisors.
FlavorHypervisorTypeCH FlavorHypervisorType = "CH"
// FlavorHypervisorTypeVMware maps flavors to VMware hypervisors.
FlavorHypervisorTypeVMware FlavorHypervisorType = "VMware vCenter Server"
// FlavorHypervisorTypeIronic maps flavors to Ironic baremetal instances.
FlavorHypervisorTypeIronic FlavorHypervisorType = "Ironic"
// FlavorHypervisorTypeOther is a flavor for which the hypervisor type
// is set in the extra specs but has an unknown value.
FlavorHypervisorTypeOther FlavorHypervisorType = "Other"
// FlavorHypervisorTypeUnspecified is a flavor for which the hypervisor type
// is not set in the extra specs.
FlavorHypervisorTypeUnspecified FlavorHypervisorType = "Unspecified"
)

// GetHypervisorType returns the hypervisor type of the flavor based on its
// extra specs.
func (f Flavor) GetHypervisorType() (FlavorHypervisorType, error) {
var extraSpecs map[string]string
if f.ExtraSpecs == "" {
extraSpecs = map[string]string{}
} else if err := json.Unmarshal([]byte(f.ExtraSpecs), &extraSpecs); err != nil {
return "", err // Return an error if the extra specs cannot be parsed.
}
Comment thread
PhilippMatthes marked this conversation as resolved.
hypervisorType, ok := extraSpecs["capabilities:hypervisor_type"]
if !ok {
return FlavorHypervisorTypeUnspecified, nil
}
switch hypervisorType {
case string(FlavorHypervisorTypeQEMU):
return FlavorHypervisorTypeQEMU, nil
case string(FlavorHypervisorTypeCH):
return FlavorHypervisorTypeCH, nil
case string(FlavorHypervisorTypeVMware):
return FlavorHypervisorTypeVMware, nil
case string(FlavorHypervisorTypeIronic):
return FlavorHypervisorTypeIronic, nil
default:
return FlavorHypervisorTypeOther, nil
}
}

// Custom unmarshaler for OpenStackFlavor to handle nested JSON.
func (f *Flavor) UnmarshalJSON(data []byte) error {
type Alias Flavor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ SELECT
os.os_ext_srv_attr_host AS host,
MAX(value) AS max_steal_time_pct
FROM kvm_libvirt_domain_metrics kvm
JOIN openstack_servers os ON os.os_ext_srv_attr_instance_name = kvm.domain
JOIN openstack_servers_v2 os ON os.os_ext_srv_attr_instance_name = kvm.domain
WHERE kvm.name = 'kvm_libvirt_domain_steal_pct' AND os.id IS NOT NULL
GROUP BY os.os_ext_srv_attr_host, os.id;
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ func TestLibvirtDomainCPUStealPctExtractor_Extract(t *testing.T) {
t.Fatalf("expected no error, got %v", err)
}

// Insert mock data into the openstack_servers table
// Insert mock data into the openstack servers table
servers := []any{
&nova.Server{
ID: "uuid-1",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ WITH durations AS (
)) AS BIGINT)
) AS duration
FROM openstack_migrations AS migrations
LEFT JOIN openstack_servers AS servers ON servers.id = migrations.instance_uuid
LEFT JOIN openstack_servers_v2 AS servers ON servers.id = migrations.instance_uuid
LEFT JOIN openstack_flavors_v2 AS flavors ON flavors.name = servers.flavor_name
)
SELECT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ running_servers AS (
EXTRACT(EPOCH FROM (NOW()::timestamp - servers.created::timestamp))::BIGINT AS duration,
COALESCE(flavors.name, 'unknown')::TEXT AS flavor_name,
false::BOOLEAN AS deleted
FROM openstack_servers servers
FROM openstack_servers_v2 servers
LEFT JOIN openstack_flavors_v2 flavors ON flavors.name = servers.flavor_name
WHERE servers.created IS NOT NULL
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ SELECT DISTINCT
m.hostsystem AS vrops_hostsystem,
s.os_ext_srv_attr_host AS nova_compute_host
FROM vrops_vm_metrics m
LEFT JOIN openstack_servers s ON m.instance_uuid = s.id
LEFT JOIN openstack_servers_v2 s ON m.instance_uuid = s.id
WHERE s.os_ext_srv_attr_host IS NOT NULL;
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ host_cpu_usage AS (
s.tenant_id,
h.service_host,
AVG(p.avg_cpu) AS avg_cpu_of_project
FROM openstack_servers s
FROM openstack_servers_v2 s
JOIN vrops_vm_metrics m ON s.id = m.instance_uuid
JOIN projects_avg_cpu p ON s.tenant_id = p.tenant_id
JOIN openstack_hypervisors h ON s.os_ext_srv_attr_hypervisor_hostname = h.hostname
Expand Down
Loading
Loading