Alerts


/etc/prometheus/rules/ansible_managed.rules > ansible managed alert rules
CriticalCPULoad (1 active)
alert: CriticalCPULoad
expr: instance:node_cpu:load > 98
for: 30m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has Critical CPU load for more than 30 minutes.'
  summary: Instance {{ $labels.instance }} - Critical CPU load
Labels State Active Since Value
alertname="CriticalCPULoad" instance="192.168.93.202:9100" severity="critical" firing 2020-09-14 11:19:29.318691705 +0000 UTC 98.92770629169613
Watchdog (1 active)
alert: Watchdog
expr: vector(1)
for: 10m
labels:
  severity: warning
annotations:
  description: This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the "DeadMansSnitch" integration in PagerDuty.
  summary: Ensure entire alerting pipeline is functional
Labels State Active Since Value
alertname="Watchdog" severity="warning" firing 2020-08-04 10:39:59 +0000 UTC 1
ClockSkewDetected (0 active)
alert: ClockSkewDetected
expr: abs(node_timex_offset_seconds) * 1000 > 30
for: 2m
labels:
  severity: warning
annotations:
  description: Clock skew detected on {{ $labels.instance }}. Ensure NTP is configured correctly on this host.
  summary: Instance {{ $labels.instance }} - Clock skew detected
CriticalDiskSpace (0 active)
alert: CriticalDiskSpace
expr: instance:node_fs:disk_space < 20
for: 4m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has less than 10% space remaining.'
  summary: Instance {{ $labels.instance }} - Critical disk space usage
CriticalRAMUsage (0 active)
alert: CriticalRAMUsage
expr: instance:node_ram:usage > 98
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} has Critical Memory Usage more than 5 minutes.'
  summary: Instance {{ $labels.instance }} has Critical Memory Usage
InstanceDown (0 active)
alert: InstanceDown
expr: up == 0
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.'
  summary: Instance {{ $labels.instance }} down
NumberRestartPerInitContainerContainer (0 active)
alert: NumberRestartPerInitContainerContainer
expr: kube_pod_init_container_status_restarts_total > 10
for: 1m
labels:
  severity: critical
annotations:
  description: ' Container {{ $labels.container }} of pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} keeps restarting.'
  summary: Container {{ $labels.container }} - Too many restarts
PendingJobs (0 active)
alert: PendingJobs
expr: gitlab_runner_jobs{state="pending"} > 0
for: 10m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has pending jobs for more than 5 minutes..'
  summary: Instance {{ $labels.instance }} - Gitlab runner pending jobs
SshServiceDown (0 active)
alert: SshServiceDown
expr: probe_success == 0
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $labels.target }} of job {{ $labels.job }} has been down for more than 5 minutes.'
  summary: Ssh Service {{ $labels.target }} down
/etc/prometheus/rules/ceph_dashboard.rules > ceph dashboard rules
Ceph Health Error (0 active)
alert: Ceph Health Error
expr: ceph_health_status > 1
for: 1m
labels:
  severity: page
annotations:
  description: The Ceph cluster health is in an error state
  summary: Ceph Health Error
Ceph Health Warning (0 active)
alert: Ceph Health Warning
expr: ceph_health_status == 1
for: 1m
labels:
  severity: page
annotations:
  description: Overall Ceph Health
  summary: Ceph Health Warning
Cluster Capacity Low (0 active)
alert: Cluster Capacity Low
expr: sum(ceph_osd_stat_bytes_used) / sum(ceph_osd_stat_bytes) > 0.85
for: 1m
labels:
  severity: page
annotations:
  description: This indicates raw used space crosses the 85% capacity threshold of the ceph cluster.
  summary: Cluster Capacity Low
Disk(s) Near Full (0 active)
alert: Disk(s) Near Full
expr: (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) * 100 > 85
for: 1m
labels:
  severity: page
annotations:
  description: This shows how many disks are at or above 85% full. Performance may degrade beyond this threshold on filestore (XFS) backed OSD's.
  summary: Disk(s) Near Full
MON(s) Down (0 active)
alert: MON(s) Down
expr: ceph_mon_quorum_status != 1
for: 1m
labels:
  severity: page
annotations:
  description: This indicates that one or more MON(s) is down.
  summary: MON(s) down
Network Errors (0 active)
OSD Host(s) Down (0 active)
alert: OSD Host(s) Down
expr: count by(instance) (ceph_disk_occupation * on(ceph_daemon) group_right(instance) ceph_osd_up == 0) - count by(instance) (ceph_disk_occupation) == 0
for: 1m
labels:
  severity: page
annotations:
  description: This indicates that one or more OSD hosts is currently down in the cluster.
  summary: OSD Host(s) Down
OSD(s) Down (0 active)
alert: OSD(s) Down
expr: ceph_osd_up < 0.5
for: 1m
labels:
  severity: page
annotations:
  description: This indicates that one or more OSDs is currently marked down in the cluster.
  summary: OSD(s) Down
OSD(s) with High PG Count (0 active)
alert: OSD(s) with High PG Count
expr: ceph_osd_numpg > 275
for: 1m
labels:
  severity: page
annotations:
  description: This indicates there are some OSDs with high PG count (275+).
  summary: OSD(s) with High PG Count
PG(s) Stuck (0 active)
alert: PG(s) Stuck
expr: max(ceph_osd_numpg) > scalar(ceph_pg_active)
for: 1m
labels:
  severity: page
annotations:
  description: This indicates there are pg's in a stuck state, manual intervention needed to resolve.
  summary: PG(s) Stuck
Pool Capacity Low (0 active)
alert: Pool Capacity Low
expr: (ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail) * 100 + on(pool_id) group_left(name) (ceph_pool_metadata * 0)) > 85
for: 1m
labels:
  severity: page
annotations:
  description: This indicates a low capacity in a pool.
  summary: Pool Capacity Low
Slow OSD Responses (0 active)
/etc/prometheus/rules/elasticsearch.rules > elasticsearch
ElasticsearchHeapTooHigh (0 active)
alert: ElasticsearchHeapTooHigh
expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} > 0.9
for: 15m
labels:
  severity: critical
annotations:
  description: The heap usage is over 90% for 15m
  summary: ElasticSearch node {{$labels.node}} heap usage is high
ElasticsearchTooFewNodesRunning (0 active)
alert: ElasticsearchTooFewNodesRunning
expr: elasticsearch_cluster_health_number_of_nodes < 3
for: 5m
labels:
  severity: critical
annotations:
  description: There are only {{$value}} < 3 ElasticSearch nodes running
  summary: ElasticSearch running on less than 3 nodes
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kube-apiserver-slos
KubeAPIErrorBudgetBurn (1 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate3d) > (1 * 0.01) and sum(apiserver_request:burnrate6h) > (1 * 0.01)
for: 3h
labels:
  severity: warning
annotations:
  message: The API server is burning too much error budget
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
Labels State Active Since Value
alertname="KubeAPIErrorBudgetBurn" severity="warning" firing 2020-08-26 16:06:55.425266593 +0000 UTC 0.026657561111998924
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate1h) > (14.4 * 0.01) and sum(apiserver_request:burnrate5m) > (14.4 * 0.01)
for: 2m
labels:
  severity: critical
annotations:
  message: The API server is burning too much error budget
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate6h) > (6 * 0.01) and sum(apiserver_request:burnrate30m) > (6 * 0.01)
for: 15m
labels:
  severity: critical
annotations:
  message: The API server is burning too much error budget
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate1d) > (3 * 0.01) and sum(apiserver_request:burnrate2h) > (3 * 0.01)
for: 1h
labels:
  severity: warning
annotations:
  message: The API server is burning too much error budget
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kubernetes-apps
KubeJobCompletion (8 active)
alert: KubeJobCompletion
expr: kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
for: 1h
labels:
  severity: warning
annotations:
  message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than one hour to complete.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
Labels State Active Since Value
alertname="KubeJobCompletion" instance="192.168.93.99:32080" job="kube-state-metrics" job_name="proc-pb-mvp01-20200325-20055-workflow" namespace="integration-sdp" severity="warning" firing 2020-09-20 04:21:20.091496482 +0000 UTC 1
alertname="KubeJobCompletion" instance="192.168.93.118:32080" job="kube-state-metrics" job_name="proc-pb-mvp01-20200325-20055-workflow" namespace="integration-sdp" severity="warning" firing 2020-09-20 04:21:20.091496482 +0000 UTC 1
alertname="KubeJobCompletion" instance="192.168.93.108:32080" job="kube-state-metrics" job_name="proc-pb-mvp01-20200325-20055-workflow" namespace="integration-sdp" severity="warning" firing 2020-09-20 04:21:20.091496482 +0000 UTC 1
alertname="KubeJobCompletion" instance="192.168.93.123:32080" job="kube-state-metrics" job_name="proc-pb-mvp01-20200325-20055-workflow" namespace="integration-sdp" severity="warning" firing 2020-09-20 04:21:20.091496482 +0000 UTC 1
alertname="KubeJobCompletion" instance="192.168.93.109:32080" job="kube-state-metrics" job_name="proc-pb-mvp01-20200325-20055-workflow" namespace="integration-sdp" severity="warning" firing 2020-09-20 04:21:20.091496482 +0000 UTC 1
alertname="KubeJobCompletion" instance="192.168.93.92:32080" job="kube-state-metrics" job_name="proc-pb-mvp01-20200325-20055-workflow" namespace="integration-sdp" severity="warning" firing 2020-09-20 04:21:20.091496482 +0000 UTC 1
alertname="KubeJobCompletion" instance="192.168.93.122:32080" job="kube-state-metrics" job_name="proc-pb-mvp01-20200325-20055-workflow" namespace="integration-sdp" severity="warning" firing 2020-09-20 04:21:20.091496482 +0000 UTC 1
alertname="KubeJobCompletion" instance="192.168.93.110:32080" job="kube-state-metrics" job_name="proc-pb-mvp01-20200325-20055-workflow" namespace="integration-sdp" severity="warning" firing 2020-09-20 04:21:05.091496482 +0000 UTC 1
KubeContainerWaiting (0 active)
alert: KubeContainerWaiting
expr: sum by(namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
for: 1h
labels:
  severity: warning
annotations:
  message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
KubeCronJobRunning (0 active)
alert: KubeCronJobRunning
expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
for: 1h
labels:
  severity: warning
annotations:
  message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
KubeDaemonSetMisScheduled (0 active)
alert: KubeDaemonSetMisScheduled
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
for: 15m
labels:
  severity: warning
annotations:
  message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
KubeDaemonSetNotScheduled (0 active)
alert: KubeDaemonSetNotScheduled
expr: kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
for: 10m
labels:
  severity: warning
annotations:
  message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
KubeDaemonSetRolloutStuck (0 active)
alert: KubeDaemonSetRolloutStuck
expr: kube_daemonset_status_number_ready{job="kube-state-metrics"} / kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} < 1
for: 15m
labels:
  severity: critical
annotations:
  message: Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
KubeDeploymentGenerationMismatch (0 active)
alert: KubeDeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation{job="kube-state-metrics"} != kube_deployment_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
  severity: critical
annotations:
  message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
KubeDeploymentReplicasMismatch (0 active)
alert: KubeDeploymentReplicasMismatch
expr: (kube_deployment_spec_replicas{job="kube-state-metrics"} != kube_deployment_status_replicas_available{job="kube-state-metrics"}) and (changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m]) == 0)
for: 15m
labels:
  severity: critical
annotations:
  message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
KubeHpaMaxedOut (0 active)
alert: KubeHpaMaxedOut
expr: kube_hpa_status_current_replicas{job="kube-state-metrics"} == kube_hpa_spec_max_replicas{job="kube-state-metrics"}
for: 15m
labels:
  severity: warning
annotations:
  message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
KubeHpaReplicasMismatch (0 active)
alert: KubeHpaReplicasMismatch
expr: (kube_hpa_status_desired_replicas{job="kube-state-metrics"} != kube_hpa_status_current_replicas{job="kube-state-metrics"}) and changes(kube_hpa_status_current_replicas[15m]) == 0
for: 15m
labels:
  severity: warning
annotations:
  message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
KubeJobFailed (0 active)
alert: KubeJobFailed
expr: kube_job_failed{job="kube-state-metrics"} > 0
for: 15m
labels:
  severity: warning
annotations:
  message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
KubePodCrashLooping (0 active)
alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
for: 15m
labels:
  severity: critical
annotations:
  message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
KubePodNotReady (0 active)
alert: KubePodNotReady
expr: sum by(namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0
for: 15m
labels:
  severity: critical
annotations:
  message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
KubeStatefulSetGenerationMismatch (0 active)
alert: KubeStatefulSetGenerationMismatch
expr: kube_statefulset_status_observed_generation{job="kube-state-metrics"} != kube_statefulset_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
  severity: critical
annotations:
  message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
KubeStatefulSetReplicasMismatch (0 active)
alert: KubeStatefulSetReplicasMismatch
expr: (kube_statefulset_status_replicas_ready{job="kube-state-metrics"} != kube_statefulset_status_replicas{job="kube-state-metrics"}) and (changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) == 0)
for: 15m
labels:
  severity: critical
annotations:
  message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
KubeStatefulSetUpdateNotRolledOut (0 active)
alert: KubeStatefulSetUpdateNotRolledOut
expr: max without(revision) (kube_statefulset_status_current_revision{job="kube-state-metrics"} unless kube_statefulset_status_update_revision{job="kube-state-metrics"}) * (kube_statefulset_replicas{job="kube-state-metrics"} != kube_statefulset_status_replicas_updated{job="kube-state-metrics"})
for: 15m
labels:
  severity: critical
annotations:
  message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kubernetes-resources
CPUThrottlingHigh (4 active)
alert: CPUThrottlingHigh
expr: sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) / sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m])) > (25 / 100)
for: 15m
labels:
  severity: warning
annotations:
  message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
Labels State Active Since Value
alertname="CPUThrottlingHigh" container="deviceserver" namespace="integration" pod="cbf-proto-fsp01-0" severity="warning" firing 2020-09-20 04:13:42.4702334 +0000 UTC 0.3519163763066202
alertname="CPUThrottlingHigh" container="deviceserver" namespace="integration" pod="cbf-proto-fsp02-0" severity="warning" firing 2020-09-20 04:13:42.4702334 +0000 UTC 0.3596638655462185
alertname="CPUThrottlingHigh" container="deviceserver" namespace="integration" pod="cbf-proto-fsp04-0" severity="warning" firing 2020-09-20 04:13:42.4702334 +0000 UTC 0.30755711775043937
alertname="CPUThrottlingHigh" container="deviceserver" namespace="integration" pod="cbf-proto-fsp03-0" severity="warning" firing 2020-09-20 04:13:42.4702334 +0000 UTC 0.35371900826446284
KubeCPUOvercommit (0 active)
alert: KubeCPUOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum) / sum(kube_node_status_allocatable_cpu_cores) > (count(kube_node_status_allocatable_cpu_cores) - 1) / count(kube_node_status_allocatable_cpu_cores)
for: 5m
labels:
  severity: warning
annotations:
  message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
KubeCPUQuotaOvercommit (0 active)
alert: KubeCPUQuotaOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="cpu",type="hard"}) / sum(kube_node_status_allocatable_cpu_cores) > 1.5
for: 5m
labels:
  severity: warning
annotations:
  message: Cluster has overcommitted CPU resource requests for Namespaces.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit
KubeMemoryOvercommit (0 active)
alert: KubeMemoryOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum) / sum(kube_node_status_allocatable_memory_bytes) > (count(kube_node_status_allocatable_memory_bytes) - 1) / count(kube_node_status_allocatable_memory_bytes)
for: 5m
labels:
  severity: warning
annotations:
  message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
KubeMemoryQuotaOvercommit (0 active)
alert: KubeMemoryQuotaOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="memory",type="hard"}) / sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"}) > 1.5
for: 5m
labels:
  severity: warning
annotations:
  message: Cluster has overcommitted memory resource requests for Namespaces.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit
KubeQuotaExceeded (0 active)
alert: KubeQuotaExceeded
expr: kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) > 0.9
for: 15m
labels:
  severity: warning
annotations:
  message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kubernetes-storage
KubePersistentVolumeErrors (0 active)
alert: KubePersistentVolumeErrors
expr: kube_persistentvolume_status_phase{job="kube-state-metrics",phase=~"Failed|Pending"} > 0
for: 5m
labels:
  severity: critical
annotations:
  message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
KubePersistentVolumeFillingUp (0 active)
alert: KubePersistentVolumeFillingUp
expr: kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"} < 0.03
for: 1m
labels:
  severity: critical
annotations:
  message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
KubePersistentVolumeFillingUp (0 active)
alert: KubePersistentVolumeFillingUp
expr: (kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"}) < 0.15 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
for: 1h
labels:
  severity: warning
annotations:
  message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kubernetes-system
KubeClientErrors (0 active)
alert: KubeClientErrors
expr: (sum by(instance, job) (rate(rest_client_requests_total{code=~"5.."}[5m])) / sum by(instance, job) (rate(rest_client_requests_total[5m]))) > 0.01
for: 15m
labels:
  severity: warning
annotations:
  message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
KubeVersionMismatch (0 active)
alert: KubeVersionMismatch
expr: count(count by(gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"}, "gitVersion", "$1", "gitVersion", "(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
  severity: warning
annotations:
  message: There are {{ $value }} different semantic versions of Kubernetes components running.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kubernetes-system-apiserver
AggregatedAPIDown (0 active)
alert: AggregatedAPIDown
expr: sum by(name, namespace) (sum_over_time(aggregator_unavailable_apiservice[5m])) > 0
for: 5m
labels:
  severity: warning
annotations:
  message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} is down. It has not been available at least for the past five minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
AggregatedAPIErrors (0 active)
alert: AggregatedAPIErrors
expr: sum by(name, namespace) (increase(aggregator_unavailable_apiservice_count[5m])) > 2
labels:
  severity: warning
annotations:
  message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
KubeAPIDown (0 active)
alert: KubeAPIDown
expr: absent(up{job="kube-apiserver"} == 1)
for: 15m
labels:
  severity: critical
annotations:
  message: KubeAPI has disappeared from Prometheus target discovery.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
KubeAPIErrorsHigh (0 active)
alert: KubeAPIErrorsHigh
expr: sum by(resource, subresource, verb) (rate(apiserver_request_total{code=~"5..",job="kube-apiserver"}[5m])) / sum by(resource, subresource, verb) (rate(apiserver_request_total{job="kube-apiserver"}[5m])) > 0.05
for: 10m
labels:
  severity: warning
annotations:
  message: API server is returning errors for {{ $value | humanizePercentage }} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
KubeAPIErrorsHigh (0 active)
alert: KubeAPIErrorsHigh
expr: sum by(resource, subresource, verb) (rate(apiserver_request_total{code=~"5..",job="kube-apiserver"}[5m])) / sum by(resource, subresource, verb) (rate(apiserver_request_total{job="kube-apiserver"}[5m])) > 0.1
for: 10m
labels:
  severity: critical
annotations:
  message: API server is returning errors for {{ $value | humanizePercentage }} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
KubeAPILatencyHigh (0 active)
alert: KubeAPILatencyHigh
expr: (cluster:apiserver_request_duration_seconds:mean5m{job="kube-apiserver"} > on(verb) group_left() (avg by(verb) (cluster:apiserver_request_duration_seconds:mean5m{job="kube-apiserver"} >= 0) + 2 * stddev by(verb) (cluster:apiserver_request_duration_seconds:mean5m{job="kube-apiserver"} >= 0))) > on(verb) group_left() 1.2 * avg by(verb) (cluster:apiserver_request_duration_seconds:mean5m{job="kube-apiserver"} >= 0) and on(verb, resource) cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="kube-apiserver",quantile="0.99"} > 1
for: 5m
labels:
  severity: warning
annotations:
  message: The API server has an abnormal latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
KubeAPILatencyHigh (0 active)
alert: KubeAPILatencyHigh
expr: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="kube-apiserver",quantile="0.99"} > 4
for: 10m
labels:
  severity: critical
annotations:
  message: The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
KubeClientCertificateExpiration (0 active)
alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
labels:
  severity: warning
annotations:
  message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
KubeClientCertificateExpiration (0 active)
alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
labels:
  severity: critical
annotations:
  message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kubernetes-system-controller-manager
KubeControllerManagerDown (0 active)
alert: KubeControllerManagerDown
expr: absent(up{job="kube-controller-manager"} == 1)
for: 15m
labels:
  severity: critical
annotations:
  message: KubeControllerManager has disappeared from Prometheus target discovery.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kubernetes-system-kubelet
KubeNodeNotReady (0 active)
alert: KubeNodeNotReady
expr: kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} == 0
for: 15m
labels:
  severity: warning
annotations:
  message: '{{ $labels.node }} has been unready for more than 15 minutes.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
KubeNodeReadinessFlapping (0 active)
alert: KubeNodeReadinessFlapping
expr: sum by(node) (changes(kube_node_status_condition{condition="Ready",status="true"}[15m])) > 2
for: 15m
labels:
  severity: warning
annotations:
  message: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
KubeNodeUnreachable (0 active)
alert: KubeNodeUnreachable
expr: kube_node_spec_taint{effect="NoSchedule",job="kube-state-metrics",key="node.kubernetes.io/unreachable"} == 1
for: 2m
labels:
  severity: warning
annotations:
  message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
KubeletDown (0 active)
alert: KubeletDown
expr: absent(up{job="kubelet"} == 1)
for: 15m
labels:
  severity: critical
annotations:
  message: Kubelet has disappeared from Prometheus target discovery.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
KubeletPlegDurationHigh (0 active)
alert: KubeletPlegDurationHigh
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
for: 5m
labels:
  severity: warning
annotations:
  message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
KubeletPodStartUpLatencyHigh (0 active)
alert: KubeletPodStartUpLatencyHigh
expr: histogram_quantile(0.99, sum by(instance, le) (rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m]))) > 60
for: 15m
labels:
  severity: warning
annotations:
  message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
KubeletTooManyPods (0 active)
alert: KubeletTooManyPods
expr: max by(node) (max by(instance) (kubelet_running_pod_count{job="kubelet"}) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) / max by(node) (kube_node_status_capacity_pods{job="kube-state-metrics"} != 1) > 0.95
for: 15m
labels:
  severity: warning
annotations:
  message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kubernetes-system-scheduler
KubeSchedulerDown (0 active)
alert: KubeSchedulerDown
expr: absent(up{job="kube-scheduler"} == 1)
for: 15m
labels:
  severity: critical
annotations:
  message: KubeScheduler has disappeared from Prometheus target discovery.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown