Alerts


/etc/prometheus/rules/ansible_managed.rules > ansible managed alert rules
InstanceDown (2 active)
alert: InstanceDown
expr: up == 0
for: 5m
labels:
  severity: critical
annotations:
  message: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.'
  summary: Instance {{ $labels.instance }} down
Labels State Active Since Value
alertname="InstanceDown" instance="192.168.93.102:10251" job="kube-scheduler" severity="critical" firing 2020-11-27 13:09:44.318691705 +0000 UTC 0
alertname="InstanceDown" instance="192.168.93.102:10252" job="kube-controller-manager" severity="critical" firing 2020-11-27 13:09:44.318691705 +0000 UTC 0
NumberRestartPerInitContainerContainer (1 active)
alert: NumberRestartPerInitContainerContainer
expr: kube_pod_init_container_status_restarts_total > 10
for: 1m
labels:
  severity: critical
annotations:
  message: ' Container {{ $labels.container }} of pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} keeps restarting.'
  summary: Container {{ $labels.container }} - Too many restarts
Labels State Active Since Value
alertname="NumberRestartPerInitContainerContainer" container="check-dependencies-0" instance="k8s-syscore-loadbalancer-0:32080" job="kube-state-metrics" namespace="team-mvp" pod="tangotest-tango-base-test-test-0" severity="critical" firing 2020-12-05 16:56:59.318691705 +0000 UTC 1475
Watchdog (1 active)
alert: Watchdog
expr: vector(1)
for: 10m
labels:
  severity: warning
annotations:
  message: This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the "DeadMansSnitch" integration in PagerDuty.
  summary: Ensure entire alerting pipeline is functional
Labels State Active Since Value
alertname="Watchdog" severity="warning" firing 2020-10-15 16:37:44 +0000 UTC 1
ClockSkewDetected (0 active)
alert: ClockSkewDetected
expr: abs(node_timex_offset_seconds) * 1000 > 30
for: 2m
labels:
  severity: warning
annotations:
  message: Clock skew detected on {{ $labels.instance }}. Ensure NTP is configured correctly on this host.
  summary: Instance {{ $labels.instance }} - Clock skew detected
CriticalCPULoad (0 active)
alert: CriticalCPULoad
expr: instance:node_cpu:load > 98
for: 30m
labels:
  severity: critical
annotations:
  message: '{{ $labels.instance }} of job {{ $labels.job }} has Critical CPU load for more than 30 minutes.'
  summary: Instance {{ $labels.instance }} - Critical CPU load
CriticalDiskSpace (0 active)
alert: CriticalDiskSpace
expr: instance:node_fs:disk_space < 20
for: 4m
labels:
  severity: critical
annotations:
  message: '{{ $labels.instance }} of job {{ $labels.job }} has less than 10% space remaining.'
  summary: Instance {{ $labels.instance }} - Critical disk space usage
CriticalRAMUsage (0 active)
alert: CriticalRAMUsage
expr: instance:node_ram:usage > 98
for: 5m
labels:
  severity: critical
annotations:
  message: '{{ $labels.instance }} has Critical Memory Usage more than 5 minutes.'
  summary: Instance {{ $labels.instance }} has Critical Memory Usage
NumberOfPVStandardClass (0 active)
alert: NumberOfPVStandardClass
expr: sum(kube_persistentvolume_info{storageclass="standard"}) > 0
for: 1m
labels:
  severity: critical
annotations:
  message: The Cluster contains more than zero PV of StandardClass
  summary: Presence of Pv type standard class
PendingJobs (0 active)
alert: PendingJobs
expr: gitlab_runner_jobs{state="pending"} > 0
for: 10m
labels:
  severity: critical
annotations:
  message: '{{ $labels.instance }} of job {{ $labels.job }} has pending jobs for more than 5 minutes..'
  summary: Instance {{ $labels.instance }} - Gitlab runner pending jobs
SshServiceDown (0 active)
alert: SshServiceDown
expr: probe_success == 0
for: 5m
labels:
  severity: critical
annotations:
  message: '{{ $labels.target }} of job {{ $labels.job }} has been down for more than 5 minutes.'
  summary: Ssh Service {{ $labels.target }} down
/etc/prometheus/rules/ceph_dashboard.rules > ceph dashboard rules
Ceph Health Error (0 active)
alert: Ceph Health Error
expr: ceph_health_status > 1
for: 1m
labels:
  severity: page
annotations:
  description: The Ceph cluster health is in an error state
  summary: Ceph Health Error
Ceph Health Warning (0 active)
alert: Ceph Health Warning
expr: ceph_health_status == 1
for: 1m
labels:
  severity: page
annotations:
  description: Overall Ceph Health
  summary: Ceph Health Warning
Cluster Capacity Low (0 active)
alert: Cluster Capacity Low
expr: sum(ceph_osd_stat_bytes_used) / sum(ceph_osd_stat_bytes) > 0.85
for: 1m
labels:
  severity: page
annotations:
  description: This indicates raw used space crosses the 85% capacity threshold of the ceph cluster.
  summary: Cluster Capacity Low
Disk(s) Near Full (0 active)
alert: Disk(s) Near Full
expr: (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) * 100 > 85
for: 1m
labels:
  severity: page
annotations:
  description: This shows how many disks are at or above 85% full. Performance may degrade beyond this threshold on filestore (XFS) backed OSD's.
  summary: Disk(s) Near Full
MON(s) Down (0 active)
alert: MON(s) Down
expr: ceph_mon_quorum_status != 1
for: 1m
labels:
  severity: page
annotations:
  description: This indicates that one or more MON(s) is down.
  summary: MON(s) down
Network Errors (0 active)
OSD Host(s) Down (0 active)
alert: OSD Host(s) Down
expr: count by(instance) (ceph_disk_occupation * on(ceph_daemon) group_right(instance) ceph_osd_up == 0) - count by(instance) (ceph_disk_occupation) == 0
for: 1m
labels:
  severity: page
annotations:
  description: This indicates that one or more OSD hosts is currently down in the cluster.
  summary: OSD Host(s) Down
OSD(s) Down (0 active)
alert: OSD(s) Down
expr: ceph_osd_up < 0.5
for: 1m
labels:
  severity: page
annotations:
  description: This indicates that one or more OSDs is currently marked down in the cluster.
  summary: OSD(s) Down
OSD(s) with High PG Count (0 active)
alert: OSD(s) with High PG Count
expr: ceph_osd_numpg > 275
for: 1m
labels:
  severity: page
annotations:
  description: This indicates there are some OSDs with high PG count (275+).
  summary: OSD(s) with High PG Count
PG(s) Stuck (0 active)
alert: PG(s) Stuck
expr: max(ceph_osd_numpg) > scalar(ceph_pg_active)
for: 1m
labels:
  severity: page
annotations:
  description: This indicates there are pg's in a stuck state, manual intervention needed to resolve.
  summary: PG(s) Stuck
Pool Capacity Low (0 active)
alert: Pool Capacity Low
expr: (ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail) * 100 + on(pool_id) group_left(name) (ceph_pool_metadata * 0)) > 85
for: 1m
labels:
  severity: page
annotations:
  description: This indicates a low capacity in a pool.
  summary: Pool Capacity Low
Slow OSD Responses (0 active)
/etc/prometheus/rules/elasticsearch.rules > elasticsearch
ElasticsearchHeapTooHigh (0 active)
alert: ElasticsearchHeapTooHigh
expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} > 0.9
for: 15m
labels:
  severity: critical
annotations:
  description: The heap usage is over 90% for 15m
  summary: ElasticSearch node {{$labels.node}} heap usage is high
ElasticsearchTooFewNodesRunning (0 active)
alert: ElasticsearchTooFewNodesRunning
expr: elasticsearch_cluster_health_number_of_nodes < 3
for: 5m
labels:
  severity: critical
annotations:
  description: There are only {{$value}} < 3 ElasticSearch nodes running
  summary: ElasticSearch running on less than 3 nodes
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kube-apiserver-slos
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate1h) > (14.4 * 0.01) and sum(apiserver_request:burnrate5m) > (14.4 * 0.01)
for: 2m
labels:
  long: 1h
  severity: critical
  short: 5m
annotations:
  instance: '{{ $labels.instance }}'
  message: The API server is burning too much error budget.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
  summary: The API server is burning too much error budget.
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate6h) > (6 * 0.01) and sum(apiserver_request:burnrate30m) > (6 * 0.01)
for: 15m
labels:
  long: 6h
  severity: critical
  short: 30m
annotations:
  instance: '{{ $labels.instance }}'
  message: The API server is burning too much error budget.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
  summary: The API server is burning too much error budget.
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate1d) > (3 * 0.01) and sum(apiserver_request:burnrate2h) > (3 * 0.01)
for: 1h
labels:
  long: 1d
  severity: warning
  short: 2h
annotations:
  instance: '{{ $labels.instance }}'
  message: The API server is burning too much error budget.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
  summary: The API server is burning too much error budget.
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate3d) > (1 * 0.01) and sum(apiserver_request:burnrate6h) > (1 * 0.01)
for: 3h
labels:
  long: 3d
  severity: warning
  short: 6h
annotations:
  instance: '{{ $labels.instance }}'
  message: The API server is burning too much error budget.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
  summary: The API server is burning too much error budget.
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kubernetes-apps
KubePodNotReady (1 active)
alert: KubePodNotReady
expr: sum by(namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}))) > 0
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
  summary: Pod has been in a non-ready state for more than 15 minutes.
Labels State Active Since Value
alertname="KubePodNotReady" namespace="team-mvp" pod="tangotest-tango-base-test-test-0" severity="warning" firing 2020-12-05 16:56:50.091496482 +0000 UTC 1
KubeStatefulSetReplicasMismatch (2 active)
alert: KubeStatefulSetReplicasMismatch
expr: (kube_statefulset_status_replicas_ready{job="kube-state-metrics"} != kube_statefulset_status_replicas{job="kube-state-metrics"}) and (changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) == 0)
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
  summary: Deployment has not matched the expected number of replicas.
Labels State Active Since Value
alertname="KubeStatefulSetReplicasMismatch" instance="k8s-syscore-loadbalancer-0:32080" job="kube-state-metrics" namespace="team-mvp" severity="warning" statefulset="databaseds-tango-base-test" pending 2020-12-05 18:26:35.091496482 +0000 UTC 0
alertname="KubeStatefulSetReplicasMismatch" instance="k8s-syscore-loadbalancer-0:32080" job="kube-state-metrics" namespace="team-mvp" severity="warning" statefulset="tangotest-tango-base-test-test" firing 2020-12-05 16:56:50.091496482 +0000 UTC 0
KubeContainerWaiting (1 active)
alert: KubeContainerWaiting
expr: sum by(namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
for: 1h
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
  summary: Pod container waiting longer than 1 hour
Labels State Active Since Value
alertname="KubeContainerWaiting" container="databaseds" namespace="team-mvp" pod="databaseds-tango-base-test-0" severity="warning" pending 2020-12-05 18:26:50.091496482 +0000 UTC 1
KubeJobCompletion (1 active)
alert: KubeJobCompletion
expr: kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
for: 12h
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
  summary: Job did not complete in time
Labels State Active Since Value
alertname="KubeJobCompletion" instance="k8s-syscore-loadbalancer-0:32080" job="kube-state-metrics" job_name="proc-pb-mvp01-20200325-12013-workflow" namespace="integration-mid-sdp" severity="warning" pending 2020-12-05 16:56:50.091496482 +0000 UTC 1
KubePodCrashLooping (1 active)
alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
  summary: Pod is crash looping.
Labels State Active Since Value
alertname="KubePodCrashLooping" container="databaseds" instance="k8s-syscore-loadbalancer-0:32080" job="kube-state-metrics" namespace="team-mvp" pod="databaseds-tango-base-test-0" severity="warning" pending 2020-12-05 18:26:05.091496482 +0000 UTC 1.0526315789473684
KubeDaemonSetMisScheduled (0 active)
alert: KubeDaemonSetMisScheduled
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
  summary: DaemonSet pods are misscheduled.
KubeDaemonSetNotScheduled (0 active)
alert: KubeDaemonSetNotScheduled
expr: kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
for: 10m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
  summary: DaemonSet pods are not scheduled.
KubeDaemonSetRolloutStuck (0 active)
alert: KubeDaemonSetRolloutStuck
expr: ((kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) or (kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} != 0) or (kube_daemonset_updated_number_scheduled{job="kube-state-metrics"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) or (kube_daemonset_status_number_available{job="kube-state-metrics"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"})) and (changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}[5m]) == 0)
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
  summary: DaemonSet rollout is stuck.
KubeDeploymentGenerationMismatch (0 active)
alert: KubeDeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation{job="kube-state-metrics"} != kube_deployment_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
  summary: Deployment generation mismatch due to possible roll-back
KubeDeploymentReplicasMismatch (0 active)
alert: KubeDeploymentReplicasMismatch
expr: (kube_deployment_spec_replicas{job="kube-state-metrics"} != kube_deployment_status_replicas_available{job="kube-state-metrics"}) and (changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m]) == 0)
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
  summary: Deployment has not matched the expected number of replicas.
KubeHpaMaxedOut (0 active)
alert: KubeHpaMaxedOut
expr: kube_hpa_status_current_replicas{job="kube-state-metrics"} == kube_hpa_spec_max_replicas{job="kube-state-metrics"}
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
  summary: HPA is running at max replicas
KubeHpaReplicasMismatch (0 active)
alert: KubeHpaReplicasMismatch
expr: (kube_hpa_status_desired_replicas{job="kube-state-metrics"} != kube_hpa_status_current_replicas{job="kube-state-metrics"}) and (kube_hpa_status_current_replicas{job="kube-state-metrics"} > kube_hpa_spec_min_replicas{job="kube-state-metrics"}) and (kube_hpa_status_current_replicas{job="kube-state-metrics"} < kube_hpa_spec_max_replicas{job="kube-state-metrics"}) and changes(kube_hpa_status_current_replicas[15m]) == 0
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
  summary: HPA has not matched descired number of replicas.
KubeJobFailed (0 active)
alert: KubeJobFailed
expr: kube_job_failed{job="kube-state-metrics"} > 0
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
  summary: Job failed to complete.
KubeStatefulSetGenerationMismatch (0 active)
alert: KubeStatefulSetGenerationMismatch
expr: kube_statefulset_status_observed_generation{job="kube-state-metrics"} != kube_statefulset_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
  summary: StatefulSet generation mismatch due to possible roll-back
KubeStatefulSetUpdateNotRolledOut (0 active)
alert: KubeStatefulSetUpdateNotRolledOut
expr: (max without(revision) (kube_statefulset_status_current_revision{job="kube-state-metrics"} unless kube_statefulset_status_update_revision{job="kube-state-metrics"}) * (kube_statefulset_replicas{job="kube-state-metrics"} != kube_statefulset_status_replicas_updated{job="kube-state-metrics"})) and (changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) == 0)
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
  summary: StatefulSet update has not been rolled out.
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kubernetes-resources
CPUThrottlingHigh (1 active)
alert: CPUThrottlingHigh
expr: sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) / sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m])) > (25 / 100)
for: 15m
labels:
  severity: info
annotations:
  instance: '{{ $labels.instance }}'
  message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
  summary: Processes experience elevated CPU throttling.
Labels State Active Since Value
alertname="CPUThrottlingHigh" container="archiverdb" namespace="public-integration" pod="archiverdb-archiver-0" severity="info" pending 2020-12-05 18:20:12.4702334 +0000 UTC 0.2965159377316531
KubeCPUOvercommit (0 active)
alert: KubeCPUOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum) / sum(kube_node_status_allocatable_cpu_cores) > (count(kube_node_status_allocatable_cpu_cores) - 1) / count(kube_node_status_allocatable_cpu_cores)
for: 5m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
  summary: Cluster has overcommitted CPU resource requests.
KubeCPUQuotaOvercommit (0 active)
alert: KubeCPUQuotaOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="cpu",type="hard"}) / sum(kube_node_status_allocatable_cpu_cores) > 1.5
for: 5m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: Cluster has overcommitted CPU resource requests for Namespaces.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit
  summary: Cluster has overcommitted CPU resource requests.
KubeMemoryOvercommit (0 active)
alert: KubeMemoryOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum) / sum(kube_node_status_allocatable_memory_bytes) > (count(kube_node_status_allocatable_memory_bytes) - 1) / count(kube_node_status_allocatable_memory_bytes)
for: 5m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
  summary: Cluster has overcommitted memory resource requests.
KubeMemoryQuotaOvercommit (0 active)
alert: KubeMemoryQuotaOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="memory",type="hard"}) / sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"}) > 1.5
for: 5m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: Cluster has overcommitted memory resource requests for Namespaces.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit
  summary: Cluster has overcommitted memory resource requests.
KubeQuotaAlmostFull (0 active)
alert: KubeQuotaAlmostFull
expr: kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) > 0.9 < 1
for: 15m
labels:
  severity: info
annotations:
  instance: '{{ $labels.instance }}'
  message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull
  summary: Namespace quota is going to be full.
KubeQuotaExceeded (0 active)
alert: KubeQuotaExceeded
expr: kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) > 1
for: 15m
labels:
  severity: warning
annotations:
  message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
  summary: Namespace quota has exceeded the limits.
KubeQuotaFullyUsed (0 active)
alert: KubeQuotaFullyUsed
expr: kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) == 1
for: 15m
labels:
  severity: info
annotations:
  message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
  summary: Namespace quota is fully used.
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kubernetes-storage
KubePersistentVolumeErrors (0 active)
alert: KubePersistentVolumeErrors
expr: kube_persistentvolume_status_phase{job="kube-state-metrics",phase=~"Failed|Pending"} > 0
for: 5m
labels:
  severity: critical
annotations:
  instance: '{{ $labels.instance }}'
  message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
  summary: PersistentVolume is having issues with provisioning.
KubePersistentVolumeFillingUp (0 active)
alert: KubePersistentVolumeFillingUp
expr: kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"} < 0.03
for: 1m
labels:
  severity: critical
annotations:
  instance: '{{ $labels.instance }}'
  message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
  summary: PersistentVolume is filling up.
KubePersistentVolumeFillingUp (0 active)
alert: KubePersistentVolumeFillingUp
expr: (kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"}) < 0.15 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
for: 1h
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
  summary: PersistentVolume is filling up.
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kubernetes-system
KubeClientErrors (0 active)
alert: KubeClientErrors
expr: (sum by(instance, job) (rate(rest_client_requests_total{code=~"5.."}[5m])) / sum by(instance, job) (rate(rest_client_requests_total[5m]))) > 0.01
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
  summary: Kubernetes API server client is experiencing errors.
KubeVersionMismatch (0 active)
alert: KubeVersionMismatch
expr: count(count by(gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"}, "gitVersion", "$1", "gitVersion", "(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: There are {{ $value }} different semantic versions of Kubernetes components running.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
  summary: Different semantic versions of Kubernetes components running.
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kubernetes-system-apiserver
AggregatedAPIDown (0 active)
alert: AggregatedAPIDown
expr: (1 - max by(name, namespace) (avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
for: 5m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
  summary: An aggregated API is down.
AggregatedAPIErrors (0 active)
alert: AggregatedAPIErrors
expr: sum by(name, namespace) (increase(aggregator_unavailable_apiservice_count[5m])) > 2
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
  summary: An aggregated API has reported errors.
KubeAPIDown (0 active)
alert: KubeAPIDown
expr: absent(up{job="kube-apiserver"} == 1)
for: 15m
labels:
  severity: critical
annotations:
  instance: '{{ $labels.instance }}'
  message: KubeAPI has disappeared from Prometheus target discovery.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
  summary: Target disappeared from Prometheus target discovery.
KubeClientCertificateExpiration (0 active)
alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
  summary: Client certificate is about to expire.
KubeClientCertificateExpiration (0 active)
alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
labels:
  severity: critical
annotations:
  instance: '{{ $labels.instance }}'
  message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
  summary: Client certificate is about to expire.
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kubernetes-system-controller-manager
KubeControllerManagerDown (1 active)
alert: KubeControllerManagerDown
expr: absent(up{job="kube-controller-manager"} == 1)
for: 15m
labels:
  severity: critical
annotations:
  instance: '{{ $labels.instance }}'
  message: KubeControllerManager has disappeared from Prometheus target discovery.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
  summary: Target disappeared from Prometheus target discovery.
Labels State Active Since Value
alertname="KubeControllerManagerDown" severity="critical" firing 2020-11-27 13:09:39.090680169 +0000 UTC 1
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kubernetes-system-kubelet
KubeNodeNotReady (0 active)
alert: KubeNodeNotReady
expr: kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} == 0
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: '{{ $labels.node }} has been unready for more than 15 minutes.'
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
  summary: Node is not ready.
KubeNodeReadinessFlapping (0 active)
alert: KubeNodeReadinessFlapping
expr: sum by(node) (changes(kube_node_status_condition{condition="Ready",status="true"}[15m])) > 2
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
  summary: Node readiness status is flapping.
KubeNodeUnreachable (0 active)
alert: KubeNodeUnreachable
expr: (kube_node_spec_taint{effect="NoSchedule",job="kube-state-metrics",key="node.kubernetes.io/unreachable"} unless ignoring(key, value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
  summary: Node is unreachable.
KubeletClientCertificateExpiration (0 active)
alert: KubeletClientCertificateExpiration
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
labels:
  severity: critical
annotations:
  message: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration
  summary: Kubelet client certificate is about to expire.
KubeletClientCertificateExpiration (0 active)
alert: KubeletClientCertificateExpiration
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
labels:
  severity: warning
annotations:
  message: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration
  summary: Kubelet client certificate is about to expire.
KubeletClientCertificateRenewalErrors (0 active)
alert: KubeletClientCertificateRenewalErrors
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
for: 15m
labels:
  severity: warning
annotations:
  message: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors
  summary: Kubelet has failed to renew its client certificate.
KubeletDown (0 active)
alert: KubeletDown
expr: absent(up{job="kubelet"} == 1)
for: 15m
labels:
  severity: critical
annotations:
  instance: '{{ $labels.instance }}'
  message: Kubelet has disappeared from Prometheus target discovery.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
  summary: Target disappeared from Prometheus target discovery.
KubeletPlegDurationHigh (0 active)
alert: KubeletPlegDurationHigh
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
for: 5m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
  summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
KubeletPodStartUpLatencyHigh (0 active)
alert: KubeletPodStartUpLatencyHigh
expr: histogram_quantile(0.99, sum by(instance, le) (rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m]))) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
  summary: Kubelet Pod startup latency is too high.
KubeletServerCertificateExpiration (0 active)
alert: KubeletServerCertificateExpiration
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
labels:
  severity: warning
annotations:
  message: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration
  summary: Kubelet server certificate is about to expire.
KubeletServerCertificateExpiration (0 active)
alert: KubeletServerCertificateExpiration
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
labels:
  severity: critical
annotations:
  message: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration
  summary: Kubelet server certificate is about to expire.
KubeletServerCertificateRenewalErrors (0 active)
alert: KubeletServerCertificateRenewalErrors
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
for: 15m
labels:
  severity: warning
annotations:
  message: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors
  summary: Kubelet has failed to renew its server certificate.
KubeletTooManyPods (0 active)
alert: KubeletTooManyPods
expr: count by(node) ((kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance, pod, namespace, cluster) group_left(node) topk by(instance, pod, namespace, cluster) (1, kube_pod_info{job="kube-state-metrics"})) / max by(node) (kube_node_status_capacity_pods{job="kube-state-metrics"} != 1) > 0.95
for: 15m
labels:
  severity: warning
annotations:
  instance: '{{ $labels.instance }}'
  message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
  summary: Kubelet is running at capacity.
/etc/prometheus/rules/kubernetes_prometheus_alerts.yaml.rules > kubernetes-system-scheduler
KubeSchedulerDown (1 active)
alert: KubeSchedulerDown
expr: absent(up{job="kube-scheduler"} == 1)
for: 15m
labels:
  severity: critical
annotations:
  instance: '{{ $labels.instance }}'
  message: KubeScheduler has disappeared from Prometheus target discovery.
  namespace: '{{ $labels.namespace }}'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
  summary: Target disappeared from Prometheus target discovery.
Labels State Active Since Value
alertname="KubeSchedulerDown" severity="critical" firing 2020-11-27 13:09:44.679100949 +0000 UTC 1