1 # Generated from 'prometheus.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
2 # Do not change in-place! In order to change this file first read following link:
3 # https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
4 {{- if and .Values.defaultRules.create .Values.defaultRules.rules.prometheus }}
5 {{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }}
6 {{- $namespace := .Release.Namespace }}
7 apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
10 name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus.rules" | trunc 63 | trimSuffix "-" }}
12 app: {{ template "prometheus-operator.name" . }}
13 {{ include "prometheus-operator.labels" . | indent 4 }}
14 {{- if .Values.defaultRules.labels }}
15 {{ toYaml .Values.defaultRules.labels | indent 4 }}
17 {{- if .Values.defaultRules.annotations }}
19 {{ toYaml .Values.defaultRules.annotations | indent 4 }}
23 - name: prometheus.rules
25 - alert: PrometheusConfigReloadFailed
27 description: Reloading Prometheus' configuration has failed for {{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}}
28 summary: Reloading Prometheus' configuration failed
29 expr: prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} == 0
33 - alert: PrometheusNotificationQueueRunningFull
35 description: Prometheus' alert notification queue is running full for {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}}
36 summary: Prometheus' alert notification queue is running full
37 expr: predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}
41 - alert: PrometheusErrorSendingAlerts
43 description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
44 summary: Errors while sending alert from Prometheus
45 expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.01
49 - alert: PrometheusErrorSendingAlerts
51 description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
52 summary: Errors while sending alerts from Prometheus
53 expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.03
57 - alert: PrometheusNotConnectedToAlertmanagers
59 description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} is not connected to any Alertmanagers
60 summary: Prometheus is not connected to any Alertmanagers
61 expr: prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} < 1
65 - alert: PrometheusTSDBReloadsFailing
67 description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} reload failures over the last four hours.'
68 summary: Prometheus has issues reloading data blocks from disk
69 expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[2h]) > 0
73 - alert: PrometheusTSDBCompactionsFailing
75 description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} compaction failures over the last four hours.'
76 summary: Prometheus has issues compacting sample blocks
77 expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[2h]) > 0
81 - alert: PrometheusTSDBWALCorruptions
83 description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} has a corrupted write-ahead log (WAL).'
84 summary: Prometheus write-ahead log is corrupted
85 expr: tsdb_wal_corruptions_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} > 0
89 - alert: PrometheusNotIngestingSamples
91 description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} isn't ingesting samples.
92 summary: Prometheus isn't ingesting samples
93 expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0
97 - alert: PrometheusTargetScrapesDuplicate
99 description: '{{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}} has many samples rejected due to duplicate timestamps but different values'
100 summary: Prometheus has many samples rejected
101 expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0