Helm charts for Distributed Edge Analytics.
[demo.git] / vnfs / DAaaS / prometheus-operator / templates / alertmanager / rules / prometheus.rules.yaml
1 # Generated from 'prometheus.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
2 {{- if and .Values.defaultRules.create }}
3 {{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }}
4 apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
5 kind: PrometheusRule
6 metadata:
7   name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus.rules" | trunc 63 | trimSuffix "-" }}
8   labels:
9     app: {{ template "prometheus-operator.name" . }}
10 {{ include "prometheus-operator.labels" . | indent 4 }}
11 {{- if .Values.defaultRules.labels }}
12 {{ toYaml .Values.defaultRules.labels | indent 4 }}
13 {{- end }}
14 {{- if .Values.defaultRules.annotations }}
15   annotations:
16 {{ toYaml .Values.defaultRules.annotations | indent 4 }}
17 {{- end }}
18 spec:
19   groups:
20   - name: prometheus.rules
21     rules:
22     - alert: PrometheusConfigReloadFailed
23       annotations:
24         description: Reloading Prometheus' configuration has failed for {{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}}
25         summary: Reloading Prometheus' configuration failed
26       expr: prometheus_config_last_reload_successful{job="{{ $prometheusJob }}"} == 0
27       for: 10m
28       labels:
29         severity: warning
30     - alert: PrometheusNotificationQueueRunningFull
31       annotations:
32         description: Prometheus' alert notification queue is running full for {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}}
33         summary: Prometheus' alert notification queue is running full
34       expr: predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}"}
35       for: 10m
36       labels:
37         severity: warning
38     - alert: PrometheusErrorSendingAlerts
39       annotations:
40         description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
41         summary: Errors while sending alert from Prometheus
42       expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}"}[5m]) > 0.01
43       for: 10m
44       labels:
45         severity: warning
46     - alert: PrometheusErrorSendingAlerts
47       annotations:
48         description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
49         summary: Errors while sending alerts from Prometheus
50       expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}"}[5m]) > 0.03
51       for: 10m
52       labels:
53         severity: critical
54     - alert: PrometheusNotConnectedToAlertmanagers
55       annotations:
56         description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} is not connected to any Alertmanagers
57         summary: Prometheus is not connected to any Alertmanagers
58       expr: prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}"} < 1
59       for: 10m
60       labels:
61         severity: warning
62     - alert: PrometheusTSDBReloadsFailing
63       annotations:
64         description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} reload failures over the last four hours.'
65         summary: Prometheus has issues reloading data blocks from disk
66       expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}"}[2h]) > 0
67       for: 12h
68       labels:
69         severity: warning
70     - alert: PrometheusTSDBCompactionsFailing
71       annotations:
72         description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} compaction failures over the last four hours.'
73         summary: Prometheus has issues compacting sample blocks
74       expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}"}[2h]) > 0
75       for: 12h
76       labels:
77         severity: warning
78     - alert: PrometheusTSDBWALCorruptions
79       annotations:
80         description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} has a corrupted write-ahead log (WAL).'
81         summary: Prometheus write-ahead log is corrupted
82       expr: tsdb_wal_corruptions_total{job="{{ $prometheusJob }}"} > 0
83       for: 4h
84       labels:
85         severity: warning
86     - alert: PrometheusNotIngestingSamples
87       annotations:
88         description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} isn't ingesting samples.
89         summary: Prometheus isn't ingesting samples
90       expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}"}[5m]) <= 0
91       for: 10m
92       labels:
93         severity: warning
94     - alert: PrometheusTargetScrapesDuplicate
95       annotations:
96         description: '{{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}} has many samples rejected due to duplicate timestamps but different values'
97         summary: Prometheus has many samples rejected
98       expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}"}[5m]) > 0
99       for: 10m
100       labels:
101         severity: warning
102 {{- end }}