vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus.rules.yaml

   1 # Generated from 'prometheus.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
   2 {{- if and .Values.defaultRules.create }}
   3 {{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }}
   4 apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
   5 kind: PrometheusRule
   6 metadata:
   7   name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus.rules" | trunc 63 | trimSuffix "-" }}
   8   labels:
   9     app: {{ template "prometheus-operator.name" . }}
  10 {{ include "prometheus-operator.labels" . | indent 4 }}
  11 {{- if .Values.defaultRules.labels }}
  12 {{ toYaml .Values.defaultRules.labels | indent 4 }}
  13 {{- end }}
  14 {{- if .Values.defaultRules.annotations }}
  15   annotations:
  16 {{ toYaml .Values.defaultRules.annotations | indent 4 }}
  17 {{- end }}
  18 spec:
  19   groups:
  20   - name: prometheus.rules
  21     rules:
  22     - alert: PrometheusConfigReloadFailed
  23       annotations:
  24         description: Reloading Prometheus' configuration has failed for {{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}}
  25         summary: Reloading Prometheus' configuration failed
  26       expr: prometheus_config_last_reload_successful{job="{{ $prometheusJob }}"} == 0
  27       for: 10m
  28       labels:
  29         severity: warning
  30     - alert: PrometheusNotificationQueueRunningFull
  31       annotations:
  32         description: Prometheus' alert notification queue is running full for {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}}
  33         summary: Prometheus' alert notification queue is running full
  34       expr: predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}"}
  35       for: 10m
  36       labels:
  37         severity: warning
  38     - alert: PrometheusErrorSendingAlerts
  39       annotations:
  40         description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
  41         summary: Errors while sending alert from Prometheus
  42       expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}"}[5m]) > 0.01
  43       for: 10m
  44       labels:
  45         severity: warning
  46     - alert: PrometheusErrorSendingAlerts
  47       annotations:
  48         description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
  49         summary: Errors while sending alerts from Prometheus
  50       expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}"}[5m]) > 0.03
  51       for: 10m
  52       labels:
  53         severity: critical
  54     - alert: PrometheusNotConnectedToAlertmanagers
  55       annotations:
  56         description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} is not connected to any Alertmanagers
  57         summary: Prometheus is not connected to any Alertmanagers
  58       expr: prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}"} < 1
  59       for: 10m
  60       labels:
  61         severity: warning
  62     - alert: PrometheusTSDBReloadsFailing
  63       annotations:
  64         description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} reload failures over the last four hours.'
  65         summary: Prometheus has issues reloading data blocks from disk
  66       expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}"}[2h]) > 0
  67       for: 12h
  68       labels:
  69         severity: warning
  70     - alert: PrometheusTSDBCompactionsFailing
  71       annotations:
  72         description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} compaction failures over the last four hours.'
  73         summary: Prometheus has issues compacting sample blocks
  74       expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}"}[2h]) > 0
  75       for: 12h
  76       labels:
  77         severity: warning
  78     - alert: PrometheusTSDBWALCorruptions
  79       annotations:
  80         description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} has a corrupted write-ahead log (WAL).'
  81         summary: Prometheus write-ahead log is corrupted
  82       expr: tsdb_wal_corruptions_total{job="{{ $prometheusJob }}"} > 0
  83       for: 4h
  84       labels:
  85         severity: warning
  86     - alert: PrometheusNotIngestingSamples
  87       annotations:
  88         description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} isn't ingesting samples.
  89         summary: Prometheus isn't ingesting samples
  90       expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}"}[5m]) <= 0
  91       for: 10m
  92       labels:
  93         severity: warning
  94     - alert: PrometheusTargetScrapesDuplicate
  95       annotations:
  96         description: '{{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}} has many samples rejected due to duplicate timestamps but different values'
  97         summary: Prometheus has many samples rejected
  98       expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}"}[5m]) > 0
  99       for: 10m
 100       labels:
 101         severity: warning
 102 {{- end }}