vnfs/DAaaS/operator/charts/prometheus-operator/templates/prometheus/rules/prometheus.rules.yaml

   1 # Generated from 'prometheus.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
   2 # Do not change in-place! In order to change this file first read following link:
   3 # https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
   4 {{- if and .Values.defaultRules.create .Values.defaultRules.rules.prometheus }}
   5 {{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }}
   6 {{- $namespace := .Release.Namespace }}
   7 apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
   8 kind: PrometheusRule
   9 metadata:
  10   name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus.rules" | trunc 63 | trimSuffix "-" }}
  11   labels:
  12     app: {{ template "prometheus-operator.name" . }}
  13 {{ include "prometheus-operator.labels" . | indent 4 }}
  14 {{- if .Values.defaultRules.labels }}
  15 {{ toYaml .Values.defaultRules.labels | indent 4 }}
  16 {{- end }}
  17 {{- if .Values.defaultRules.annotations }}
  18   annotations:
  19 {{ toYaml .Values.defaultRules.annotations | indent 4 }}
  20 {{- end }}
  21 spec:
  22   groups:
  23   - name: prometheus.rules
  24     rules:
  25     - alert: PrometheusConfigReloadFailed
  26       annotations:
  27         description: Reloading Prometheus' configuration has failed for {{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}}
  28         summary: Reloading Prometheus' configuration failed
  29       expr: prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} == 0
  30       for: 10m
  31       labels:
  32         severity: warning
  33     - alert: PrometheusNotificationQueueRunningFull
  34       annotations:
  35         description: Prometheus' alert notification queue is running full for {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}}
  36         summary: Prometheus' alert notification queue is running full
  37       expr: predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}
  38       for: 10m
  39       labels:
  40         severity: warning
  41     - alert: PrometheusErrorSendingAlerts
  42       annotations:
  43         description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
  44         summary: Errors while sending alert from Prometheus
  45       expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.01
  46       for: 10m
  47       labels:
  48         severity: warning
  49     - alert: PrometheusErrorSendingAlerts
  50       annotations:
  51         description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
  52         summary: Errors while sending alerts from Prometheus
  53       expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.03
  54       for: 10m
  55       labels:
  56         severity: critical
  57     - alert: PrometheusNotConnectedToAlertmanagers
  58       annotations:
  59         description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} is not connected to any Alertmanagers
  60         summary: Prometheus is not connected to any Alertmanagers
  61       expr: prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} < 1
  62       for: 10m
  63       labels:
  64         severity: warning
  65     - alert: PrometheusTSDBReloadsFailing
  66       annotations:
  67         description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} reload failures over the last four hours.'
  68         summary: Prometheus has issues reloading data blocks from disk
  69       expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[2h]) > 0
  70       for: 12h
  71       labels:
  72         severity: warning
  73     - alert: PrometheusTSDBCompactionsFailing
  74       annotations:
  75         description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} compaction failures over the last four hours.'
  76         summary: Prometheus has issues compacting sample blocks
  77       expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[2h]) > 0
  78       for: 12h
  79       labels:
  80         severity: warning
  81     - alert: PrometheusTSDBWALCorruptions
  82       annotations:
  83         description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} has a corrupted write-ahead log (WAL).'
  84         summary: Prometheus write-ahead log is corrupted
  85       expr: tsdb_wal_corruptions_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} > 0
  86       for: 4h
  87       labels:
  88         severity: warning
  89     - alert: PrometheusNotIngestingSamples
  90       annotations:
  91         description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} isn't ingesting samples.
  92         summary: Prometheus isn't ingesting samples
  93       expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0
  94       for: 10m
  95       labels:
  96         severity: warning
  97     - alert: PrometheusTargetScrapesDuplicate
  98       annotations:
  99         description: '{{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}} has many samples rejected due to duplicate timestamps but different values'
 100         summary: Prometheus has many samples rejected
 101       expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
 102       for: 10m
 103       labels:
 104         severity: warning
 105 {{- end }}