main.yml 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. ---
  2. alertmanager_version: "0.24.0"
  3. alertmanager_tarball: "alertmanager-{{ alertmanager_version }}.linux-amd64"
  4. alertmanager_url: "https://github.com/prometheus/alertmanager/releases/download/v{{ alertmanager_version }}"
  5. alertmanager_skip_install: false
  6. prometheus_version: 2.37.1
  7. prometheus_tarball: "prometheus-{{ prometheus_version }}.linux-amd64"
  8. prometheus_url: "https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}"
  9. prometheus_skip_install: false
  10. prometheus_binary_local_dir: ''
  11. prometheus_binary_install_dir: '/usr/local/bin'
  12. prometheus_config_dir: /etc/prometheus
  13. prometheus_db_dir: /var/lib/prometheus
  14. prometheus_read_only_dirs: []
  15. prometheus_web_listen_address: "0.0.0.0:9090"
  16. prometheus_web_external_url: ''
  17. # See https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md
  18. prometheus_web_config:
  19. tls_server_config: {}
  20. http_server_config: {}
  21. basic_auth_users: {}
  22. prometheus_storage_retention: "30d"
  23. # Available since Prometheus 2.7.0
  24. # [EXPERIMENTAL] Maximum number of bytes that can be stored for blocks. Units
  25. # supported: KB, MB, GB, TB, PB.
  26. prometheus_storage_retention_size: "0"
  27. prometheus_config_flags_extra: {}
  28. # prometheus_config_flags_extra:
  29. # storage.tsdb.retention: 15d
  30. # alertmanager.timeout: 10s
  31. prometheus_alertmanager_config: []
  32. # prometheus_alertmanager_config:
  33. # - scheme: https
  34. # path_prefix: alertmanager/
  35. # basic_auth:
  36. # username: user
  37. # password: pass
  38. # static_configs:
  39. # - targets: ["127.0.0.1:9093"]
  40. # proxy_url: "127.0.0.2"
  41. prometheus_alert_relabel_configs: []
  42. # prometheus_alert_relabel_configs:
  43. # - action: labeldrop
  44. # regex: replica
  45. prometheus_global:
  46. scrape_interval: 15s
  47. scrape_timeout: 10s
  48. evaluation_interval: 15s
  49. prometheus_remote_write: []
  50. # prometheus_remote_write:
  51. # - url: https://dev.kausal.co/prom/push
  52. # basic_auth:
  53. # password: FOO
  54. prometheus_remote_read: []
  55. # prometheus_remote_read:
  56. # - url: https://demo.cloudalchemy.org:9201/read
  57. # basic_auth:
  58. # password: FOO
  59. prometheus_external_labels:
  60. environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}"
  61. prometheus_targets: {}
  62. # node:
  63. # - targets:
  64. # - localhost:9100
  65. # labels:
  66. # env: test
  67. prometheus_scrape_configs:
  68. - job_name: "prometheus"
  69. metrics_path: "{{ prometheus_metrics_path }}"
  70. static_configs:
  71. - targets:
  72. - "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090"
  73. - job_name: "node"
  74. file_sd_configs:
  75. - files:
  76. - "{{ prometheus_config_dir }}/file_sd/node.yml"
  77. # Alternative config file name, searched in ansible templates path.
  78. prometheus_config_file: 'prometheus.yml.j2'
  79. prometheus_alert_rules_files:
  80. - prometheus/rules/*.rules
  81. prometheus_static_targets_files:
  82. - prometheus/targets/*.yml
  83. - prometheus/targets/*.json
  84. prometheus_alert_rules:
  85. - alert: Watchdog
  86. expr: vector(1)
  87. for: 10m
  88. labels:
  89. severity: warning
  90. annotations:
  91. description: "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty."
  92. summary: 'Ensure entire alerting pipeline is functional'
  93. - alert: InstanceDown
  94. expr: 'up == 0'
  95. for: 5m
  96. labels:
  97. severity: critical
  98. annotations:
  99. description: '{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}'
  100. summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}'
  101. - alert: RebootRequired
  102. expr: 'node_reboot_required > 0'
  103. labels:
  104. severity: warning
  105. annotations:
  106. description: '{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}'
  107. summary: '{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}'
  108. - alert: NodeFilesystemSpaceFillingUp
  109. annotations:
  110. description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.{% endraw %}'
  111. summary: 'Filesystem is predicted to run out of space within the next 24 hours.'
  112. expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
  113. for: 1h
  114. labels:
  115. severity: warning
  116. - alert: NodeFilesystemSpaceFillingUp
  117. annotations:
  118. description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.{% endraw %}'
  119. summary: 'Filesystem is predicted to run out of space within the next 4 hours.'
  120. expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
  121. for: 1h
  122. labels:
  123. severity: critical
  124. - alert: NodeFilesystemAlmostOutOfSpace
  125. annotations:
  126. description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}'
  127. summary: 'Filesystem has less than 5% space left.'
  128. expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
  129. for: 1h
  130. labels:
  131. severity: warning
  132. - alert: NodeFilesystemAlmostOutOfSpace
  133. annotations:
  134. description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}'
  135. summary: 'Filesystem has less than 3% space left.'
  136. expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
  137. for: 1h
  138. labels:
  139. severity: critical
  140. - alert: NodeFilesystemFilesFillingUp
  141. annotations:
  142. description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.{% endraw %}'
  143. summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.'
  144. expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
  145. for: 1h
  146. labels:
  147. severity: warning
  148. - alert: NodeFilesystemFilesFillingUp
  149. annotations:
  150. description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.{% endraw %}'
  151. summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.'
  152. expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
  153. for: 1h
  154. labels:
  155. severity: critical
  156. - alert: NodeFilesystemAlmostOutOfFiles
  157. annotations:
  158. description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}'
  159. summary: 'Filesystem has less than 5% inodes left.'
  160. expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
  161. for: 1h
  162. labels:
  163. severity: warning
  164. - alert: NodeFilesystemAlmostOutOfFiles
  165. annotations:
  166. description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}'
  167. summary: 'Filesystem has less than 3% inodes left.'
  168. expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
  169. for: 1h
  170. labels:
  171. severity: critical
  172. - alert: NodeNetworkReceiveErrs
  173. annotations:
  174. description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.{% endraw %}'
  175. summary: 'Network interface is reporting many receive errors.'
  176. expr: "increase(node_network_receive_errs_total[2m]) > 10\n"
  177. for: 1h
  178. labels:
  179. severity: warning
  180. - alert: NodeNetworkTransmitErrs
  181. annotations:
  182. description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.{% endraw %}'
  183. summary: 'Network interface is reporting many transmit errors.'
  184. expr: "increase(node_network_transmit_errs_total[2m]) > 10\n"
  185. for: 1h
  186. labels:
  187. severity: warning
  188. - alert: NodeHighNumberConntrackEntriesUsed
  189. annotations:
  190. description: '{% raw %}{{ $value | humanizePercentage }} of conntrack entries are used{% endraw %}'
  191. summary: 'Number of conntrack are getting close to the limit'
  192. expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n"
  193. labels:
  194. severity: warning
  195. - alert: NodeClockSkewDetected
  196. annotations:
  197. message: '{% raw %}Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.{% endraw %}'
  198. summary: 'Clock skew detected.'
  199. expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n"
  200. for: 10m
  201. labels:
  202. severity: warning
  203. - alert: NodeClockNotSynchronising
  204. annotations:
  205. message: '{% raw %}Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.{% endraw %}'
  206. summary: 'Clock not synchronising.'
  207. expr: "min_over_time(node_timex_sync_status[5m]) == 0\n"
  208. for: 10m
  209. labels:
  210. severity: warning