diff options
| author | Max Audron <audron@cocaine.farm> | 2025-08-05 14:53:56 +0200 |
|---|---|---|
| committer | Max Audron <audron@cocaine.farm> | 2025-08-05 14:53:56 +0200 |
| commit | 482e058f57ff14f6293e9011fa43f5e9db3723fc (patch) | |
| tree | 899366cad885726e75bcd097cd1c3e979caed843 | |
| parent | add homepage dashboard (diff) | |
add prometheus alerting rules
| -rw-r--r-- | modules/monitoring/default.nix | 40 | ||||
| -rw-r--r-- | modules/monitoring/rules.nix | 97 | ||||
| -rw-r--r-- | modules/monitoring/scrape.nix | 45 |
3 files changed, 144 insertions, 38 deletions
diff --git a/modules/monitoring/default.nix b/modules/monitoring/default.nix index d2e4bc3..46c8d2f 100644 --- a/modules/monitoring/default.nix +++ b/modules/monitoring/default.nix @@ -4,6 +4,8 @@ with self.lib.nginx; with self.lib.mon; let exp = config.services.prometheus.exporters; in { + imports = [ ./scrape.nix ./rules.nix ]; + services.prometheus = { enable = true; enableReload = true; @@ -15,44 +17,6 @@ in { globalConfig = { scrape_interval = "10s"; }; - - scrapeConfigs = [ - (mkScrapeConfig "node" [ "ettves" "phaenn" "fra01" "nyc01" "sin01" ] exp.node.port) - (mkScrapeConfig "zfs" [ "ettves" "phaenn" ] exp.zfs.port) - (mkScrapeConfig "smartctl" [ "ettves" "phaenn" ] exp.smartctl.port) - (mkScrapeConfig "nginx" [ "ettves" "phaenn" "fra01" "nyc01" "sin01" ] exp.nginx.port) - - (mkScrapeConfig "postgres" [ "ettves" ] exp.postgres.port) - (mkScrapeConfig "quassel" [ "localhost" ] config.services.quassel.settings.metrics.port) - - { - job_name = "authentik"; - static_configs = [ - { targets = [ "ettves:9300" "ettves:9303" "ettves:9304" ]; } - ]; - relabel_configs = relabelConfig; - } - - (mkScrapeConfig "garage" [ "fra01" "nyc01" "sin01" ] 3903) - (mkScrapeConfig "pdns" [ "ettves" "fra01" "nyc01" "sin01" ] 8081) - - ((mkScrape "minecraft" [ "ettves:25585" "ettves:9150" "ettves:9225" ]) // { - relabel_configs = [ - { - source_labels = ["__address__"]; - target_label = "server"; - regex = "(ettves:25585)|(ettves:9150)"; - replacement = "dungeons"; - } - { - source_labels = ["__address__"]; - target_label = "server"; - regex = "(ettves:9225)"; - replacement = "vanilla"; - } - ]; - }) - ]; }; services.udev.extraRules = '' diff --git a/modules/monitoring/rules.nix b/modules/monitoring/rules.nix new file mode 100644 index 0000000..bff5aa8 --- /dev/null +++ b/modules/monitoring/rules.nix @@ -0,0 +1,97 @@ +{ self, config, lib, pkgs, ... }: + +{ + services.prometheus = { + rules = [(builtins.toJSON { + groups = [ + { name = "disk"; + rules = [ + { alert = "smartctl self-test failed to pass"; + expr = "smartctl_device_smart_status != 1"; + } + { alert = "smartctl uncorrectable errors"; + expr = ''smartctl_device_attribute{attribute_value_type="raw", attribute_name="Offline_Uncorrectable"} > 10''; + } + { alert = "smartctl sectors pending"; + expr = ''smartctl_device_attribute{attribute_value_type="raw", attribute_name="Current_Pending_Sector"} > 10''; + for = "10m"; + } + { alert = "ZFS Pool over 90% full"; + expr = "round((zfs_pool_allocated_bytes / zfs_pool_size_bytes) * 100) > 90"; + for = "10m"; + } + ]; + } + { name = "Machine Resources"; + rules = [ + { alert = "CPU Load over 100%"; + expr = ''round((node_load5 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) * 100, 0.1) > 100''; + for = "10m"; + } + { alert = "Memory usage over 90%"; + expr = '' + round( + (node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) + / node_memory_MemTotal_bytes * 100 + , 0.1) > 90 + ''; + for = "10m"; + } + ]; + } + { name = "systemd"; + rules = [ + { alert = "systemd unit failed"; + expr = ''node_systemd_unit_state{state="failed"} >= 1''; + for = "5m"; + } + ]; + } + { name = "Authentik"; + rules = [ + { alert = "authentik outpost down"; + expr = ''authentik_outpost_connection != 1''; + for = "5m"; + } + ]; + } + { name = "nginx"; + rules = [ + { alert = "nginx down"; + expr = ''nginx_up != 1''; + for = "5m"; + } + { alert = "nginx down"; + expr = ''nginx_up != 1''; + for = "5m"; + } + ]; + } + { name = "minecraft"; + rules = [ + { alert = "minecraft tps low"; + expr = ''(mc_tps or minecraft_tps) < 16''; + for = "5m"; + } + ]; + } + { name = "PowerDNS"; + rules = [ + { alert = "pdns latency high"; + expr = ''pdns_auth_latency > 350''; + for = "5m"; + } + { alert = "pdns send latency high"; + expr = ''pdns_auth_send_latency > 85''; + for = "5m"; + } + { alert = "pdns backend overloaded"; + expr = ''pdns_auth_overload_drops > 10''; + for = "5m"; + } + ]; + } + ]; + })]; + }; +} diff --git a/modules/monitoring/scrape.nix b/modules/monitoring/scrape.nix new file mode 100644 index 0000000..69ea001 --- /dev/null +++ b/modules/monitoring/scrape.nix @@ -0,0 +1,45 @@ +{ self, config, lib, pkgs, ... }: + +with self.lib.mon; +let exp = config.services.prometheus.exporters; +in { + services.prometheus = { + scrapeConfigs = [ + (mkScrapeConfig "node" [ "ettves" "phaenn" "fra01" "nyc01" "sin01" ] exp.node.port) + (mkScrapeConfig "zfs" [ "ettves" "phaenn" ] exp.zfs.port) + (mkScrapeConfig "smartctl" [ "ettves" "phaenn" ] exp.smartctl.port) + (mkScrapeConfig "nginx" [ "ettves" "phaenn" "fra01" "nyc01" "sin01" ] exp.nginx.port) + + (mkScrapeConfig "postgres" [ "ettves" ] exp.postgres.port) + (mkScrapeConfig "quassel" [ "localhost" ] config.services.quassel.settings.metrics.port) + + { + job_name = "authentik"; + static_configs = [ + { targets = [ "ettves:9300" "ettves:9303" "ettves:9304" ]; } + ]; + relabel_configs = relabelConfig; + } + + (mkScrapeConfig "garage" [ "fra01" "nyc01" "sin01" ] 3903) + (mkScrapeConfig "pdns" [ "ettves" "fra01" "nyc01" "sin01" ] 8081) + + ((mkScrape "minecraft" [ "ettves:25585" "ettves:9150" "ettves:9225" ]) // { + relabel_configs = [ + { + source_labels = ["__address__"]; + target_label = "server"; + regex = "(ettves:25585)|(ettves:9150)"; + replacement = "dungeons"; + } + { + source_labels = ["__address__"]; + target_label = "server"; + regex = "(ettves:9225)"; + replacement = "vanilla"; + } + ]; + }) + ]; + }; +} |
