aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMax Audron <audron@cocaine.farm>2025-08-05 14:53:56 +0200
committerMax Audron <audron@cocaine.farm>2025-08-05 14:53:56 +0200
commit482e058f57ff14f6293e9011fa43f5e9db3723fc (patch)
tree899366cad885726e75bcd097cd1c3e979caed843
parentadd homepage dashboard (diff)
add prometheus alerting rules
Diffstat (limited to '')
-rw-r--r--modules/monitoring/default.nix40
-rw-r--r--modules/monitoring/rules.nix97
-rw-r--r--modules/monitoring/scrape.nix45
3 files changed, 144 insertions, 38 deletions
diff --git a/modules/monitoring/default.nix b/modules/monitoring/default.nix
index d2e4bc3..46c8d2f 100644
--- a/modules/monitoring/default.nix
+++ b/modules/monitoring/default.nix
@@ -4,6 +4,8 @@ with self.lib.nginx;
with self.lib.mon;
let exp = config.services.prometheus.exporters;
in {
+ imports = [ ./scrape.nix ./rules.nix ];
+
services.prometheus = {
enable = true;
enableReload = true;
@@ -15,44 +17,6 @@ in {
globalConfig = {
scrape_interval = "10s";
};
-
- scrapeConfigs = [
- (mkScrapeConfig "node" [ "ettves" "phaenn" "fra01" "nyc01" "sin01" ] exp.node.port)
- (mkScrapeConfig "zfs" [ "ettves" "phaenn" ] exp.zfs.port)
- (mkScrapeConfig "smartctl" [ "ettves" "phaenn" ] exp.smartctl.port)
- (mkScrapeConfig "nginx" [ "ettves" "phaenn" "fra01" "nyc01" "sin01" ] exp.nginx.port)
-
- (mkScrapeConfig "postgres" [ "ettves" ] exp.postgres.port)
- (mkScrapeConfig "quassel" [ "localhost" ] config.services.quassel.settings.metrics.port)
-
- {
- job_name = "authentik";
- static_configs = [
- { targets = [ "ettves:9300" "ettves:9303" "ettves:9304" ]; }
- ];
- relabel_configs = relabelConfig;
- }
-
- (mkScrapeConfig "garage" [ "fra01" "nyc01" "sin01" ] 3903)
- (mkScrapeConfig "pdns" [ "ettves" "fra01" "nyc01" "sin01" ] 8081)
-
- ((mkScrape "minecraft" [ "ettves:25585" "ettves:9150" "ettves:9225" ]) // {
- relabel_configs = [
- {
- source_labels = ["__address__"];
- target_label = "server";
- regex = "(ettves:25585)|(ettves:9150)";
- replacement = "dungeons";
- }
- {
- source_labels = ["__address__"];
- target_label = "server";
- regex = "(ettves:9225)";
- replacement = "vanilla";
- }
- ];
- })
- ];
};
services.udev.extraRules = ''
diff --git a/modules/monitoring/rules.nix b/modules/monitoring/rules.nix
new file mode 100644
index 0000000..bff5aa8
--- /dev/null
+++ b/modules/monitoring/rules.nix
@@ -0,0 +1,97 @@
+{ self, config, lib, pkgs, ... }:
+
+{
+ services.prometheus = {
+ rules = [(builtins.toJSON {
+ groups = [
+ { name = "disk";
+ rules = [
+ { alert = "smartctl self-test failed to pass";
+ expr = "smartctl_device_smart_status != 1";
+ }
+ { alert = "smartctl uncorrectable errors";
+ expr = ''smartctl_device_attribute{attribute_value_type="raw", attribute_name="Offline_Uncorrectable"} > 10'';
+ }
+ { alert = "smartctl sectors pending";
+ expr = ''smartctl_device_attribute{attribute_value_type="raw", attribute_name="Current_Pending_Sector"} > 10'';
+ for = "10m";
+ }
+ { alert = "ZFS Pool over 90% full";
+ expr = "round((zfs_pool_allocated_bytes / zfs_pool_size_bytes) * 100) > 90";
+ for = "10m";
+ }
+ ];
+ }
+ { name = "Machine Resources";
+ rules = [
+ { alert = "CPU Load over 100%";
+ expr = ''round((node_load5 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) * 100, 0.1) > 100'';
+ for = "10m";
+ }
+ { alert = "Memory usage over 90%";
+ expr = ''
+ round(
+ (node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes)
+ / node_memory_MemTotal_bytes * 100
+ , 0.1) > 90
+ '';
+ for = "10m";
+ }
+ ];
+ }
+ { name = "systemd";
+ rules = [
+ { alert = "systemd unit failed";
+ expr = ''node_systemd_unit_state{state="failed"} >= 1'';
+ for = "5m";
+ }
+ ];
+ }
+ { name = "Authentik";
+ rules = [
+ { alert = "authentik outpost down";
+ expr = ''authentik_outpost_connection != 1'';
+ for = "5m";
+ }
+ ];
+ }
+ { name = "nginx";
+ rules = [
+ { alert = "nginx down";
+ expr = ''nginx_up != 1'';
+ for = "5m";
+ }
+ { alert = "nginx down";
+ expr = ''nginx_up != 1'';
+ for = "5m";
+ }
+ ];
+ }
+ { name = "minecraft";
+ rules = [
+ { alert = "minecraft tps low";
+ expr = ''(mc_tps or minecraft_tps) < 16'';
+ for = "5m";
+ }
+ ];
+ }
+ { name = "PowerDNS";
+ rules = [
+ { alert = "pdns latency high";
+ expr = ''pdns_auth_latency > 350'';
+ for = "5m";
+ }
+ { alert = "pdns send latency high";
+ expr = ''pdns_auth_send_latency > 85'';
+ for = "5m";
+ }
+ { alert = "pdns backend overloaded";
+ expr = ''pdns_auth_overload_drops > 10'';
+ for = "5m";
+ }
+ ];
+ }
+ ];
+ })];
+ };
+}
diff --git a/modules/monitoring/scrape.nix b/modules/monitoring/scrape.nix
new file mode 100644
index 0000000..69ea001
--- /dev/null
+++ b/modules/monitoring/scrape.nix
@@ -0,0 +1,45 @@
+{ self, config, lib, pkgs, ... }:
+
+with self.lib.mon;
+let exp = config.services.prometheus.exporters;
+in {
+ services.prometheus = {
+ scrapeConfigs = [
+ (mkScrapeConfig "node" [ "ettves" "phaenn" "fra01" "nyc01" "sin01" ] exp.node.port)
+ (mkScrapeConfig "zfs" [ "ettves" "phaenn" ] exp.zfs.port)
+ (mkScrapeConfig "smartctl" [ "ettves" "phaenn" ] exp.smartctl.port)
+ (mkScrapeConfig "nginx" [ "ettves" "phaenn" "fra01" "nyc01" "sin01" ] exp.nginx.port)
+
+ (mkScrapeConfig "postgres" [ "ettves" ] exp.postgres.port)
+ (mkScrapeConfig "quassel" [ "localhost" ] config.services.quassel.settings.metrics.port)
+
+ {
+ job_name = "authentik";
+ static_configs = [
+ { targets = [ "ettves:9300" "ettves:9303" "ettves:9304" ]; }
+ ];
+ relabel_configs = relabelConfig;
+ }
+
+ (mkScrapeConfig "garage" [ "fra01" "nyc01" "sin01" ] 3903)
+ (mkScrapeConfig "pdns" [ "ettves" "fra01" "nyc01" "sin01" ] 8081)
+
+ ((mkScrape "minecraft" [ "ettves:25585" "ettves:9150" "ettves:9225" ]) // {
+ relabel_configs = [
+ {
+ source_labels = ["__address__"];
+ target_label = "server";
+ regex = "(ettves:25585)|(ettves:9150)";
+ replacement = "dungeons";
+ }
+ {
+ source_labels = ["__address__"];
+ target_label = "server";
+ regex = "(ettves:9225)";
+ replacement = "vanilla";
+ }
+ ];
+ })
+ ];
+ };
+}