aboutsummaryrefslogtreecommitdiff
path: root/modules/monitoring/rules.nix
blob: bff5aa84752bae29ad48897e86c63371998a5661 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
{ self, config, lib, pkgs, ... }:

{
  services.prometheus = {
    rules = [(builtins.toJSON {
      groups = [
        { name = "disk";
          rules = [
            { alert = "smartctl self-test failed to pass";
              expr = "smartctl_device_smart_status != 1";
            }
            { alert = "smartctl uncorrectable errors";
              expr = ''smartctl_device_attribute{attribute_value_type="raw", attribute_name="Offline_Uncorrectable"} > 10'';
            }
            { alert = "smartctl sectors pending";
              expr = ''smartctl_device_attribute{attribute_value_type="raw", attribute_name="Current_Pending_Sector"} > 10'';
              for = "10m";
            }
            { alert = "ZFS Pool over 90% full";
              expr = "round((zfs_pool_allocated_bytes / zfs_pool_size_bytes) * 100) > 90";
              for = "10m";
            }
          ];
        }
        { name = "Machine Resources";
          rules = [
            { alert = "CPU Load over 100%"; 
              expr = ''round((node_load5 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) * 100, 0.1) > 100'';
              for = "10m";
            }
            { alert = "Memory usage over 90%"; 
              expr = ''
                round(
                  (node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes)
                  / node_memory_MemTotal_bytes * 100
                , 0.1) > 90
              '';
              for = "10m";
            }
          ];
        }
        { name = "systemd";
          rules = [
            { alert = "systemd unit failed"; 
              expr = ''node_systemd_unit_state{state="failed"} >= 1'';
              for = "5m";
            }
          ];
        }
        { name = "Authentik";
          rules = [
            { alert = "authentik outpost down"; 
              expr = ''authentik_outpost_connection != 1'';
              for = "5m";
            }
          ];
        }
        { name = "nginx";
          rules = [
            { alert = "nginx down"; 
              expr = ''nginx_up != 1'';
              for = "5m";
            }
            { alert = "nginx down"; 
              expr = ''nginx_up != 1'';
              for = "5m";
            }
          ];
        }
        { name = "minecraft";
          rules = [
            { alert = "minecraft tps low"; 
              expr = ''(mc_tps or minecraft_tps) < 16'';
              for = "5m";
            }
          ];
        }
        { name = "PowerDNS";
          rules = [
            { alert = "pdns latency high"; 
              expr = ''pdns_auth_latency > 350'';
              for = "5m";
            }
            { alert = "pdns send latency high"; 
              expr = ''pdns_auth_send_latency > 85'';
              for = "5m";
            }
            { alert = "pdns backend overloaded"; 
              expr = ''pdns_auth_overload_drops > 10'';
              for = "5m";
            }
          ];
        }
      ];
    })];
  };
}