summaryrefslogtreecommitdiffstats
path: root/krebs/2configs/shack/prometheus
diff options
context:
space:
mode:
Diffstat (limited to 'krebs/2configs/shack/prometheus')
-rw-r--r--krebs/2configs/shack/prometheus/alert-rules.nix21
-rw-r--r--krebs/2configs/shack/prometheus/irc-alerts.py207
-rw-r--r--krebs/2configs/shack/prometheus/irc-hooks.nix59
-rw-r--r--krebs/2configs/shack/prometheus/server.nix5
4 files changed, 271 insertions, 21 deletions
diff --git a/krebs/2configs/shack/prometheus/alert-rules.nix b/krebs/2configs/shack/prometheus/alert-rules.nix
index 5ba49ede6..4cefdc3e5 100644
--- a/krebs/2configs/shack/prometheus/alert-rules.nix
+++ b/krebs/2configs/shack/prometheus/alert-rules.nix
@@ -1,6 +1,6 @@
{ lib,... }:
let
- disk_free_threshold = "10"; # at least this much free disk percentage
+ disk_free_threshold = "5"; # at least this much free disk percentage
in {
services.prometheus.rules = [(builtins.toJSON
{
@@ -8,22 +8,6 @@ in {
{ name = "shack-env";
rules = [
{
- alert = "Wolf RootPartitionFull";
- for = "30m";
- expr = ''(node_filesystem_avail_bytes{alias="wolf.shack",mountpoint="/"} * 100) / node_filesystem_size_bytes{alias="wolf.shack",mountpoint="/"} < ${disk_free_threshold}'';
- labels.severity = "warning";
- annotations.summary = "{{ $labels.alias }} root disk full";
- annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=wolf";
- annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value | printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%). CI for deploying new configuration will seize working. Log in to the system and try to clean up the obsolete files on the machine. There are a couple of things you can do:
-1. `nix-collect-garbage -d`
-2. clean up the shack share folder in `/home/share`
-3. check `du -hs /var/ | sort -h`.
-4. run `docker system prune`
-5. `find /var/lib/containers/news/var/lib/htgen-go/items -mtime +7 -delete;` to clean up the link shortener data
-5. If you are really desperate run `du -hs / | sort -h` and go through the folders recursively until you've found something to delete
-6. as a last resort the root disk can be expanded via `lvresize -L +10G /dev/pool/root && btrfs filesystem resize max /` '';
- }
- {
alert = "Puyak RootPartitionFull";
for = "30m";
expr = ''(node_filesystem_avail_bytes{alias="puyak.shack",mountpoint="/"} * 100) / node_filesystem_size_bytes{alias="puyak.shack",mountpoint="/"} < ${disk_free_threshold}'';
@@ -32,9 +16,8 @@ in {
annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=puyak";
annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value | printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%).Prometheus will not be able to create new alerts and CI for deploying new configuration will also seize working. Log in to the system and run `nix-collect-garbage -d` and if this does not help you can check `du -hs /var/ | sort -h`, run `docker system prune` or if you are really desperate run `du -hs / | sort -h` and go through the folders recursively until you've found something to delete'';
}
- # wolf.shack is not worth supervising anymore
{
- alert = "HostDown";
+ alert = "Infra01 down";
expr = ''up{alias="infra01.shack"} == 0'';
for = "5m";
labels.severity = "page";
diff --git a/krebs/2configs/shack/prometheus/irc-alerts.py b/krebs/2configs/shack/prometheus/irc-alerts.py
new file mode 100644
index 000000000..005a2013b
--- /dev/null
+++ b/krebs/2configs/shack/prometheus/irc-alerts.py
@@ -0,0 +1,207 @@
+import base64
+import cgi
+import json
+import os
+import re
+import socket
+import ssl
+import sys
+from http.server import BaseHTTPRequestHandler
+from typing import List, Optional, Tuple
+from urllib.parse import urlparse
+
+DEBUG = os.environ.get("DEBUG") is not None
+
+
+def _irc_send(
+ server: str,
+ nick: str,
+ channel: str,
+ sasl_password: Optional[str] = None,
+ server_password: Optional[str] = None,
+ tls: bool = True,
+ port: int = 6697,
+ messages: List[str] = [],
+) -> None:
+ if not messages:
+ return
+
+ sock = socket.socket()
+ if tls:
+ sock = ssl.wrap_socket(
+ sock, cert_reqs=ssl.CERT_NONE, ssl_version=ssl.PROTOCOL_TLSv1_2
+ )
+
+ def _send(command: str) -> int:
+ if DEBUG:
+ print(command)
+ return sock.send((f"{command}\r\n").encode())
+
+ def _pong(ping: str):
+ if ping.startswith("PING"):
+ sock.send(ping.replace("PING", "PONG").encode("ascii"))
+
+ recv_file = sock.makefile(mode="r")
+
+ print(f"connect {server}:{port}")
+ sock.connect((server, port))
+ if server_password:
+ _send(f"PASS {server_password}")
+ _send(f"USER {nick} 0 * :{nick}")
+ _send(f"NICK {nick}")
+ for line in recv_file.readline():
+ if re.match(r"^:[^ ]* (MODE|221|376|422) ", line):
+ break
+ else:
+ _pong(line)
+
+ if sasl_password:
+ _send("CAP REQ :sasl")
+ _send("AUTHENTICATE PLAIN")
+ auth = base64.encodebytes(f"{nick}\0{nick}\0{sasl_password}".encode("utf-8"))
+ _send(f"AUTHENTICATE {auth.decode('ascii')}")
+ _send("CAP END")
+ _send(f"JOIN :{channel}")
+
+ for m in messages:
+ _send(f"PRIVMSG {channel} :{m}")
+
+ _send("INFO")
+ for line in recv_file:
+ if DEBUG:
+ print(line, end="")
+ # Assume INFO reply means we are done
+ if "End of /INFO" in line:
+ break
+ else:
+ _pong(line)
+
+ sock.send(b"QUIT")
+ print("disconnect")
+ sock.close()
+
+
+def irc_send(
+ url: str, notifications: List[str], password: Optional[str] = None
+) -> None:
+ parsed = urlparse(f"{url}")
+ username = parsed.username or "prometheus"
+ server = parsed.hostname or "chat.freenode.net"
+ if parsed.fragment != "":
+ channel = f"#{parsed.fragment}"
+ else:
+ channel = "#krebs-announce"
+ port = parsed.port or 6697
+ if not password:
+ password = parsed.password
+ if len(notifications) == 0:
+ return
+ _irc_send(
+ server=server,
+ nick=username,
+ sasl_password=password,
+ channel=channel,
+ port=port,
+ messages=notifications,
+ tls=parsed.scheme == "irc+tls",
+ )
+
+
+class PrometheusWebHook(BaseHTTPRequestHandler):
+ def __init__(
+ self,
+ irc_url: str,
+ conn: socket.socket,
+ addr: Tuple[str, int],
+ password: Optional[str] = None,
+ ) -> None:
+ self.irc_url = irc_url
+ self.password = password
+ self.rfile = conn.makefile("rb")
+ self.wfile = conn.makefile("wb")
+ self.client_address = addr
+ self.handle()
+
+ # for testing
+ def do_GET(self) -> None:
+ if DEBUG:
+ print("GET: Request Received")
+ self.send_response(200)
+ self.send_header("Content-type", "text/plain")
+ self.end_headers()
+ self.wfile.write(b"ok")
+
+ def do_POST(self) -> None:
+ if DEBUG:
+ print("POST: Request Received")
+ content_type, _ = cgi.parse_header(self.headers.get("content-type"))
+
+ # refuse to receive non-json content
+ if content_type != "application/json":
+ if DEBUG:
+ print(f"POST: wrong content type {content_type}")
+ self.send_response(400)
+ self.end_headers()
+ return
+
+ length = int(self.headers.get("content-length"))
+ payload = json.loads(self.rfile.read(length))
+ messages = []
+ for alert in payload["alerts"]:
+ description = alert["annotations"]["description"]
+ messages.append(f"{alert['status']}: {description}")
+ irc_send(self.irc_url, messages, password=self.password)
+
+ self.do_GET()
+
+
+def systemd_socket_response() -> None:
+ irc_url = os.environ.get("IRC_URL", None)
+ if irc_url is None:
+ print(
+ "IRC_URL environment variable not set: i.e. IRC_URL=irc+tls://mic92-prometheus@chat.freenode.net/#krebs-announce",
+ file=sys.stderr,
+ )
+ sys.exit(1)
+
+ password = None
+ irc_password_file = os.environ.get("IRC_PASSWORD_FILE", None)
+ if irc_password_file:
+ with open(irc_password_file) as f:
+ password = f.read()
+
+ msgs = sys.argv[1:]
+
+ if msgs != []:
+ irc_send(irc_url, msgs, password=password)
+ return
+
+ nfds = os.environ.get("LISTEN_FDS", None)
+ if nfds is None:
+ print(
+ "LISTEN_FDS not set. Run me with systemd(TM) socket activation?",
+ file=sys.stderr,
+ )
+ sys.exit(1)
+ fds = range(3, 3 + int(nfds))
+
+ for fd in fds:
+ sock = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM)
+ sock.settimeout(0)
+
+ try:
+ while True:
+ PrometheusWebHook(irc_url, *sock.accept(), password=password)
+ except BlockingIOError:
+ # no more connections
+ pass
+
+
+if __name__ == "__main__":
+ if DEBUG:
+ print("Starting in DEBUG mode")
+ if len(sys.argv) == 3:
+ print(f"{sys.argv[1]} {sys.argv[2]}")
+ irc_send(sys.argv[1], [sys.argv[2]])
+ else:
+ systemd_socket_response()
diff --git a/krebs/2configs/shack/prometheus/irc-hooks.nix b/krebs/2configs/shack/prometheus/irc-hooks.nix
new file mode 100644
index 000000000..07bb2423b
--- /dev/null
+++ b/krebs/2configs/shack/prometheus/irc-hooks.nix
@@ -0,0 +1,59 @@
+{ config
+, lib
+, pkgs
+, ...
+}:
+let
+ irc-alerts = pkgs.writers.writePython3 "irc-alerts" {
+ flakeIgnore = [ "E501" ];
+ } (builtins.readFile ./irc-alerts.py);
+ endpoints = {
+ binaergewitter = {
+ url = "irc+tls://puyak-alerts@irc.libera.chat:6697/#binaergewitter-alerts";
+ port = 9223;
+ };
+ };
+in
+{
+ systemd.sockets =
+ lib.mapAttrs'
+ (name: opts:
+ lib.nameValuePair "irc-alerts-${name}" {
+ description = "Receive http hook and send irc message for ${name}";
+ wantedBy = [ "sockets.target" ];
+ listenStreams = [ "[::]:${builtins.toString opts.port}" ];
+ }) endpoints;
+
+ systemd.services =
+ lib.mapAttrs'
+ (name: opts:
+ let
+ serviceName = "irc-alerts-${name}";
+ hasPassword = opts.passwordFile or null != null;
+ in
+ lib.nameValuePair serviceName {
+ description = "Receive http hook and send irc message for ${name}";
+ requires = [ "irc-alerts-${name}.socket" ];
+ serviceConfig =
+ {
+ Environment =
+ [
+ "IRC_URL=${opts.url}"
+ "DEBUG=y"
+ ]
+ ++ lib.optional hasPassword "IRC_PASSWORD_FILE=/run/${serviceName}/password";
+ DynamicUser = true;
+ User = serviceName;
+ ExecStart = irc-alerts;
+ }
+ // lib.optionalAttrs hasPassword {
+ PermissionsStartOnly = true;
+ ExecStartPre =
+ "${pkgs.coreutils}/bin/install -m400 "
+ + "-o ${serviceName} -g ${serviceName} "
+ + "${config.sops.secrets.prometheus-irc-password.path} "
+ + "/run/${serviceName}/password";
+ RuntimeDirectory = serviceName;
+ };
+ }) endpoints;
+}
diff --git a/krebs/2configs/shack/prometheus/server.nix b/krebs/2configs/shack/prometheus/server.nix
index 9e4b4d1a7..7a5532027 100644
--- a/krebs/2configs/shack/prometheus/server.nix
+++ b/krebs/2configs/shack/prometheus/server.nix
@@ -3,6 +3,7 @@
{
imports = [
./alert-rules.nix
+ ./irc-hooks.nix
];
networking = {
firewall.allowedTCPPorts = [
@@ -129,11 +130,11 @@
"group_wait" = "30s";
"group_interval" = "2m";
"repeat_interval" = "4h";
- "receiver" = "team-admins";
+ "receiver" = "shack-admins";
};
"receivers" = [
{
- "name" = "team-admins";
+ "name" = "shack-admins";
"email_configs" = [ ];
"webhook_configs" = [
{