summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorlassulus <lassulus@lassul.us>2022-11-17 11:58:59 +0100
committerlassulus <lassulus@lassul.us>2022-11-17 12:23:28 +0100
commit17fc13592935fc7c8119e24980ea3a83f9773043 (patch)
tree1ba6c78663c0876c5755983c8610ff60c51183ca
parent3736bbf091a34ac9ab33b60d872a922080fd81f7 (diff)
l sync-containers3: don't kill container on lock loss
-rw-r--r--lass/3modules/sync-containers3.nix124
1 files changed, 102 insertions, 22 deletions
diff --git a/lass/3modules/sync-containers3.nix b/lass/3modules/sync-containers3.nix
index 5b6fa37e1..104b92b24 100644
--- a/lass/3modules/sync-containers3.nix
+++ b/lass/3modules/sync-containers3.nix
@@ -99,7 +99,7 @@ in {
set -efux
consul lock sync_${ctr.name} ${pkgs.writers.writeDash "${ctr.name}-sync" ''
set -efux
- if ping -c 1 ${ctr.name}.r; then
+ if /run/wrappers/bin/ping -c 1 ${ctr.name}.r; then
touch "$HOME"/incomplete
rsync -a -e "ssh -i $CREDENTIALS_DIRECTORY/ssh_key" --inplace container_sync@${ctr.name}.r:disk "$HOME"/disk
rm "$HOME"/incomplete
@@ -108,6 +108,54 @@ in {
'';
};
}; }
+ { "${ctr.name}_watcher" = {
+ path = with pkgs; [
+ coreutils
+ consul
+ cryptsetup
+ curl
+ mount
+ util-linux
+ jq
+ ];
+ serviceConfig = {
+ ExecStart = pkgs.writers.writeDash "${ctr.name}_watcher" ''
+ set -efux
+ while sleep 5; do
+ # get the payload
+ # check if the host reacted recently
+ case $(curl -s -o /dev/null --retry 10 -w '%{http_code}' http://127.0.0.1:8500/v1/kv/containers/${ctr.name}) in
+ 404)
+ echo 'got 404 from kv, should kill the container'
+ break
+ ;;
+ 500)
+ echo 'got 500 from kv, will kill container'
+ break
+ ;;
+ 200)
+ # echo 'got 200 from kv, will check payload'
+ export payload=$(consul kv get containers/${ctr.name})
+ if [ "$(jq -rn 'env.payload | fromjson.host')" = '${config.networking.hostName}' ]; then
+ # echo 'we are the host, continuing'
+ continue
+ else
+ echo 'we are not host, killing container'
+ break
+ fi
+ ;;
+ *)
+ echo 'unknown state, continuing'
+ continue
+ ;;
+ esac
+ done
+ /run/current-system/sw/bin/nixos-container stop ${ctr.name} || :
+ umount /var/lib/sync-containers3/${ctr.name}/state || :
+ cryptsetup luksClose ${ctr.name} || :
+ '';
+ };
+ }; }
{ "${ctr.name}_scheduler" = {
wantedBy = [ "multi-user.target" ];
path = with pkgs; [
@@ -116,36 +164,68 @@ in {
cryptsetup
mount
util-linux
+ curl
systemd
+ jq
retry
+ bc
];
- serviceConfig = let
- containerDirectory = lib.removeSuffix "/%i" config.systemd.services."container@${ctr.name}".environment.root;
- in {
+ serviceConfig = {
Restart = "always";
- RestartSec = "5s";
- ExecStart = "${pkgs.consul}/bin/consul lock -verbose -monitor-retry 100 container_${ctr.name} ${pkgs.writers.writeBash "${ctr.name}-start" ''
+ RestartSec = "30s";
+ ExecStart = pkgs.writers.writeDash "${ctr.name}_scheduler" ''
set -efux
+ # get the payload
+ # check if the host reacted recently
+ case $(curl -s -o /dev/null --retry 10 -w '%{http_code}' http://127.0.0.1:8500/v1/kv/containers/${ctr.name}) in
+ 404)
+ # echo 'got 404 from kv, will create container'
+ ;;
+ 500)
+ # echo 'got 500 from kv, retrying again'
+ exit 0
+ ;;
+ 200)
+ # echo 'got 200 from kv, will check payload'
+ export payload=$(consul kv get containers/${ctr.name})
+ if [ "$(jq -rn 'env.payload | fromjson.host')" = '${config.networking.hostName}' ]; then
+ echo 'we are the host, starting container'
+ else
+ # echo 'we are not host, checking timestamp'
+ # if [ $(echo "$(date +%s) - $(jq -rn 'env.payload | fromjson.time') > 100" | bc) -eq 1 ]; then
+ if [ "$(jq -rn 'env.payload | fromjson.time | now - tonumber > 100')" = 'true' ]; then
+ echo 'last beacon is more than 100s ago, taking over'
+ else
+ # echo 'last beacon was recent. trying again'
+ exit 0
+ fi
+ fi
+ ;;
+ *)
+ echo 'unknown state, bailing out'
+ exit 0
+ ;;
+ esac
if test -e /var/lib/sync-containers3/${ctr.name}/incomplete; then
echo 'data is inconistent, start aborted'
exit 1
fi
- trap ${pkgs.writers.writeDash "stop-${ctr.name}" ''
- set -efux
- /run/current-system/sw/bin/nixos-container stop ${ctr.name} || :
- umount /var/lib/sync-containers3/${ctr.name}/state || :
- cryptsetup luksClose ${ctr.name} || :
- ''} INT TERM EXIT
- consul kv put containers/${ctr.name}/host ${config.networking.hostName}
- cryptsetup luksOpen --key-file ${ctr.luksKey} /var/lib/sync-containers3/${ctr.name}/disk ${ctr.name}
- mkdir -p /var/lib/sync-containers3/${ctr.name}/state
- mount /dev/mapper/${ctr.name} /var/lib/sync-containers3/${ctr.name}/state
- /run/current-system/sw/bin/nixos-container start ${ctr.name}
- set +x
- until /run/wrappers/bin/ping -q -c 1 ${ctr.name}.r > /dev/null; do sleep 5; done
- while retry -t 5 -d 60 -- /run/wrappers/bin/ping -q -c 3 ${ctr.name}.r > /dev/null; do sleep 5; done
- echo "lost tinc connection to container, shutting down"
- ''}";
+ consul kv put containers/${ctr.name} "$(jq -cn '{host: "${config.networking.hostName}", time: now}')" >/dev/null
+ consul lock -verbose -monitor-retry=100 -timeout 30s -name container_${ctr.name} container_${ctr.name} ${pkgs.writers.writeBash "${ctr.name}-start" ''
+ set -efu
+ cryptsetup luksOpen --key-file ${ctr.luksKey} /var/lib/sync-containers3/${ctr.name}/disk ${ctr.name} || :
+ mkdir -p /var/lib/sync-containers3/${ctr.name}/state
+ mountpoint /var/lib/sync-containers3/${ctr.name}/state || mount /dev/mapper/${ctr.name} /var/lib/sync-containers3/${ctr.name}/state
+ /run/current-system/sw/bin/nixos-container start ${ctr.name}
+ # wait for system to become reachable for the first time
+ retry -t 10 -d 10 -- /run/wrappers/bin/ping -q -c 1 ${ctr.name}.r > /dev/null
+ systemctl start ${ctr.name}_watcher.service
+ while systemctl is-active container@${ctr.name}.service >/devnull && /run/wrappers/bin/ping -q -c 3 ${ctr.name}.r >/dev/null; do
+ consul kv put containers/${ctr.name} "$(jq -cn '{host: "${config.networking.hostName}", time: now}')" >/dev/null
+ sleep 10
+ done
+ ''}
+ '';
};
}; }
]) (lib.attrValues cfg.containers)));