#!/usr/bin/env bash # dead_man.sh — independent watchdog for Cerbero Bite (docs/07-risk-controls.md). # # Runs from cron every 5 minutes. If the engine has not written a # HEALTH_OK event into today's JSONL log within the last # DEAD_MAN_THRESHOLD_SECONDS (default 900 = 15 minutes), it: # 1. Sends an alert via DEAD_MAN_ALERT_CMD (any command consuming # a single argument: the alert text). When unset, falls back to # writing data/log/dead-man-alert.txt so an external watcher can # pick it up. # 2. Arms the SQLite kill switch directly (no Python required). # 3. Appends one line to data/audit.log (best-effort hash chain; # verifying after recovery is the operator's job). # # Configuration via env vars or .env in PROJECT_ROOT: # PROJECT_ROOT — repo root (default: parent of this file). # DEAD_MAN_THRESHOLD_SECONDS — silence threshold (default 900). # DEAD_MAN_ALERT_CMD — optional alert command. # # This script intentionally avoids Python so it survives env corruption. set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="${PROJECT_ROOT:-$(cd "${SCRIPT_DIR}/.." && pwd)}" THRESHOLD="${DEAD_MAN_THRESHOLD_SECONDS:-900}" LOG_DIR="${PROJECT_ROOT}/data/log" DB_PATH="${PROJECT_ROOT}/data/state.sqlite" AUDIT_PATH="${PROJECT_ROOT}/data/audit.log" ALERT_FILE="${LOG_DIR}/dead-man-alert.txt" today_log() { date -u +"${LOG_DIR}/cerbero-bite-%Y-%m-%d.jsonl" } last_health_ts() { local file="$1" if [[ ! -f "$file" ]]; then echo "" return fi grep -F '"event": "HEALTH_OK"' "$file" 2>/dev/null \ | tail -n 1 \ | sed -E 's/.*"ts":[[:space:]]*"([^"]+)".*/\1/' \ || true } emit_alert() { local message="$1" if [[ -n "${DEAD_MAN_ALERT_CMD:-}" ]]; then "${DEAD_MAN_ALERT_CMD}" "$message" || true fi mkdir -p "$LOG_DIR" printf '%s | %s\n' "$(date -u +%FT%TZ)" "$message" >> "$ALERT_FILE" } arm_kill_switch() { if [[ ! -f "$DB_PATH" ]] || ! command -v sqlite3 >/dev/null 2>&1; then return fi sqlite3 "$DB_PATH" <threshold"}|prev_hash=manual|hash=manual\n' "$ts" >> "$AUDIT_PATH" } main() { local log_file log_file="$(today_log)" local last_ts last_ts="$(last_health_ts "$log_file")" if [[ -z "$last_ts" ]]; then emit_alert "dead_man: no HEALTH_OK in $log_file" arm_kill_switch append_audit_line exit 1 fi local last_epoch now_epoch elapsed last_epoch="$(date -u -d "$last_ts" +%s 2>/dev/null || echo 0)" now_epoch="$(date -u +%s)" elapsed=$(( now_epoch - last_epoch )) if (( elapsed > THRESHOLD )); then emit_alert "dead_man: ${elapsed}s since last HEALTH_OK (threshold ${THRESHOLD}s)" arm_kill_switch append_audit_line exit 1 fi exit 0 } main "$@"