Phase 2: persistence + safety controls

Aggiunge la persistenza SQLite, l'audit log a hash chain, il kill switch coordinato e i CLI di gestione documentati in docs/05-data-model.md e docs/07-risk-controls.md. 197 test pass, 1 skipped (sqlite3 CLI mancante), copertura totale 97%. State (`state/`): - 0001_init.sql con positions, instructions, decisions, dvol_history, manual_actions, system_state. - db.py: connect con WAL + foreign_keys + transaction ctx, runner forward-only basato su PRAGMA user_version. - models.py: record Pydantic, Decimal preservato come TEXT. - repository.py: CRUD typed con singola connessione passata, cache aware, posizioni concorrenti. Safety (`safety/`): - audit_log.py: AuditLog append-only con SHA-256 chain e fsync, verify_chain riconosce ogni manomissione (payload, prev_hash, hash, JSON, separatori). - kill_switch.py: arm/disarm transazionali, idempotenti, accoppiati all'audit chain. Config (`config/loader.py` + `strategy.yaml`): - Loader YAML con deep-merge di strategy.local.yaml. - Verifica config_hash SHA-256 (riga config_hash esclusa). - File golden strategy.yaml + esempio override. Scripts: - dead_man.sh: watchdog shell indipendente da Python. - backup.py: VACUUM INTO orario con retention 30 giorni. CLI: - audit verify (exit 2 su tampering). - kill-switch arm/disarm/status su SQLite reale. - state inspect con tabella posizioni aperte. - config hash, config validate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 13:35:35 +02:00
parent fbb7753cc6
commit 263470786d
25 changed files with 3669 additions and 14 deletions
@@ -0,0 +1,110 @@
+"""Hourly SQLite backup utility (``docs/05-data-model.md``).
+
+Uses ``VACUUM INTO`` so the snapshot is a self-contained, defragmented
+SQLite file that can be inspected without locking the live database.
+
+Retention is enforced by deletion: any file matching
+``state-YYYYMMDD-HH.sqlite`` older than ``retention_days`` is removed.
+
+Designed to be invoked from APScheduler in the orchestrator and from
+the CLI for ad-hoc backups.
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import sqlite3
+from collections.abc import Iterable
+from datetime import UTC, datetime, timedelta
+from pathlib import Path
+
+__all__ = ["BACKUP_FILENAME_RE", "backup_database", "prune_backups"]
+
+
+BACKUP_FILENAME_RE = re.compile(r"^state-(\d{8}-\d{2})\.sqlite$")
+_DEFAULT_RETENTION_DAYS = 30
+
+
+def _format_backup_name(now: datetime) -> str:
+    return f"state-{now.astimezone(UTC):%Y%m%d-%H}.sqlite"
+
+
+def backup_database(
+    *,
+    db_path: Path | str,
+    backup_dir: Path | str,
+    now: datetime | None = None,
+) -> Path:
+    """Create a snapshot via ``VACUUM INTO`` and return its path."""
+    src = Path(db_path)
+    dst_dir = Path(backup_dir)
+    dst_dir.mkdir(parents=True, exist_ok=True)
+
+    timestamp = (now or datetime.now(UTC)).astimezone(UTC)
+    dst = dst_dir / _format_backup_name(timestamp)
+    if dst.exists():
+        # Idempotent at hour granularity: same hour = same target file.
+        dst.unlink()
+
+    conn = sqlite3.connect(str(src))
+    try:
+        conn.execute(f"VACUUM INTO '{dst.as_posix()}'")
+    finally:
+        conn.close()
+    return dst
+
+
+def _parse_backup_timestamp(name: str) -> datetime | None:
+    match = BACKUP_FILENAME_RE.match(name)
+    if match is None:
+        return None
+    try:
+        return datetime.strptime(match.group(1), "%Y%m%d-%H").replace(tzinfo=UTC)
+    except ValueError:
+        return None
+
+
+def prune_backups(
+    backup_dir: Path | str,
+    *,
+    retention_days: int = _DEFAULT_RETENTION_DAYS,
+    now: datetime | None = None,
+) -> list[Path]:
+    """Remove backups older than ``retention_days``. Returns the deleted paths."""
+    cutoff = (now or datetime.now(UTC)).astimezone(UTC) - timedelta(days=retention_days)
+    deleted: list[Path] = []
+    for entry in Path(backup_dir).iterdir():
+        if not entry.is_file():
+            continue
+        ts = _parse_backup_timestamp(entry.name)
+        if ts is None:
+            continue
+        if ts < cutoff:
+            entry.unlink()
+            deleted.append(entry)
+    return deleted
+
+
+def list_backups(backup_dir: Path | str) -> Iterable[Path]:
+    return sorted(
+        (p for p in Path(backup_dir).iterdir() if BACKUP_FILENAME_RE.match(p.name)),
+        key=lambda p: p.name,
+    )
+
+
+def _cli() -> None:
+    parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
+    parser.add_argument("--db", default="data/state.sqlite")
+    parser.add_argument("--out", default="data/backups")
+    parser.add_argument("--retention-days", type=int, default=_DEFAULT_RETENTION_DAYS)
+    args = parser.parse_args()
+    out = backup_database(db_path=args.db, backup_dir=args.out)
+    pruned = prune_backups(args.out, retention_days=args.retention_days)
+    print(f"backup -> {out}")
+    if pruned:
+        print(f"pruned: {', '.join(p.name for p in pruned)}")
+
+
+if __name__ == "__main__":
+    _cli()
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+# dead_man.sh — independent watchdog for Cerbero Bite (docs/07-risk-controls.md).
+#
+# Runs from cron every 5 minutes. If the engine has not written a
+# HEALTH_OK event into today's JSONL log within the last
+# DEAD_MAN_THRESHOLD_SECONDS (default 900 = 15 minutes), it:
+#   1. Sends an alert via DEAD_MAN_ALERT_CMD (any command consuming
+#      a single argument: the alert text). When unset, falls back to
+#      writing data/log/dead-man-alert.txt so an external watcher can
+#      pick it up.
+#   2. Arms the SQLite kill switch directly (no Python required).
+#   3. Appends one line to data/audit.log (best-effort hash chain;
+#      verifying after recovery is the operator's job).
+#
+# Configuration via env vars or .env in PROJECT_ROOT:
+#   PROJECT_ROOT             — repo root (default: parent of this file).
+#   DEAD_MAN_THRESHOLD_SECONDS — silence threshold (default 900).
+#   DEAD_MAN_ALERT_CMD       — optional alert command.
+#
+# This script intentionally avoids Python so it survives env corruption.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="${PROJECT_ROOT:-$(cd "${SCRIPT_DIR}/.." && pwd)}"
+THRESHOLD="${DEAD_MAN_THRESHOLD_SECONDS:-900}"
+
+LOG_DIR="${PROJECT_ROOT}/data/log"
+DB_PATH="${PROJECT_ROOT}/data/state.sqlite"
+AUDIT_PATH="${PROJECT_ROOT}/data/audit.log"
+ALERT_FILE="${LOG_DIR}/dead-man-alert.txt"
+
+today_log() {
+    date -u +"${LOG_DIR}/cerbero-bite-%Y-%m-%d.jsonl"
+}
+
+last_health_ts() {
+    local file="$1"
+    if [[ ! -f "$file" ]]; then
+        echo ""
+        return
+    fi
+    grep -F '"event": "HEALTH_OK"' "$file" 2>/dev/null \
+        | tail -n 1 \
+        | sed -E 's/.*"ts":[[:space:]]*"([^"]+)".*/\1/' \
+        || true
+}
+
+emit_alert() {
+    local message="$1"
+    if [[ -n "${DEAD_MAN_ALERT_CMD:-}" ]]; then
+        "${DEAD_MAN_ALERT_CMD}" "$message" || true
+    fi
+    mkdir -p "$LOG_DIR"
+    printf '%s | %s\n' "$(date -u +%FT%TZ)" "$message" >> "$ALERT_FILE"
+}
+
+arm_kill_switch() {
+    if [[ ! -f "$DB_PATH" ]] || ! command -v sqlite3 >/dev/null 2>&1; then
+        return
+    fi
+    sqlite3 "$DB_PATH" <<SQL || true
+UPDATE system_state
+   SET kill_switch = 1,
+       kill_reason = COALESCE(kill_reason, 'dead_man'),
+       kill_at = COALESCE(kill_at, datetime('now')),
+       last_health_check = datetime('now')
+ WHERE id = 1;
+SQL
+}
+
+append_audit_line() {
+    local ts
+    ts="$(date -u +%FT%TZ)"
+    mkdir -p "$(dirname "$AUDIT_PATH")"
+    printf '%s|DEAD_MAN_TRIGGERED|{"reason":"silence>threshold"}|prev_hash=manual|hash=manual\n' "$ts" >> "$AUDIT_PATH"
+}
+
+main() {
+    local log_file
+    log_file="$(today_log)"
+    local last_ts
+    last_ts="$(last_health_ts "$log_file")"
+
+    if [[ -z "$last_ts" ]]; then
+        emit_alert "dead_man: no HEALTH_OK in $log_file"
+        arm_kill_switch
+        append_audit_line
+        exit 1
+    fi
+
+    local last_epoch now_epoch elapsed
+    last_epoch="$(date -u -d "$last_ts" +%s 2>/dev/null || echo 0)"
+    now_epoch="$(date -u +%s)"
+    elapsed=$(( now_epoch - last_epoch ))
+
+    if (( elapsed > THRESHOLD )); then
+        emit_alert "dead_man: ${elapsed}s since last HEALTH_OK (threshold ${THRESHOLD}s)"
+        arm_kill_switch
+        append_audit_line
+        exit 1
+    fi
+
+    exit 0
+}
+
+main "$@"