Phase 2: persistence + safety controls

Aggiunge la persistenza SQLite, l'audit log a hash chain, il kill
switch coordinato e i CLI di gestione documentati in
docs/05-data-model.md e docs/07-risk-controls.md. 197 test pass,
1 skipped (sqlite3 CLI mancante), copertura totale 97%.

State (`state/`):
- 0001_init.sql con positions, instructions, decisions, dvol_history,
  manual_actions, system_state.
- db.py: connect con WAL + foreign_keys + transaction ctx, runner
  forward-only basato su PRAGMA user_version.
- models.py: record Pydantic, Decimal preservato come TEXT.
- repository.py: CRUD typed con singola connessione passata, cache
  aware, posizioni concorrenti.

Safety (`safety/`):
- audit_log.py: AuditLog append-only con SHA-256 chain e fsync,
  verify_chain riconosce ogni manomissione (payload, prev_hash,
  hash, JSON, separatori).
- kill_switch.py: arm/disarm transazionali, idempotenti, accoppiati
  all'audit chain.

Config (`config/loader.py` + `strategy.yaml`):
- Loader YAML con deep-merge di strategy.local.yaml.
- Verifica config_hash SHA-256 (riga config_hash esclusa).
- File golden strategy.yaml + esempio override.

Scripts:
- dead_man.sh: watchdog shell indipendente da Python.
- backup.py: VACUUM INTO orario con retention 30 giorni.

CLI:
- audit verify (exit 2 su tampering).
- kill-switch arm/disarm/status su SQLite reale.
- state inspect con tabella posizioni aperte.
- config hash, config validate.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-27 13:35:35 +02:00
parent fbb7753cc6
commit 263470786d
25 changed files with 3669 additions and 14 deletions
+170
View File
@@ -0,0 +1,170 @@
"""Kill switch behaviour: SQLite + audit log stay in lock-step."""
from __future__ import annotations
from datetime import UTC, datetime, timedelta
from pathlib import Path
import pytest
from cerbero_bite.safety import AuditLog, verify_chain
from cerbero_bite.safety.kill_switch import KillSwitch, KillSwitchError
from cerbero_bite.state import Repository, connect, run_migrations, transaction
def _make_kill_switch(tmp_path: Path) -> tuple[KillSwitch, AuditLog, Path, Repository]:
db_path = tmp_path / "state.sqlite"
audit_path = tmp_path / "audit.log"
conn = connect(db_path)
run_migrations(conn)
repo = Repository()
with transaction(conn):
repo.init_system_state(
conn, config_version="1.0.0", now=datetime(2026, 4, 27, 14, 0, tzinfo=UTC)
)
conn.close()
audit = AuditLog(audit_path)
times = iter(
datetime(2026, 4, 27, 14, m, tzinfo=UTC) for m in (10, 20, 30, 40, 50)
)
ks = KillSwitch(
connection_factory=lambda: connect(db_path),
repository=repo,
audit_log=audit,
clock=lambda: next(times),
)
return ks, audit, audit_path, repo
def test_arm_persists_state_and_appends_audit(tmp_path: Path) -> None:
ks, _audit, audit_path, repo = _make_kill_switch(tmp_path)
assert ks.is_armed() is False
ks.arm(reason="manual test", source="manual")
assert ks.is_armed() is True
conn = connect(tmp_path / "state.sqlite")
try:
state = repo.get_system_state(conn)
finally:
conn.close()
assert state is not None
assert state.kill_switch == 1
assert state.kill_reason == "manual test"
assert state.kill_at is not None
assert verify_chain(audit_path) == 1
def test_arm_is_idempotent_on_second_call(tmp_path: Path) -> None:
ks, _audit, audit_path, _repo = _make_kill_switch(tmp_path)
ks.arm(reason="first", source="manual")
ks.arm(reason="second", source="manual") # no-op
# only one audit line because the second call short-circuits
assert verify_chain(audit_path) == 1
def test_disarm_resets_kill_switch(tmp_path: Path) -> None:
ks, _audit, audit_path, repo = _make_kill_switch(tmp_path)
ks.arm(reason="test", source="manual")
ks.disarm(reason="cleared", source="manual")
assert ks.is_armed() is False
conn = connect(tmp_path / "state.sqlite")
try:
state = repo.get_system_state(conn)
finally:
conn.close()
assert state is not None
assert state.kill_at is None
# arm + disarm = 2 audit lines
assert verify_chain(audit_path) == 2
def test_disarm_when_not_armed_is_noop(tmp_path: Path) -> None:
ks, _audit, audit_path, _repo = _make_kill_switch(tmp_path)
ks.disarm(reason="nothing to do", source="manual")
assert verify_chain(audit_path) == 0
def test_arm_requires_reason(tmp_path: Path) -> None:
ks, _audit, _audit_path, _repo = _make_kill_switch(tmp_path)
with pytest.raises(KillSwitchError, match="reason is required"):
ks.arm(reason="", source="manual")
def test_arm_without_initialised_state_raises(tmp_path: Path) -> None:
db_path = tmp_path / "state.sqlite"
audit_path = tmp_path / "audit.log"
conn = connect(db_path)
run_migrations(conn)
conn.close()
ks = KillSwitch(
connection_factory=lambda: connect(db_path),
repository=Repository(),
audit_log=AuditLog(audit_path),
clock=lambda: datetime(2026, 4, 27, 14, 0, tzinfo=UTC),
)
with pytest.raises(KillSwitchError, match="system_state singleton missing"):
ks.arm(reason="x", source="manual")
def test_audit_chain_records_event_kind(tmp_path: Path) -> None:
ks, _audit, audit_path, _repo = _make_kill_switch(tmp_path)
ks.arm(reason="x", source="mcp_timeout")
ks.disarm(reason="y", source="manual")
text = audit_path.read_text(encoding="utf-8")
assert "KILL_SWITCH_ARMED" in text
assert "KILL_SWITCH_DISARMED" in text
def test_is_armed_returns_false_when_singleton_missing(tmp_path: Path) -> None:
db_path = tmp_path / "state.sqlite"
audit_path = tmp_path / "audit.log"
conn = connect(db_path)
run_migrations(conn)
conn.close()
ks = KillSwitch(
connection_factory=lambda: connect(db_path),
repository=Repository(),
audit_log=AuditLog(audit_path),
clock=lambda: datetime(2026, 4, 27, 14, 0, tzinfo=UTC),
)
assert ks.is_armed() is False
def test_disarm_requires_reason(tmp_path: Path) -> None:
ks, _audit, _audit_path, _repo = _make_kill_switch(tmp_path)
with pytest.raises(KillSwitchError, match="reason is required"):
ks.disarm(reason="", source="manual")
def test_disarm_without_initialised_state_raises(tmp_path: Path) -> None:
db_path = tmp_path / "state.sqlite"
audit_path = tmp_path / "audit.log"
conn = connect(db_path)
run_migrations(conn)
conn.close()
ks = KillSwitch(
connection_factory=lambda: connect(db_path),
repository=Repository(),
audit_log=AuditLog(audit_path),
clock=lambda: datetime(2026, 4, 27, 14, 0, tzinfo=UTC),
)
with pytest.raises(KillSwitchError, match="system_state singleton missing"):
ks.disarm(reason="x", source="manual")
def test_clock_is_advanced_for_each_call(tmp_path: Path) -> None:
ks, _audit, _audit_path, repo = _make_kill_switch(tmp_path)
ks.arm(reason="x", source="manual")
ks.disarm(reason="y", source="manual")
conn = connect(tmp_path / "state.sqlite")
try:
state = repo.get_system_state(conn)
finally:
conn.close()
assert state is not None
# last_health_check should reflect the disarm time (14:20 from the fake clock).
assert state.last_health_check >= datetime(2026, 4, 27, 14, 15, tzinfo=UTC) - timedelta(
seconds=1
)