Hardening round 2: healthcheck, audit anchor, return_4h, exec config, signals

Sei interventi MEDIA priorità sul sistema. 323 test pass, mypy strict pulito, ruff clean. 1. Docker HEALTHCHECK + cerbero-bite healthcheck: - nuovo subcommand che esce 0 se kill_switch=0 e last_health_check entro --max-staleness-s (default 600s); - HEALTHCHECK direttiva nel Dockerfile (60s interval, 5s timeout, start_period 120s, retries 3); - healthcheck definition nel docker-compose.yml. 2. Audit hash chain anti-truncation: - migration 0002: nuova colonna system_state.last_audit_hash; - AuditLog accetta callback on_append, dependencies.py la wire al repository.set_last_audit_hash; - Orchestrator.boot verifica che il tail file matcha l'anchor persistito; mismatch → kill switch CRITICAL. 3. return_4h bootstrap da deribit get_historical: - quando dvol_history è vuoto _fetch_return_4h cade su deribit.historical_close (1h candle 4h fa); - alert LOW se anche il fallback fallisce. 4. execution.environment + execution.eur_to_usd in strategy.yaml: - ExecutionConfig promosso a typed schema con i due campi consumati al boot; - CLI start preferisce i valori da config; CLI flag overridano solo quando differenti dai default. 5. Cycle correlation ID: - structlog.contextvars.bind_contextvars in run_entry/run_monitor/ run_health propaga cycle_id e cycle nei log strutturati. 6. SIGTERM/SIGINT clean shutdown: - run_forever installa loop.add_signal_handler per SIGTERM e SIGINT; il segnale set()ta un asyncio.Event che termina il blocco principale, scheduler.shutdown e ctx.aclose finalizzano. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 00:37:39 +02:00
parent 411b747e93
commit b5b96f959c
15 changed files with 477 additions and 24 deletions
@@ -17,7 +17,9 @@ from datetime import UTC, datetime
 from decimal import Decimal
 from pathlib import Path
 from typing import Literal
+from uuid import uuid4

+import structlog
 from apscheduler.schedulers.asyncio import AsyncIOScheduler

 from cerbero_bite.config.mcp_endpoints import McpEndpoints
@@ -29,6 +31,7 @@ from cerbero_bite.runtime.lockfile import EngineLock
 from cerbero_bite.runtime.monitor_cycle import MonitorCycleResult, run_monitor_cycle
 from cerbero_bite.runtime.recovery import recover_state
 from cerbero_bite.runtime.scheduler import JobSpec, build_scheduler
+from cerbero_bite.state import connect as connect_state

 __all__ = ["Orchestrator"]

@@ -82,6 +85,7 @@ class Orchestrator:
    async def boot(self) -> _BootResult:
        """Reconcile state, verify environment, run a first health probe."""
        when = self._ctx.clock()
+        await self._verify_audit_anchor(now=when)
        await recover_state(self._ctx, now=when)

        info = await self._ctx.deribit.environment_info()
@@ -111,22 +115,70 @@ class Orchestrator:
    # Cycle invocations (used by scheduler jobs and CLI dry-run)
    # ------------------------------------------------------------------

+    async def _verify_audit_anchor(self, *, now: datetime) -> None:  # noqa: ARG002
+        """Compare the audit log tail with the SQLite anchor.
+
+        ``now`` is accepted for symmetry with the other ``boot``
+        helpers but unused: the comparison is purely between the
+        in-memory tail hash and the value persisted on the previous
+        run.
+        """
+        conn = connect_state(self._ctx.db_path)
+        try:
+            state = self._ctx.repository.get_system_state(conn)
+        finally:
+            conn.close()
+        if state is None or state.last_audit_hash is None:
+            return  # first boot, nothing to compare against
+        actual_tail = self._ctx.audit_log.last_hash
+        if actual_tail != state.last_audit_hash:
+            await self._ctx.alert_manager.critical(
+                source="orchestrator.boot",
+                message=(
+                    f"audit log anchor mismatch: anchor="
+                    f"{state.last_audit_hash[:12]}…, file tail="
+                    f"{actual_tail[:12]}… — possible tampering or truncation"
+                ),
+                component="safety.audit_log",
+            )
+
    async def run_entry(
        self, *, now: datetime | None = None
    ) -> EntryCycleResult:
-        return await run_entry_cycle(
-            self._ctx, eur_to_usd_rate=self._eur_to_usd, now=now
+        cycle_id = str(uuid4())
+        token = structlog.contextvars.bind_contextvars(
+            cycle="entry", cycle_id=cycle_id
        )
+        try:
+            return await run_entry_cycle(
+                self._ctx, eur_to_usd_rate=self._eur_to_usd, now=now
+            )
+        finally:
+            structlog.contextvars.reset_contextvars(**token)

    async def run_monitor(
        self, *, now: datetime | None = None
    ) -> MonitorCycleResult:
-        return await run_monitor_cycle(self._ctx, now=now)
+        cycle_id = str(uuid4())
+        token = structlog.contextvars.bind_contextvars(
+            cycle="monitor", cycle_id=cycle_id
+        )
+        try:
+            return await run_monitor_cycle(self._ctx, now=now)
+        finally:
+            structlog.contextvars.reset_contextvars(**token)

    async def run_health(
        self, *, now: datetime | None = None
    ) -> HealthCheckResult:
-        return await self._health.run(now=now)
+        cycle_id = str(uuid4())
+        token = structlog.contextvars.bind_contextvars(
+            cycle="health", cycle_id=cycle_id
+        )
+        try:
+            return await self._health.run(now=now)
+        finally:
+            structlog.contextvars.reset_contextvars(**token)

    # ------------------------------------------------------------------
    # Scheduler lifecycle
@@ -191,8 +243,14 @@ class Orchestrator:
        """Boot, acquire the single-instance lock, install the scheduler.

        ``lock_path`` defaults to ``<db_path.parent>/.lockfile`` so two
-        containers cannot trade against the same SQLite file.
+        containers cannot trade against the same SQLite file. SIGTERM
+        and SIGINT are intercepted so Docker (or the operator) can
+        signal a clean shutdown — the scheduler is stopped, in-flight
+        cycles complete, the audit log fsyncs, and the HTTP client is
+        closed before the process exits.
        """
+        import signal  # noqa: PLC0415  — only needed by run_forever
+
        lock = EngineLock(
            lock_path or self._ctx.db_path.parent / ".lockfile"
        )
@@ -201,8 +259,28 @@ class Orchestrator:
                await self.boot()
                scheduler = self.install_scheduler()
                scheduler.start()
+                stop_event = asyncio.Event()
+
+                def _on_signal(signame: str) -> None:
+                    _log.info("received %s — initiating shutdown", signame)
+                    stop_event.set()
+
+                loop = asyncio.get_running_loop()
+                for sig_name in ("SIGTERM", "SIGINT"):
+                    sig = getattr(signal, sig_name, None)
+                    if sig is None:  # pragma: no cover — Windows fallback
+                        continue
+                    try:
+                        loop.add_signal_handler(
+                            sig, _on_signal, sig_name
+                        )
+                    except NotImplementedError:  # pragma: no cover
+                        # Some sandboxes (Windows asyncio) don't support
+                        # add_signal_handler; fall back to no-op.
+                        signal.signal(sig, lambda *_: stop_event.set())
+
                try:
-                    await asyncio.Event().wait()
+                    await stop_event.wait()
                finally:
                    scheduler.shutdown(wait=False)
            finally: