Hardening round 2: healthcheck, audit anchor, return_4h, exec config, signals

Sei interventi MEDIA priorità sul sistema. 323 test pass, mypy strict
pulito, ruff clean.

1. Docker HEALTHCHECK + cerbero-bite healthcheck:
   - nuovo subcommand che esce 0 se kill_switch=0 e last_health_check
     entro --max-staleness-s (default 600s);
   - HEALTHCHECK direttiva nel Dockerfile (60s interval, 5s timeout,
     start_period 120s, retries 3);
   - healthcheck definition nel docker-compose.yml.

2. Audit hash chain anti-truncation:
   - migration 0002: nuova colonna system_state.last_audit_hash;
   - AuditLog accetta callback on_append, dependencies.py la wire al
     repository.set_last_audit_hash;
   - Orchestrator.boot verifica che il tail file matcha l'anchor
     persistito; mismatch → kill switch CRITICAL.

3. return_4h bootstrap da deribit get_historical:
   - quando dvol_history è vuoto _fetch_return_4h cade su
     deribit.historical_close (1h candle 4h fa);
   - alert LOW se anche il fallback fallisce.

4. execution.environment + execution.eur_to_usd in strategy.yaml:
   - ExecutionConfig promosso a typed schema con i due campi
     consumati al boot;
   - CLI start preferisce i valori da config; CLI flag overridano
     solo quando differenti dai default.

5. Cycle correlation ID:
   - structlog.contextvars.bind_contextvars in run_entry/run_monitor/
     run_health propaga cycle_id e cycle nei log strutturati.

6. SIGTERM/SIGINT clean shutdown:
   - run_forever installa loop.add_signal_handler per SIGTERM e
     SIGINT; il segnale set()ta un asyncio.Event che termina il
     blocco principale, scheduler.shutdown e ctx.aclose finalizzano.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-28 00:37:39 +02:00
parent 411b747e93
commit b5b96f959c
15 changed files with 477 additions and 24 deletions
+84 -6
View File
@@ -17,7 +17,9 @@ from datetime import UTC, datetime
from decimal import Decimal
from pathlib import Path
from typing import Literal
from uuid import uuid4
import structlog
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from cerbero_bite.config.mcp_endpoints import McpEndpoints
@@ -29,6 +31,7 @@ from cerbero_bite.runtime.lockfile import EngineLock
from cerbero_bite.runtime.monitor_cycle import MonitorCycleResult, run_monitor_cycle
from cerbero_bite.runtime.recovery import recover_state
from cerbero_bite.runtime.scheduler import JobSpec, build_scheduler
from cerbero_bite.state import connect as connect_state
__all__ = ["Orchestrator"]
@@ -82,6 +85,7 @@ class Orchestrator:
async def boot(self) -> _BootResult:
"""Reconcile state, verify environment, run a first health probe."""
when = self._ctx.clock()
await self._verify_audit_anchor(now=when)
await recover_state(self._ctx, now=when)
info = await self._ctx.deribit.environment_info()
@@ -111,22 +115,70 @@ class Orchestrator:
# Cycle invocations (used by scheduler jobs and CLI dry-run)
# ------------------------------------------------------------------
async def _verify_audit_anchor(self, *, now: datetime) -> None: # noqa: ARG002
"""Compare the audit log tail with the SQLite anchor.
``now`` is accepted for symmetry with the other ``boot``
helpers but unused: the comparison is purely between the
in-memory tail hash and the value persisted on the previous
run.
"""
conn = connect_state(self._ctx.db_path)
try:
state = self._ctx.repository.get_system_state(conn)
finally:
conn.close()
if state is None or state.last_audit_hash is None:
return # first boot, nothing to compare against
actual_tail = self._ctx.audit_log.last_hash
if actual_tail != state.last_audit_hash:
await self._ctx.alert_manager.critical(
source="orchestrator.boot",
message=(
f"audit log anchor mismatch: anchor="
f"{state.last_audit_hash[:12]}…, file tail="
f"{actual_tail[:12]}… — possible tampering or truncation"
),
component="safety.audit_log",
)
async def run_entry(
self, *, now: datetime | None = None
) -> EntryCycleResult:
return await run_entry_cycle(
self._ctx, eur_to_usd_rate=self._eur_to_usd, now=now
cycle_id = str(uuid4())
token = structlog.contextvars.bind_contextvars(
cycle="entry", cycle_id=cycle_id
)
try:
return await run_entry_cycle(
self._ctx, eur_to_usd_rate=self._eur_to_usd, now=now
)
finally:
structlog.contextvars.reset_contextvars(**token)
async def run_monitor(
self, *, now: datetime | None = None
) -> MonitorCycleResult:
return await run_monitor_cycle(self._ctx, now=now)
cycle_id = str(uuid4())
token = structlog.contextvars.bind_contextvars(
cycle="monitor", cycle_id=cycle_id
)
try:
return await run_monitor_cycle(self._ctx, now=now)
finally:
structlog.contextvars.reset_contextvars(**token)
async def run_health(
self, *, now: datetime | None = None
) -> HealthCheckResult:
return await self._health.run(now=now)
cycle_id = str(uuid4())
token = structlog.contextvars.bind_contextvars(
cycle="health", cycle_id=cycle_id
)
try:
return await self._health.run(now=now)
finally:
structlog.contextvars.reset_contextvars(**token)
# ------------------------------------------------------------------
# Scheduler lifecycle
@@ -191,8 +243,14 @@ class Orchestrator:
"""Boot, acquire the single-instance lock, install the scheduler.
``lock_path`` defaults to ``<db_path.parent>/.lockfile`` so two
containers cannot trade against the same SQLite file.
containers cannot trade against the same SQLite file. SIGTERM
and SIGINT are intercepted so Docker (or the operator) can
signal a clean shutdown — the scheduler is stopped, in-flight
cycles complete, the audit log fsyncs, and the HTTP client is
closed before the process exits.
"""
import signal # noqa: PLC0415 — only needed by run_forever
lock = EngineLock(
lock_path or self._ctx.db_path.parent / ".lockfile"
)
@@ -201,8 +259,28 @@ class Orchestrator:
await self.boot()
scheduler = self.install_scheduler()
scheduler.start()
stop_event = asyncio.Event()
def _on_signal(signame: str) -> None:
_log.info("received %s — initiating shutdown", signame)
stop_event.set()
loop = asyncio.get_running_loop()
for sig_name in ("SIGTERM", "SIGINT"):
sig = getattr(signal, sig_name, None)
if sig is None: # pragma: no cover — Windows fallback
continue
try:
loop.add_signal_handler(
sig, _on_signal, sig_name
)
except NotImplementedError: # pragma: no cover
# Some sandboxes (Windows asyncio) don't support
# add_signal_handler; fall back to no-op.
signal.signal(sig, lambda *_: stop_event.set())
try:
await asyncio.Event().wait()
await stop_event.wait()
finally:
scheduler.shutdown(wait=False)
finally: