Hardening round 2: healthcheck, audit anchor, return_4h, exec config, signals

Sei interventi MEDIA priorità sul sistema. 323 test pass, mypy strict
pulito, ruff clean.

1. Docker HEALTHCHECK + cerbero-bite healthcheck:
   - nuovo subcommand che esce 0 se kill_switch=0 e last_health_check
     entro --max-staleness-s (default 600s);
   - HEALTHCHECK direttiva nel Dockerfile (60s interval, 5s timeout,
     start_period 120s, retries 3);
   - healthcheck definition nel docker-compose.yml.

2. Audit hash chain anti-truncation:
   - migration 0002: nuova colonna system_state.last_audit_hash;
   - AuditLog accetta callback on_append, dependencies.py la wire al
     repository.set_last_audit_hash;
   - Orchestrator.boot verifica che il tail file matcha l'anchor
     persistito; mismatch → kill switch CRITICAL.

3. return_4h bootstrap da deribit get_historical:
   - quando dvol_history è vuoto _fetch_return_4h cade su
     deribit.historical_close (1h candle 4h fa);
   - alert LOW se anche il fallback fallisce.

4. execution.environment + execution.eur_to_usd in strategy.yaml:
   - ExecutionConfig promosso a typed schema con i due campi
     consumati al boot;
   - CLI start preferisce i valori da config; CLI flag overridano
     solo quando differenti dai default.

5. Cycle correlation ID:
   - structlog.contextvars.bind_contextvars in run_entry/run_monitor/
     run_health propaga cycle_id e cycle nei log strutturati.

6. SIGTERM/SIGINT clean shutdown:
   - run_forever installa loop.add_signal_handler per SIGTERM e
     SIGINT; il segnale set()ta un asyncio.Event che termina il
     blocco principale, scheduler.shutdown e ctx.aclose finalizzano.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-28 00:37:39 +02:00
parent 411b747e93
commit b5b96f959c
15 changed files with 477 additions and 24 deletions
+17 -1
View File
@@ -103,7 +103,23 @@ def build_runtime(
finally:
conn.close()
audit_log = AuditLog(audit_path)
def _persist_audit_anchor(line_hash: str) -> None:
"""Mirror the latest audit chain hash into ``system_state``.
Best-effort: if SQLite is locked by another writer the audit
log itself is still consistent, the anchor will catch up on
the next append.
"""
anchor_conn = connect(db_path)
try:
with transaction(anchor_conn):
repository.set_last_audit_hash(anchor_conn, hex_hash=line_hash)
except Exception: # pragma: no cover — durability is best-effort
pass
finally:
anchor_conn.close()
audit_log = AuditLog(audit_path, on_append=_persist_audit_anchor)
kill_switch = KillSwitch(
connection_factory=lambda: connect(db_path),
repository=repository,
+34 -11
View File
@@ -154,13 +154,19 @@ def _option_type_from_name(name: str) -> PutOrCall:
async def _fetch_return_4h(ctx: RuntimeContext, *, now: datetime) -> Decimal:
"""Compute ETH 4h return from the locally stored dvol_history snapshots.
"""Compute ETH 4h return.
The orchestrator records a snapshot at the start of every monitor
cycle (see :func:`run_monitor_cycle`); this helper reads the most
recent snapshot at least 3.5h old and computes ``(now / past) - 1``.
Returns 0 if no historical sample is available — in that branch the
orchestrator emits a LOW alert about insufficient history.
Resolution order:
1. local ``dvol_history`` snapshot at least 3h30 old (recorded by
previous monitor cycles);
2. Deribit ``get_historical`` 1h candles 4h ago — bootstrap when
SQLite has no recent sample (first cycle after a fresh
container, or after long downtime).
Returns ``0`` only when both sources fail; in that case the
monitor cycle emits a LOW alert and exit_decision falls back to
HOLD on the adverse-move trigger.
"""
cutoff = now - timedelta(hours=3, minutes=30)
floor = now - timedelta(hours=8)
@@ -174,13 +180,30 @@ async def _fetch_return_4h(ctx: RuntimeContext, *, now: datetime) -> Decimal:
).fetchone()
finally:
conn.close()
if row is None:
return Decimal("0")
past_spot = Decimal(str(row[1]))
if past_spot == 0:
if row is not None:
past_spot = Decimal(str(row[1]))
if past_spot != 0:
spot_now = await ctx.deribit.index_price_eth()
return spot_now / past_spot - Decimal("1")
# Fallback: ask Deribit for the 4h candle close.
try:
past_close = await ctx.deribit.historical_close(
instrument="ETH-PERPETUAL",
start=now - timedelta(hours=5),
end=now - timedelta(hours=3, minutes=30),
resolution="1h",
)
except Exception: # pragma: no cover — defensive, surface as LOW alert
past_close = None
if past_close is None or past_close == 0:
await ctx.alert_manager.low(
source="monitor_cycle",
message="no return_4h sample available (history empty + bootstrap failed)",
)
return Decimal("0")
spot_now = await ctx.deribit.index_price_eth()
return spot_now / past_spot - Decimal("1")
return spot_now / past_close - Decimal("1")
# ---------------------------------------------------------------------------
+84 -6
View File
@@ -17,7 +17,9 @@ from datetime import UTC, datetime
from decimal import Decimal
from pathlib import Path
from typing import Literal
from uuid import uuid4
import structlog
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from cerbero_bite.config.mcp_endpoints import McpEndpoints
@@ -29,6 +31,7 @@ from cerbero_bite.runtime.lockfile import EngineLock
from cerbero_bite.runtime.monitor_cycle import MonitorCycleResult, run_monitor_cycle
from cerbero_bite.runtime.recovery import recover_state
from cerbero_bite.runtime.scheduler import JobSpec, build_scheduler
from cerbero_bite.state import connect as connect_state
__all__ = ["Orchestrator"]
@@ -82,6 +85,7 @@ class Orchestrator:
async def boot(self) -> _BootResult:
"""Reconcile state, verify environment, run a first health probe."""
when = self._ctx.clock()
await self._verify_audit_anchor(now=when)
await recover_state(self._ctx, now=when)
info = await self._ctx.deribit.environment_info()
@@ -111,22 +115,70 @@ class Orchestrator:
# Cycle invocations (used by scheduler jobs and CLI dry-run)
# ------------------------------------------------------------------
async def _verify_audit_anchor(self, *, now: datetime) -> None: # noqa: ARG002
"""Compare the audit log tail with the SQLite anchor.
``now`` is accepted for symmetry with the other ``boot``
helpers but unused: the comparison is purely between the
in-memory tail hash and the value persisted on the previous
run.
"""
conn = connect_state(self._ctx.db_path)
try:
state = self._ctx.repository.get_system_state(conn)
finally:
conn.close()
if state is None or state.last_audit_hash is None:
return # first boot, nothing to compare against
actual_tail = self._ctx.audit_log.last_hash
if actual_tail != state.last_audit_hash:
await self._ctx.alert_manager.critical(
source="orchestrator.boot",
message=(
f"audit log anchor mismatch: anchor="
f"{state.last_audit_hash[:12]}…, file tail="
f"{actual_tail[:12]}… — possible tampering or truncation"
),
component="safety.audit_log",
)
async def run_entry(
self, *, now: datetime | None = None
) -> EntryCycleResult:
return await run_entry_cycle(
self._ctx, eur_to_usd_rate=self._eur_to_usd, now=now
cycle_id = str(uuid4())
token = structlog.contextvars.bind_contextvars(
cycle="entry", cycle_id=cycle_id
)
try:
return await run_entry_cycle(
self._ctx, eur_to_usd_rate=self._eur_to_usd, now=now
)
finally:
structlog.contextvars.reset_contextvars(**token)
async def run_monitor(
self, *, now: datetime | None = None
) -> MonitorCycleResult:
return await run_monitor_cycle(self._ctx, now=now)
cycle_id = str(uuid4())
token = structlog.contextvars.bind_contextvars(
cycle="monitor", cycle_id=cycle_id
)
try:
return await run_monitor_cycle(self._ctx, now=now)
finally:
structlog.contextvars.reset_contextvars(**token)
async def run_health(
self, *, now: datetime | None = None
) -> HealthCheckResult:
return await self._health.run(now=now)
cycle_id = str(uuid4())
token = structlog.contextvars.bind_contextvars(
cycle="health", cycle_id=cycle_id
)
try:
return await self._health.run(now=now)
finally:
structlog.contextvars.reset_contextvars(**token)
# ------------------------------------------------------------------
# Scheduler lifecycle
@@ -191,8 +243,14 @@ class Orchestrator:
"""Boot, acquire the single-instance lock, install the scheduler.
``lock_path`` defaults to ``<db_path.parent>/.lockfile`` so two
containers cannot trade against the same SQLite file.
containers cannot trade against the same SQLite file. SIGTERM
and SIGINT are intercepted so Docker (or the operator) can
signal a clean shutdown — the scheduler is stopped, in-flight
cycles complete, the audit log fsyncs, and the HTTP client is
closed before the process exits.
"""
import signal # noqa: PLC0415 — only needed by run_forever
lock = EngineLock(
lock_path or self._ctx.db_path.parent / ".lockfile"
)
@@ -201,8 +259,28 @@ class Orchestrator:
await self.boot()
scheduler = self.install_scheduler()
scheduler.start()
stop_event = asyncio.Event()
def _on_signal(signame: str) -> None:
_log.info("received %s — initiating shutdown", signame)
stop_event.set()
loop = asyncio.get_running_loop()
for sig_name in ("SIGTERM", "SIGINT"):
sig = getattr(signal, sig_name, None)
if sig is None: # pragma: no cover — Windows fallback
continue
try:
loop.add_signal_handler(
sig, _on_signal, sig_name
)
except NotImplementedError: # pragma: no cover
# Some sandboxes (Windows asyncio) don't support
# add_signal_handler; fall back to no-op.
signal.signal(sig, lambda *_: stop_event.set())
try:
await asyncio.Event().wait()
await stop_event.wait()
finally:
scheduler.shutdown(wait=False)
finally: