Hardening round 2: healthcheck, audit anchor, return_4h, exec config, signals
Sei interventi MEDIA priorità sul sistema. 323 test pass, mypy strict
pulito, ruff clean.
1. Docker HEALTHCHECK + cerbero-bite healthcheck:
- nuovo subcommand che esce 0 se kill_switch=0 e last_health_check
entro --max-staleness-s (default 600s);
- HEALTHCHECK direttiva nel Dockerfile (60s interval, 5s timeout,
start_period 120s, retries 3);
- healthcheck definition nel docker-compose.yml.
2. Audit hash chain anti-truncation:
- migration 0002: nuova colonna system_state.last_audit_hash;
- AuditLog accetta callback on_append, dependencies.py la wire al
repository.set_last_audit_hash;
- Orchestrator.boot verifica che il tail file matcha l'anchor
persistito; mismatch → kill switch CRITICAL.
3. return_4h bootstrap da deribit get_historical:
- quando dvol_history è vuoto _fetch_return_4h cade su
deribit.historical_close (1h candle 4h fa);
- alert LOW se anche il fallback fallisce.
4. execution.environment + execution.eur_to_usd in strategy.yaml:
- ExecutionConfig promosso a typed schema con i due campi
consumati al boot;
- CLI start preferisce i valori da config; CLI flag overridano
solo quando differenti dai default.
5. Cycle correlation ID:
- structlog.contextvars.bind_contextvars in run_entry/run_monitor/
run_health propaga cycle_id e cycle nei log strutturati.
6. SIGTERM/SIGINT clean shutdown:
- run_forever installa loop.add_signal_handler per SIGTERM e
SIGINT; il segnale set()ta un asyncio.Event che termina il
blocco principale, scheduler.shutdown e ctx.aclose finalizzano.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -17,7 +17,9 @@ from datetime import UTC, datetime
|
||||
from decimal import Decimal
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
from uuid import uuid4
|
||||
|
||||
import structlog
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
|
||||
from cerbero_bite.config.mcp_endpoints import McpEndpoints
|
||||
@@ -29,6 +31,7 @@ from cerbero_bite.runtime.lockfile import EngineLock
|
||||
from cerbero_bite.runtime.monitor_cycle import MonitorCycleResult, run_monitor_cycle
|
||||
from cerbero_bite.runtime.recovery import recover_state
|
||||
from cerbero_bite.runtime.scheduler import JobSpec, build_scheduler
|
||||
from cerbero_bite.state import connect as connect_state
|
||||
|
||||
__all__ = ["Orchestrator"]
|
||||
|
||||
@@ -82,6 +85,7 @@ class Orchestrator:
|
||||
async def boot(self) -> _BootResult:
|
||||
"""Reconcile state, verify environment, run a first health probe."""
|
||||
when = self._ctx.clock()
|
||||
await self._verify_audit_anchor(now=when)
|
||||
await recover_state(self._ctx, now=when)
|
||||
|
||||
info = await self._ctx.deribit.environment_info()
|
||||
@@ -111,22 +115,70 @@ class Orchestrator:
|
||||
# Cycle invocations (used by scheduler jobs and CLI dry-run)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _verify_audit_anchor(self, *, now: datetime) -> None: # noqa: ARG002
|
||||
"""Compare the audit log tail with the SQLite anchor.
|
||||
|
||||
``now`` is accepted for symmetry with the other ``boot``
|
||||
helpers but unused: the comparison is purely between the
|
||||
in-memory tail hash and the value persisted on the previous
|
||||
run.
|
||||
"""
|
||||
conn = connect_state(self._ctx.db_path)
|
||||
try:
|
||||
state = self._ctx.repository.get_system_state(conn)
|
||||
finally:
|
||||
conn.close()
|
||||
if state is None or state.last_audit_hash is None:
|
||||
return # first boot, nothing to compare against
|
||||
actual_tail = self._ctx.audit_log.last_hash
|
||||
if actual_tail != state.last_audit_hash:
|
||||
await self._ctx.alert_manager.critical(
|
||||
source="orchestrator.boot",
|
||||
message=(
|
||||
f"audit log anchor mismatch: anchor="
|
||||
f"{state.last_audit_hash[:12]}…, file tail="
|
||||
f"{actual_tail[:12]}… — possible tampering or truncation"
|
||||
),
|
||||
component="safety.audit_log",
|
||||
)
|
||||
|
||||
async def run_entry(
|
||||
self, *, now: datetime | None = None
|
||||
) -> EntryCycleResult:
|
||||
return await run_entry_cycle(
|
||||
self._ctx, eur_to_usd_rate=self._eur_to_usd, now=now
|
||||
cycle_id = str(uuid4())
|
||||
token = structlog.contextvars.bind_contextvars(
|
||||
cycle="entry", cycle_id=cycle_id
|
||||
)
|
||||
try:
|
||||
return await run_entry_cycle(
|
||||
self._ctx, eur_to_usd_rate=self._eur_to_usd, now=now
|
||||
)
|
||||
finally:
|
||||
structlog.contextvars.reset_contextvars(**token)
|
||||
|
||||
async def run_monitor(
|
||||
self, *, now: datetime | None = None
|
||||
) -> MonitorCycleResult:
|
||||
return await run_monitor_cycle(self._ctx, now=now)
|
||||
cycle_id = str(uuid4())
|
||||
token = structlog.contextvars.bind_contextvars(
|
||||
cycle="monitor", cycle_id=cycle_id
|
||||
)
|
||||
try:
|
||||
return await run_monitor_cycle(self._ctx, now=now)
|
||||
finally:
|
||||
structlog.contextvars.reset_contextvars(**token)
|
||||
|
||||
async def run_health(
|
||||
self, *, now: datetime | None = None
|
||||
) -> HealthCheckResult:
|
||||
return await self._health.run(now=now)
|
||||
cycle_id = str(uuid4())
|
||||
token = structlog.contextvars.bind_contextvars(
|
||||
cycle="health", cycle_id=cycle_id
|
||||
)
|
||||
try:
|
||||
return await self._health.run(now=now)
|
||||
finally:
|
||||
structlog.contextvars.reset_contextvars(**token)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Scheduler lifecycle
|
||||
@@ -191,8 +243,14 @@ class Orchestrator:
|
||||
"""Boot, acquire the single-instance lock, install the scheduler.
|
||||
|
||||
``lock_path`` defaults to ``<db_path.parent>/.lockfile`` so two
|
||||
containers cannot trade against the same SQLite file.
|
||||
containers cannot trade against the same SQLite file. SIGTERM
|
||||
and SIGINT are intercepted so Docker (or the operator) can
|
||||
signal a clean shutdown — the scheduler is stopped, in-flight
|
||||
cycles complete, the audit log fsyncs, and the HTTP client is
|
||||
closed before the process exits.
|
||||
"""
|
||||
import signal # noqa: PLC0415 — only needed by run_forever
|
||||
|
||||
lock = EngineLock(
|
||||
lock_path or self._ctx.db_path.parent / ".lockfile"
|
||||
)
|
||||
@@ -201,8 +259,28 @@ class Orchestrator:
|
||||
await self.boot()
|
||||
scheduler = self.install_scheduler()
|
||||
scheduler.start()
|
||||
stop_event = asyncio.Event()
|
||||
|
||||
def _on_signal(signame: str) -> None:
|
||||
_log.info("received %s — initiating shutdown", signame)
|
||||
stop_event.set()
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
for sig_name in ("SIGTERM", "SIGINT"):
|
||||
sig = getattr(signal, sig_name, None)
|
||||
if sig is None: # pragma: no cover — Windows fallback
|
||||
continue
|
||||
try:
|
||||
loop.add_signal_handler(
|
||||
sig, _on_signal, sig_name
|
||||
)
|
||||
except NotImplementedError: # pragma: no cover
|
||||
# Some sandboxes (Windows asyncio) don't support
|
||||
# add_signal_handler; fall back to no-op.
|
||||
signal.signal(sig, lambda *_: stop_event.set())
|
||||
|
||||
try:
|
||||
await asyncio.Event().wait()
|
||||
await stop_event.wait()
|
||||
finally:
|
||||
scheduler.shutdown(wait=False)
|
||||
finally:
|
||||
|
||||
Reference in New Issue
Block a user