Hardening round 2: healthcheck, audit anchor, return_4h, exec config, signals

Sei interventi MEDIA priorità sul sistema. 323 test pass, mypy strict
pulito, ruff clean.

1. Docker HEALTHCHECK + cerbero-bite healthcheck:
   - nuovo subcommand che esce 0 se kill_switch=0 e last_health_check
     entro --max-staleness-s (default 600s);
   - HEALTHCHECK direttiva nel Dockerfile (60s interval, 5s timeout,
     start_period 120s, retries 3);
   - healthcheck definition nel docker-compose.yml.

2. Audit hash chain anti-truncation:
   - migration 0002: nuova colonna system_state.last_audit_hash;
   - AuditLog accetta callback on_append, dependencies.py la wire al
     repository.set_last_audit_hash;
   - Orchestrator.boot verifica che il tail file matcha l'anchor
     persistito; mismatch → kill switch CRITICAL.

3. return_4h bootstrap da deribit get_historical:
   - quando dvol_history è vuoto _fetch_return_4h cade su
     deribit.historical_close (1h candle 4h fa);
   - alert LOW se anche il fallback fallisce.

4. execution.environment + execution.eur_to_usd in strategy.yaml:
   - ExecutionConfig promosso a typed schema con i due campi
     consumati al boot;
   - CLI start preferisce i valori da config; CLI flag overridano
     solo quando differenti dai default.

5. Cycle correlation ID:
   - structlog.contextvars.bind_contextvars in run_entry/run_monitor/
     run_health propaga cycle_id e cycle nei log strutturati.

6. SIGTERM/SIGINT clean shutdown:
   - run_forever installa loop.add_signal_handler per SIGTERM e
     SIGINT; il segnale set()ta un asyncio.Event che termina il
     blocco principale, scheduler.shutdown e ctx.aclose finalizzano.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-28 00:37:39 +02:00
parent 411b747e93
commit b5b96f959c
15 changed files with 477 additions and 24 deletions
+125
View File
@@ -0,0 +1,125 @@
"""Tests for the audit chain anti-truncation anchor."""
from __future__ import annotations
from datetime import UTC, datetime
from pathlib import Path
import pytest
from pytest_httpx import HTTPXMock
from cerbero_bite.config import golden_config
from cerbero_bite.config.mcp_endpoints import load_endpoints
from cerbero_bite.runtime import build_runtime
from cerbero_bite.runtime.orchestrator import Orchestrator
from cerbero_bite.state import connect
pytestmark = pytest.mark.httpx_mock(assert_all_responses_were_requested=False)
def _now() -> datetime:
return datetime(2026, 4, 27, 14, 0, tzinfo=UTC)
def _build(tmp_path: Path) -> Orchestrator:
ctx = build_runtime(
cfg=golden_config(),
endpoints=load_endpoints(env={}),
token="t",
db_path=tmp_path / "state.sqlite",
audit_path=tmp_path / "audit.log",
retry_max=1,
clock=_now,
)
return Orchestrator(
ctx,
expected_environment="testnet",
eur_to_usd=__import__("decimal").Decimal("1.075"),
)
def _wire_boot_dependencies(httpx_mock: HTTPXMock) -> None:
httpx_mock.add_response(
url="http://mcp-deribit:9011/tools/environment_info",
json={
"exchange": "deribit",
"environment": "testnet",
"source": "env",
"env_value": "true",
"base_url": "https://test.deribit.com/api/v2",
"max_leverage": 3,
},
is_reusable=True,
)
httpx_mock.add_response(
url="http://mcp-deribit:9011/tools/get_positions",
json=[],
is_reusable=True,
)
httpx_mock.add_response(
url="http://mcp-macro:9013/tools/get_macro_calendar",
json={"events": []},
is_reusable=True,
)
httpx_mock.add_response(
url="http://mcp-sentiment:9014/tools/get_cross_exchange_funding",
json={"snapshot": {}},
is_reusable=True,
)
httpx_mock.add_response(
url="http://mcp-hyperliquid:9012/tools/get_funding_rate",
json={"asset": "ETH", "current_funding_rate": 0.0001},
is_reusable=True,
)
httpx_mock.add_response(
url="http://mcp-portfolio:9018/tools/get_total_portfolio_value",
json={"total_value_eur": 1000.0},
is_reusable=True,
)
@pytest.mark.asyncio
async def test_audit_anchor_persisted_after_append(tmp_path: Path) -> None:
orch = _build(tmp_path)
orch.context.audit_log.append(
event="TEST",
payload={"x": 1},
now=_now(),
)
conn = connect(tmp_path / "state.sqlite")
try:
state = orch.context.repository.get_system_state(conn)
finally:
conn.close()
assert state is not None
assert state.last_audit_hash == orch.context.audit_log.last_hash
@pytest.mark.asyncio
async def test_boot_detects_audit_truncation(
tmp_path: Path, httpx_mock: HTTPXMock
) -> None:
orch = _build(tmp_path)
# Append three lines so we have something to truncate.
for i in range(3):
orch.context.audit_log.append(
event=f"E{i}", payload={"i": i}, now=_now()
)
# Truncate the file: keep only the first line.
audit_path = tmp_path / "audit.log"
head = audit_path.read_text(encoding="utf-8").splitlines(keepends=True)[0]
audit_path.write_text(head, encoding="utf-8")
# Rebuild orchestrator (the AuditLog tail-reads the file again).
orch = _build(tmp_path)
_wire_boot_dependencies(httpx_mock)
httpx_mock.add_response(
url="http://mcp-telegram:9017/tools/notify_system_error",
json={"ok": True},
is_reusable=True,
)
await orch.boot()
assert orch.context.kill_switch.is_armed() is True
+11
View File
@@ -98,6 +98,7 @@ def _wire_market_data(
*,
spot: float = 3000.0,
dvol: float = 50.0,
historical_close: float | None = None,
) -> None:
httpx_mock.add_response(
url="http://mcp-deribit:9011/tools/get_ticker",
@@ -109,6 +110,16 @@ def _wire_market_data(
json={"currency": "ETH", "latest": dvol, "candles": []},
is_reusable=True,
)
# Bootstrap fallback for return_4h when dvol_history is empty.
httpx_mock.add_response(
url="http://mcp-deribit:9011/tools/get_historical",
json={
"candles": (
[{"close": historical_close}] if historical_close is not None else []
)
},
is_reusable=True,
)
def _wire_position_quotes(
+66
View File
@@ -0,0 +1,66 @@
"""Tests for the ``cerbero-bite healthcheck`` subcommand."""
from __future__ import annotations
from datetime import UTC, datetime, timedelta
from pathlib import Path
from click.testing import CliRunner
from cerbero_bite.cli import main as cli_main
from cerbero_bite.state import Repository, connect, run_migrations, transaction
def _seed_state(db: Path, *, last_check: datetime, kill_switch: bool = False) -> None:
conn = connect(db)
try:
run_migrations(conn)
repo = Repository()
with transaction(conn):
repo.init_system_state(
conn, config_version="1.0.0", now=last_check
)
if kill_switch:
repo.set_kill_switch(
conn, armed=True, reason="test", now=last_check
)
else:
repo.touch_health_check(conn, now=last_check)
finally:
conn.close()
def test_healthcheck_exits_one_when_db_missing(tmp_path: Path) -> None:
result = CliRunner().invoke(
cli_main,
["healthcheck", "--db", str(tmp_path / "absent.sqlite")],
)
assert result.exit_code == 1
assert "unhealthy" in result.output
def test_healthcheck_exits_one_when_kill_switch_armed(tmp_path: Path) -> None:
db = tmp_path / "state.sqlite"
_seed_state(db, last_check=datetime.now(UTC), kill_switch=True)
result = CliRunner().invoke(cli_main, ["healthcheck", "--db", str(db)])
assert result.exit_code == 1
assert "kill switch" in result.output
def test_healthcheck_exits_one_when_last_check_stale(tmp_path: Path) -> None:
db = tmp_path / "state.sqlite"
_seed_state(db, last_check=datetime.now(UTC) - timedelta(hours=1))
result = CliRunner().invoke(
cli_main,
["healthcheck", "--db", str(db), "--max-staleness-s", "60"],
)
assert result.exit_code == 1
assert "stale" in result.output
def test_healthcheck_exits_zero_on_recent_check(tmp_path: Path) -> None:
db = tmp_path / "state.sqlite"
_seed_state(db, last_check=datetime.now(UTC))
result = CliRunner().invoke(cli_main, ["healthcheck", "--db", str(db)])
assert result.exit_code == 0
assert "healthy" in result.output