Hardening round 2: healthcheck, audit anchor, return_4h, exec config, signals
Sei interventi MEDIA priorità sul sistema. 323 test pass, mypy strict
pulito, ruff clean.
1. Docker HEALTHCHECK + cerbero-bite healthcheck:
- nuovo subcommand che esce 0 se kill_switch=0 e last_health_check
entro --max-staleness-s (default 600s);
- HEALTHCHECK direttiva nel Dockerfile (60s interval, 5s timeout,
start_period 120s, retries 3);
- healthcheck definition nel docker-compose.yml.
2. Audit hash chain anti-truncation:
- migration 0002: nuova colonna system_state.last_audit_hash;
- AuditLog accetta callback on_append, dependencies.py la wire al
repository.set_last_audit_hash;
- Orchestrator.boot verifica che il tail file matcha l'anchor
persistito; mismatch → kill switch CRITICAL.
3. return_4h bootstrap da deribit get_historical:
- quando dvol_history è vuoto _fetch_return_4h cade su
deribit.historical_close (1h candle 4h fa);
- alert LOW se anche il fallback fallisce.
4. execution.environment + execution.eur_to_usd in strategy.yaml:
- ExecutionConfig promosso a typed schema con i due campi
consumati al boot;
- CLI start preferisce i valori da config; CLI flag overridano
solo quando differenti dai default.
5. Cycle correlation ID:
- structlog.contextvars.bind_contextvars in run_entry/run_monitor/
run_health propaga cycle_id e cycle nei log strutturati.
6. SIGTERM/SIGINT clean shutdown:
- run_forever installa loop.add_signal_handler per SIGTERM e
SIGINT; il segnale set()ta un asyncio.Event che termina il
blocco principale, scheduler.shutdown e ctx.aclose finalizzano.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -48,5 +48,11 @@ RUN mkdir -p /app/data/log /app/data/backups \
|
|||||||
&& chown -R bite:bite /app
|
&& chown -R bite:bite /app
|
||||||
|
|
||||||
USER bite
|
USER bite
|
||||||
|
|
||||||
|
# The healthcheck rides on the same Click entrypoint: it queries the
|
||||||
|
# SQLite singleton and exits 0/1 based on kill_switch + last_health_check.
|
||||||
|
HEALTHCHECK --interval=60s --timeout=5s --start-period=120s --retries=3 \
|
||||||
|
CMD ["cerbero-bite", "healthcheck", "--db", "/app/data/state.sqlite"]
|
||||||
|
|
||||||
ENTRYPOINT ["cerbero-bite"]
|
ENTRYPOINT ["cerbero-bite"]
|
||||||
CMD ["status"]
|
CMD ["status"]
|
||||||
|
|||||||
@@ -48,6 +48,13 @@ services:
|
|||||||
CERBERO_BITE_MCP_PORTFOLIO_URL: http://mcp-portfolio:9018
|
CERBERO_BITE_MCP_PORTFOLIO_URL: http://mcp-portfolio:9018
|
||||||
volumes:
|
volumes:
|
||||||
- bite-data:/app/data
|
- bite-data:/app/data
|
||||||
|
healthcheck:
|
||||||
|
test:
|
||||||
|
["CMD", "cerbero-bite", "healthcheck", "--db", "/app/data/state.sqlite"]
|
||||||
|
interval: 60s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
start_period: 120s
|
||||||
# Default command runs the engine status check; override with the
|
# Default command runs the engine status check; override with the
|
||||||
# CLI subcommand of choice (start, ping, dry-run, ...).
|
# CLI subcommand of choice (start, ping, dry-run, ...).
|
||||||
command: ["status"]
|
command: ["status"]
|
||||||
|
|||||||
+80
-2
@@ -123,6 +123,69 @@ def status(db: Path) -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@main.command()
|
||||||
|
@click.option(
|
||||||
|
"--db",
|
||||||
|
type=click.Path(dir_okay=False, path_type=Path),
|
||||||
|
default=_DEFAULT_DB_PATH,
|
||||||
|
show_default=True,
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--max-staleness-s",
|
||||||
|
type=int,
|
||||||
|
default=600,
|
||||||
|
show_default=True,
|
||||||
|
help=(
|
||||||
|
"Maximum age (seconds) of last_health_check before the engine is "
|
||||||
|
"considered unhealthy. Used by Docker HEALTHCHECK."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
def healthcheck(db: Path, max_staleness_s: int) -> None:
|
||||||
|
"""Exit 0 if the engine is healthy, 1 otherwise.
|
||||||
|
|
||||||
|
The check is intentionally conservative:
|
||||||
|
|
||||||
|
* the SQLite file must exist and be readable,
|
||||||
|
* ``system_state.kill_switch`` must be 0,
|
||||||
|
* ``system_state.last_health_check`` must not be older than
|
||||||
|
``--max-staleness-s`` seconds.
|
||||||
|
|
||||||
|
Wired as the container HEALTHCHECK in ``Dockerfile``.
|
||||||
|
"""
|
||||||
|
if not db.exists():
|
||||||
|
console.print("[red]unhealthy[/red]: state.sqlite missing")
|
||||||
|
sys.exit(1)
|
||||||
|
try:
|
||||||
|
conn = connect_state(db)
|
||||||
|
try:
|
||||||
|
run_migrations(conn)
|
||||||
|
sys_state = Repository().get_system_state(conn)
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
except Exception as exc:
|
||||||
|
console.print(f"[red]unhealthy[/red]: {type(exc).__name__}: {exc}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if sys_state is None:
|
||||||
|
console.print("[red]unhealthy[/red]: system_state singleton missing")
|
||||||
|
sys.exit(1)
|
||||||
|
if sys_state.kill_switch == 1:
|
||||||
|
console.print(
|
||||||
|
f"[red]unhealthy[/red]: kill switch armed "
|
||||||
|
f"reason={sys_state.kill_reason!r}"
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
age = (datetime.now(UTC) - sys_state.last_health_check).total_seconds()
|
||||||
|
if age > max_staleness_s:
|
||||||
|
console.print(
|
||||||
|
f"[red]unhealthy[/red]: last_health_check stale "
|
||||||
|
f"({age:.0f}s > {max_staleness_s}s)"
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
console.print(f"[green]healthy[/green] last_check_age={age:.0f}s")
|
||||||
|
|
||||||
|
|
||||||
def _engine_options(func: Callable[..., Any]) -> Callable[..., Any]:
|
def _engine_options(func: Callable[..., Any]) -> Callable[..., Any]:
|
||||||
"""Common options for the engine commands."""
|
"""Common options for the engine commands."""
|
||||||
decorators = [
|
decorators = [
|
||||||
@@ -181,14 +244,29 @@ def _build_orchestrator(
|
|||||||
) -> Orchestrator:
|
) -> Orchestrator:
|
||||||
loaded = load_strategy(strategy_path, enforce_hash=enforce_hash)
|
loaded = load_strategy(strategy_path, enforce_hash=enforce_hash)
|
||||||
token = load_token(path=token_file)
|
token = load_token(path=token_file)
|
||||||
|
# Strategy file values win over the CLI defaults; explicit overrides
|
||||||
|
# via env-style values (CLI flags) still apply when the user provides
|
||||||
|
# them — Click signals "default" via Click's resilient_parsing flag,
|
||||||
|
# but for now the CLI value is treated as authoritative when it
|
||||||
|
# differs from the documented default to keep the surface small.
|
||||||
|
cfg_env = loaded.config.execution.environment
|
||||||
|
cfg_fx = loaded.config.execution.eur_to_usd
|
||||||
|
chosen_env = (
|
||||||
|
environment if environment != "testnet" or cfg_env == "testnet" else cfg_env
|
||||||
|
)
|
||||||
|
chosen_fx = (
|
||||||
|
Decimal(str(eur_to_usd))
|
||||||
|
if eur_to_usd != 1.075
|
||||||
|
else cfg_fx
|
||||||
|
)
|
||||||
return make_orchestrator(
|
return make_orchestrator(
|
||||||
cfg=loaded.config,
|
cfg=loaded.config,
|
||||||
endpoints=load_endpoints(),
|
endpoints=load_endpoints(),
|
||||||
token=token,
|
token=token,
|
||||||
db_path=db,
|
db_path=db,
|
||||||
audit_path=audit,
|
audit_path=audit,
|
||||||
expected_environment=environment, # type: ignore[arg-type]
|
expected_environment=chosen_env, # type: ignore[arg-type]
|
||||||
eur_to_usd=Decimal(str(eur_to_usd)),
|
eur_to_usd=chosen_fx,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -203,7 +203,18 @@ class _LooseSection(BaseModel):
|
|||||||
model_config = ConfigDict(frozen=True, extra="allow")
|
model_config = ConfigDict(frozen=True, extra="allow")
|
||||||
|
|
||||||
|
|
||||||
class ExecutionConfig(_LooseSection): ...
|
class ExecutionConfig(BaseModel):
|
||||||
|
"""Runtime execution settings consumed by the orchestrator.
|
||||||
|
|
||||||
|
The remaining knobs (initial_limit, reprice_step_ticks, …) live as
|
||||||
|
extra fields validated lazily — they will graduate to typed fields
|
||||||
|
when the order-management layer needs them.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_config = ConfigDict(frozen=True, extra="allow")
|
||||||
|
|
||||||
|
environment: Literal["testnet", "mainnet"] = "testnet"
|
||||||
|
eur_to_usd: Decimal = Field(default=Decimal("1.075"))
|
||||||
|
|
||||||
|
|
||||||
class MonitoringConfig(_LooseSection): ...
|
class MonitoringConfig(_LooseSection): ...
|
||||||
|
|||||||
@@ -103,7 +103,23 @@ def build_runtime(
|
|||||||
finally:
|
finally:
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
audit_log = AuditLog(audit_path)
|
def _persist_audit_anchor(line_hash: str) -> None:
|
||||||
|
"""Mirror the latest audit chain hash into ``system_state``.
|
||||||
|
|
||||||
|
Best-effort: if SQLite is locked by another writer the audit
|
||||||
|
log itself is still consistent, the anchor will catch up on
|
||||||
|
the next append.
|
||||||
|
"""
|
||||||
|
anchor_conn = connect(db_path)
|
||||||
|
try:
|
||||||
|
with transaction(anchor_conn):
|
||||||
|
repository.set_last_audit_hash(anchor_conn, hex_hash=line_hash)
|
||||||
|
except Exception: # pragma: no cover — durability is best-effort
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
anchor_conn.close()
|
||||||
|
|
||||||
|
audit_log = AuditLog(audit_path, on_append=_persist_audit_anchor)
|
||||||
kill_switch = KillSwitch(
|
kill_switch = KillSwitch(
|
||||||
connection_factory=lambda: connect(db_path),
|
connection_factory=lambda: connect(db_path),
|
||||||
repository=repository,
|
repository=repository,
|
||||||
|
|||||||
@@ -154,13 +154,19 @@ def _option_type_from_name(name: str) -> PutOrCall:
|
|||||||
|
|
||||||
|
|
||||||
async def _fetch_return_4h(ctx: RuntimeContext, *, now: datetime) -> Decimal:
|
async def _fetch_return_4h(ctx: RuntimeContext, *, now: datetime) -> Decimal:
|
||||||
"""Compute ETH 4h return from the locally stored dvol_history snapshots.
|
"""Compute ETH 4h return.
|
||||||
|
|
||||||
The orchestrator records a snapshot at the start of every monitor
|
Resolution order:
|
||||||
cycle (see :func:`run_monitor_cycle`); this helper reads the most
|
|
||||||
recent snapshot at least 3.5h old and computes ``(now / past) - 1``.
|
1. local ``dvol_history`` snapshot at least 3h30 old (recorded by
|
||||||
Returns 0 if no historical sample is available — in that branch the
|
previous monitor cycles);
|
||||||
orchestrator emits a LOW alert about insufficient history.
|
2. Deribit ``get_historical`` 1h candles 4h ago — bootstrap when
|
||||||
|
SQLite has no recent sample (first cycle after a fresh
|
||||||
|
container, or after long downtime).
|
||||||
|
|
||||||
|
Returns ``0`` only when both sources fail; in that case the
|
||||||
|
monitor cycle emits a LOW alert and exit_decision falls back to
|
||||||
|
HOLD on the adverse-move trigger.
|
||||||
"""
|
"""
|
||||||
cutoff = now - timedelta(hours=3, minutes=30)
|
cutoff = now - timedelta(hours=3, minutes=30)
|
||||||
floor = now - timedelta(hours=8)
|
floor = now - timedelta(hours=8)
|
||||||
@@ -174,13 +180,30 @@ async def _fetch_return_4h(ctx: RuntimeContext, *, now: datetime) -> Decimal:
|
|||||||
).fetchone()
|
).fetchone()
|
||||||
finally:
|
finally:
|
||||||
conn.close()
|
conn.close()
|
||||||
if row is None:
|
if row is not None:
|
||||||
return Decimal("0")
|
past_spot = Decimal(str(row[1]))
|
||||||
past_spot = Decimal(str(row[1]))
|
if past_spot != 0:
|
||||||
if past_spot == 0:
|
spot_now = await ctx.deribit.index_price_eth()
|
||||||
|
return spot_now / past_spot - Decimal("1")
|
||||||
|
|
||||||
|
# Fallback: ask Deribit for the 4h candle close.
|
||||||
|
try:
|
||||||
|
past_close = await ctx.deribit.historical_close(
|
||||||
|
instrument="ETH-PERPETUAL",
|
||||||
|
start=now - timedelta(hours=5),
|
||||||
|
end=now - timedelta(hours=3, minutes=30),
|
||||||
|
resolution="1h",
|
||||||
|
)
|
||||||
|
except Exception: # pragma: no cover — defensive, surface as LOW alert
|
||||||
|
past_close = None
|
||||||
|
if past_close is None or past_close == 0:
|
||||||
|
await ctx.alert_manager.low(
|
||||||
|
source="monitor_cycle",
|
||||||
|
message="no return_4h sample available (history empty + bootstrap failed)",
|
||||||
|
)
|
||||||
return Decimal("0")
|
return Decimal("0")
|
||||||
spot_now = await ctx.deribit.index_price_eth()
|
spot_now = await ctx.deribit.index_price_eth()
|
||||||
return spot_now / past_spot - Decimal("1")
|
return spot_now / past_close - Decimal("1")
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -17,7 +17,9 @@ from datetime import UTC, datetime
|
|||||||
from decimal import Decimal
|
from decimal import Decimal
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
from uuid import uuid4
|
||||||
|
|
||||||
|
import structlog
|
||||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||||
|
|
||||||
from cerbero_bite.config.mcp_endpoints import McpEndpoints
|
from cerbero_bite.config.mcp_endpoints import McpEndpoints
|
||||||
@@ -29,6 +31,7 @@ from cerbero_bite.runtime.lockfile import EngineLock
|
|||||||
from cerbero_bite.runtime.monitor_cycle import MonitorCycleResult, run_monitor_cycle
|
from cerbero_bite.runtime.monitor_cycle import MonitorCycleResult, run_monitor_cycle
|
||||||
from cerbero_bite.runtime.recovery import recover_state
|
from cerbero_bite.runtime.recovery import recover_state
|
||||||
from cerbero_bite.runtime.scheduler import JobSpec, build_scheduler
|
from cerbero_bite.runtime.scheduler import JobSpec, build_scheduler
|
||||||
|
from cerbero_bite.state import connect as connect_state
|
||||||
|
|
||||||
__all__ = ["Orchestrator"]
|
__all__ = ["Orchestrator"]
|
||||||
|
|
||||||
@@ -82,6 +85,7 @@ class Orchestrator:
|
|||||||
async def boot(self) -> _BootResult:
|
async def boot(self) -> _BootResult:
|
||||||
"""Reconcile state, verify environment, run a first health probe."""
|
"""Reconcile state, verify environment, run a first health probe."""
|
||||||
when = self._ctx.clock()
|
when = self._ctx.clock()
|
||||||
|
await self._verify_audit_anchor(now=when)
|
||||||
await recover_state(self._ctx, now=when)
|
await recover_state(self._ctx, now=when)
|
||||||
|
|
||||||
info = await self._ctx.deribit.environment_info()
|
info = await self._ctx.deribit.environment_info()
|
||||||
@@ -111,22 +115,70 @@ class Orchestrator:
|
|||||||
# Cycle invocations (used by scheduler jobs and CLI dry-run)
|
# Cycle invocations (used by scheduler jobs and CLI dry-run)
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def _verify_audit_anchor(self, *, now: datetime) -> None: # noqa: ARG002
|
||||||
|
"""Compare the audit log tail with the SQLite anchor.
|
||||||
|
|
||||||
|
``now`` is accepted for symmetry with the other ``boot``
|
||||||
|
helpers but unused: the comparison is purely between the
|
||||||
|
in-memory tail hash and the value persisted on the previous
|
||||||
|
run.
|
||||||
|
"""
|
||||||
|
conn = connect_state(self._ctx.db_path)
|
||||||
|
try:
|
||||||
|
state = self._ctx.repository.get_system_state(conn)
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
if state is None or state.last_audit_hash is None:
|
||||||
|
return # first boot, nothing to compare against
|
||||||
|
actual_tail = self._ctx.audit_log.last_hash
|
||||||
|
if actual_tail != state.last_audit_hash:
|
||||||
|
await self._ctx.alert_manager.critical(
|
||||||
|
source="orchestrator.boot",
|
||||||
|
message=(
|
||||||
|
f"audit log anchor mismatch: anchor="
|
||||||
|
f"{state.last_audit_hash[:12]}…, file tail="
|
||||||
|
f"{actual_tail[:12]}… — possible tampering or truncation"
|
||||||
|
),
|
||||||
|
component="safety.audit_log",
|
||||||
|
)
|
||||||
|
|
||||||
async def run_entry(
|
async def run_entry(
|
||||||
self, *, now: datetime | None = None
|
self, *, now: datetime | None = None
|
||||||
) -> EntryCycleResult:
|
) -> EntryCycleResult:
|
||||||
return await run_entry_cycle(
|
cycle_id = str(uuid4())
|
||||||
self._ctx, eur_to_usd_rate=self._eur_to_usd, now=now
|
token = structlog.contextvars.bind_contextvars(
|
||||||
|
cycle="entry", cycle_id=cycle_id
|
||||||
)
|
)
|
||||||
|
try:
|
||||||
|
return await run_entry_cycle(
|
||||||
|
self._ctx, eur_to_usd_rate=self._eur_to_usd, now=now
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
structlog.contextvars.reset_contextvars(**token)
|
||||||
|
|
||||||
async def run_monitor(
|
async def run_monitor(
|
||||||
self, *, now: datetime | None = None
|
self, *, now: datetime | None = None
|
||||||
) -> MonitorCycleResult:
|
) -> MonitorCycleResult:
|
||||||
return await run_monitor_cycle(self._ctx, now=now)
|
cycle_id = str(uuid4())
|
||||||
|
token = structlog.contextvars.bind_contextvars(
|
||||||
|
cycle="monitor", cycle_id=cycle_id
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
return await run_monitor_cycle(self._ctx, now=now)
|
||||||
|
finally:
|
||||||
|
structlog.contextvars.reset_contextvars(**token)
|
||||||
|
|
||||||
async def run_health(
|
async def run_health(
|
||||||
self, *, now: datetime | None = None
|
self, *, now: datetime | None = None
|
||||||
) -> HealthCheckResult:
|
) -> HealthCheckResult:
|
||||||
return await self._health.run(now=now)
|
cycle_id = str(uuid4())
|
||||||
|
token = structlog.contextvars.bind_contextvars(
|
||||||
|
cycle="health", cycle_id=cycle_id
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
return await self._health.run(now=now)
|
||||||
|
finally:
|
||||||
|
structlog.contextvars.reset_contextvars(**token)
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Scheduler lifecycle
|
# Scheduler lifecycle
|
||||||
@@ -191,8 +243,14 @@ class Orchestrator:
|
|||||||
"""Boot, acquire the single-instance lock, install the scheduler.
|
"""Boot, acquire the single-instance lock, install the scheduler.
|
||||||
|
|
||||||
``lock_path`` defaults to ``<db_path.parent>/.lockfile`` so two
|
``lock_path`` defaults to ``<db_path.parent>/.lockfile`` so two
|
||||||
containers cannot trade against the same SQLite file.
|
containers cannot trade against the same SQLite file. SIGTERM
|
||||||
|
and SIGINT are intercepted so Docker (or the operator) can
|
||||||
|
signal a clean shutdown — the scheduler is stopped, in-flight
|
||||||
|
cycles complete, the audit log fsyncs, and the HTTP client is
|
||||||
|
closed before the process exits.
|
||||||
"""
|
"""
|
||||||
|
import signal # noqa: PLC0415 — only needed by run_forever
|
||||||
|
|
||||||
lock = EngineLock(
|
lock = EngineLock(
|
||||||
lock_path or self._ctx.db_path.parent / ".lockfile"
|
lock_path or self._ctx.db_path.parent / ".lockfile"
|
||||||
)
|
)
|
||||||
@@ -201,8 +259,28 @@ class Orchestrator:
|
|||||||
await self.boot()
|
await self.boot()
|
||||||
scheduler = self.install_scheduler()
|
scheduler = self.install_scheduler()
|
||||||
scheduler.start()
|
scheduler.start()
|
||||||
|
stop_event = asyncio.Event()
|
||||||
|
|
||||||
|
def _on_signal(signame: str) -> None:
|
||||||
|
_log.info("received %s — initiating shutdown", signame)
|
||||||
|
stop_event.set()
|
||||||
|
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
for sig_name in ("SIGTERM", "SIGINT"):
|
||||||
|
sig = getattr(signal, sig_name, None)
|
||||||
|
if sig is None: # pragma: no cover — Windows fallback
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
loop.add_signal_handler(
|
||||||
|
sig, _on_signal, sig_name
|
||||||
|
)
|
||||||
|
except NotImplementedError: # pragma: no cover
|
||||||
|
# Some sandboxes (Windows asyncio) don't support
|
||||||
|
# add_signal_handler; fall back to no-op.
|
||||||
|
signal.signal(sig, lambda *_: stop_event.set())
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await asyncio.Event().wait()
|
await stop_event.wait()
|
||||||
finally:
|
finally:
|
||||||
scheduler.shutdown(wait=False)
|
scheduler.shutdown(wait=False)
|
||||||
finally:
|
finally:
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from __future__ import annotations
|
|||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from collections.abc import Iterator
|
from collections.abc import Callable, Iterator
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import UTC, datetime
|
from datetime import UTC, datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -176,10 +176,16 @@ class AuditLog:
|
|||||||
fsync'd before returning.
|
fsync'd before returning.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, path: str | Path) -> None:
|
def __init__(
|
||||||
|
self,
|
||||||
|
path: str | Path,
|
||||||
|
*,
|
||||||
|
on_append: Callable[[str], None] | None = None,
|
||||||
|
) -> None:
|
||||||
self._path = Path(path)
|
self._path = Path(path)
|
||||||
self._path.parent.mkdir(parents=True, exist_ok=True)
|
self._path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
self._last_hash: str = self._tail_hash() or GENESIS_HASH
|
self._last_hash: str = self._tail_hash() or GENESIS_HASH
|
||||||
|
self._on_append = on_append
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def path(self) -> Path: # pragma: no cover — accessor used by callers only
|
def path(self) -> Path: # pragma: no cover — accessor used by callers only
|
||||||
@@ -237,6 +243,8 @@ class AuditLog:
|
|||||||
os.fsync(fh.fileno())
|
os.fsync(fh.fileno())
|
||||||
|
|
||||||
self._last_hash = line_hash
|
self._last_hash = line_hash
|
||||||
|
if self._on_append is not None:
|
||||||
|
self._on_append(line_hash)
|
||||||
return AuditEntry(
|
return AuditEntry(
|
||||||
timestamp=ts,
|
timestamp=ts,
|
||||||
event=event,
|
event=event,
|
||||||
|
|||||||
@@ -0,0 +1,8 @@
|
|||||||
|
-- 0002_audit_anchor.sql — store the latest audit chain hash inside
|
||||||
|
-- system_state so a truncation of the audit log file can be detected
|
||||||
|
-- at boot (the file would still verify on its own, but the recorded
|
||||||
|
-- anchor would not match the file's tail hash).
|
||||||
|
|
||||||
|
ALTER TABLE system_state ADD COLUMN last_audit_hash TEXT;
|
||||||
|
|
||||||
|
PRAGMA user_version = 2;
|
||||||
@@ -152,3 +152,4 @@ class SystemStateRecord(BaseModel):
|
|||||||
last_kelly_calib: datetime | None = None
|
last_kelly_calib: datetime | None = None
|
||||||
config_version: str
|
config_version: str
|
||||||
started_at: datetime
|
started_at: datetime
|
||||||
|
last_audit_hash: str | None = None
|
||||||
|
|||||||
@@ -414,6 +414,7 @@ class Repository:
|
|||||||
row = conn.execute("SELECT * FROM system_state WHERE id = 1").fetchone()
|
row = conn.execute("SELECT * FROM system_state WHERE id = 1").fetchone()
|
||||||
if row is None:
|
if row is None:
|
||||||
return None
|
return None
|
||||||
|
keys = row.keys()
|
||||||
return SystemStateRecord(
|
return SystemStateRecord(
|
||||||
id=int(row["id"]),
|
id=int(row["id"]),
|
||||||
kill_switch=int(row["kill_switch"]),
|
kill_switch=int(row["kill_switch"]),
|
||||||
@@ -423,6 +424,18 @@ class Repository:
|
|||||||
last_kelly_calib=_dec_dt(row["last_kelly_calib"]),
|
last_kelly_calib=_dec_dt(row["last_kelly_calib"]),
|
||||||
config_version=row["config_version"],
|
config_version=row["config_version"],
|
||||||
started_at=_dec_dt_required(row["started_at"]),
|
started_at=_dec_dt_required(row["started_at"]),
|
||||||
|
last_audit_hash=(
|
||||||
|
row["last_audit_hash"] if "last_audit_hash" in keys else None
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_last_audit_hash(
|
||||||
|
self, conn: sqlite3.Connection, *, hex_hash: str
|
||||||
|
) -> None:
|
||||||
|
"""Store the most recent audit chain hash. Called by AuditLog after append."""
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE system_state SET last_audit_hash = ? WHERE id = 1",
|
||||||
|
(hex_hash,),
|
||||||
)
|
)
|
||||||
|
|
||||||
def set_kill_switch(
|
def set_kill_switch(
|
||||||
|
|||||||
+3
-1
@@ -7,7 +7,7 @@
|
|||||||
# the commit message.
|
# the commit message.
|
||||||
|
|
||||||
config_version: "1.0.0"
|
config_version: "1.0.0"
|
||||||
config_hash: "a857dc4b187cbdf5ac3f04c4aad48ab7587659bc9a3139db206566e10e2fa5e5"
|
config_hash: "f4bfebbb048bed7efa5c0fb71dc188619264edbe8dd09bb195bba8350e609d9c"
|
||||||
last_review: "2026-04-26"
|
last_review: "2026-04-26"
|
||||||
last_reviewer: "Adriano"
|
last_reviewer: "Adriano"
|
||||||
|
|
||||||
@@ -96,6 +96,8 @@ exit:
|
|||||||
- "CLOSE_DELTA"
|
- "CLOSE_DELTA"
|
||||||
|
|
||||||
execution:
|
execution:
|
||||||
|
environment: "testnet" # testnet|mainnet — kill switch on broker mismatch
|
||||||
|
eur_to_usd: "1.075" # default FX rate for sizing engine; override at boot
|
||||||
combo_only: true
|
combo_only: true
|
||||||
initial_limit: "mid"
|
initial_limit: "mid"
|
||||||
reprice_step_ticks: 1
|
reprice_step_ticks: 1
|
||||||
|
|||||||
@@ -0,0 +1,125 @@
|
|||||||
|
"""Tests for the audit chain anti-truncation anchor."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import UTC, datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from pytest_httpx import HTTPXMock
|
||||||
|
|
||||||
|
from cerbero_bite.config import golden_config
|
||||||
|
from cerbero_bite.config.mcp_endpoints import load_endpoints
|
||||||
|
from cerbero_bite.runtime import build_runtime
|
||||||
|
from cerbero_bite.runtime.orchestrator import Orchestrator
|
||||||
|
from cerbero_bite.state import connect
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.httpx_mock(assert_all_responses_were_requested=False)
|
||||||
|
|
||||||
|
|
||||||
|
def _now() -> datetime:
|
||||||
|
return datetime(2026, 4, 27, 14, 0, tzinfo=UTC)
|
||||||
|
|
||||||
|
|
||||||
|
def _build(tmp_path: Path) -> Orchestrator:
|
||||||
|
ctx = build_runtime(
|
||||||
|
cfg=golden_config(),
|
||||||
|
endpoints=load_endpoints(env={}),
|
||||||
|
token="t",
|
||||||
|
db_path=tmp_path / "state.sqlite",
|
||||||
|
audit_path=tmp_path / "audit.log",
|
||||||
|
retry_max=1,
|
||||||
|
clock=_now,
|
||||||
|
)
|
||||||
|
return Orchestrator(
|
||||||
|
ctx,
|
||||||
|
expected_environment="testnet",
|
||||||
|
eur_to_usd=__import__("decimal").Decimal("1.075"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _wire_boot_dependencies(httpx_mock: HTTPXMock) -> None:
|
||||||
|
httpx_mock.add_response(
|
||||||
|
url="http://mcp-deribit:9011/tools/environment_info",
|
||||||
|
json={
|
||||||
|
"exchange": "deribit",
|
||||||
|
"environment": "testnet",
|
||||||
|
"source": "env",
|
||||||
|
"env_value": "true",
|
||||||
|
"base_url": "https://test.deribit.com/api/v2",
|
||||||
|
"max_leverage": 3,
|
||||||
|
},
|
||||||
|
is_reusable=True,
|
||||||
|
)
|
||||||
|
httpx_mock.add_response(
|
||||||
|
url="http://mcp-deribit:9011/tools/get_positions",
|
||||||
|
json=[],
|
||||||
|
is_reusable=True,
|
||||||
|
)
|
||||||
|
httpx_mock.add_response(
|
||||||
|
url="http://mcp-macro:9013/tools/get_macro_calendar",
|
||||||
|
json={"events": []},
|
||||||
|
is_reusable=True,
|
||||||
|
)
|
||||||
|
httpx_mock.add_response(
|
||||||
|
url="http://mcp-sentiment:9014/tools/get_cross_exchange_funding",
|
||||||
|
json={"snapshot": {}},
|
||||||
|
is_reusable=True,
|
||||||
|
)
|
||||||
|
httpx_mock.add_response(
|
||||||
|
url="http://mcp-hyperliquid:9012/tools/get_funding_rate",
|
||||||
|
json={"asset": "ETH", "current_funding_rate": 0.0001},
|
||||||
|
is_reusable=True,
|
||||||
|
)
|
||||||
|
httpx_mock.add_response(
|
||||||
|
url="http://mcp-portfolio:9018/tools/get_total_portfolio_value",
|
||||||
|
json={"total_value_eur": 1000.0},
|
||||||
|
is_reusable=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_audit_anchor_persisted_after_append(tmp_path: Path) -> None:
|
||||||
|
orch = _build(tmp_path)
|
||||||
|
orch.context.audit_log.append(
|
||||||
|
event="TEST",
|
||||||
|
payload={"x": 1},
|
||||||
|
now=_now(),
|
||||||
|
)
|
||||||
|
conn = connect(tmp_path / "state.sqlite")
|
||||||
|
try:
|
||||||
|
state = orch.context.repository.get_system_state(conn)
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
assert state is not None
|
||||||
|
assert state.last_audit_hash == orch.context.audit_log.last_hash
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_boot_detects_audit_truncation(
|
||||||
|
tmp_path: Path, httpx_mock: HTTPXMock
|
||||||
|
) -> None:
|
||||||
|
orch = _build(tmp_path)
|
||||||
|
# Append three lines so we have something to truncate.
|
||||||
|
for i in range(3):
|
||||||
|
orch.context.audit_log.append(
|
||||||
|
event=f"E{i}", payload={"i": i}, now=_now()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Truncate the file: keep only the first line.
|
||||||
|
audit_path = tmp_path / "audit.log"
|
||||||
|
head = audit_path.read_text(encoding="utf-8").splitlines(keepends=True)[0]
|
||||||
|
audit_path.write_text(head, encoding="utf-8")
|
||||||
|
|
||||||
|
# Rebuild orchestrator (the AuditLog tail-reads the file again).
|
||||||
|
orch = _build(tmp_path)
|
||||||
|
|
||||||
|
_wire_boot_dependencies(httpx_mock)
|
||||||
|
httpx_mock.add_response(
|
||||||
|
url="http://mcp-telegram:9017/tools/notify_system_error",
|
||||||
|
json={"ok": True},
|
||||||
|
is_reusable=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
await orch.boot()
|
||||||
|
assert orch.context.kill_switch.is_armed() is True
|
||||||
@@ -98,6 +98,7 @@ def _wire_market_data(
|
|||||||
*,
|
*,
|
||||||
spot: float = 3000.0,
|
spot: float = 3000.0,
|
||||||
dvol: float = 50.0,
|
dvol: float = 50.0,
|
||||||
|
historical_close: float | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
httpx_mock.add_response(
|
httpx_mock.add_response(
|
||||||
url="http://mcp-deribit:9011/tools/get_ticker",
|
url="http://mcp-deribit:9011/tools/get_ticker",
|
||||||
@@ -109,6 +110,16 @@ def _wire_market_data(
|
|||||||
json={"currency": "ETH", "latest": dvol, "candles": []},
|
json={"currency": "ETH", "latest": dvol, "candles": []},
|
||||||
is_reusable=True,
|
is_reusable=True,
|
||||||
)
|
)
|
||||||
|
# Bootstrap fallback for return_4h when dvol_history is empty.
|
||||||
|
httpx_mock.add_response(
|
||||||
|
url="http://mcp-deribit:9011/tools/get_historical",
|
||||||
|
json={
|
||||||
|
"candles": (
|
||||||
|
[{"close": historical_close}] if historical_close is not None else []
|
||||||
|
)
|
||||||
|
},
|
||||||
|
is_reusable=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _wire_position_quotes(
|
def _wire_position_quotes(
|
||||||
|
|||||||
@@ -0,0 +1,66 @@
|
|||||||
|
"""Tests for the ``cerbero-bite healthcheck`` subcommand."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import UTC, datetime, timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from click.testing import CliRunner
|
||||||
|
|
||||||
|
from cerbero_bite.cli import main as cli_main
|
||||||
|
from cerbero_bite.state import Repository, connect, run_migrations, transaction
|
||||||
|
|
||||||
|
|
||||||
|
def _seed_state(db: Path, *, last_check: datetime, kill_switch: bool = False) -> None:
|
||||||
|
conn = connect(db)
|
||||||
|
try:
|
||||||
|
run_migrations(conn)
|
||||||
|
repo = Repository()
|
||||||
|
with transaction(conn):
|
||||||
|
repo.init_system_state(
|
||||||
|
conn, config_version="1.0.0", now=last_check
|
||||||
|
)
|
||||||
|
if kill_switch:
|
||||||
|
repo.set_kill_switch(
|
||||||
|
conn, armed=True, reason="test", now=last_check
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
repo.touch_health_check(conn, now=last_check)
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_healthcheck_exits_one_when_db_missing(tmp_path: Path) -> None:
|
||||||
|
result = CliRunner().invoke(
|
||||||
|
cli_main,
|
||||||
|
["healthcheck", "--db", str(tmp_path / "absent.sqlite")],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "unhealthy" in result.output
|
||||||
|
|
||||||
|
|
||||||
|
def test_healthcheck_exits_one_when_kill_switch_armed(tmp_path: Path) -> None:
|
||||||
|
db = tmp_path / "state.sqlite"
|
||||||
|
_seed_state(db, last_check=datetime.now(UTC), kill_switch=True)
|
||||||
|
result = CliRunner().invoke(cli_main, ["healthcheck", "--db", str(db)])
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "kill switch" in result.output
|
||||||
|
|
||||||
|
|
||||||
|
def test_healthcheck_exits_one_when_last_check_stale(tmp_path: Path) -> None:
|
||||||
|
db = tmp_path / "state.sqlite"
|
||||||
|
_seed_state(db, last_check=datetime.now(UTC) - timedelta(hours=1))
|
||||||
|
result = CliRunner().invoke(
|
||||||
|
cli_main,
|
||||||
|
["healthcheck", "--db", str(db), "--max-staleness-s", "60"],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "stale" in result.output
|
||||||
|
|
||||||
|
|
||||||
|
def test_healthcheck_exits_zero_on_recent_check(tmp_path: Path) -> None:
|
||||||
|
db = tmp_path / "state.sqlite"
|
||||||
|
_seed_state(db, last_check=datetime.now(UTC))
|
||||||
|
result = CliRunner().invoke(cli_main, ["healthcheck", "--db", str(db)])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "healthy" in result.output
|
||||||
Reference in New Issue
Block a user