Hardening round 2: healthcheck, audit anchor, return_4h, exec config, signals

Sei interventi MEDIA priorità sul sistema. 323 test pass, mypy strict pulito, ruff clean. 1. Docker HEALTHCHECK + cerbero-bite healthcheck: - nuovo subcommand che esce 0 se kill_switch=0 e last_health_check entro --max-staleness-s (default 600s); - HEALTHCHECK direttiva nel Dockerfile (60s interval, 5s timeout, start_period 120s, retries 3); - healthcheck definition nel docker-compose.yml. 2. Audit hash chain anti-truncation: - migration 0002: nuova colonna system_state.last_audit_hash; - AuditLog accetta callback on_append, dependencies.py la wire al repository.set_last_audit_hash; - Orchestrator.boot verifica che il tail file matcha l'anchor persistito; mismatch → kill switch CRITICAL. 3. return_4h bootstrap da deribit get_historical: - quando dvol_history è vuoto _fetch_return_4h cade su deribit.historical_close (1h candle 4h fa); - alert LOW se anche il fallback fallisce. 4. execution.environment + execution.eur_to_usd in strategy.yaml: - ExecutionConfig promosso a typed schema con i due campi consumati al boot; - CLI start preferisce i valori da config; CLI flag overridano solo quando differenti dai default. 5. Cycle correlation ID: - structlog.contextvars.bind_contextvars in run_entry/run_monitor/ run_health propaga cycle_id e cycle nei log strutturati. 6. SIGTERM/SIGINT clean shutdown: - run_forever installa loop.add_signal_handler per SIGTERM e SIGINT; il segnale set()ta un asyncio.Event che termina il blocco principale, scheduler.shutdown e ctx.aclose finalizzano. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 00:37:39 +02:00
parent 411b747e93
commit b5b96f959c
15 changed files with 477 additions and 24 deletions
@@ -48,5 +48,11 @@ RUN mkdir -p /app/data/log /app/data/backups \
    && chown -R bite:bite /app
 USER bite
 # The healthcheck rides on the same Click entrypoint: it queries the
 # SQLite singleton and exits 0/1 based on kill_switch + last_health_check.
 HEALTHCHECK --interval=60s --timeout=5s --start-period=120s --retries=3 \
    CMD ["cerbero-bite", "healthcheck", "--db", "/app/data/state.sqlite"]
 ENTRYPOINT ["cerbero-bite"]
 CMD ["status"]
@@ -48,6 +48,13 @@ services:
      CERBERO_BITE_MCP_PORTFOLIO_URL: http://mcp-portfolio:9018
    volumes:
      - bite-data:/app/data
    healthcheck:
      test:
        ["CMD", "cerbero-bite", "healthcheck", "--db", "/app/data/state.sqlite"]
      interval: 60s
      timeout: 5s
      retries: 3
      start_period: 120s
    # Default command runs the engine status check; override with the
    # CLI subcommand of choice (start, ping, dry-run, ...).
    command: ["status"]
@@ -123,6 +123,69 @@ def status(db: Path) -> None:
    )
@main.command()
@click.option(
    "--db",
    type=click.Path(dir_okay=False, path_type=Path),
    default=_DEFAULT_DB_PATH,
    show_default=True,
 )
@click.option(
    "--max-staleness-s",
    type=int,
    default=600,
    show_default=True,
    help=(
        "Maximum age (seconds) of last_health_check before the engine is "
        "considered unhealthy. Used by Docker HEALTHCHECK."
    ),
 )
 def healthcheck(db: Path, max_staleness_s: int) -> None:
    """Exit 0 if the engine is healthy, 1 otherwise.
    The check is intentionally conservative:
    * the SQLite file must exist and be readable,
    * ``system_state.kill_switch`` must be 0,
    * ``system_state.last_health_check`` must not be older than
      ``--max-staleness-s`` seconds.
    Wired as the container HEALTHCHECK in ``Dockerfile``.
    """
    if not db.exists():
        console.print("[red]unhealthy[/red]: state.sqlite missing")
        sys.exit(1)
    try:
        conn = connect_state(db)
        try:
            run_migrations(conn)
            sys_state = Repository().get_system_state(conn)
        finally:
            conn.close()
    except Exception as exc:
        console.print(f"[red]unhealthy[/red]: {type(exc).__name__}: {exc}")
        sys.exit(1)
    if sys_state is None:
        console.print("[red]unhealthy[/red]: system_state singleton missing")
        sys.exit(1)
    if sys_state.kill_switch == 1:
        console.print(
            f"[red]unhealthy[/red]: kill switch armed "
            f"reason={sys_state.kill_reason!r}"
        )
        sys.exit(1)
    age = (datetime.now(UTC) - sys_state.last_health_check).total_seconds()
    if age > max_staleness_s:
        console.print(
            f"[red]unhealthy[/red]: last_health_check stale "
            f"({age:.0f}s > {max_staleness_s}s)"
        )
        sys.exit(1)
    console.print(f"[green]healthy[/green] last_check_age={age:.0f}s")
 def _engine_options(func: Callable[..., Any]) -> Callable[..., Any]:
    """Common options for the engine commands."""
    decorators = [
@@ -181,14 +244,29 @@ def _build_orchestrator(
 ) -> Orchestrator:
    loaded = load_strategy(strategy_path, enforce_hash=enforce_hash)
    token = load_token(path=token_file)
    # Strategy file values win over the CLI defaults; explicit overrides
    # via env-style values (CLI flags) still apply when the user provides
    # them — Click signals "default" via Click's resilient_parsing flag,
    # but for now the CLI value is treated as authoritative when it
    # differs from the documented default to keep the surface small.
    cfg_env = loaded.config.execution.environment
    cfg_fx = loaded.config.execution.eur_to_usd
    chosen_env = (
        environment if environment != "testnet" or cfg_env == "testnet" else cfg_env
    )
    chosen_fx = (
        Decimal(str(eur_to_usd))
        if eur_to_usd != 1.075
        else cfg_fx
    )
    return make_orchestrator(
        cfg=loaded.config,
        endpoints=load_endpoints(),
        token=token,
        db_path=db,
        audit_path=audit,
-        expected_environment=environment,  # type: ignore[arg-type]
+        expected_environment=chosen_env,  # type: ignore[arg-type]
-        eur_to_usd=Decimal(str(eur_to_usd)),
+        eur_to_usd=chosen_fx,
    )
@@ -203,7 +203,18 @@ class _LooseSection(BaseModel):
    model_config = ConfigDict(frozen=True, extra="allow")
-class ExecutionConfig(_LooseSection): ...
+class ExecutionConfig(BaseModel):
    """Runtime execution settings consumed by the orchestrator.
    The remaining knobs (initial_limit, reprice_step_ticks, …) live as
    extra fields validated lazily — they will graduate to typed fields
    when the order-management layer needs them.
    """
    model_config = ConfigDict(frozen=True, extra="allow")
    environment: Literal["testnet", "mainnet"] = "testnet"
    eur_to_usd: Decimal = Field(default=Decimal("1.075"))
 class MonitoringConfig(_LooseSection): ...
@@ -103,7 +103,23 @@ def build_runtime(
    finally:
        conn.close()
-    audit_log = AuditLog(audit_path)
+    def _persist_audit_anchor(line_hash: str) -> None:
        """Mirror the latest audit chain hash into ``system_state``.
        Best-effort: if SQLite is locked by another writer the audit
        log itself is still consistent, the anchor will catch up on
        the next append.
        """
        anchor_conn = connect(db_path)
        try:
            with transaction(anchor_conn):
                repository.set_last_audit_hash(anchor_conn, hex_hash=line_hash)
        except Exception:  # pragma: no cover — durability is best-effort
            pass
        finally:
            anchor_conn.close()
    audit_log = AuditLog(audit_path, on_append=_persist_audit_anchor)
    kill_switch = KillSwitch(
        connection_factory=lambda: connect(db_path),
        repository=repository,
@@ -154,13 +154,19 @@ def _option_type_from_name(name: str) -> PutOrCall:
 async def _fetch_return_4h(ctx: RuntimeContext, *, now: datetime) -> Decimal:
-    """Compute ETH 4h return from the locally stored dvol_history snapshots.
+    """Compute ETH 4h return.
-    The orchestrator records a snapshot at the start of every monitor
+    Resolution order:
-    cycle (see :func:`run_monitor_cycle`); this helper reads the most
+
-    recent snapshot at least 3.5h old and computes ``(now / past) - 1``.
+    1. local ``dvol_history`` snapshot at least 3h30 old (recorded by
-    Returns 0 if no historical sample is available — in that branch the
+       previous monitor cycles);
-    orchestrator emits a LOW alert about insufficient history.
+    2. Deribit ``get_historical`` 1h candles 4h ago — bootstrap when
       SQLite has no recent sample (first cycle after a fresh
       container, or after long downtime).
    Returns ``0`` only when both sources fail; in that case the
    monitor cycle emits a LOW alert and exit_decision falls back to
    HOLD on the adverse-move trigger.
    """
    cutoff = now - timedelta(hours=3, minutes=30)
    floor = now - timedelta(hours=8)
@@ -174,14 +180,31 @@ async def _fetch_return_4h(ctx: RuntimeContext, *, now: datetime) -> Decimal:
        ).fetchone()
    finally:
        conn.close()
-    if row is None:
+    if row is not None:
        return Decimal("0")
        past_spot = Decimal(str(row[1]))
-    if past_spot == 0:
+        if past_spot != 0:
        return Decimal("0")
            spot_now = await ctx.deribit.index_price_eth()
            return spot_now / past_spot - Decimal("1")
    # Fallback: ask Deribit for the 4h candle close.
    try:
        past_close = await ctx.deribit.historical_close(
            instrument="ETH-PERPETUAL",
            start=now - timedelta(hours=5),
            end=now - timedelta(hours=3, minutes=30),
            resolution="1h",
        )
    except Exception:  # pragma: no cover — defensive, surface as LOW alert
        past_close = None
    if past_close is None or past_close == 0:
        await ctx.alert_manager.low(
            source="monitor_cycle",
            message="no return_4h sample available (history empty + bootstrap failed)",
        )
        return Decimal("0")
    spot_now = await ctx.deribit.index_price_eth()
    return spot_now / past_close - Decimal("1")
 # ---------------------------------------------------------------------------
 # Cycle entry point
@@ -17,7 +17,9 @@ from datetime import UTC, datetime
 from decimal import Decimal
 from pathlib import Path
 from typing import Literal
 from uuid import uuid4
 import structlog
 from apscheduler.schedulers.asyncio import AsyncIOScheduler
 from cerbero_bite.config.mcp_endpoints import McpEndpoints
@@ -29,6 +31,7 @@ from cerbero_bite.runtime.lockfile import EngineLock
 from cerbero_bite.runtime.monitor_cycle import MonitorCycleResult, run_monitor_cycle
 from cerbero_bite.runtime.recovery import recover_state
 from cerbero_bite.runtime.scheduler import JobSpec, build_scheduler
 from cerbero_bite.state import connect as connect_state
 __all__ = ["Orchestrator"]
@@ -82,6 +85,7 @@ class Orchestrator:
    async def boot(self) -> _BootResult:
        """Reconcile state, verify environment, run a first health probe."""
        when = self._ctx.clock()
        await self._verify_audit_anchor(now=when)
        await recover_state(self._ctx, now=when)
        info = await self._ctx.deribit.environment_info()
@@ -111,22 +115,70 @@ class Orchestrator:
    # Cycle invocations (used by scheduler jobs and CLI dry-run)
    # ------------------------------------------------------------------
    async def _verify_audit_anchor(self, *, now: datetime) -> None:  # noqa: ARG002
        """Compare the audit log tail with the SQLite anchor.
        ``now`` is accepted for symmetry with the other ``boot``
        helpers but unused: the comparison is purely between the
        in-memory tail hash and the value persisted on the previous
        run.
        """
        conn = connect_state(self._ctx.db_path)
        try:
            state = self._ctx.repository.get_system_state(conn)
        finally:
            conn.close()
        if state is None or state.last_audit_hash is None:
            return  # first boot, nothing to compare against
        actual_tail = self._ctx.audit_log.last_hash
        if actual_tail != state.last_audit_hash:
            await self._ctx.alert_manager.critical(
                source="orchestrator.boot",
                message=(
                    f"audit log anchor mismatch: anchor="
                    f"{state.last_audit_hash[:12]}…, file tail="
                    f"{actual_tail[:12]}… — possible tampering or truncation"
                ),
                component="safety.audit_log",
            )
    async def run_entry(
        self, *, now: datetime | None = None
    ) -> EntryCycleResult:
        cycle_id = str(uuid4())
        token = structlog.contextvars.bind_contextvars(
            cycle="entry", cycle_id=cycle_id
        )
        try:
            return await run_entry_cycle(
                self._ctx, eur_to_usd_rate=self._eur_to_usd, now=now
            )
        finally:
            structlog.contextvars.reset_contextvars(**token)
    async def run_monitor(
        self, *, now: datetime | None = None
    ) -> MonitorCycleResult:
        cycle_id = str(uuid4())
        token = structlog.contextvars.bind_contextvars(
            cycle="monitor", cycle_id=cycle_id
        )
        try:
            return await run_monitor_cycle(self._ctx, now=now)
        finally:
            structlog.contextvars.reset_contextvars(**token)
    async def run_health(
        self, *, now: datetime | None = None
    ) -> HealthCheckResult:
        cycle_id = str(uuid4())
        token = structlog.contextvars.bind_contextvars(
            cycle="health", cycle_id=cycle_id
        )
        try:
            return await self._health.run(now=now)
        finally:
            structlog.contextvars.reset_contextvars(**token)
    # ------------------------------------------------------------------
    # Scheduler lifecycle
@@ -191,8 +243,14 @@ class Orchestrator:
        """Boot, acquire the single-instance lock, install the scheduler.
        ``lock_path`` defaults to ``<db_path.parent>/.lockfile`` so two
-        containers cannot trade against the same SQLite file.
+        containers cannot trade against the same SQLite file. SIGTERM
        and SIGINT are intercepted so Docker (or the operator) can
        signal a clean shutdown — the scheduler is stopped, in-flight
        cycles complete, the audit log fsyncs, and the HTTP client is
        closed before the process exits.
        """
        import signal  # noqa: PLC0415  — only needed by run_forever
        lock = EngineLock(
            lock_path or self._ctx.db_path.parent / ".lockfile"
        )
@@ -201,8 +259,28 @@ class Orchestrator:
                await self.boot()
                scheduler = self.install_scheduler()
                scheduler.start()
                stop_event = asyncio.Event()
                def _on_signal(signame: str) -> None:
                    _log.info("received %s — initiating shutdown", signame)
                    stop_event.set()
                loop = asyncio.get_running_loop()
                for sig_name in ("SIGTERM", "SIGINT"):
                    sig = getattr(signal, sig_name, None)
                    if sig is None:  # pragma: no cover — Windows fallback
                        continue
                    try:
-                    await asyncio.Event().wait()
+                        loop.add_signal_handler(
                            sig, _on_signal, sig_name
                        )
                    except NotImplementedError:  # pragma: no cover
                        # Some sandboxes (Windows asyncio) don't support
                        # add_signal_handler; fall back to no-op.
                        signal.signal(sig, lambda *_: stop_event.set())
                try:
                    await stop_event.wait()
                finally:
                    scheduler.shutdown(wait=False)
            finally:
@@ -17,7 +17,7 @@ from __future__ import annotations
 import hashlib
 import json
 import os
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from dataclasses import dataclass
 from datetime import UTC, datetime
 from pathlib import Path
@@ -176,10 +176,16 @@ class AuditLog:
    fsync'd before returning.
    """
-    def __init__(self, path: str | Path) -> None:
+    def __init__(
        self,
        path: str | Path,
        *,
        on_append: Callable[[str], None] | None = None,
    ) -> None:
        self._path = Path(path)
        self._path.parent.mkdir(parents=True, exist_ok=True)
        self._last_hash: str = self._tail_hash() or GENESIS_HASH
        self._on_append = on_append
    @property
    def path(self) -> Path:  # pragma: no cover — accessor used by callers only
@@ -237,6 +243,8 @@ class AuditLog:
            os.fsync(fh.fileno())
        self._last_hash = line_hash
        if self._on_append is not None:
            self._on_append(line_hash)
        return AuditEntry(
            timestamp=ts,
            event=event,
@@ -0,0 +1,8 @@
 -- 0002_audit_anchor.sql — store the latest audit chain hash inside
 -- system_state so a truncation of the audit log file can be detected
 -- at boot (the file would still verify on its own, but the recorded
 -- anchor would not match the file's tail hash).
 ALTER TABLE system_state ADD COLUMN last_audit_hash TEXT;
 PRAGMA user_version = 2;
@@ -152,3 +152,4 @@ class SystemStateRecord(BaseModel):
    last_kelly_calib: datetime | None = None
    config_version: str
    started_at: datetime
    last_audit_hash: str | None = None
@@ -414,6 +414,7 @@ class Repository:
        row = conn.execute("SELECT * FROM system_state WHERE id = 1").fetchone()
        if row is None:
            return None
        keys = row.keys()
        return SystemStateRecord(
            id=int(row["id"]),
            kill_switch=int(row["kill_switch"]),
@@ -423,6 +424,18 @@ class Repository:
            last_kelly_calib=_dec_dt(row["last_kelly_calib"]),
            config_version=row["config_version"],
            started_at=_dec_dt_required(row["started_at"]),
            last_audit_hash=(
                row["last_audit_hash"] if "last_audit_hash" in keys else None
            ),
        )
    def set_last_audit_hash(
        self, conn: sqlite3.Connection, *, hex_hash: str
    ) -> None:
        """Store the most recent audit chain hash. Called by AuditLog after append."""
        conn.execute(
            "UPDATE system_state SET last_audit_hash = ? WHERE id = 1",
            (hex_hash,),
        )
    def set_kill_switch(
@@ -7,7 +7,7 @@
 # the commit message.
 config_version: "1.0.0"
-config_hash: "a857dc4b187cbdf5ac3f04c4aad48ab7587659bc9a3139db206566e10e2fa5e5"
+config_hash: "f4bfebbb048bed7efa5c0fb71dc188619264edbe8dd09bb195bba8350e609d9c"
 last_review: "2026-04-26"
 last_reviewer: "Adriano"
@@ -96,6 +96,8 @@ exit:
    - "CLOSE_DELTA"
 execution:
  environment: "testnet"        # testnet|mainnet — kill switch on broker mismatch
  eur_to_usd: "1.075"           # default FX rate for sizing engine; override at boot
  combo_only: true
  initial_limit: "mid"
  reprice_step_ticks: 1
@@ -0,0 +1,125 @@
 """Tests for the audit chain anti-truncation anchor."""
 from __future__ import annotations
 from datetime import UTC, datetime
 from pathlib import Path
 import pytest
 from pytest_httpx import HTTPXMock
 from cerbero_bite.config import golden_config
 from cerbero_bite.config.mcp_endpoints import load_endpoints
 from cerbero_bite.runtime import build_runtime
 from cerbero_bite.runtime.orchestrator import Orchestrator
 from cerbero_bite.state import connect
 pytestmark = pytest.mark.httpx_mock(assert_all_responses_were_requested=False)
 def _now() -> datetime:
    return datetime(2026, 4, 27, 14, 0, tzinfo=UTC)
 def _build(tmp_path: Path) -> Orchestrator:
    ctx = build_runtime(
        cfg=golden_config(),
        endpoints=load_endpoints(env={}),
        token="t",
        db_path=tmp_path / "state.sqlite",
        audit_path=tmp_path / "audit.log",
        retry_max=1,
        clock=_now,
    )
    return Orchestrator(
        ctx,
        expected_environment="testnet",
        eur_to_usd=__import__("decimal").Decimal("1.075"),
    )
 def _wire_boot_dependencies(httpx_mock: HTTPXMock) -> None:
    httpx_mock.add_response(
        url="http://mcp-deribit:9011/tools/environment_info",
        json={
            "exchange": "deribit",
            "environment": "testnet",
            "source": "env",
            "env_value": "true",
            "base_url": "https://test.deribit.com/api/v2",
            "max_leverage": 3,
        },
        is_reusable=True,
    )
    httpx_mock.add_response(
        url="http://mcp-deribit:9011/tools/get_positions",
        json=[],
        is_reusable=True,
    )
    httpx_mock.add_response(
        url="http://mcp-macro:9013/tools/get_macro_calendar",
        json={"events": []},
        is_reusable=True,
    )
    httpx_mock.add_response(
        url="http://mcp-sentiment:9014/tools/get_cross_exchange_funding",
        json={"snapshot": {}},
        is_reusable=True,
    )
    httpx_mock.add_response(
        url="http://mcp-hyperliquid:9012/tools/get_funding_rate",
        json={"asset": "ETH", "current_funding_rate": 0.0001},
        is_reusable=True,
    )
    httpx_mock.add_response(
        url="http://mcp-portfolio:9018/tools/get_total_portfolio_value",
        json={"total_value_eur": 1000.0},
        is_reusable=True,
    )
@pytest.mark.asyncio
 async def test_audit_anchor_persisted_after_append(tmp_path: Path) -> None:
    orch = _build(tmp_path)
    orch.context.audit_log.append(
        event="TEST",
        payload={"x": 1},
        now=_now(),
    )
    conn = connect(tmp_path / "state.sqlite")
    try:
        state = orch.context.repository.get_system_state(conn)
    finally:
        conn.close()
    assert state is not None
    assert state.last_audit_hash == orch.context.audit_log.last_hash
@pytest.mark.asyncio
 async def test_boot_detects_audit_truncation(
    tmp_path: Path, httpx_mock: HTTPXMock
 ) -> None:
    orch = _build(tmp_path)
    # Append three lines so we have something to truncate.
    for i in range(3):
        orch.context.audit_log.append(
            event=f"E{i}", payload={"i": i}, now=_now()
        )
    # Truncate the file: keep only the first line.
    audit_path = tmp_path / "audit.log"
    head = audit_path.read_text(encoding="utf-8").splitlines(keepends=True)[0]
    audit_path.write_text(head, encoding="utf-8")
    # Rebuild orchestrator (the AuditLog tail-reads the file again).
    orch = _build(tmp_path)
    _wire_boot_dependencies(httpx_mock)
    httpx_mock.add_response(
        url="http://mcp-telegram:9017/tools/notify_system_error",
        json={"ok": True},
        is_reusable=True,
    )
    await orch.boot()
    assert orch.context.kill_switch.is_armed() is True
@@ -98,6 +98,7 @@ def _wire_market_data(
    *,
    spot: float = 3000.0,
    dvol: float = 50.0,
    historical_close: float | None = None,
 ) -> None:
    httpx_mock.add_response(
        url="http://mcp-deribit:9011/tools/get_ticker",
@@ -109,6 +110,16 @@ def _wire_market_data(
        json={"currency": "ETH", "latest": dvol, "candles": []},
        is_reusable=True,
    )
    # Bootstrap fallback for return_4h when dvol_history is empty.
    httpx_mock.add_response(
        url="http://mcp-deribit:9011/tools/get_historical",
        json={
            "candles": (
                [{"close": historical_close}] if historical_close is not None else []
            )
        },
        is_reusable=True,
    )
 def _wire_position_quotes(
@@ -0,0 +1,66 @@
 """Tests for the ``cerbero-bite healthcheck`` subcommand."""
 from __future__ import annotations
 from datetime import UTC, datetime, timedelta
 from pathlib import Path
 from click.testing import CliRunner
 from cerbero_bite.cli import main as cli_main
 from cerbero_bite.state import Repository, connect, run_migrations, transaction
 def _seed_state(db: Path, *, last_check: datetime, kill_switch: bool = False) -> None:
    conn = connect(db)
    try:
        run_migrations(conn)
        repo = Repository()
        with transaction(conn):
            repo.init_system_state(
                conn, config_version="1.0.0", now=last_check
            )
            if kill_switch:
                repo.set_kill_switch(
                    conn, armed=True, reason="test", now=last_check
                )
            else:
                repo.touch_health_check(conn, now=last_check)
    finally:
        conn.close()
 def test_healthcheck_exits_one_when_db_missing(tmp_path: Path) -> None:
    result = CliRunner().invoke(
        cli_main,
        ["healthcheck", "--db", str(tmp_path / "absent.sqlite")],
    )
    assert result.exit_code == 1
    assert "unhealthy" in result.output
 def test_healthcheck_exits_one_when_kill_switch_armed(tmp_path: Path) -> None:
    db = tmp_path / "state.sqlite"
    _seed_state(db, last_check=datetime.now(UTC), kill_switch=True)
    result = CliRunner().invoke(cli_main, ["healthcheck", "--db", str(db)])
    assert result.exit_code == 1
    assert "kill switch" in result.output
 def test_healthcheck_exits_one_when_last_check_stale(tmp_path: Path) -> None:
    db = tmp_path / "state.sqlite"
    _seed_state(db, last_check=datetime.now(UTC) - timedelta(hours=1))
    result = CliRunner().invoke(
        cli_main,
        ["healthcheck", "--db", str(db), "--max-staleness-s", "60"],
    )
    assert result.exit_code == 1
    assert "stale" in result.output
 def test_healthcheck_exits_zero_on_recent_check(tmp_path: Path) -> None:
    db = tmp_path / "state.sqlite"
    _seed_state(db, last_check=datetime.now(UTC))
    result = CliRunner().invoke(cli_main, ["healthcheck", "--db", str(db)])
    assert result.exit_code == 0
    assert "healthy" in result.output