"""Periodic health probe across MCP services + SQLite + environment. The probe is fail-soft: every check is wrapped in a try/except so a single misbehaving service does not abort the others. The orchestrator keeps a counter of consecutive failures: at the third failure the kill switch arms (HIGH severity); any time the probe succeeds the counter resets and a fresh ``HEALTH_OK`` line is appended to the audit log so the dead-man watcher stays quiet. """ from __future__ import annotations import asyncio import logging from dataclasses import dataclass from datetime import UTC, datetime from typing import Literal from cerbero_bite.runtime.alert_manager import Severity from cerbero_bite.runtime.dependencies import RuntimeContext from cerbero_bite.state import connect __all__ = ["HealthCheck", "HealthCheckResult", "HealthState"] _log = logging.getLogger("cerbero_bite.runtime.health") HealthState = Literal["ok", "degraded"] @dataclass(frozen=True) class HealthCheckResult: state: HealthState failures: list[tuple[str, str]] # [(service, reason), ...] consecutive_failures: int class HealthCheck: """Stateful health probe; remembers consecutive failures across calls.""" def __init__( self, ctx: RuntimeContext, *, expected_environment: Literal["testnet", "mainnet"], kill_after: int = 3, ) -> None: self._ctx = ctx self._expected = expected_environment self._kill_after = kill_after self._consecutive = 0 async def run(self, *, now: datetime | None = None) -> HealthCheckResult: when = (now or self._ctx.clock()).astimezone(UTC) failures: list[tuple[str, str]] = [] async def _probe(service: str, coro: object) -> None: try: await coro # type: ignore[misc] except Exception as exc: # surface every error to the operator failures.append((service, f"{type(exc).__name__}: {exc}")) await asyncio.gather( _probe("deribit", self._probe_deribit()), _probe("macro", self._ctx.macro.get_calendar(days=1)), _probe("sentiment", self._probe_sentiment()), _probe("hyperliquid", self._ctx.hyperliquid.funding_rate_annualized("ETH")), ) # SQLite health: lightweight transaction. try: conn = connect(self._ctx.db_path) try: self._ctx.repository.touch_health_check(conn, now=when) finally: conn.close() except Exception as exc: # pragma: no cover — sqlite errors are rare failures.append(("sqlite", f"{type(exc).__name__}: {exc}")) if failures: self._consecutive += 1 state: HealthState = "degraded" self._ctx.audit_log.append( event="HEALTH_DEGRADED", payload={ "failures": failures, "consecutive": self._consecutive, }, now=when, ) if self._consecutive >= self._kill_after: await self._ctx.alert_manager.emit( Severity.HIGH, source="health_check", message=( f"{self._consecutive} consecutive health-check failures " f"(latest: {failures})" ), ) else: self._consecutive = 0 state = "ok" self._ctx.audit_log.append( event="HEALTH_OK", payload={}, now=when ) return HealthCheckResult( state=state, failures=failures, consecutive_failures=self._consecutive, ) async def _probe_deribit(self) -> None: info = await self._ctx.deribit.environment_info() if info.environment != self._expected: raise RuntimeError( f"deribit environment mismatch: expected {self._expected}, " f"got {info.environment}" ) async def _probe_sentiment(self) -> None: # Avoid funding_cross which would raise on empty snapshot during # the health probe; we only need a successful HTTP round-trip. await self._ctx.sentiment._http.call( "get_cross_exchange_funding", {"assets": ["ETH"]} )