Files
Cerbero-Bite/src/cerbero_bite/runtime/health_check.py
T
Adriano abf5a140e2 refactor: telegram + portfolio in-process (drop shared MCP)
Each bot now manages its own notification + portfolio aggregation:

* TelegramClient calls the public Bot API directly via httpx, reading
  CERBERO_BITE_TELEGRAM_BOT_TOKEN / CERBERO_BITE_TELEGRAM_CHAT_ID from
  env. No credentials → silent disabled mode.
* PortfolioClient composes DeribitClient + HyperliquidClient + the new
  MacroClient.get_asset_price/eur_usd_rate to expose equity (EUR) and
  per-asset exposure as the bot's own slice (no cross-bot view).
* mcp-telegram and mcp-portfolio removed from MCP_SERVICES / McpEndpoints
  and the cerbero-bite ping CLI; health_check no longer probes portfolio.

Docs (02/04/06/07) and docker-compose updated to reflect the new
architecture.

353/353 tests pass; ruff clean; mypy src clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 00:31:20 +02:00

128 lines
4.3 KiB
Python

"""Periodic health probe across MCP services + SQLite + environment.
The probe is fail-soft: every check is wrapped in a try/except so a
single misbehaving service does not abort the others. The orchestrator
keeps a counter of consecutive failures: at the third failure the
kill switch arms (HIGH severity); any time the probe succeeds the
counter resets and a fresh ``HEALTH_OK`` line is appended to the
audit log so the dead-man watcher stays quiet.
"""
from __future__ import annotations
import asyncio
import logging
from dataclasses import dataclass
from datetime import UTC, datetime
from typing import Literal
from cerbero_bite.runtime.alert_manager import Severity
from cerbero_bite.runtime.dependencies import RuntimeContext
from cerbero_bite.state import connect
__all__ = ["HealthCheck", "HealthCheckResult", "HealthState"]
_log = logging.getLogger("cerbero_bite.runtime.health")
HealthState = Literal["ok", "degraded"]
@dataclass(frozen=True)
class HealthCheckResult:
state: HealthState
failures: list[tuple[str, str]] # [(service, reason), ...]
consecutive_failures: int
class HealthCheck:
"""Stateful health probe; remembers consecutive failures across calls."""
def __init__(
self,
ctx: RuntimeContext,
*,
expected_environment: Literal["testnet", "mainnet"],
kill_after: int = 3,
) -> None:
self._ctx = ctx
self._expected = expected_environment
self._kill_after = kill_after
self._consecutive = 0
async def run(self, *, now: datetime | None = None) -> HealthCheckResult:
when = (now or self._ctx.clock()).astimezone(UTC)
failures: list[tuple[str, str]] = []
async def _probe(service: str, coro: object) -> None:
try:
await coro # type: ignore[misc]
except Exception as exc: # surface every error to the operator
failures.append((service, f"{type(exc).__name__}: {exc}"))
await asyncio.gather(
_probe("deribit", self._probe_deribit()),
_probe("macro", self._ctx.macro.get_calendar(days=1)),
_probe("sentiment", self._probe_sentiment()),
_probe("hyperliquid", self._ctx.hyperliquid.funding_rate_annualized("ETH")),
)
# SQLite health: lightweight transaction.
try:
conn = connect(self._ctx.db_path)
try:
self._ctx.repository.touch_health_check(conn, now=when)
finally:
conn.close()
except Exception as exc: # pragma: no cover — sqlite errors are rare
failures.append(("sqlite", f"{type(exc).__name__}: {exc}"))
if failures:
self._consecutive += 1
state: HealthState = "degraded"
self._ctx.audit_log.append(
event="HEALTH_DEGRADED",
payload={
"failures": failures,
"consecutive": self._consecutive,
},
now=when,
)
if self._consecutive >= self._kill_after:
await self._ctx.alert_manager.emit(
Severity.HIGH,
source="health_check",
message=(
f"{self._consecutive} consecutive health-check failures "
f"(latest: {failures})"
),
)
else:
self._consecutive = 0
state = "ok"
self._ctx.audit_log.append(
event="HEALTH_OK", payload={}, now=when
)
return HealthCheckResult(
state=state,
failures=failures,
consecutive_failures=self._consecutive,
)
async def _probe_deribit(self) -> None:
info = await self._ctx.deribit.environment_info()
if info.environment != self._expected:
raise RuntimeError(
f"deribit environment mismatch: expected {self._expected}, "
f"got {info.environment}"
)
async def _probe_sentiment(self) -> None:
# Avoid funding_cross which would raise on empty snapshot during
# the health probe; we only need a successful HTTP round-trip.
await self._ctx.sentiment._http.call(
"get_cross_exchange_funding", {"assets": ["ETH"]}
)