abf5a140e2
Each bot now manages its own notification + portfolio aggregation: * TelegramClient calls the public Bot API directly via httpx, reading CERBERO_BITE_TELEGRAM_BOT_TOKEN / CERBERO_BITE_TELEGRAM_CHAT_ID from env. No credentials → silent disabled mode. * PortfolioClient composes DeribitClient + HyperliquidClient + the new MacroClient.get_asset_price/eur_usd_rate to expose equity (EUR) and per-asset exposure as the bot's own slice (no cross-bot view). * mcp-telegram and mcp-portfolio removed from MCP_SERVICES / McpEndpoints and the cerbero-bite ping CLI; health_check no longer probes portfolio. Docs (02/04/06/07) and docker-compose updated to reflect the new architecture. 353/353 tests pass; ruff clean; mypy src clean. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
128 lines
4.3 KiB
Python
128 lines
4.3 KiB
Python
"""Periodic health probe across MCP services + SQLite + environment.
|
|
|
|
The probe is fail-soft: every check is wrapped in a try/except so a
|
|
single misbehaving service does not abort the others. The orchestrator
|
|
keeps a counter of consecutive failures: at the third failure the
|
|
kill switch arms (HIGH severity); any time the probe succeeds the
|
|
counter resets and a fresh ``HEALTH_OK`` line is appended to the
|
|
audit log so the dead-man watcher stays quiet.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from datetime import UTC, datetime
|
|
from typing import Literal
|
|
|
|
from cerbero_bite.runtime.alert_manager import Severity
|
|
from cerbero_bite.runtime.dependencies import RuntimeContext
|
|
from cerbero_bite.state import connect
|
|
|
|
__all__ = ["HealthCheck", "HealthCheckResult", "HealthState"]
|
|
|
|
|
|
_log = logging.getLogger("cerbero_bite.runtime.health")
|
|
|
|
|
|
HealthState = Literal["ok", "degraded"]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class HealthCheckResult:
|
|
state: HealthState
|
|
failures: list[tuple[str, str]] # [(service, reason), ...]
|
|
consecutive_failures: int
|
|
|
|
|
|
class HealthCheck:
|
|
"""Stateful health probe; remembers consecutive failures across calls."""
|
|
|
|
def __init__(
|
|
self,
|
|
ctx: RuntimeContext,
|
|
*,
|
|
expected_environment: Literal["testnet", "mainnet"],
|
|
kill_after: int = 3,
|
|
) -> None:
|
|
self._ctx = ctx
|
|
self._expected = expected_environment
|
|
self._kill_after = kill_after
|
|
self._consecutive = 0
|
|
|
|
async def run(self, *, now: datetime | None = None) -> HealthCheckResult:
|
|
when = (now or self._ctx.clock()).astimezone(UTC)
|
|
failures: list[tuple[str, str]] = []
|
|
|
|
async def _probe(service: str, coro: object) -> None:
|
|
try:
|
|
await coro # type: ignore[misc]
|
|
except Exception as exc: # surface every error to the operator
|
|
failures.append((service, f"{type(exc).__name__}: {exc}"))
|
|
|
|
await asyncio.gather(
|
|
_probe("deribit", self._probe_deribit()),
|
|
_probe("macro", self._ctx.macro.get_calendar(days=1)),
|
|
_probe("sentiment", self._probe_sentiment()),
|
|
_probe("hyperliquid", self._ctx.hyperliquid.funding_rate_annualized("ETH")),
|
|
)
|
|
|
|
# SQLite health: lightweight transaction.
|
|
try:
|
|
conn = connect(self._ctx.db_path)
|
|
try:
|
|
self._ctx.repository.touch_health_check(conn, now=when)
|
|
finally:
|
|
conn.close()
|
|
except Exception as exc: # pragma: no cover — sqlite errors are rare
|
|
failures.append(("sqlite", f"{type(exc).__name__}: {exc}"))
|
|
|
|
if failures:
|
|
self._consecutive += 1
|
|
state: HealthState = "degraded"
|
|
self._ctx.audit_log.append(
|
|
event="HEALTH_DEGRADED",
|
|
payload={
|
|
"failures": failures,
|
|
"consecutive": self._consecutive,
|
|
},
|
|
now=when,
|
|
)
|
|
if self._consecutive >= self._kill_after:
|
|
await self._ctx.alert_manager.emit(
|
|
Severity.HIGH,
|
|
source="health_check",
|
|
message=(
|
|
f"{self._consecutive} consecutive health-check failures "
|
|
f"(latest: {failures})"
|
|
),
|
|
)
|
|
else:
|
|
self._consecutive = 0
|
|
state = "ok"
|
|
self._ctx.audit_log.append(
|
|
event="HEALTH_OK", payload={}, now=when
|
|
)
|
|
|
|
return HealthCheckResult(
|
|
state=state,
|
|
failures=failures,
|
|
consecutive_failures=self._consecutive,
|
|
)
|
|
|
|
async def _probe_deribit(self) -> None:
|
|
info = await self._ctx.deribit.environment_info()
|
|
if info.environment != self._expected:
|
|
raise RuntimeError(
|
|
f"deribit environment mismatch: expected {self._expected}, "
|
|
f"got {info.environment}"
|
|
)
|
|
|
|
async def _probe_sentiment(self) -> None:
|
|
# Avoid funding_cross which would raise on empty snapshot during
|
|
# the health probe; we only need a successful HTTP round-trip.
|
|
await self._ctx.sentiment._http.call(
|
|
"get_cross_exchange_funding", {"assets": ["ETH"]}
|
|
)
|