feat(V2): /health/ready con ping client + middleware request log strutturato + request_id correlation

- /health/ready: ping di tutti i client (exchange, env) cached con
  timeout 2s, status ready|degraded|not_ready, opt-in 503 via
  READY_FAILS_ON_DEGRADED.
- Middleware mcp.request: 1 riga JSON per HTTP request con request_id,
  method, path, status_code, duration_ms, actor, bot_tag, exchange,
  tool, client_ip, user_agent.
- request_id propagato in request.state, audit log e error envelope per
  correlazione cross-cutting.
- Aggiunto async health() come probe minimo a bybit/alpaca/macro/
  sentiment/deribit (hyperliquid lo aveva già).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
AdrianoDev
2026-05-01 09:03:28 +02:00
parent 9afd087152
commit 8ecc1a24a9
13 changed files with 509 additions and 2 deletions
+40 -1
View File
@@ -80,7 +80,8 @@ non è richiesto sugli endpoint pubblici (`/health`, `/apidocs`,
| Path | Descrizione |
|---|---|
| `GET /health` | Healthcheck (no auth) |
| `GET /health` | Liveness check (no auth) |
| `GET /health/ready` | Readiness check con ping client exchange (no auth) |
| `GET /apidocs` | Swagger UI (no auth) |
| `GET /openapi.json` | Schema OpenAPI 3.1 (no auth) |
| `POST /mcp-deribit/tools/{tool}` | Tool exchange Deribit |
@@ -91,6 +92,44 @@ non è richiesto sugli endpoint pubblici (`/health`, `/apidocs`,
| `POST /mcp-sentiment/tools/{tool}` | Tool sentiment/news |
| `GET /admin/audit` | Query dell'audit log JSONL (bearer richiesto, no X-Bot-Tag) |
## Observability
### Health check
L'applicazione espone due endpoint distinti per il monitoring:
- `GET /health` — liveness check semplice. Non richiede autenticazione e
ritorna sempre HTTP 200 finché il processo è vivo. Ideale per la
liveness probe di Kubernetes o per il pinger di Traefik.
- `GET /health/ready` — readiness check evoluto. Itera tutti i client
exchange presenti nel registry e per ciascuno tenta una probe leggera
(`health()` se disponibile, fallback su `is_testnet()`), con timeout
di 2 secondi per client. La risposta contiene il campo `status` con
uno dei valori `ready` (tutti i client rispondono), `degraded` (almeno
uno fallisce) o `not_ready` (registry vuoto) ed un array `clients` con
un record per ogni coppia `(exchange, env)` cached. Per default
l'endpoint risponde sempre con HTTP 200; impostando la variabile
d'ambiente `READY_FAILS_ON_DEGRADED=true` si forza HTTP 503 quando lo
stato non è `ready`, comportamento utile per la readiness probe di
Kubernetes.
### Request log
Ogni richiesta HTTP attraversa un middleware che emette una riga JSON
sul logger `mcp.request` con i seguenti campi: `request_id`, `method`,
`path`, `status_code`, `duration_ms`, `actor` (`testnet` o `mainnet`,
solo se autenticato), `bot_tag` (header `X-Bot-Tag` se presente),
`exchange` (estratto dal path `/mcp-{exchange}/...`), `tool` (nome del
tool quando il path è `/mcp-X/tools/Y`), `client_ip`, `user_agent`. Lo
stesso `request_id` viene incluso anche nei record dell'audit log
`mcp.audit` e nell'envelope di errore restituito al client, in modo da
poter correlare le tre tracce a parità di richiesta.
### Audit log
Vedi la sezione "Audit query" qui sotto per la consultazione del log
strutturato delle operazioni di scrittura.
## Audit query
`GET /admin/audit` legge il file JSONL puntato da `AUDIT_LOG_FILE` e
+8 -1
View File
@@ -15,7 +15,14 @@ Environment = Literal["testnet", "mainnet"]
# Path che bypassano sia bearer auth sia bot_tag check.
PATH_WHITELIST_FULL = frozenset(
{"/health", "/apidocs", "/openapi.json", "/docs", "/redoc"}
{
"/health",
"/health/ready",
"/apidocs",
"/openapi.json",
"/docs",
"/redoc",
}
)
# Path che richiedono bearer ma NON il bot_tag (admin endpoint).
PATH_WHITELIST_BOT_TAG_ONLY = frozenset({"/admin/audit"})
+5
View File
@@ -74,6 +74,7 @@ def audit_write_op(
payload: dict[str, Any] | None = None,
result: dict[str, Any] | None = None,
error: str | None = None,
request_id: str | None = None,
) -> None:
"""Emit a structured audit log record per write operation.
@@ -86,6 +87,8 @@ def audit_write_op(
payload: input non-sensibile (qty, side, leverage, ecc.).
result: output del client (order_id, status, ecc.).
error: stringa errore se l'operazione ha fallito.
request_id: id propagato dal middleware request log per correlazione
tra audit log e request log.
"""
_configure_audit_sink()
record: dict[str, Any] = {
@@ -97,6 +100,8 @@ def audit_write_op(
"target": target,
"payload": payload or {},
}
if request_id is not None:
record["request_id"] = request_id
if result is not None:
record["result"] = _summarize_result(result)
if error is not None:
+3
View File
@@ -58,6 +58,7 @@ async def audit_call(
"""Esegue tool_fn e logga audit (success o error). Riraisola eccezioni."""
actor = getattr(request.state, "environment", None)
bot_tag = getattr(request.state, "bot_tag", None)
request_id = getattr(request.state, "request_id", None)
target = _extract_target(params, target_field)
payload = _safe_dump(params)
@@ -72,6 +73,7 @@ async def audit_call(
target=target,
payload=payload,
error=f"{type(e).__name__}: {e}",
request_id=request_id,
)
raise
@@ -93,5 +95,6 @@ async def audit_call(
target=target,
payload=payload,
result=audit_result,
request_id=request_id,
)
return result
+104
View File
@@ -0,0 +1,104 @@
"""Middleware: structured JSON request log per ogni HTTP request.
Emette una riga JSON sul logger ``mcp.request`` con campi correlabili
all'audit log via ``request_id``. Espone anche ``request_id`` su
``request.state`` così che handler/exception handler downstream possano
includerlo nei propri payload.
"""
from __future__ import annotations
import logging
import time
import uuid
from collections.abc import Awaitable, Callable
from datetime import UTC, datetime
from typing import Any
from fastapi import FastAPI, Request
from starlette.responses import Response
from cerbero_mcp.common.logging import get_json_logger
_logger = get_json_logger("mcp.request", level=logging.INFO)
def _extract_exchange(path: str) -> str | None:
"""Estrae il nome dell'exchange dal path se è un ``/mcp-{exchange}/...``."""
if not path.startswith("/mcp-"):
return None
rest = path[len("/mcp-"):]
end = rest.find("/")
if end < 0:
return rest or None
return rest[:end] or None
def _extract_tool(path: str) -> str | None:
"""Estrae nome tool dal path ``/mcp-X/tools/Y``."""
parts = path.split("/")
# ["", "mcp-deribit", "tools", "place_order"]
if len(parts) >= 4 and parts[2] == "tools":
return parts[3] or None
return None
def install_request_log_middleware(app: FastAPI) -> None:
"""Aggiunge un middleware HTTP che logga JSON per ogni request."""
@app.middleware("http")
async def request_log(
request: Request,
call_next: Callable[[Request], Awaitable[Response]],
) -> Response:
request_id = uuid.uuid4().hex
# Espone request_id per uso downstream (audit, error envelope)
request.state.request_id = request_id
t0 = time.perf_counter()
status_code = 500
error: str | None = None
response: Response | None = None
try:
response = await call_next(request)
status_code = response.status_code
except Exception as e:
error = f"{type(e).__name__}: {str(e)[:200]}"
raise
finally:
dur_ms = (time.perf_counter() - t0) * 1000
path = request.url.path
payload: dict[str, Any] = {
"event": "request",
"request_id": request_id,
"method": request.method,
"path": path,
"status_code": status_code,
"duration_ms": round(dur_ms, 2),
"timestamp": datetime.now(UTC).isoformat(),
}
ua = request.headers.get("user-agent")
if ua:
payload["user_agent"] = ua[:200]
client = request.client
if client is not None:
payload["client_ip"] = client.host
actor = getattr(request.state, "environment", None)
if actor:
payload["actor"] = actor
bot_tag = getattr(request.state, "bot_tag", None)
if bot_tag:
payload["bot_tag"] = bot_tag
exchange = _extract_exchange(path)
if exchange:
payload["exchange"] = exchange
tool = _extract_tool(path)
if tool:
payload["tool"] = tool
if error:
payload["error"] = error
_logger.error("request", extra=payload)
else:
_logger.info("request", extra=payload)
# response è settato se non c'è stata eccezione (altrimenti
# l'eccezione è stata già rilanciata dal blocco except).
assert response is not None
return response
@@ -99,6 +99,10 @@ class AlpacaClient:
if not self._http.is_closed:
await self._http.aclose()
async def health(self) -> dict[str, Any]:
"""Probe minimo per /health/ready: nessuna chiamata di rete."""
return {"status": "ok", "paper": self.paper}
# ── Helpers ──────────────────────────────────────────────────
@property
@@ -75,6 +75,10 @@ class BybitClient:
if self._owns_http:
await self._http.aclose()
async def health(self) -> dict[str, Any]:
"""Probe minimo per /health/ready: nessuna chiamata di rete."""
return {"status": "ok", "testnet": self.testnet}
# ── auth helpers ───────────────────────────────────────────
def _timestamp_ms(self) -> str:
@@ -97,6 +97,10 @@ class DeribitClient:
def is_testnet(self) -> dict:
return {"testnet": self.testnet, "base_url": self.base_url}
async def health(self) -> dict:
"""Probe minimo per /health/ready: nessuna chiamata di rete."""
return {"status": "ok", "testnet": self.testnet}
async def get_ticker(self, instrument_name: str) -> dict:
import datetime as _dt
raw = await self._request("public/ticker", {"instrument_name": instrument_name})
@@ -8,6 +8,8 @@ istanziato dal `ClientRegistry`.
"""
from __future__ import annotations
from typing import Any
class MacroClient:
"""Wrapper credenziali FRED/Finnhub. Stateless, no HTTP session."""
@@ -18,3 +20,7 @@ class MacroClient:
async def aclose(self) -> None: # pragma: no cover - no-op, no resources
return None
async def health(self) -> dict[str, Any]:
"""Probe minimo per /health/ready: nessuna chiamata di rete."""
return {"status": "ok"}
@@ -9,6 +9,8 @@ e per essere istanziato dal `ClientRegistry`.
"""
from __future__ import annotations
from typing import Any
class SentimentClient:
"""Wrapper credenziali CryptoPanic/LunarCrush. Stateless, no HTTP session."""
@@ -19,3 +21,7 @@ class SentimentClient:
async def aclose(self) -> None: # pragma: no cover - no-op, no resources
return None
async def health(self) -> dict[str, Any]:
"""Probe minimo per /health/ready: nessuna chiamata di rete."""
return {"status": "ok"}
+84
View File
@@ -1,7 +1,9 @@
"""Factory FastAPI app con middleware, swagger, exception handlers."""
from __future__ import annotations
import asyncio
import json
import os
import time
from datetime import UTC, datetime
from typing import Any
@@ -18,6 +20,7 @@ from cerbero_mcp.common.errors import (
RETRYABLE_STATUSES,
error_envelope,
)
from cerbero_mcp.common.request_log import install_request_log_middleware
class _TimestampInjectorMiddleware(BaseHTTPMiddleware):
@@ -99,6 +102,11 @@ def build_app(
app, testnet_token=testnet_token, mainnet_token=mainnet_token
)
# Request log middleware: registrato DOPO auth → starlette esegue
# i middleware in ordine inverso (LIFO) → request_log è outermost,
# auth è interno e popola request.state.* prima del ritorno.
install_request_log_middleware(app)
app.add_middleware(_TimestampInjectorMiddleware)
@app.middleware("http")
@@ -128,6 +136,7 @@ def build_app(
content=error_envelope(
type_="http_error", code=code, message=message,
retryable=retryable, details=details,
request_id=getattr(request.state, "request_id", None),
),
)
@@ -155,6 +164,7 @@ def build_app(
message=f"request body validation failed on {first_loc}",
retryable=False, suggested_fix=suggestion,
details={"errors": safe_errs},
request_id=getattr(request.state, "request_id", None),
),
)
@@ -166,6 +176,7 @@ def build_app(
type_="internal_error", code="UNHANDLED_EXCEPTION",
message=f"{type(exc).__name__}: {str(exc)[:300]}",
retryable=True,
request_id=getattr(request.state, "request_id", None),
),
)
@@ -179,6 +190,79 @@ def build_app(
"data_timestamp": datetime.now(UTC).isoformat(),
}
@app.get("/health/ready", tags=["system"])
async def health_ready():
"""Readiness check: ping ogni client exchange cached.
- Itera ``app.state.registry._clients`` (se presente).
- Per ogni client prova ``health()`` (preferito) o ``is_testnet()``.
In assenza di metodo, marca con ``note: no probe method``.
- Timeout di 2s per client tramite ``asyncio.wait_for``.
- Stato globale: ``ready`` se tutti ok, ``degraded`` se almeno
uno fallisce, ``not_ready`` se registry vuoto.
- HTTP 200 di default; con ``READY_FAILS_ON_DEGRADED=true`` ritorna
503 quando lo stato non è ``ready`` (utile per probe k8s).
"""
registry = getattr(app.state, "registry", None)
clients_status: list[dict[str, Any]] = []
if registry is not None:
for (exchange, env), client in registry._clients.items():
t0 = time.perf_counter()
ping = (
getattr(client, "health", None)
or getattr(client, "is_testnet", None)
)
if ping is None:
clients_status.append({
"exchange": exchange,
"env": env,
"healthy": True,
"note": "no probe method",
})
continue
try:
res = ping()
if asyncio.iscoroutine(res):
await asyncio.wait_for(res, timeout=2.0)
dur = (time.perf_counter() - t0) * 1000
clients_status.append({
"exchange": exchange,
"env": env,
"healthy": True,
"duration_ms": round(dur, 2),
})
except Exception as e:
clients_status.append({
"exchange": exchange,
"env": env,
"healthy": False,
"error": f"{type(e).__name__}: {str(e)[:200]}",
})
if not clients_status:
status_label = "not_ready"
elif all(c["healthy"] for c in clients_status):
status_label = "ready"
else:
status_label = "degraded"
fail_on_degraded = os.environ.get(
"READY_FAILS_ON_DEGRADED", "false"
).lower() in ("1", "true", "yes")
http_code = 200
if fail_on_degraded and status_label != "ready":
http_code = 503
body = {
"status": status_label,
"name": title,
"version": version,
"uptime_seconds": int(time.time() - app.state.boot_at),
"data_timestamp": datetime.now(UTC).isoformat(),
"clients": clients_status,
}
return JSONResponse(status_code=http_code, content=body)
def _custom_openapi() -> dict[str, Any]:
if app.openapi_schema:
return app.openapi_schema
+121
View File
@@ -0,0 +1,121 @@
from __future__ import annotations
import logging
import pytest
from fastapi.testclient import TestClient
from starlette.requests import Request as StarletteRequest
@pytest.fixture
def app(monkeypatch):
from tests.unit.test_settings import _minimal_env
for k, v in _minimal_env().items():
monkeypatch.setenv(k, v)
from cerbero_mcp.__main__ import _make_app
from cerbero_mcp.settings import Settings
return _make_app(Settings())
@pytest.fixture
def request_log_caplog(caplog):
"""Caplog per logger mcp.request: aggiunge l'handler caplog direttamente
al logger, bypassando ``propagate=False`` settato da common/logging.py.
"""
lg = logging.getLogger("mcp.request")
lg.addHandler(caplog.handler)
lg.setLevel(logging.INFO)
caplog.set_level(logging.INFO, logger="mcp.request")
try:
yield caplog
finally:
lg.removeHandler(caplog.handler)
def _request_records(caplog):
return [rec for rec in caplog.records if rec.name == "mcp.request"]
def test_request_log_emits_for_health(app, request_log_caplog):
c = TestClient(app)
r = c.get("/health")
assert r.status_code == 200
records = _request_records(request_log_caplog)
assert any(getattr(rec, "path", None) == "/health" for rec in records)
def test_request_log_includes_request_id(app, request_log_caplog):
c = TestClient(app)
c.get("/health")
records = _request_records(request_log_caplog)
assert records, "expected at least one mcp.request record"
for rec in records:
rid = getattr(rec, "request_id", None)
assert rid and isinstance(rid, str) and len(rid) >= 16
def test_request_log_includes_method_status_duration(app, request_log_caplog):
c = TestClient(app)
c.get("/health")
records = _request_records(request_log_caplog)
rec = next(rec for rec in records if getattr(rec, "path", None) == "/health")
assert getattr(rec, "method", None) == "GET"
assert getattr(rec, "status_code", None) == 200
assert isinstance(getattr(rec, "duration_ms", None), int | float)
def test_request_log_includes_actor_and_bot_tag_on_protected(
app, request_log_caplog
):
"""Su path autenticato actor/bot_tag/exchange/tool sono propagati."""
c = TestClient(app)
c.post(
"/mcp-deribit/tools/is_testnet",
headers={
"Authorization": "Bearer t_test_123",
"X-Bot-Tag": "scanner-x",
},
json={},
)
records = _request_records(request_log_caplog)
rec = next(
rec
for rec in records
if getattr(rec, "path", None) == "/mcp-deribit/tools/is_testnet"
)
assert getattr(rec, "actor", None) == "testnet"
assert getattr(rec, "bot_tag", None) == "scanner-x"
assert getattr(rec, "exchange", None) == "deribit"
assert getattr(rec, "tool", None) == "is_testnet"
def test_request_log_unauthorized_does_not_have_actor(
app, request_log_caplog
):
"""Senza bearer, request log emette comunque ma senza actor/bot_tag."""
c = TestClient(app)
c.post("/mcp-deribit/tools/is_testnet", json={})
records = _request_records(request_log_caplog)
rec = next(
rec
for rec in records
if getattr(rec, "path", None) == "/mcp-deribit/tools/is_testnet"
)
assert getattr(rec, "status_code", None) == 401
assert getattr(rec, "actor", None) is None
assert getattr(rec, "exchange", None) == "deribit"
def test_request_id_in_state_for_handlers(app):
"""Verifica che request.state.request_id sia disponibile a handler."""
@app.get("/__test_state")
def _state_handler(request: StarletteRequest) -> dict:
return {"rid": request.state.request_id}
c = TestClient(app)
r = c.get(
"/__test_state",
headers={"Authorization": "Bearer t_test_123", "X-Bot-Tag": "x"},
)
assert r.status_code == 200, f"got {r.status_code}: {r.text[:500]}"
assert r.json()["rid"]
+120
View File
@@ -60,3 +60,123 @@ def test_x_duration_ms_header(app):
c = TestClient(app)
r = c.get("/health")
assert "X-Duration-Ms" in r.headers
def test_health_ready_empty_registry(app):
"""Senza registry il readiness ritorna not_ready ma HTTP 200."""
c = TestClient(app)
r = c.get("/health/ready")
assert r.status_code == 200
j = r.json()
assert j["status"] == "not_ready"
assert j["clients"] == []
assert j["version"] == "2.0.0"
def test_health_ready_all_healthy(app):
"""Registry con stub client healthy → status=ready."""
from cerbero_mcp.client_registry import ClientRegistry
class _StubOk:
async def health(self):
return {"status": "ok"}
async def _builder(exchange, env): # pragma: no cover - non chiamato
return _StubOk()
reg = ClientRegistry(builder=_builder)
reg._clients[("deribit", "testnet")] = _StubOk()
reg._clients[("bybit", "mainnet")] = _StubOk()
app.state.registry = reg
c = TestClient(app)
r = c.get("/health/ready")
assert r.status_code == 200
j = r.json()
assert j["status"] == "ready"
assert len(j["clients"]) == 2
for entry in j["clients"]:
assert entry["healthy"] is True
assert "duration_ms" in entry
def test_health_ready_degraded_on_error(app):
"""Registry con almeno un client che fa raise → status=degraded."""
from cerbero_mcp.client_registry import ClientRegistry
class _StubOk:
async def health(self):
return {"status": "ok"}
class _StubFail:
async def health(self):
raise RuntimeError("boom")
async def _builder(exchange, env): # pragma: no cover - non chiamato
return _StubOk()
reg = ClientRegistry(builder=_builder)
reg._clients[("deribit", "testnet")] = _StubOk()
reg._clients[("bybit", "mainnet")] = _StubFail()
app.state.registry = reg
c = TestClient(app)
r = c.get("/health/ready")
assert r.status_code == 200
j = r.json()
assert j["status"] == "degraded"
fail = next(c for c in j["clients"] if c["exchange"] == "bybit")
assert fail["healthy"] is False
assert "RuntimeError" in fail["error"]
def test_health_ready_503_when_fail_on_degraded(app, monkeypatch):
"""READY_FAILS_ON_DEGRADED=true → HTTP 503 quando degraded."""
from cerbero_mcp.client_registry import ClientRegistry
class _StubFail:
async def health(self):
raise RuntimeError("boom")
async def _builder(exchange, env): # pragma: no cover - non chiamato
return _StubFail()
reg = ClientRegistry(builder=_builder)
reg._clients[("deribit", "testnet")] = _StubFail()
app.state.registry = reg
monkeypatch.setenv("READY_FAILS_ON_DEGRADED", "true")
c = TestClient(app)
r = c.get("/health/ready")
assert r.status_code == 503
assert r.json()["status"] == "degraded"
def test_health_ready_no_probe_method(app):
"""Client senza health/is_testnet → marcato healthy con note."""
from cerbero_mcp.client_registry import ClientRegistry
class _StubBare:
pass
async def _builder(exchange, env): # pragma: no cover - non chiamato
return _StubBare()
reg = ClientRegistry(builder=_builder)
reg._clients[("foo", "testnet")] = _StubBare()
app.state.registry = reg
c = TestClient(app)
r = c.get("/health/ready")
assert r.status_code == 200
j = r.json()
assert j["status"] == "ready"
assert j["clients"][0]["note"] == "no probe method"
def test_health_ready_in_whitelist_no_auth(app):
"""/health/ready non richiede bearer."""
c = TestClient(app)
# Nessun Authorization header → 200 (whitelist)
r = c.get("/health/ready")
assert r.status_code == 200