feat(V2): /health/ready con ping client + middleware request log strutturato + request_id correlation

- /health/ready: ping di tutti i client (exchange, env) cached con
  timeout 2s, status ready|degraded|not_ready, opt-in 503 via
  READY_FAILS_ON_DEGRADED.
- Middleware mcp.request: 1 riga JSON per HTTP request con request_id,
  method, path, status_code, duration_ms, actor, bot_tag, exchange,
  tool, client_ip, user_agent.
- request_id propagato in request.state, audit log e error envelope per
  correlazione cross-cutting.
- Aggiunto async health() come probe minimo a bybit/alpaca/macro/
  sentiment/deribit (hyperliquid lo aveva già).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
AdrianoDev
2026-05-01 09:03:28 +02:00
parent 9afd087152
commit 8ecc1a24a9
13 changed files with 509 additions and 2 deletions
+8 -1
View File
@@ -15,7 +15,14 @@ Environment = Literal["testnet", "mainnet"]
# Path che bypassano sia bearer auth sia bot_tag check.
PATH_WHITELIST_FULL = frozenset(
{"/health", "/apidocs", "/openapi.json", "/docs", "/redoc"}
{
"/health",
"/health/ready",
"/apidocs",
"/openapi.json",
"/docs",
"/redoc",
}
)
# Path che richiedono bearer ma NON il bot_tag (admin endpoint).
PATH_WHITELIST_BOT_TAG_ONLY = frozenset({"/admin/audit"})
+5
View File
@@ -74,6 +74,7 @@ def audit_write_op(
payload: dict[str, Any] | None = None,
result: dict[str, Any] | None = None,
error: str | None = None,
request_id: str | None = None,
) -> None:
"""Emit a structured audit log record per write operation.
@@ -86,6 +87,8 @@ def audit_write_op(
payload: input non-sensibile (qty, side, leverage, ecc.).
result: output del client (order_id, status, ecc.).
error: stringa errore se l'operazione ha fallito.
request_id: id propagato dal middleware request log per correlazione
tra audit log e request log.
"""
_configure_audit_sink()
record: dict[str, Any] = {
@@ -97,6 +100,8 @@ def audit_write_op(
"target": target,
"payload": payload or {},
}
if request_id is not None:
record["request_id"] = request_id
if result is not None:
record["result"] = _summarize_result(result)
if error is not None:
+3
View File
@@ -58,6 +58,7 @@ async def audit_call(
"""Esegue tool_fn e logga audit (success o error). Riraisola eccezioni."""
actor = getattr(request.state, "environment", None)
bot_tag = getattr(request.state, "bot_tag", None)
request_id = getattr(request.state, "request_id", None)
target = _extract_target(params, target_field)
payload = _safe_dump(params)
@@ -72,6 +73,7 @@ async def audit_call(
target=target,
payload=payload,
error=f"{type(e).__name__}: {e}",
request_id=request_id,
)
raise
@@ -93,5 +95,6 @@ async def audit_call(
target=target,
payload=payload,
result=audit_result,
request_id=request_id,
)
return result
+104
View File
@@ -0,0 +1,104 @@
"""Middleware: structured JSON request log per ogni HTTP request.
Emette una riga JSON sul logger ``mcp.request`` con campi correlabili
all'audit log via ``request_id``. Espone anche ``request_id`` su
``request.state`` così che handler/exception handler downstream possano
includerlo nei propri payload.
"""
from __future__ import annotations
import logging
import time
import uuid
from collections.abc import Awaitable, Callable
from datetime import UTC, datetime
from typing import Any
from fastapi import FastAPI, Request
from starlette.responses import Response
from cerbero_mcp.common.logging import get_json_logger
_logger = get_json_logger("mcp.request", level=logging.INFO)
def _extract_exchange(path: str) -> str | None:
"""Estrae il nome dell'exchange dal path se è un ``/mcp-{exchange}/...``."""
if not path.startswith("/mcp-"):
return None
rest = path[len("/mcp-"):]
end = rest.find("/")
if end < 0:
return rest or None
return rest[:end] or None
def _extract_tool(path: str) -> str | None:
"""Estrae nome tool dal path ``/mcp-X/tools/Y``."""
parts = path.split("/")
# ["", "mcp-deribit", "tools", "place_order"]
if len(parts) >= 4 and parts[2] == "tools":
return parts[3] or None
return None
def install_request_log_middleware(app: FastAPI) -> None:
"""Aggiunge un middleware HTTP che logga JSON per ogni request."""
@app.middleware("http")
async def request_log(
request: Request,
call_next: Callable[[Request], Awaitable[Response]],
) -> Response:
request_id = uuid.uuid4().hex
# Espone request_id per uso downstream (audit, error envelope)
request.state.request_id = request_id
t0 = time.perf_counter()
status_code = 500
error: str | None = None
response: Response | None = None
try:
response = await call_next(request)
status_code = response.status_code
except Exception as e:
error = f"{type(e).__name__}: {str(e)[:200]}"
raise
finally:
dur_ms = (time.perf_counter() - t0) * 1000
path = request.url.path
payload: dict[str, Any] = {
"event": "request",
"request_id": request_id,
"method": request.method,
"path": path,
"status_code": status_code,
"duration_ms": round(dur_ms, 2),
"timestamp": datetime.now(UTC).isoformat(),
}
ua = request.headers.get("user-agent")
if ua:
payload["user_agent"] = ua[:200]
client = request.client
if client is not None:
payload["client_ip"] = client.host
actor = getattr(request.state, "environment", None)
if actor:
payload["actor"] = actor
bot_tag = getattr(request.state, "bot_tag", None)
if bot_tag:
payload["bot_tag"] = bot_tag
exchange = _extract_exchange(path)
if exchange:
payload["exchange"] = exchange
tool = _extract_tool(path)
if tool:
payload["tool"] = tool
if error:
payload["error"] = error
_logger.error("request", extra=payload)
else:
_logger.info("request", extra=payload)
# response è settato se non c'è stata eccezione (altrimenti
# l'eccezione è stata già rilanciata dal blocco except).
assert response is not None
return response
@@ -99,6 +99,10 @@ class AlpacaClient:
if not self._http.is_closed:
await self._http.aclose()
async def health(self) -> dict[str, Any]:
"""Probe minimo per /health/ready: nessuna chiamata di rete."""
return {"status": "ok", "paper": self.paper}
# ── Helpers ──────────────────────────────────────────────────
@property
@@ -75,6 +75,10 @@ class BybitClient:
if self._owns_http:
await self._http.aclose()
async def health(self) -> dict[str, Any]:
"""Probe minimo per /health/ready: nessuna chiamata di rete."""
return {"status": "ok", "testnet": self.testnet}
# ── auth helpers ───────────────────────────────────────────
def _timestamp_ms(self) -> str:
@@ -97,6 +97,10 @@ class DeribitClient:
def is_testnet(self) -> dict:
return {"testnet": self.testnet, "base_url": self.base_url}
async def health(self) -> dict:
"""Probe minimo per /health/ready: nessuna chiamata di rete."""
return {"status": "ok", "testnet": self.testnet}
async def get_ticker(self, instrument_name: str) -> dict:
import datetime as _dt
raw = await self._request("public/ticker", {"instrument_name": instrument_name})
@@ -8,6 +8,8 @@ istanziato dal `ClientRegistry`.
"""
from __future__ import annotations
from typing import Any
class MacroClient:
"""Wrapper credenziali FRED/Finnhub. Stateless, no HTTP session."""
@@ -18,3 +20,7 @@ class MacroClient:
async def aclose(self) -> None: # pragma: no cover - no-op, no resources
return None
async def health(self) -> dict[str, Any]:
"""Probe minimo per /health/ready: nessuna chiamata di rete."""
return {"status": "ok"}
@@ -9,6 +9,8 @@ e per essere istanziato dal `ClientRegistry`.
"""
from __future__ import annotations
from typing import Any
class SentimentClient:
"""Wrapper credenziali CryptoPanic/LunarCrush. Stateless, no HTTP session."""
@@ -19,3 +21,7 @@ class SentimentClient:
async def aclose(self) -> None: # pragma: no cover - no-op, no resources
return None
async def health(self) -> dict[str, Any]:
"""Probe minimo per /health/ready: nessuna chiamata di rete."""
return {"status": "ok"}
+84
View File
@@ -1,7 +1,9 @@
"""Factory FastAPI app con middleware, swagger, exception handlers."""
from __future__ import annotations
import asyncio
import json
import os
import time
from datetime import UTC, datetime
from typing import Any
@@ -18,6 +20,7 @@ from cerbero_mcp.common.errors import (
RETRYABLE_STATUSES,
error_envelope,
)
from cerbero_mcp.common.request_log import install_request_log_middleware
class _TimestampInjectorMiddleware(BaseHTTPMiddleware):
@@ -99,6 +102,11 @@ def build_app(
app, testnet_token=testnet_token, mainnet_token=mainnet_token
)
# Request log middleware: registrato DOPO auth → starlette esegue
# i middleware in ordine inverso (LIFO) → request_log è outermost,
# auth è interno e popola request.state.* prima del ritorno.
install_request_log_middleware(app)
app.add_middleware(_TimestampInjectorMiddleware)
@app.middleware("http")
@@ -128,6 +136,7 @@ def build_app(
content=error_envelope(
type_="http_error", code=code, message=message,
retryable=retryable, details=details,
request_id=getattr(request.state, "request_id", None),
),
)
@@ -155,6 +164,7 @@ def build_app(
message=f"request body validation failed on {first_loc}",
retryable=False, suggested_fix=suggestion,
details={"errors": safe_errs},
request_id=getattr(request.state, "request_id", None),
),
)
@@ -166,6 +176,7 @@ def build_app(
type_="internal_error", code="UNHANDLED_EXCEPTION",
message=f"{type(exc).__name__}: {str(exc)[:300]}",
retryable=True,
request_id=getattr(request.state, "request_id", None),
),
)
@@ -179,6 +190,79 @@ def build_app(
"data_timestamp": datetime.now(UTC).isoformat(),
}
@app.get("/health/ready", tags=["system"])
async def health_ready():
"""Readiness check: ping ogni client exchange cached.
- Itera ``app.state.registry._clients`` (se presente).
- Per ogni client prova ``health()`` (preferito) o ``is_testnet()``.
In assenza di metodo, marca con ``note: no probe method``.
- Timeout di 2s per client tramite ``asyncio.wait_for``.
- Stato globale: ``ready`` se tutti ok, ``degraded`` se almeno
uno fallisce, ``not_ready`` se registry vuoto.
- HTTP 200 di default; con ``READY_FAILS_ON_DEGRADED=true`` ritorna
503 quando lo stato non è ``ready`` (utile per probe k8s).
"""
registry = getattr(app.state, "registry", None)
clients_status: list[dict[str, Any]] = []
if registry is not None:
for (exchange, env), client in registry._clients.items():
t0 = time.perf_counter()
ping = (
getattr(client, "health", None)
or getattr(client, "is_testnet", None)
)
if ping is None:
clients_status.append({
"exchange": exchange,
"env": env,
"healthy": True,
"note": "no probe method",
})
continue
try:
res = ping()
if asyncio.iscoroutine(res):
await asyncio.wait_for(res, timeout=2.0)
dur = (time.perf_counter() - t0) * 1000
clients_status.append({
"exchange": exchange,
"env": env,
"healthy": True,
"duration_ms": round(dur, 2),
})
except Exception as e:
clients_status.append({
"exchange": exchange,
"env": env,
"healthy": False,
"error": f"{type(e).__name__}: {str(e)[:200]}",
})
if not clients_status:
status_label = "not_ready"
elif all(c["healthy"] for c in clients_status):
status_label = "ready"
else:
status_label = "degraded"
fail_on_degraded = os.environ.get(
"READY_FAILS_ON_DEGRADED", "false"
).lower() in ("1", "true", "yes")
http_code = 200
if fail_on_degraded and status_label != "ready":
http_code = 503
body = {
"status": status_label,
"name": title,
"version": version,
"uptime_seconds": int(time.time() - app.state.boot_at),
"data_timestamp": datetime.now(UTC).isoformat(),
"clients": clients_status,
}
return JSONResponse(status_code=http_code, content=body)
def _custom_openapi() -> dict[str, Any]:
if app.openapi_schema:
return app.openapi_schema