Phase 4 hardening: status CLI, lock file, backup job, hash enforce, pooling, real bias

Sei interventi mirati sui rischi operativi rilevati nell'audit post-Fase 4. 317 test pass, mypy strict pulito, ruff clean. 1. status CLI: legge SQLite reale e mostra kill_switch, posizioni aperte, environment, config_version, last_health_check, started_at. Sostituisce il placeholder "phase 0 skeleton". 2. Lock file single-instance: runtime/lockfile.py acquisisce data/.lockfile via fcntl.flock al boot di run_forever; un secondo container fallisce subito con LockError. 3. Backup orario nello scheduler: nuovo job APScheduler 0 * * * * chiama scripts.backup.backup_database + prune_backups. 4. config_hash enforce su start: il CLI start verifica l'integrità del file (enforce_hash=True). Mismatch → exit 1 prima di toccare stato. dry-run resta enforce_hash=False per debug. 5. Connection pooling MCP: RuntimeContext espone un httpx.AsyncClient long-lived condiviso da tutti i wrapper (limits 20/10 connections/keepalive). aclose() chiamato in run_forever finale. 6. Bias direzionale reale: deribit.historical_close + deribit.adx_14 popolano TrendContext con spot a 30 giorni e ADX(14) effettivi. Sblocca bull_put e bear_call. Quando i dati storici mancano l'engine emette alert MEDIUM e cade su no_entry in modo deterministico. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 00:15:28 +02:00
parent ca1e6379df
commit 411b747e93
11 changed files with 439 additions and 36 deletions
@@ -14,6 +14,8 @@ from dataclasses import dataclass
 from datetime import UTC, datetime
 from pathlib import Path

+import httpx
+
 from cerbero_bite.clients._base import HttpToolClient
 from cerbero_bite.clients.deribit import DeribitClient
 from cerbero_bite.clients.hyperliquid import HyperliquidClient
@@ -56,8 +58,14 @@ class RuntimeContext:
    portfolio: PortfolioClient
    telegram: TelegramClient

+    http_client: httpx.AsyncClient
+
    clock: Callable[[], datetime]

+    async def aclose(self) -> None:
+        """Close the shared HTTP client. Idempotent."""
+        await self.http_client.aclose()
+

 def _utc_now() -> datetime:
    return datetime.now(UTC)
@@ -103,6 +111,14 @@ def build_runtime(
        clock=clk,
    )

+    # Single long-lived AsyncClient shared by every wrapper. httpx pools
+    # connections per host so the snapshot stage of the entry cycle
+    # avoids paying TLS/TCP handshakes on each call.
+    http_client = httpx.AsyncClient(
+        timeout=httpx.Timeout(timeout_s),
+        limits=httpx.Limits(max_connections=20, max_keepalive_connections=10),
+    )
+
    def _client(service: str) -> HttpToolClient:
        return HttpToolClient(
            service=service,
@@ -110,6 +126,7 @@ def build_runtime(
            token=token,
            timeout_s=timeout_s,
            retry_max=retry_max,
+            client=http_client,
        )

    telegram = TelegramClient(_client("telegram"))
@@ -131,5 +148,6 @@ def build_runtime(
        hyperliquid=HyperliquidClient(_client("hyperliquid")),
        portfolio=PortfolioClient(_client("portfolio")),
        telegram=telegram,
+        http_client=http_client,
        clock=clk,
    )
@@ -84,6 +84,8 @@ class EntryCycleResult:
@dataclass(frozen=True)
 class _MarketSnapshot:
    spot_eth_usd: Decimal
+    spot_eth_30d_ago: Decimal | None
+    adx_14: Decimal | None
    dvol: Decimal
    funding_perp: Decimal
    funding_cross: Decimal
@@ -102,7 +104,28 @@ async def _gather_snapshot(
    cfg: StrategyConfig,
    now: datetime,
 ) -> _MarketSnapshot:
+    window_days = cfg.entry.trend_window_days
+    historical_start = now - timedelta(days=window_days + 1)
+    historical_end = now - timedelta(days=window_days - 1)
+    adx_start = now - timedelta(days=10)
+
    spot_t: asyncio.Task[Decimal] = asyncio.create_task(deribit.index_price_eth())
+    spot_past_t: asyncio.Task[Decimal | None] = asyncio.create_task(
+        deribit.historical_close(
+            instrument="ETH-PERPETUAL",
+            start=historical_start,
+            end=historical_end,
+            resolution="1D",
+        )
+    )
+    adx_t: asyncio.Task[Decimal | None] = asyncio.create_task(
+        deribit.adx_14(
+            instrument="ETH-PERPETUAL",
+            start=adx_start,
+            end=now,
+            resolution="1h",
+        )
+    )
    dvol_t: asyncio.Task[Decimal] = asyncio.create_task(
        deribit.latest_dvol(currency="ETH", now=now)
    )
@@ -128,6 +151,8 @@ async def _gather_snapshot(

    await asyncio.gather(
        spot_t,
+        spot_past_t,
+        adx_t,
        dvol_t,
        funding_perp_t,
        funding_cross_t,
@@ -137,6 +162,8 @@ async def _gather_snapshot(
    )
    return _MarketSnapshot(
        spot_eth_usd=spot_t.result(),
+        spot_eth_30d_ago=spot_past_t.result(),
+        adx_14=adx_t.result(),
        dvol=dvol_t.result(),
        funding_perp=funding_perp_t.result(),
        funding_cross=funding_cross_t.result(),
@@ -299,6 +326,10 @@ async def run_entry_cycle(
    inputs = {
        "snapshot": {
            "spot_eth_usd": str(snap.spot_eth_usd),
+            "spot_eth_30d_ago": (
+                str(snap.spot_eth_30d_ago) if snap.spot_eth_30d_ago else None
+            ),
+            "adx_14": str(snap.adx_14) if snap.adx_14 is not None else None,
            "dvol": str(snap.dvol),
            "funding_perp": str(snap.funding_perp),
            "funding_cross": str(snap.funding_cross),
@@ -326,17 +357,26 @@ async def run_entry_cycle(
            status=_STATUS_NO_ENTRY, reason=";".join(decision.reasons)
        )

-    # 3. Bias (need a 30-day prior spot — orchestrator passes it in)
-    # We approximate by reusing the current spot until the historical
-    # snapshot store ships in Phase 5; for now no historical → bias
-    # cannot fire bull/bear, only iron_condor when DVOL/ADX align. The
-    # caller is responsible for plugging in real data via overrides.
+    # 3. Bias — eth_30d_ago and adx_14 come from the historical snapshot
+    # collected during the parallel snapshot stage. When either signal
+    # is missing the bias function falls back to "no entry" (defensive
+    # behaviour: never trade without confirmed regime data).
+    if snap.spot_eth_30d_ago is None:
+        await alert.medium(
+            source="entry_cycle",
+            message="historical spot unavailable — bias falls back to neutral",
+        )
+    if snap.adx_14 is None:
+        await alert.medium(
+            source="entry_cycle",
+            message="ADX unavailable — bias may reject iron_condor",
+        )
    trend_ctx = TrendContext(
        eth_now=snap.spot_eth_usd,
-        eth_30d_ago=snap.spot_eth_usd,
+        eth_30d_ago=snap.spot_eth_30d_ago or snap.spot_eth_usd,
        funding_cross_annualized=snap.funding_cross,
        dvol_now=snap.dvol,
-        adx_14=Decimal("25"),  # placeholder until ADX lands in market data
+        adx_14=snap.adx_14 if snap.adx_14 is not None else Decimal("25"),
    )
    bias = compute_bias(trend_ctx, cfg)
    if bias is None:
@@ -0,0 +1,91 @@
+"""Single-instance file lock for the engine (``docs/02-architecture.md``).
+
+Acquires an exclusive ``fcntl.flock`` on a sentinel file at boot. A
+second container/process trying to start while another holds the lock
+gets :class:`LockError` immediately and exits before doing any I/O.
+
+The lock is released either on context exit or when the process dies
+(the OS releases ``flock`` automatically), so a hard crash cannot
+permanently wedge the system.
+"""
+
+from __future__ import annotations
+
+import fcntl
+import logging
+import os
+from pathlib import Path
+from types import TracebackType
+
+__all__ = ["EngineLock", "LockError"]
+
+
+_log = logging.getLogger("cerbero_bite.runtime.lockfile")
+
+
+class LockError(RuntimeError):
+    """Raised when another instance already holds the engine lock."""
+
+
+class EngineLock:
+    """Context manager around a per-instance file lock.
+
+    Usage::
+
+        with EngineLock(Path("data/.lockfile")) as lock:
+            ...
+
+    On enter, writes the current PID to the lock file so an operator
+    can identify the running instance. On exit, releases the lock.
+    """
+
+    def __init__(self, path: Path | str) -> None:
+        self._path = Path(path)
+        self._fh: object | None = None  # actual type: io.TextIOWrapper
+
+    @property
+    def path(self) -> Path:
+        return self._path
+
+    def acquire(self) -> None:
+        self._path.parent.mkdir(parents=True, exist_ok=True)
+        # ``r+`` would fail if the file does not exist yet; use ``a+``
+        # then seek to overwrite the contents on each acquire.
+        fh = self._path.open("a+", encoding="utf-8")
+        try:
+            fcntl.flock(fh.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
+        except BlockingIOError as exc:
+            fh.close()
+            raise LockError(
+                f"another Cerbero Bite instance holds {self._path}; "
+                f"check the running container or remove the file if stale"
+            ) from exc
+
+        fh.seek(0)
+        fh.truncate()
+        fh.write(f"{os.getpid()}\n")
+        fh.flush()
+        self._fh = fh
+        _log.info("engine lock acquired: pid=%d path=%s", os.getpid(), self._path)
+
+    def release(self) -> None:
+        if self._fh is None:
+            return
+        try:
+            fcntl.flock(self._fh.fileno(), fcntl.LOCK_UN)  # type: ignore[attr-defined]
+        finally:
+            self._fh.close()  # type: ignore[attr-defined]
+            self._fh = None
+        _log.info("engine lock released: %s", self._path)
+
+    def __enter__(self) -> EngineLock:
+        self.acquire()
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc: BaseException | None,
+        tb: TracebackType | None,
+    ) -> None:
+        self.release()
@@ -25,6 +25,7 @@ from cerbero_bite.config.schema import StrategyConfig
 from cerbero_bite.runtime.dependencies import RuntimeContext, build_runtime
 from cerbero_bite.runtime.entry_cycle import EntryCycleResult, run_entry_cycle
 from cerbero_bite.runtime.health_check import HealthCheck, HealthCheckResult
+from cerbero_bite.runtime.lockfile import EngineLock
 from cerbero_bite.runtime.monitor_cycle import MonitorCycleResult, run_monitor_cycle
 from cerbero_bite.runtime.recovery import recover_state
 from cerbero_bite.runtime.scheduler import JobSpec, build_scheduler
@@ -40,6 +41,8 @@ Environment = Literal["testnet", "mainnet"]
 _CRON_ENTRY = "0 14 * * MON"
 _CRON_MONITOR = "0 2,14 * * *"
 _CRON_HEALTH = "*/5 * * * *"
+_CRON_BACKUP = "0 * * * *"
+_BACKUP_RETENTION_DAYS = 30


@dataclass(frozen=True)
@@ -135,6 +138,9 @@ class Orchestrator:
        entry_cron: str = _CRON_ENTRY,
        monitor_cron: str = _CRON_MONITOR,
        health_cron: str = _CRON_HEALTH,
+        backup_cron: str = _CRON_BACKUP,
+        backup_dir: Path | None = None,
+        backup_retention_days: int = _BACKUP_RETENTION_DAYS,
    ) -> AsyncIOScheduler:
        """Build the scheduler with the canonical job set, ready to start."""

@@ -158,24 +164,77 @@ class Orchestrator:
        async def _health() -> None:
            await _safe("health", self.run_health)

+        backups_target = backup_dir or self._ctx.db_path.parent / "backups"
+
+        async def _backup() -> None:
+            async def _do() -> None:
+                await asyncio.to_thread(
+                    _run_backup,
+                    db_path=self._ctx.db_path,
+                    backup_dir=backups_target,
+                    retention_days=backup_retention_days,
+                )
+
+            await _safe("backup", _do)
+
        self._scheduler = build_scheduler(
            [
                JobSpec(name="entry", cron=entry_cron, coro_factory=_entry),
                JobSpec(name="monitor", cron=monitor_cron, coro_factory=_monitor),
                JobSpec(name="health", cron=health_cron, coro_factory=_health),
+                JobSpec(name="backup", cron=backup_cron, coro_factory=_backup),
            ]
        )
        return self._scheduler

-    async def run_forever(self) -> None:
-        """Boot, install the scheduler, and block forever (until cancelled)."""
-        await self.boot()
-        scheduler = self.install_scheduler()
-        scheduler.start()
-        try:
-            await asyncio.Event().wait()
-        finally:
-            scheduler.shutdown(wait=False)
+    async def run_forever(self, *, lock_path: Path | None = None) -> None:
+        """Boot, acquire the single-instance lock, install the scheduler.
+
+        ``lock_path`` defaults to ``<db_path.parent>/.lockfile`` so two
+        containers cannot trade against the same SQLite file.
+        """
+        lock = EngineLock(
+            lock_path or self._ctx.db_path.parent / ".lockfile"
+        )
+        with lock:
+            try:
+                await self.boot()
+                scheduler = self.install_scheduler()
+                scheduler.start()
+                try:
+                    await asyncio.Event().wait()
+                finally:
+                    scheduler.shutdown(wait=False)
+            finally:
+                await self._ctx.aclose()
+
+
+def _run_backup(
+    *, db_path: Path, backup_dir: Path, retention_days: int
+) -> None:
+    """Synchronous helper invoked from the scheduler via ``asyncio.to_thread``.
+
+    Keeps the import of ``scripts.backup`` lazy: importing the module
+    eagerly at orchestrator load time would mean the scheduler depends
+    on a script that lives outside the ``cerbero_bite`` package, which
+    breaks ``importlib.util.spec_from_file_location`` if the cwd shifts
+    at runtime.
+    """
+    import sys  # noqa: PLC0415  — kept lazy to keep module load cheap
+    from importlib.util import (  # noqa: PLC0415
+        module_from_spec,
+        spec_from_file_location,
+    )
+
+    backup_py = Path(__file__).resolve().parents[3] / "scripts" / "backup.py"
+    spec = spec_from_file_location("_cerbero_bite_backup", backup_py)
+    if spec is None or spec.loader is None:  # pragma: no cover — only on broken installs
+        raise RuntimeError(f"cannot load scripts/backup.py from {backup_py}")
+    module = module_from_spec(spec)
+    sys.modules.setdefault(spec.name, module)
+    spec.loader.exec_module(module)
+    module.backup_database(db_path=db_path, backup_dir=backup_dir)
+    module.prune_backups(backup_dir, retention_days=retention_days)


 # ---------------------------------------------------------------------------