feat(data): OHLCV loader via ccxt with parquet cache

Adjust .gitignore to keep src/multi_swarm/data/ tracked (only top-level /data/ cache directory remains ignored). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-09 19:01:25 +02:00
parent b73416f482
commit b977c371e5
4 changed files with 138 additions and 1 deletions
@@ -30,7 +30,7 @@ runs.db-shm
 series/
 *.parquet
 *.feather
-data/
+/data/
 checkpoints/
 logs/
 *.log
@@ -0,0 +1,73 @@
 from __future__ import annotations
 import hashlib
 from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path
 import ccxt  # type: ignore[import-untyped]
 import pandas as pd  # type: ignore[import-untyped]
@dataclass(frozen=True)
 class OHLCVRequest:
    symbol: str
    timeframe: str
    start: datetime
    end: datetime
    def cache_key(self) -> str:
        s = f"{self.symbol}|{self.timeframe}|{self.start.isoformat()}|{self.end.isoformat()}"
        return hashlib.sha1(s.encode()).hexdigest()[:16]
 class OHLCVLoader:
    """Carica OHLCV via ccxt (Binance) e cachea in parquet."""
    def __init__(self, cache_dir: Path, exchange_name: str = "binance"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        self.exchange_name = exchange_name
    def load(self, req: OHLCVRequest) -> pd.DataFrame:
        cache_file = self.cache_dir / f"{req.cache_key()}.parquet"
        if cache_file.exists():
            return pd.read_parquet(cache_file)
        df = self._fetch_paginated(req)
        df.to_parquet(cache_file)
        return df
    @staticmethod
    def _timeframe_to_ms(timeframe: str) -> int:
        units = {"m": 60, "h": 3600, "d": 86400, "w": 604800}
        unit = timeframe[-1]
        if unit not in units:
            raise ValueError(f"Unsupported timeframe: {timeframe}")
        return int(timeframe[:-1]) * units[unit] * 1000
    def _fetch_paginated(self, req: OHLCVRequest) -> pd.DataFrame:
        exchange = getattr(ccxt, self.exchange_name)({"enableRateLimit": True})
        timeframe_ms = self._timeframe_to_ms(req.timeframe)
        since = int(req.start.timestamp() * 1000)
        end_ms = int(req.end.timestamp() * 1000)
        all_rows: list[list[float]] = []
        limit = 1000
        while since <= end_ms:
            rows = exchange.fetch_ohlcv(req.symbol, req.timeframe, since=since, limit=limit)
            if not rows:
                break
            all_rows.extend(rows)
            last_ts = rows[-1][0]
            new_since = last_ts + timeframe_ms
            if new_since <= since:
                break
            since = new_since
        df = pd.DataFrame(all_rows, columns=["ts", "open", "high", "low", "close", "volume"])
        df = df.drop_duplicates(subset=["ts"]).sort_values("ts")
        df["ts"] = pd.to_datetime(df["ts"], unit="ms", utc=True)
        df = df.set_index("ts")
        df = df[(df.index >= req.start) & (df.index < req.end)]
        return df[["open", "high", "low", "close", "volume"]].astype("float64")
@@ -0,0 +1,64 @@
 from datetime import UTC, datetime
 from pathlib import Path
 import pandas as pd
 import pytest
 from multi_swarm.data.ohlcv_loader import OHLCVLoader, OHLCVRequest
@pytest.fixture
 def sample_ohlcv_rows():
    base_ts = int(datetime(2024, 1, 1, tzinfo=UTC).timestamp() * 1000)
    rows = []
    for i in range(48):
        rows.append(
            [base_ts + i * 3600 * 1000, 40000 + i, 40100 + i, 39900 + i, 40050 + i, 100.0 + i]
        )
    return rows
 def test_loader_fetches_and_caches(tmp_path: Path, mocker, sample_ohlcv_rows):
    fake_exchange = mocker.MagicMock()
    fake_exchange.fetch_ohlcv.return_value = sample_ohlcv_rows
    mocker.patch("multi_swarm.data.ohlcv_loader.ccxt.binance", return_value=fake_exchange)
    loader = OHLCVLoader(cache_dir=tmp_path)
    req = OHLCVRequest(
        symbol="BTC/USDT",
        timeframe="1h",
        start=datetime(2024, 1, 1, tzinfo=UTC),
        end=datetime(2024, 1, 3, tzinfo=UTC),
    )
    df = loader.load(req)
    assert isinstance(df, pd.DataFrame)
    assert list(df.columns) == ["open", "high", "low", "close", "volume"]
    assert len(df) == 48
    assert df.index.is_monotonic_increasing
    cache_files = list(tmp_path.glob("*.parquet"))
    assert len(cache_files) == 1
 def test_loader_uses_cache_on_second_call(tmp_path: Path, mocker, sample_ohlcv_rows):
    fake_exchange = mocker.MagicMock()
    fake_exchange.fetch_ohlcv.return_value = sample_ohlcv_rows
    mocker.patch("multi_swarm.data.ohlcv_loader.ccxt.binance", return_value=fake_exchange)
    loader = OHLCVLoader(cache_dir=tmp_path)
    req = OHLCVRequest(
        symbol="BTC/USDT",
        timeframe="1h",
        start=datetime(2024, 1, 1, tzinfo=UTC),
        end=datetime(2024, 1, 3, tzinfo=UTC),
    )
    df1 = loader.load(req)
    df2 = loader.load(req)
    assert fake_exchange.fetch_ohlcv.call_count == 2  # paginazione interna, non caching
    pd.testing.assert_frame_equal(df1, df2)
    # Seconda chiamata legge da cache, non chiama exchange
    fake_exchange.fetch_ohlcv.reset_mock()
    df3 = loader.load(req)
    assert fake_exchange.fetch_ohlcv.call_count == 0
    pd.testing.assert_frame_equal(df1, df3)