feat(DATA-DB-001): add Databento historical price source for backtesting

- Add DatabentoHistoricalPriceSource implementing HistoricalPriceSource protocol - Smart caching with Parquet storage and metadata tracking - Auto symbol-to-dataset resolution (GLD→XNAS.BASIC, GC=F→GLBX.MDP3) - Cache management with age threshold invalidation - Cost estimation via metadata.get_cost() - Add databento>=0.30.0 to requirements.txt - Add DATABENTO_API_KEY to .env.example - Full test coverage with 16 tests
2026-03-29 09:58:02 +02:00
parent c02159481d
commit bf13ab5b46
5 changed files with 677 additions and 0 deletions
--- a/app/services/backtesting/databento_source.py
+++ b/app/services/backtesting/databento_source.py
@@ -0,0 +1,365 @@
+"""Databento historical price source for backtesting."""
+
+from __future__ import annotations
+
+import hashlib
+import json
+from dataclasses import dataclass
+from datetime import date, timedelta
+from pathlib import Path
+from typing import Any
+
+from app.services.backtesting.historical_provider import DailyClosePoint, HistoricalPriceSource
+
+try:
+    import databento as db
+    import pandas as pd
+
+    DATABENTO_AVAILABLE = True
+except ImportError:
+    DATABENTO_AVAILABLE = False
+    db = None  # type: ignore
+    pd = None  # type: ignore
+
+
+@dataclass(frozen=True)
+class DatabentoCacheKey:
+    """Cache key for Databento data requests."""
+
+    dataset: str
+    symbol: str
+    schema: str
+    start_date: date
+    end_date: date
+
+    def cache_path(self, cache_dir: Path) -> Path:
+        """Generate cache file path from key."""
+        key_str = f"{self.dataset}_{self.symbol}_{self.schema}_{self.start_date}_{self.end_date}"
+        key_hash = hashlib.sha256(key_str.encode()).hexdigest()[:16]
+        return cache_dir / f"dbn_{key_hash}.parquet"
+
+    def metadata_path(self, cache_dir: Path) -> Path:
+        """Generate metadata file path from key."""
+        key_str = f"{self.dataset}_{self.symbol}_{self.schema}_{self.start_date}_{self.end_date}"
+        key_hash = hashlib.sha256(key_str.encode()).hexdigest()[:16]
+        return cache_dir / f"dbn_{key_hash}_meta.json"
+
+
+@dataclass
+class DatabentoSourceConfig:
+    """Configuration for Databento data source."""
+
+    api_key: str | None = None  # Falls back to DATABENTO_API_KEY env var
+    cache_dir: Path = Path(".cache/databento")
+    dataset: str = "XNAS.BASIC"
+    schema: str = "ohlcv-1d"
+    stype_in: str = "raw_symbol"
+
+    # Re-download threshold
+    max_cache_age_days: int = 30
+
+    def __post_init__(self) -> None:
+        # Ensure cache_dir is a Path
+        if isinstance(self.cache_dir, str):
+            object.__setattr__(self, "cache_dir", Path(self.cache_dir))
+
+
+class DatabentoHistoricalPriceSource(HistoricalPriceSource):
+    """Databento-based historical price source for backtesting.
+
+    This provider fetches historical daily OHLCV data from Databento's API
+    and caches it locally to minimize API calls and costs.
+
+    Key features:
+    - Smart caching with configurable age threshold
+    - Cost estimation before fetching
+    - Symbol-to-dataset resolution (GLD→XNAS.BASIC, GC=F→GLBX.MDP3)
+    - Parquet storage for fast loading
+
+    Example usage:
+        source = DatabentoHistoricalPriceSource()
+        prices = source.load_daily_closes("GLD", date(2024, 1, 1), date(2024, 1, 31))
+    """
+
+    def __init__(self, config: DatabentoSourceConfig | None = None) -> None:
+        if not DATABENTO_AVAILABLE:
+            raise RuntimeError("databento package required: pip install databento>=0.30.0")
+
+        self.config = config or DatabentoSourceConfig()
+        self.config.cache_dir.mkdir(parents=True, exist_ok=True)
+        self._client: Any = None  # db.Historical
+
+    @property
+    def client(self) -> Any:
+        """Get or create Databento client."""
+        if self._client is None:
+            if db is None:
+                raise RuntimeError("databento package not installed")
+            self._client = db.Historical(key=self.config.api_key)
+        return self._client
+
+    def _load_from_cache(self, key: DatabentoCacheKey) -> list[DailyClosePoint] | None:
+        """Load cached data if available and fresh."""
+        cache_file = key.cache_path(self.config.cache_dir)
+        meta_file = key.metadata_path(self.config.cache_dir)
+
+        if not cache_file.exists() or not meta_file.exists():
+            return None
+
+        try:
+            with open(meta_file) as f:
+                meta = json.load(f)
+
+            # Check cache age
+            download_date = date.fromisoformat(meta["download_date"])
+            age_days = (date.today() - download_date).days
+            if age_days > self.config.max_cache_age_days:
+                return None
+
+            # Check parameters match
+            if meta["dataset"] != key.dataset or meta["symbol"] != key.symbol:
+                return None
+
+            # Load parquet and convert
+            if pd is None:
+                return None
+            df = pd.read_parquet(cache_file)
+            return self._df_to_daily_points(df)
+        except Exception:
+            return None
+
+    def _save_to_cache(self, key: DatabentoCacheKey, df: Any, cost_usd: float = 0.0) -> None:
+        """Save data to cache."""
+        if pd is None:
+            return
+
+        cache_file = key.cache_path(self.config.cache_dir)
+        meta_file = key.metadata_path(self.config.cache_dir)
+
+        df.to_parquet(cache_file, index=False)
+
+        meta = {
+            "download_date": date.today().isoformat(),
+            "dataset": key.dataset,
+            "symbol": key.symbol,
+            "schema": key.schema,
+            "start_date": key.start_date.isoformat(),
+            "end_date": key.end_date.isoformat(),
+            "rows": len(df),
+            "cost_usd": cost_usd,
+        }
+        with open(meta_file, "w") as f:
+            json.dump(meta, f, indent=2)
+
+    def _fetch_from_databento(self, key: DatabentoCacheKey) -> Any:
+        """Fetch data from Databento API."""
+        data = self.client.timeseries.get_range(
+            dataset=key.dataset,
+            symbols=key.symbol,
+            schema=key.schema,
+            start=key.start_date.isoformat(),
+            end=(key.end_date + timedelta(days=1)).isoformat(),  # Exclusive end
+            stype_in=self.config.stype_in,
+        )
+        return data.to_df()
+
+    def _df_to_daily_points(self, df: Any) -> list[DailyClosePoint]:
+        """Convert DataFrame to DailyClosePoint list."""
+        if pd is None:
+            return []
+
+        points = []
+        for idx, row in df.iterrows():
+            # Databento ohlcv schema has ts_event as timestamp
+            ts = row.get("ts_event", row.get("ts_recv", idx))
+            if hasattr(ts, "date"):
+                row_date = ts.date()
+            else:
+                # Parse ISO date string
+                ts_str = str(ts)
+                row_date = date.fromisoformat(ts_str[:10])
+
+            # Databento prices are int64 scaled by 1e-9
+            close_raw = row.get("close", 0)
+            if isinstance(close_raw, (int, float)):
+                close = float(close_raw) / 1e9 if close_raw > 1e9 else float(close_raw)
+            else:
+                close = float(close_raw)
+
+            if close > 0:
+                points.append(DailyClosePoint(date=row_date, close=close))
+
+        return sorted(points, key=lambda p: p.date)
+
+    def load_daily_closes(self, symbol: str, start_date: date, end_date: date) -> list[DailyClosePoint]:
+        """Load daily closing prices from Databento (with caching).
+
+        Args:
+            symbol: Trading symbol (GLD, GC=F, XAU)
+            start_date: Inclusive start date
+            end_date: Inclusive end date
+
+        Returns:
+            List of DailyClosePoint sorted by date
+        """
+        # Map symbols to datasets
+        dataset = self._resolve_dataset(symbol)
+        databento_symbol = self._resolve_symbol(symbol)
+
+        key = DatabentoCacheKey(
+            dataset=dataset,
+            symbol=databento_symbol,
+            schema=self.config.schema,
+            start_date=start_date,
+            end_date=end_date,
+        )
+
+        # Try cache first
+        cached = self._load_from_cache(key)
+        if cached is not None:
+            return cached
+
+        # Fetch from Databento
+        df = self._fetch_from_databento(key)
+
+        # Get cost estimate (approximate)
+        try:
+            cost_usd = self.get_cost_estimate(symbol, start_date, end_date)
+        except Exception:
+            cost_usd = 0.0
+
+        # Cache results
+        self._save_to_cache(key, df, cost_usd)
+
+        return self._df_to_daily_points(df)
+
+    def _resolve_dataset(self, symbol: str) -> str:
+        """Resolve symbol to Databento dataset."""
+        symbol_upper = symbol.upper()
+        if symbol_upper in ("GLD", "GLDM", "IAU"):
+            return "XNAS.BASIC"  # ETFs on Nasdaq
+        elif symbol_upper in ("GC=F", "GC", "GOLD"):
+            return "GLBX.MDP3"  # CME gold futures
+        elif symbol_upper == "XAU":
+            return "XNAS.BASIC"  # Treat as GLD proxy
+        else:
+            return self.config.dataset  # Use configured default
+
+    def _resolve_symbol(self, symbol: str) -> str:
+        """Resolve vault-dash symbol to Databento symbol."""
+        symbol_upper = symbol.upper()
+        if symbol_upper == "XAU":
+            return "GLD"  # Proxy XAU via GLD prices
+        elif symbol_upper == "GC=F":
+            return "GC"  # Use parent symbol for continuous contracts
+        return symbol_upper
+
+    def get_cost_estimate(self, symbol: str, start_date: date, end_date: date) -> float:
+        """Estimate cost in USD for a data request.
+
+        Args:
+            symbol: Trading symbol
+            start_date: Start date
+            end_date: End date
+
+        Returns:
+            Estimated cost in USD
+        """
+        dataset = self._resolve_dataset(symbol)
+        databento_symbol = self._resolve_symbol(symbol)
+
+        try:
+            cost = self.client.metadata.get_cost(
+                dataset=dataset,
+                symbols=databento_symbol,
+                schema=self.config.schema,
+                start=start_date.isoformat(),
+                end=(end_date + timedelta(days=1)).isoformat(),
+            )
+            return float(cost)
+        except Exception:
+            return 0.0  # Return 0 if cost estimation fails
+
+    def get_available_range(self, symbol: str) -> tuple[date | None, date | None]:
+        """Get the available date range for a symbol.
+
+        Args:
+            symbol: Trading symbol
+
+        Returns:
+            Tuple of (start_date, end_date) or (None, None) if unavailable
+        """
+        dataset = self._resolve_dataset(symbol)
+
+        try:
+            range_info = self.client.metadata.get_dataset_range(dataset=dataset)
+            start_str = range_info.get("start", "")
+            end_str = range_info.get("end", "")
+
+            start = date.fromisoformat(start_str[:10]) if start_str else None
+            end = date.fromisoformat(end_str[:10]) if end_str else None
+
+            return start, end
+        except Exception:
+            return None, None
+
+    def clear_cache(self) -> int:
+        """Clear all cached data files.
+
+        Returns:
+            Number of files deleted
+        """
+        count = 0
+        for file in self.config.cache_dir.glob("*"):
+            if file.is_file():
+                file.unlink()
+                count += 1
+        return count
+
+    def get_cache_stats(self) -> dict[str, Any]:
+        """Get cache statistics.
+
+        Returns:
+            Dict with total_size_bytes, file_count, oldest_download, entries
+        """
+        total_size = 0
+        file_count = 0
+        oldest_download: date | None = None
+        entries: list[dict[str, Any]] = []
+
+        for meta_file in self.config.cache_dir.glob("*_meta.json"):
+            try:
+                with open(meta_file) as f:
+                    meta = json.load(f)
+
+                download_date = date.fromisoformat(meta["download_date"])
+                cache_file = meta_file.with_name(meta_file.stem.replace("_meta", "") + ".parquet")
+
+                size = cache_file.stat().st_size if cache_file.exists() else 0
+                total_size += size
+                file_count += 2  # meta + parquet
+
+                if oldest_download is None or download_date < oldest_download:
+                    oldest_download = download_date
+
+                entries.append(
+                    {
+                        "dataset": meta["dataset"],
+                        "symbol": meta["symbol"],
+                        "start_date": meta["start_date"],
+                        "end_date": meta["end_date"],
+                        "rows": meta.get("rows", 0),
+                        "cost_usd": meta.get("cost_usd", 0.0),
+                        "download_date": meta["download_date"],
+                        "size_bytes": size,
+                    }
+                )
+            except Exception:
+                continue
+
+        return {
+            "total_size_bytes": total_size,
+            "file_count": file_count,
+            "oldest_download": oldest_download.isoformat() if oldest_download else None,
+            "entries": entries,
+        }