feat(DATA-DB-001): add Databento historical price source for backtesting

- Add DatabentoHistoricalPriceSource implementing HistoricalPriceSource protocol
- Smart caching with Parquet storage and metadata tracking
- Auto symbol-to-dataset resolution (GLD→XNAS.BASIC, GC=F→GLBX.MDP3)
- Cache management with age threshold invalidation
- Cost estimation via metadata.get_cost()
- Add databento>=0.30.0 to requirements.txt
- Add DATABENTO_API_KEY to .env.example
- Full test coverage with 16 tests
This commit is contained in:
Bu5hm4nn
2026-03-29 09:58:02 +02:00
parent c02159481d
commit bf13ab5b46
5 changed files with 677 additions and 0 deletions

View File

@@ -0,0 +1,310 @@
"""Tests for Databento historical price source."""
from __future__ import annotations
import json
import tempfile
from datetime import date, timedelta
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from app.services.backtesting.databento_source import (
DatabentoCacheKey,
DatabentoHistoricalPriceSource,
DatabentoSourceConfig,
)
@pytest.fixture
def temp_cache_dir():
"""Create a temporary cache directory."""
with tempfile.TemporaryDirectory() as tmpdir:
yield Path(tmpdir)
@pytest.fixture
def mock_databento_client():
"""Create a mock Databento client."""
mock_client = MagicMock()
return mock_client
@pytest.fixture
def sample_ohlcv_df():
"""Create sample OHLCV DataFrame."""
import pandas as pd
data = [
{"ts_event": "2024-01-02", "close": 185000000000}, # 185.0
{"ts_event": "2024-01-03", "close": 186500000000}, # 186.5
{"ts_event": "2024-01-04", "close": 184000000000}, # 184.0
{"ts_event": "2024-01-05", "close": 187000000000}, # 187.0
]
return pd.DataFrame(data)
class TestDatabentoCacheKey:
"""Tests for DatabentoCacheKey."""
def test_cache_path_generation(self, temp_cache_dir: Path) -> None:
"""Cache path is deterministic for same parameters."""
key = DatabentoCacheKey(
dataset="XNAS.BASIC",
symbol="GLD",
schema="ohlcv-1d",
start_date=date(2024, 1, 1),
end_date=date(2024, 1, 31),
)
path1 = key.cache_path(temp_cache_dir)
path2 = key.cache_path(temp_cache_dir)
assert path1 == path2
assert path1.suffix == ".parquet"
assert path1.name.startswith("dbn_")
def test_metadata_path_generation(self, temp_cache_dir: Path) -> None:
"""Metadata path matches cache path."""
key = DatabentoCacheKey(
dataset="XNAS.BASIC",
symbol="GLD",
schema="ohlcv-1d",
start_date=date(2024, 1, 1),
end_date=date(2024, 1, 31),
)
cache_path = key.cache_path(temp_cache_dir)
meta_path = key.metadata_path(temp_cache_dir)
assert meta_path.stem == cache_path.stem + "_meta"
assert meta_path.suffix == ".json"
class TestDatabentoSourceConfig:
"""Tests for DatabentoSourceConfig."""
def test_default_config(self) -> None:
"""Default config uses XNAS.BASIC and daily bars."""
config = DatabentoSourceConfig()
assert config.dataset == "XNAS.BASIC"
assert config.schema == "ohlcv-1d"
assert config.max_cache_age_days == 30
assert config.api_key is None
def test_custom_config(self) -> None:
"""Custom config overrides defaults."""
config = DatabentoSourceConfig(
api_key="test-key",
dataset="GLBX.MDP3",
schema="ohlcv-1h",
max_cache_age_days=7,
)
assert config.api_key == "test-key"
assert config.dataset == "GLBX.MDP3"
assert config.schema == "ohlcv-1h"
assert config.max_cache_age_days == 7
class TestDatabentoHistoricalPriceSource:
"""Tests for DatabentoHistoricalPriceSource."""
def test_resolve_dataset_gld(self) -> None:
"""GLD resolves to XNAS.BASIC."""
source = DatabentoHistoricalPriceSource.__new__(DatabentoHistoricalPriceSource)
source.config = DatabentoSourceConfig()
assert source._resolve_dataset("GLD") == "XNAS.BASIC"
assert source._resolve_dataset("gld") == "XNAS.BASIC"
assert source._resolve_dataset("GLDM") == "XNAS.BASIC"
def test_resolve_dataset_gc_f(self) -> None:
"""GC=F resolves to GLBX.MDP3."""
source = DatabentoHistoricalPriceSource.__new__(DatabentoHistoricalPriceSource)
source.config = DatabentoSourceConfig()
assert source._resolve_dataset("GC=F") == "GLBX.MDP3"
assert source._resolve_dataset("GC") == "GLBX.MDP3"
def test_resolve_dataset_xau(self) -> None:
"""XAU resolves to XNAS.BASIC (GLD proxy)."""
source = DatabentoHistoricalPriceSource.__new__(DatabentoHistoricalPriceSource)
source.config = DatabentoSourceConfig()
assert source._resolve_dataset("XAU") == "XNAS.BASIC"
def test_resolve_symbol_xau(self) -> None:
"""XAU resolves to GLD symbol."""
source = DatabentoHistoricalPriceSource.__new__(DatabentoHistoricalPriceSource)
source.config = DatabentoSourceConfig()
assert source._resolve_symbol("XAU") == "GLD"
def test_resolve_symbol_gc_f(self) -> None:
"""GC=F resolves to GC parent symbol."""
source = DatabentoHistoricalPriceSource.__new__(DatabentoHistoricalPriceSource)
source.config = DatabentoSourceConfig()
assert source._resolve_symbol("GC=F") == "GC"
def test_df_to_daily_points_converts_prices(self) -> None:
"""DataFrame prices are converted from int64 scaled format."""
import pandas as pd
source = DatabentoHistoricalPriceSource.__new__(DatabentoHistoricalPriceSource)
source.config = DatabentoSourceConfig()
df = pd.DataFrame(
[
{"ts_event": "2024-01-02", "close": 185000000000}, # 185.0
{"ts_event": "2024-01-03", "close": 186500000000}, # 186.5
]
)
points = source._df_to_daily_points(df)
assert len(points) == 2
assert points[0].date == date(2024, 1, 2)
assert points[0].close == 185.0
assert points[1].close == 186.5
def test_load_from_cache_returns_none_if_missing(self, temp_cache_dir: Path) -> None:
"""Returns None if cache files don't exist."""
source = DatabentoHistoricalPriceSource.__new__(DatabentoHistoricalPriceSource)
source.config = DatabentoSourceConfig(cache_dir=temp_cache_dir)
key = DatabentoCacheKey(
dataset="XNAS.BASIC",
symbol="GLD",
schema="ohlcv-1d",
start_date=date(2024, 1, 1),
end_date=date(2024, 1, 31),
)
result = source._load_from_cache(key)
assert result is None
def test_load_from_cache_returns_data_if_fresh(self, temp_cache_dir: Path, sample_ohlcv_df) -> None:
"""Returns cached data if within age threshold."""
source = DatabentoHistoricalPriceSource.__new__(DatabentoHistoricalPriceSource)
source.config = DatabentoSourceConfig(cache_dir=temp_cache_dir)
key = DatabentoCacheKey(
dataset="XNAS.BASIC",
symbol="GLD",
schema="ohlcv-1d",
start_date=date(2024, 1, 1),
end_date=date(2024, 1, 31),
)
# Save to cache
source._save_to_cache(key, sample_ohlcv_df)
# Load from cache
result = source._load_from_cache(key)
assert result is not None
assert len(result) == 4
assert result[0].close == 185.0
def test_load_from_cache_returns_none_if_stale(
self, temp_cache_dir: Path, sample_ohlcv_df
) -> None:
"""Returns None if cache exceeds age threshold."""
source = DatabentoHistoricalPriceSource.__new__(DatabentoHistoricalPriceSource)
source.config = DatabentoSourceConfig(
cache_dir=temp_cache_dir,
max_cache_age_days=0, # Always stale
)
key = DatabentoCacheKey(
dataset="XNAS.BASIC",
symbol="GLD",
schema="ohlcv-1d",
start_date=date(2024, 1, 1),
end_date=date(2024, 1, 31),
)
# Save to cache
source._save_to_cache(key, sample_ohlcv_df)
# Manually age the cache by setting download_date to yesterday
meta_file = key.metadata_path(temp_cache_dir)
with open(meta_file) as f:
meta = json.load(f)
meta["download_date"] = (date.today() - timedelta(days=1)).isoformat()
with open(meta_file, "w") as f:
json.dump(meta, f)
# Load from cache (should fail due to age)
result = source._load_from_cache(key)
assert result is None
@patch("app.services.backtesting.databento_source.DATABENTO_AVAILABLE", False)
def test_raises_if_databento_not_installed(self) -> None:
"""Raises error if databento package not installed."""
with pytest.raises(RuntimeError, match="databento package required"):
DatabentoHistoricalPriceSource()
def test_clear_cache(self, temp_cache_dir: Path, sample_ohlcv_df) -> None:
"""Clears all cache files."""
source = DatabentoHistoricalPriceSource.__new__(DatabentoHistoricalPriceSource)
source.config = DatabentoSourceConfig(cache_dir=temp_cache_dir)
# Create some cache files
key1 = DatabentoCacheKey(
dataset="XNAS.BASIC",
symbol="GLD",
schema="ohlcv-1d",
start_date=date(2024, 1, 1),
end_date=date(2024, 1, 31),
)
key2 = DatabentoCacheKey(
dataset="GLBX.MDP3",
symbol="GC",
schema="ohlcv-1d",
start_date=date(2024, 1, 1),
end_date=date(2024, 1, 31),
)
source._save_to_cache(key1, sample_ohlcv_df)
source._save_to_cache(key2, sample_ohlcv_df)
count = source.clear_cache()
assert count == 4 # 2 parquet + 2 json
class TestDatabentoHistoricalPriceSourceIntegration:
"""Integration tests (require databento package)."""
@pytest.mark.skipif(
not DatabentoHistoricalPriceSource.__module__,
reason="databento not installed",
)
def test_get_cache_stats(self, temp_cache_dir: Path, sample_ohlcv_df) -> None:
"""Returns cache statistics."""
source = DatabentoHistoricalPriceSource.__new__(DatabentoHistoricalPriceSource)
source.config = DatabentoSourceConfig(cache_dir=temp_cache_dir)
key = DatabentoCacheKey(
dataset="XNAS.BASIC",
symbol="GLD",
schema="ohlcv-1d",
start_date=date(2024, 1, 1),
end_date=date(2024, 1, 31),
)
source._save_to_cache(key, sample_ohlcv_df)
stats = source.get_cache_stats()
assert stats["file_count"] == 2
assert stats["total_size_bytes"] > 0
assert len(stats["entries"]) == 1
assert stats["entries"][0]["symbol"] == "GLD"