feat: add zstandard compression support

thaaddeus · thaaddeus · commit 166b2b7721af · 2026-03-29T13:26:51.000+02:00
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,7 @@ python = ">=3.9,<4.0"
 aiohttp = ">=3.10.0"
 python-dateutil = "^2.8.2"
 aiofiles = "^24.1.0"
+zstandard = ">=0.19.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.5"
diff --git a/tardis_dev/_http.py b/tardis_dev/_http.py
@@ -14,11 +14,11 @@
 logger = logging.getLogger(__name__)
 
 
-async def create_session(api_key: str, timeout: int) -> aiohttp.ClientSession:
+async def create_session(api_key: str, timeout: int, accept_encoding: str = "gzip") -> aiohttp.ClientSession:
     from tardis_dev import __version__
 
     headers = {
-        "Accept-Encoding": "gzip",
+        "Accept-Encoding": accept_encoding,
         "User-Agent": f"tardis-dev/{__version__} (+https://github.com/tardis-dev/tardis-python)",
     }
     if api_key:
@@ -38,14 +38,20 @@ async def reliable_download(
     dest_path: str,
     http_proxy: Optional[str] = None,
     max_attempts: int = 30,
-) -> None:
+    append_content_encoding_extension: bool = False,
+) -> str:
     attempts = 0
 
     while True:
         attempts += 1
         try:
-            await _download(session, _get_retry_url(url, attempts), dest_path, http_proxy)
-            return
+            return await _download(
+                session,
+                _get_retry_url(url, attempts),
+                dest_path,
+                http_proxy,
+                append_content_encoding_extension=append_content_encoding_extension,
+            )
         except asyncio.CancelledError:
             raise
         except Exception as exc:
@@ -99,12 +105,30 @@ async def _download(
     url: str,
     dest_path: str,
     http_proxy: Optional[str],
-) -> None:
+    *,
+    append_content_encoding_extension: bool,
+) -> str:
     async with session.get(url, proxy=http_proxy) as response:
         if response.status != 200:
             error_text = await response.text()
             raise urllib.error.HTTPError(url, code=response.status, msg=error_text, hdrs=None, fp=None)
 
+        final_path = dest_path
+        if append_content_encoding_extension:
+            content_encoding = response.headers.get("Content-Encoding")
+            if content_encoding == "zstd":
+                final_path = f"{dest_path}.zst"
+            elif content_encoding == "gzip":
+                final_path = f"{dest_path}.gz"
+            else:
+                raise urllib.error.HTTPError(
+                    url,
+                    code=400,
+                    msg=f"Unsupported data feed content encoding: {content_encoding}",
+                    hdrs=None,
+                    fp=None,
+                )
+
         pathlib.Path(dest_path).parent.mkdir(parents=True, exist_ok=True)
         temp_path = f"{dest_path}{secrets.token_hex(8)}.unconfirmed"
 
@@ -113,7 +137,8 @@ async def _download(
                 async for chunk in response.content.iter_any():
                     await temp_file.write(chunk)
 
-            os.replace(temp_path, dest_path)
+            os.replace(temp_path, final_path)
+            return final_path
         finally:
             if os.path.exists(temp_path):
                 os.remove(temp_path)
diff --git a/tardis_dev/replay.py b/tardis_dev/replay.py
@@ -1,6 +1,7 @@
 import asyncio
 import gzip
 import hashlib
+import io
 import json as json_module
 import logging
 import os
@@ -11,6 +12,7 @@
 from typing import Any, AsyncIterator, Dict, List, NamedTuple, Optional, Sequence, Union
 
 import dateutil.parser
+import zstandard
 
 from tardis_dev._http import create_session, reliable_download
 from tardis_dev._options import DEFAULT_CACHE_DIR, DEFAULT_ENDPOINT
@@ -84,7 +86,15 @@ async def replay(
 
             while current_slice_path is None:
                 await asyncio.sleep(0)
-                path_to_check = _get_slice_cache_path(
+                zstd_path = _get_slice_cache_path(
+                    cache_dir,
+                    exchange,
+                    current_slice_date,
+                    normalized_filters,
+                    filters_hash=filters_hash,
+                    content_encoding="zstd",
+                )
+                gzip_path = _get_slice_cache_path(
                     cache_dir,
                     exchange,
                     current_slice_date,
@@ -95,26 +105,27 @@ async def replay(
                 if fetch_data_task.done() and fetch_data_task.exception():
                     raise fetch_data_task.exception()
 
-                if os.path.isfile(path_to_check):
-                    current_slice_path = path_to_check
+                if os.path.isfile(zstd_path):
+                    current_slice_path = zstd_path
+                elif os.path.isfile(gzip_path):
+                    current_slice_path = gzip_path
                 else:
                     await asyncio.sleep(0.1)
 
-            with gzip.open(current_slice_path, "rb") as file:
-                for line in file:
-                    if len(line) <= 1:
-                        if with_disconnects and not last_message_was_disconnect:
-                            last_message_was_disconnect = True
-                            yield None
-                        continue
+            for line in _iterate_slice_lines(current_slice_path):
+                if len(line) <= 1:
+                    if with_disconnects and not last_message_was_disconnect:
+                        last_message_was_disconnect = True
+                        yield None
+                    continue
 
-                    last_message_was_disconnect = False
+                last_message_was_disconnect = False
 
-                    if decode_response:
-                        timestamp = datetime.fromisoformat(line[0 : DATE_MESSAGE_SPLIT_INDEX - 2].decode("utf-8"))
-                        yield Response(timestamp, json.loads(line[DATE_MESSAGE_SPLIT_INDEX + 1 :]))
-                    else:
-                        yield Response(line[0:DATE_MESSAGE_SPLIT_INDEX], line[DATE_MESSAGE_SPLIT_INDEX + 1 :])
+                if decode_response:
+                    timestamp = datetime.fromisoformat(line[0 : DATE_MESSAGE_SPLIT_INDEX - 2].decode("utf-8"))
+                    yield Response(timestamp, json.loads(line[DATE_MESSAGE_SPLIT_INDEX + 1 :]))
+                else:
+                    yield Response(line[0:DATE_MESSAGE_SPLIT_INDEX], line[DATE_MESSAGE_SPLIT_INDEX + 1 :])
 
             if auto_cleanup:
                 _remove_processed_slice(current_slice_path)
@@ -167,7 +178,7 @@ async def _fetch_data_to_replay(
     if minutes_diff <= 0:
         return
 
-    async with await create_session(api_key, timeout) as session:
+    async with await create_session(api_key, timeout, "zstd, gzip") as session:
         fetch_data_tasks = set()
         try:
             prefetch_offsets = [minutes_diff - 1]
@@ -231,9 +242,23 @@ async def _fetch_slice_if_not_cached(
     filters_hash: str,
 ) -> None:
     slice_date = from_date + timedelta(minutes=offset)
-    cache_path = _get_slice_cache_path(cache_dir, exchange, slice_date, filters, filters_hash=filters_hash)
+    cache_zstd_path = _get_slice_cache_path(
+        cache_dir,
+        exchange,
+        slice_date,
+        filters,
+        filters_hash=filters_hash,
+        content_encoding="zstd",
+    )
+    cache_gzip_path = _get_slice_cache_path(
+        cache_dir,
+        exchange,
+        slice_date,
+        filters,
+        filters_hash=filters_hash,
+    )
 
-    if os.path.isfile(cache_path):
+    if os.path.isfile(cache_zstd_path) or os.path.isfile(cache_gzip_path):
         return
 
     fetch_url = f"{endpoint}/data-feeds/{exchange}?from={_format_replay_query_date(from_date)}&offset={offset}"
@@ -242,11 +267,14 @@ async def _fetch_slice_if_not_cached(
         filters_url_encoded = urllib.parse.quote(filters_serialized, safe="~()*!.'")
         fetch_url += f"&filters={filters_url_encoded}"
 
+    cache_base_path = cache_gzip_path.removesuffix(".gz")
+
     await reliable_download(
         session=session,
         url=fetch_url,
-        dest_path=cache_path,
+        dest_path=cache_base_path,
         http_proxy=http_proxy,
+        append_content_encoding_extension=True,
     )
 
 
@@ -341,16 +369,14 @@ def _get_slice_cache_path(
     filters: Optional[Sequence[Channel]],
     *,
     filters_hash: Optional[str] = None,
+    content_encoding: Optional[str] = None,
 ) -> str:
-    return (
-        os.path.join(
-            cache_dir,
-            "feeds",
-            exchange,
-            filters_hash if filters_hash is not None else _get_filters_hash(filters),
-            _format_date_to_path(date),
-        )
-        + ".json.gz"
+    return os.path.join(
+        cache_dir,
+        "feeds",
+        exchange,
+        filters_hash if filters_hash is not None else _get_filters_hash(filters),
+        f"{_format_date_to_path(date)}.json{'.zst' if content_encoding == 'zstd' else '.gz'}",
     )
 
 
@@ -405,6 +431,18 @@ def _remove_processed_slice(path: str) -> None:
         os.remove(path)
 
 
+def _iterate_slice_lines(path: str):
+    if path.endswith(".zst"):
+        with open(path, "rb") as compressed_file:
+            with zstandard.ZstdDecompressor().stream_reader(compressed_file) as file:
+                with io.BufferedReader(file) as buffered_file:
+                    yield from buffered_file
+        return
+
+    with gzip.open(path, "rb") as file:
+        yield from file
+
+
 def _clear_replay_cache_range(
     *,
     cache_dir: str,
diff --git a/tests/test_http.py b/tests/test_http.py
@@ -8,9 +8,26 @@
 
 
 @pytest.mark.asyncio
-async def test_create_session_omits_authorization_header_when_api_key_missing():
-    async with await create_session("", 5) as session:
+async def test_create_session_uses_requested_accept_encoding_and_omits_authorization_header_when_api_key_missing():
+    async with await create_session("", 5, "zstd, gzip") as session:
         assert "Authorization" not in session.headers
+        assert session.headers["Accept-Encoding"] == "zstd, gzip"
+
+
+@pytest.mark.asyncio
+async def test_reliable_download_appends_zstd_extension_for_replay_cache(tmp_path: Path):
+    destination = tmp_path / "slice.json"
+    url = "https://example.com/data"
+
+    with aioresponses() as mocked:
+        mocked.get(url, body=b"payload", headers={"Content-Encoding": "zstd"})
+
+        async with await create_session("", 5) as session:
+            final_path = await reliable_download(session, url, str(destination), append_content_encoding_extension=True)
+
+    assert final_path.endswith(".zst")
+    assert Path(final_path).read_bytes() == b"payload"
+    assert not destination.exists()
 
 
 @pytest.mark.asyncio
diff --git a/tests/test_replay.py b/tests/test_replay.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 
 import pytest
+import zstandard
 
 from tardis_dev import Channel, replay
 from tardis_dev.replay import (
@@ -113,6 +114,48 @@ def test_replay_cache_path_uses_normalized_filter_hash():
     ]
 
 
+@pytest.mark.asyncio
+async def test_replay_prefers_zstd_cache_path_when_available(monkeypatch, tmp_path: Path):
+    filters = _live_replay_filters()
+    gzip_path = Path(_get_slice_cache_path(str(tmp_path), LIVE_REPLAY_EXCHANGE, datetime(2019, 5, 1, 0, 0), filters))
+    zstd_path = Path(
+        _get_slice_cache_path(
+            str(tmp_path),
+            LIVE_REPLAY_EXCHANGE,
+            datetime(2019, 5, 1, 0, 0),
+            filters,
+            content_encoding="zstd",
+        )
+    )
+    gzip_path.parent.mkdir(parents=True, exist_ok=True)
+    with gzip.open(gzip_path, "wb") as file:
+        file.write(b'2019-05-01T00:00:00.0000000Z {"table":"trade","source":"gzip"}\n')
+    zstd_path.write_bytes(
+        zstandard.ZstdCompressor().compress(
+            b'2019-05-01T00:00:00.0000000Z {"table":"trade","source":"zstd"}\n'
+        )
+    )
+
+    async def fake_fetch_data_to_replay(**kwargs):
+        return None
+
+    monkeypatch.setattr(replay_module, "_fetch_data_to_replay", fake_fetch_data_to_replay)
+
+    results = []
+    async for item in replay(
+        exchange=LIVE_REPLAY_EXCHANGE,
+        from_date=LIVE_REPLAY_FROM,
+        to_date=LIVE_REPLAY_TO,
+        filters=filters,
+        cache_dir=str(tmp_path),
+    ):
+        results.append(item)
+
+    assert len(results) == 1
+    assert results[0] is not None
+    assert results[0].message["source"] == "zstd"
+
+
 def test_replay_rejects_invalid_filter_items():
     async def collect():
         async for _ in replay(exchange="bitmex", from_date="2019-06-01", to_date="2019-06-02", filters=["bad"]):
@@ -245,11 +288,51 @@ async def fake_fetch_data_to_replay(**kwargs):
     assert results[0].message == b'{"table":"trade","action":"partial","data":[{"symbol":"BTCUSD"}]}\n'
 
 
+@pytest.mark.asyncio
+async def test_replay_reads_zstd_cached_slice(monkeypatch, tmp_path: Path):
+    cache_dir = tmp_path / "cache"
+    filters = _live_replay_filters()
+    slice_path = Path(
+        _get_slice_cache_path(
+            str(cache_dir),
+            LIVE_REPLAY_EXCHANGE,
+            datetime(2019, 5, 1, 0, 0),
+            filters,
+            content_encoding="zstd",
+        )
+    )
+    slice_path.parent.mkdir(parents=True, exist_ok=True)
+    slice_path.write_bytes(
+        zstandard.ZstdCompressor().compress(
+            b'2019-05-01T00:00:00.0000000Z {"table":"trade","action":"partial","data":[{"symbol":"BTCUSD"}]}\n'
+        )
+    )
+
+    async def fake_fetch_data_to_replay(**kwargs):
+        return None
+
+    monkeypatch.setattr(replay_module, "_fetch_data_to_replay", fake_fetch_data_to_replay)
+
+    results = []
+    async for item in replay(
+        exchange=LIVE_REPLAY_EXCHANGE,
+        from_date=LIVE_REPLAY_FROM,
+        to_date=LIVE_REPLAY_TO,
+        filters=filters,
+        cache_dir=str(cache_dir),
+    ):
+        results.append(item)
+
+    assert len(results) == 1
+    assert results[0] is not None
+    assert results[0].message["table"] == "trade"
+
+
 @pytest.mark.asyncio
 async def test_fetch_data_to_replay_prefetches_last_then_first(monkeypatch):
     offsets = []
 
-    async def fake_create_session(api_key: str, timeout: int):
+    async def fake_create_session(api_key: str, timeout: int, accept_encoding: str):
         return _FakeSession()
 
     async def fake_fetch_slice_if_not_cached(**kwargs):