11import asyncio
22import gzip
33import hashlib
4+ import io
45import json as json_module
56import logging
67import os
1112from typing import Any , AsyncIterator , Dict , List , NamedTuple , Optional , Sequence , Union
1213
1314import dateutil .parser
15+ import zstandard
1416
1517from tardis_dev ._http import create_session , reliable_download
1618from tardis_dev ._options import DEFAULT_CACHE_DIR , DEFAULT_ENDPOINT
@@ -84,7 +86,15 @@ async def replay(
8486
8587 while current_slice_path is None :
8688 await asyncio .sleep (0 )
87- path_to_check = _get_slice_cache_path (
89+ zstd_path = _get_slice_cache_path (
90+ cache_dir ,
91+ exchange ,
92+ current_slice_date ,
93+ normalized_filters ,
94+ filters_hash = filters_hash ,
95+ content_encoding = "zstd" ,
96+ )
97+ gzip_path = _get_slice_cache_path (
8898 cache_dir ,
8999 exchange ,
90100 current_slice_date ,
@@ -95,26 +105,27 @@ async def replay(
95105 if fetch_data_task .done () and fetch_data_task .exception ():
96106 raise fetch_data_task .exception ()
97107
98- if os .path .isfile (path_to_check ):
99- current_slice_path = path_to_check
108+ if os .path .isfile (zstd_path ):
109+ current_slice_path = zstd_path
110+ elif os .path .isfile (gzip_path ):
111+ current_slice_path = gzip_path
100112 else :
101113 await asyncio .sleep (0.1 )
102114
103- with gzip .open (current_slice_path , "rb" ) as file :
104- for line in file :
105- if len (line ) <= 1 :
106- if with_disconnects and not last_message_was_disconnect :
107- last_message_was_disconnect = True
108- yield None
109- continue
115+ for line in _iterate_slice_lines (current_slice_path ):
116+ if len (line ) <= 1 :
117+ if with_disconnects and not last_message_was_disconnect :
118+ last_message_was_disconnect = True
119+ yield None
120+ continue
110121
111- last_message_was_disconnect = False
122+ last_message_was_disconnect = False
112123
113- if decode_response :
114- timestamp = datetime .fromisoformat (line [0 : DATE_MESSAGE_SPLIT_INDEX - 2 ].decode ("utf-8" ))
115- yield Response (timestamp , json .loads (line [DATE_MESSAGE_SPLIT_INDEX + 1 :]))
116- else :
117- yield Response (line [0 :DATE_MESSAGE_SPLIT_INDEX ], line [DATE_MESSAGE_SPLIT_INDEX + 1 :])
124+ if decode_response :
125+ timestamp = datetime .fromisoformat (line [0 : DATE_MESSAGE_SPLIT_INDEX - 2 ].decode ("utf-8" ))
126+ yield Response (timestamp , json .loads (line [DATE_MESSAGE_SPLIT_INDEX + 1 :]))
127+ else :
128+ yield Response (line [0 :DATE_MESSAGE_SPLIT_INDEX ], line [DATE_MESSAGE_SPLIT_INDEX + 1 :])
118129
119130 if auto_cleanup :
120131 _remove_processed_slice (current_slice_path )
@@ -167,7 +178,7 @@ async def _fetch_data_to_replay(
167178 if minutes_diff <= 0 :
168179 return
169180
170- async with await create_session (api_key , timeout ) as session :
181+ async with await create_session (api_key , timeout , "zstd, gzip" ) as session :
171182 fetch_data_tasks = set ()
172183 try :
173184 prefetch_offsets = [minutes_diff - 1 ]
@@ -231,9 +242,23 @@ async def _fetch_slice_if_not_cached(
231242 filters_hash : str ,
232243) -> None :
233244 slice_date = from_date + timedelta (minutes = offset )
234- cache_path = _get_slice_cache_path (cache_dir , exchange , slice_date , filters , filters_hash = filters_hash )
245+ cache_zstd_path = _get_slice_cache_path (
246+ cache_dir ,
247+ exchange ,
248+ slice_date ,
249+ filters ,
250+ filters_hash = filters_hash ,
251+ content_encoding = "zstd" ,
252+ )
253+ cache_gzip_path = _get_slice_cache_path (
254+ cache_dir ,
255+ exchange ,
256+ slice_date ,
257+ filters ,
258+ filters_hash = filters_hash ,
259+ )
235260
236- if os .path .isfile (cache_path ):
261+ if os .path .isfile (cache_zstd_path ) or os . path . isfile ( cache_gzip_path ):
237262 return
238263
239264 fetch_url = f"{ endpoint } /data-feeds/{ exchange } ?from={ _format_replay_query_date (from_date )} &offset={ offset } "
@@ -242,11 +267,14 @@ async def _fetch_slice_if_not_cached(
242267 filters_url_encoded = urllib .parse .quote (filters_serialized , safe = "~()*!.'" )
243268 fetch_url += f"&filters={ filters_url_encoded } "
244269
270+ cache_base_path = cache_gzip_path .removesuffix (".gz" )
271+
245272 await reliable_download (
246273 session = session ,
247274 url = fetch_url ,
248- dest_path = cache_path ,
275+ dest_path = cache_base_path ,
249276 http_proxy = http_proxy ,
277+ append_content_encoding_extension = True ,
250278 )
251279
252280
@@ -341,16 +369,14 @@ def _get_slice_cache_path(
341369 filters : Optional [Sequence [Channel ]],
342370 * ,
343371 filters_hash : Optional [str ] = None ,
372+ content_encoding : Optional [str ] = None ,
344373) -> str :
345- return (
346- os .path .join (
347- cache_dir ,
348- "feeds" ,
349- exchange ,
350- filters_hash if filters_hash is not None else _get_filters_hash (filters ),
351- _format_date_to_path (date ),
352- )
353- + ".json.gz"
374+ return os .path .join (
375+ cache_dir ,
376+ "feeds" ,
377+ exchange ,
378+ filters_hash if filters_hash is not None else _get_filters_hash (filters ),
379+ f"{ _format_date_to_path (date )} .json{ '.zst' if content_encoding == 'zstd' else '.gz' } " ,
354380 )
355381
356382
@@ -405,6 +431,18 @@ def _remove_processed_slice(path: str) -> None:
405431 os .remove (path )
406432
407433
434+ def _iterate_slice_lines (path : str ):
435+ if path .endswith (".zst" ):
436+ with open (path , "rb" ) as compressed_file :
437+ with zstandard .ZstdDecompressor ().stream_reader (compressed_file ) as file :
438+ with io .BufferedReader (file ) as buffered_file :
439+ yield from buffered_file
440+ return
441+
442+ with gzip .open (path , "rb" ) as file :
443+ yield from file
444+
445+
408446def _clear_replay_cache_range (
409447 * ,
410448 cache_dir : str ,
0 commit comments