diff --git a/waterbutler/core/cache.py b/waterbutler/core/cache.py new file mode 100644 index 000000000..59e71cf87 --- /dev/null +++ b/waterbutler/core/cache.py @@ -0,0 +1,43 @@ +import asyncio + + +class CacheableMetadataProviderProxy: + """Wraps a provider so that folder ``metadata`` requests are cached and the + metadata of any child folders is prefetched concurrently. + + When a folder is listed, a metadata task is scheduled for each of its child + folders without awaiting them. Those tasks run on the event loop while files + are being downloaded, so the generator can issue all of its metadata requests + up front instead of fetching folder listings strictly one-at-a-time. + + Any attribute that is not overridden here (``download``, ``path_from_metadata``, + etc.) is delegated to the wrapped provider. + """ + + def __init__(self, provider): + self.provider = provider + self.__cache = dict[str, asyncio.Task]() + + def metadata_task(self, path, **kwargs): + key = path.identifier or str(path) + if key not in self.__cache: + self.__cache[key] = asyncio.create_task( + self.provider.metadata(path, **kwargs) + ) + return self.__cache[key] + + async def metadata(self, path, **kwargs): + if path.is_file: + return await self.provider.metadata(path, **kwargs) + + items = await self.metadata_task(path, **kwargs) + + for item in items: + if item.is_folder: + child = self.provider.path_from_metadata(path, item) + self.metadata_task(child, **kwargs) + + return items + + def __getattr__(self, name): + return getattr(self.provider, name) diff --git a/waterbutler/core/provider.py b/waterbutler/core/provider.py index 9e5101063..70cdf9185 100644 --- a/waterbutler/core/provider.py +++ b/waterbutler/core/provider.py @@ -18,6 +18,7 @@ from waterbutler import settings as wb_settings from waterbutler.core.metrics import MetricsRecord from waterbutler.core import metadata as wb_metadata +from waterbutler.core.cache import CacheableMetadataProviderProxy from waterbutler.core.utils import ZipStreamGenerator from waterbutler.core.utils import RequestHandlerContext @@ -678,18 +679,17 @@ async def revalidate_path(self, """ return base.child(path, folder=folder) - async def zip(self, path: wb_path.WaterButlerPath, **kwargs) -> asyncio.StreamReader: + async def zip(self, path: wb_path.WaterButlerPath, use_cache: bool = True, **kwargs) -> asyncio.StreamReader: """Streams a Zip archive of the given folder :param path: ( :class:`.WaterButlerPath` ) The folder to compress + :param use_cache: ( `bool` ) Whether to prefetch metadata requests for nested folders during zip """ + provider = CacheableMetadataProviderProxy(self) if use_cache else self - meta_data = await self.metadata(path, **kwargs) # type: ignore - if path.is_file: - meta_data = [meta_data] # type: ignore - path = path.parent - - return streams.ZipStreamReader(ZipStreamGenerator(self, path, *meta_data, **kwargs)) # type: ignore + return streams.ZipStreamReader( + await ZipStreamGenerator.create(provider, path, **kwargs) + ) # type: ignore def shares_storage_root(self, other: 'BaseProvider') -> bool: """Returns True if ``self`` and ``other`` both point to the same storage root. Used to diff --git a/waterbutler/core/utils.py b/waterbutler/core/utils.py index 6c28abf75..512e95e2c 100644 --- a/waterbutler/core/utils.py +++ b/waterbutler/core/utils.py @@ -193,6 +193,7 @@ def make_disposition(filename): class ZipStreamGenerator: + def __init__(self, provider, parent_path, *metadata_objs, **kwargs): self.provider = provider self.parent_path = parent_path @@ -202,6 +203,14 @@ def __init__(self, provider, parent_path, *metadata_objs, **kwargs): ] self.kwargs = kwargs + @classmethod + async def create(cls, provider, path, **kwargs): + meta_data = await provider.metadata(path, **kwargs) + if path.is_file: + meta_data = [meta_data] + path = path.parent + return cls(provider, path, *meta_data, **kwargs) + async def __aiter__(self): return self