|
| 1 | +import io |
| 2 | +from pathlib import Path |
| 3 | +from typing import List, Union |
| 4 | + |
| 5 | +from .file import File |
| 6 | +from .transfer_manager import transfer_manager |
| 7 | + |
| 8 | + |
| 9 | +class Fetcher: |
| 10 | + def __init__( |
| 11 | + self, |
| 12 | + paths: List[Union[str, Path]], |
| 13 | + endpoint_url: str, |
| 14 | + aws_access_key_id: str, |
| 15 | + aws_secret_access_key: str, |
| 16 | + region_name: str, |
| 17 | + bucket_name: str, |
| 18 | + ordered=True, |
| 19 | + buffer_size=1024, |
| 20 | + n_workers=32, |
| 21 | + ): |
| 22 | + self.paths = paths |
| 23 | + self.ordered = ordered |
| 24 | + self.buffer_size = buffer_size |
| 25 | + self.transfer_manager = transfer_manager( |
| 26 | + endpoint_url=endpoint_url, |
| 27 | + aws_access_key_id=aws_access_key_id, |
| 28 | + aws_secret_access_key=aws_secret_access_key, |
| 29 | + region_name=region_name, |
| 30 | + n_workers=n_workers, |
| 31 | + ) |
| 32 | + self.bucket_name = bucket_name |
| 33 | + self.files: List[File] = [] |
| 34 | + self.current_path_index = 0 |
| 35 | + |
| 36 | + def __len__(self): |
| 37 | + return len(self.paths) |
| 38 | + |
| 39 | + def __iter__(self): |
| 40 | + for _ in range(self.buffer_size): |
| 41 | + self.queue_download_() |
| 42 | + |
| 43 | + if self.ordered: |
| 44 | + for _ in range(len(self)): |
| 45 | + file = self.files.pop(0) |
| 46 | + file.future.result() |
| 47 | + yield file |
| 48 | + self.queue_download_() |
| 49 | + else: |
| 50 | + for _ in range(len(self)): |
| 51 | + for index, file in enumerate(self.files): |
| 52 | + if file.future.done(): |
| 53 | + break |
| 54 | + else: |
| 55 | + index = 0 |
| 56 | + file = self.files.pop(index) |
| 57 | + file.future.result() |
| 58 | + yield file |
| 59 | + self.queue_download_() |
| 60 | + |
| 61 | + def queue_download_(self): |
| 62 | + if self.current_path_index < len(self): |
| 63 | + buffer = io.BytesIO() |
| 64 | + path = self.paths[self.current_path_index] |
| 65 | + self.files.append( |
| 66 | + File( |
| 67 | + buffer=buffer, |
| 68 | + future=self.transfer_manager.download( |
| 69 | + fileobj=buffer, |
| 70 | + bucket=self.bucket_name, |
| 71 | + key=str(path), |
| 72 | + ), |
| 73 | + path=path, |
| 74 | + ) |
| 75 | + ) |
| 76 | + self.current_path_index += 1 |
| 77 | + |
| 78 | + def close(self): |
| 79 | + self.transfer_manager.shutdown() |
0 commit comments