diff --git a/pridepy/commands/by_url.py b/pridepy/commands/by_url.py index 91d6fec..375f990 100644 --- a/pridepy/commands/by_url.py +++ b/pridepy/commands/by_url.py @@ -15,6 +15,9 @@ from tqdm import tqdm +from pridepy.providers import transport +from pridepy.providers import util as _provider_util +from pridepy.providers.pride import PrideProvider from pridepy.util.api_handling import Util @@ -38,8 +41,6 @@ def _validate_urls_checksums(urls: List[str], output_folder: str) -> None: :raises RuntimeError: if one or more files fail validation """ - from pridepy.files.files import Files - accession_urls: Dict[str, List[str]] = {} for url in urls: acc = _extract_pride_accession(url) @@ -52,8 +53,8 @@ def _validate_urls_checksums(urls: List[str], output_folder: str) -> None: validation_failures: List[str] = [] for acc, acc_urls in accession_urls.items(): - checksum_file_path = Files.save_checksum_file(acc, output_folder) - checksum_map = Files.read_checksum_file(checksum_file_path) + checksum_file_path = PrideProvider.save_checksum_file(acc, output_folder) + checksum_map = _provider_util.read_checksum_file(checksum_file_path) logging.info( "Loaded checksums for %d files (project %s)", len(checksum_map), acc, @@ -63,7 +64,7 @@ def _validate_urls_checksums(urls: List[str], output_folder: str) -> None: target = os.path.join(output_folder, file_name) expected = checksum_map.get(file_name) logging.info("Validating %s", file_name) - valid, reason = Files.validate_download(target, expected) + valid, reason = _provider_util.validate_download(target, expected) if not valid: logging.error("Validation failed for %s: %s", file_name, reason) validation_failures.append(f"{file_name} ({reason})") @@ -129,19 +130,17 @@ def _dispatch_url_scheme(parsed, target: str, protocol: str = "ftp", position: i """Route a parsed URL to its protocol-specific downloader. ``protocol='globus'`` swaps the http/https single-connection streamer - for :func:`pridepy.files.files.Files._parallel_download` (single-connection with progress bar). - ftp:// URLs are unaffected. + for :func:`pridepy.providers.transport._parallel_download` (single-connection + with progress bar). ftp:// URLs are unaffected. """ - from pridepy.files.files import Files - scheme = (parsed.scheme or "").lower() if scheme in ("http", "https"): if protocol == "globus": - Files._parallel_download(parsed.geturl(), target, position=position) + transport._parallel_download(parsed.geturl(), target, position=position) else: - Files._http_download_url(parsed.geturl(), target) + _http_download_url(parsed.geturl(), target) elif scheme == "ftp": - Files._ftp_download_url(parsed, target) + _ftp_download_url(parsed, target) else: raise ValueError(f"Unsupported URL scheme: {scheme}") @@ -154,8 +153,6 @@ def _download_single_url( position: int = 0, ) -> str: """Download one URL, dispatched by scheme; return the local file path.""" - from pridepy.files.files import Files - parsed = urlparse(url) if not (parsed.scheme or "").lower(): raise ValueError(f"URL missing scheme: {url}") @@ -169,11 +166,11 @@ def _download_single_url( logging.info("Skipping %s: already downloaded", file_name) return target - Files._dispatch_url_scheme(parsed, target, protocol, position=position) + _dispatch_url_scheme(parsed, target, protocol, position=position) - ok, reason = Files.validate_download(target) + ok, reason = _provider_util.validate_download(target) if not ok: - Files._remove_if_exists(target) + _provider_util._remove_if_exists(target) raise RuntimeError(f"Download invalid: {reason} ({target})") return target @@ -211,12 +208,11 @@ def download_files_by_url( parallel_files = min(parallel_files, 3, len(urls)) failures: List[Tuple[str, str]] = [] - from pridepy.files.files import Files if parallel_files < 2: for url in urls: try: - Files._download_single_url( + _download_single_url( url, output_folder, skip_if_downloaded_already, protocol, ) except Exception as exc: # pylint: disable=broad-except @@ -230,7 +226,7 @@ def download_files_by_url( with ThreadPoolExecutor(max_workers=parallel_files) as executor: futures = { executor.submit( - Files._download_single_url, + _download_single_url, url, output_folder, skip_if_downloaded_already, protocol, position=idx, ): url diff --git a/pridepy/files/files.py b/pridepy/files/files.py index 3567e78..7764c81 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -1,21 +1,19 @@ #!/usr/bin/env python -import importlib.resources +"""Public Files facade — thin compatibility surface over the modular +provider architecture in :mod:`pridepy.providers`. + +The provider classes own all transport/listing logic; this module exposes +a small set of high-level operations (CLI entry points + a handful of +one-line shims for downstream Python users). +""" import logging import os -import subprocess -import urllib -import urllib.request -from ftplib import FTP from typing import Dict, List, Optional, Tuple import requests # noqa: F401 — kept as a patch target for tests from pridepy.util.api_handling import Util -# Module-level imports of the modular architecture. Providers and commands -# do not import Files at module level (only lazily inside method bodies), -# so hoisting these to the top is safe and avoids cluttering every shim -# method body with a local import. from pridepy.providers import registry, transport from pridepy.providers import util as _provider_util from pridepy.providers.iprox import IproxProvider @@ -31,10 +29,7 @@ class Files: - """ - This class handles PRIDE API files endpoint, and dispatches to the - per-repository provider classes in :mod:`pridepy.providers`. - """ + """High-level facade over the per-repository providers.""" # PRIDE class-attribute re-exports (kept here for back-compat). V3_API_BASE_URL = PrideProvider.V3_API_BASE_URL @@ -50,8 +45,6 @@ class Files: # MassIVE class-attribute re-exports. MASSIVE_ARCHIVE_FTP = MassiveProvider.ARCHIVE_FTP MASSIVE_ARCHIVE_FTP_URL_PREFIX = MassiveProvider.ARCHIVE_FTP_URL_PREFIX - # Note: MASSIVE_CATEGORY_MAP is the module-level constant in providers/massive.py, - # re-exported on Files as a class attribute via the module-level import above. # JPOST class-attribute re-exports. JPOST_ARCHIVE_FTP = JpostProvider.ARCHIVE_FTP @@ -64,8 +57,7 @@ class Files: IPROX_PX_XML_URL_TEMPLATE = IproxProvider.PX_XML_URL_TEMPLATE IPROX_PX_CATEGORY_MAP = IproxProvider.PX_CATEGORY_MAP - # MassIVE category map re-exported. Class attribute shadowing the module-level - # constant of the same name happens cleanly in class scope. + # MassIVE category map re-exported. MASSIVE_CATEGORY_MAP = MASSIVE_CATEGORY_MAP logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") @@ -73,20 +65,7 @@ class Files: def __init__(self): pass - @staticmethod - def _find_tsv_columns(header: str) -> Optional[Tuple[int, int]]: - """Shim — see :func:`pridepy.providers.util._find_tsv_columns`.""" - return _provider_util._find_tsv_columns(header) - - @staticmethod - def _is_md5_checksum(value: str) -> bool: - """Shim — see :func:`pridepy.providers.util._is_md5_checksum`.""" - return _provider_util._is_md5_checksum(value) - - @staticmethod - def read_checksum_file(checksum_file_path: str) -> Dict[str, str]: - """Shim — see :func:`pridepy.providers.util.read_checksum_file`.""" - return _provider_util.read_checksum_file(checksum_file_path) + # Pure delegating shims kept for backward compatibility. @staticmethod def compute_md5(file_path: str, chunk_size: int = 4 * 1024 * 1024) -> str: @@ -99,201 +78,116 @@ def validate_download(file_path: str, expected_checksum: Optional[str] = None) - return _provider_util.validate_download(file_path, expected_checksum) @staticmethod - def _remove_if_exists(file_path: str) -> None: - """Shim — see :func:`pridepy.providers.util._remove_if_exists`.""" - return _provider_util._remove_if_exists(file_path) + def read_checksum_file(checksum_file_path: str) -> Dict[str, str]: + """Shim — see :func:`pridepy.providers.util.read_checksum_file`.""" + return _provider_util.read_checksum_file(checksum_file_path) @staticmethod - def _get_download_url(file_record: Dict, protocol: str) -> str: - """Shim — see :func:`pridepy.providers.util._get_download_url`.""" - return _provider_util._get_download_url(file_record, protocol) + def download_ftp_urls( + ftp_urls: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + max_connection_retries: int = 3, + max_download_retries: int = 3, + use_tls: bool = False, + parallel_files: int = 1, + ) -> None: + """Shim — see :func:`pridepy.providers.transport.download_ftp_urls`.""" + return transport.download_ftp_urls( + ftp_urls=ftp_urls, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + max_connection_retries=max_connection_retries, + max_download_retries=max_download_retries, + use_tls=use_tls, + parallel_files=parallel_files, + ) @staticmethod - def _resolve_local_path(file_record: Dict, output_folder: str) -> str: - """Shim — see :func:`pridepy.providers.util._resolve_local_path`.""" - return _provider_util._resolve_local_path(file_record, output_folder) + def download_http_urls( + http_urls: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + parallel_files: int = 1, + max_retries: int = 3, + ) -> None: + """Shim — see :func:`pridepy.providers.transport.download_http_urls`.""" + return transport.download_http_urls( + http_urls=http_urls, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + parallel_files=parallel_files, + max_retries=max_retries, + ) - @staticmethod - def _protocol_sequence(protocol: str) -> List[str]: - """Shim — see :meth:`pridepy.providers.pride.PrideProvider._protocol_sequence`.""" - return PrideProvider._protocol_sequence(protocol) + # Accession-matcher convenience helpers (useful public API). @staticmethod def is_massive_accession(accession: str) -> bool: - """Shim — see :meth:`pridepy.providers.massive.MassiveProvider.matches`.""" return MassiveProvider.matches(accession) - @staticmethod - def _get_massive_public_root(accession: str) -> str: - return MassiveProvider._get_public_root(accession) - - @staticmethod - def _get_massive_public_ftp_url(accession: str, remote_path: str) -> str: - return MassiveProvider._get_public_ftp_url(accession, remote_path) - - @staticmethod - def _map_massive_collection_to_category(collection: str) -> str: - return MassiveProvider._map_collection_to_category(collection) - - @staticmethod - def _build_massive_file_record(accession: str, ftp_url: str) -> Dict: - return MassiveProvider._build_file_record(accession, ftp_url) - @staticmethod def is_jpost_accession(accession: str) -> bool: - """Shim — see :meth:`pridepy.providers.jpost.JpostProvider.matches`.""" return JpostProvider.matches(accession) @staticmethod - def _get_jpost_public_root(accession: str) -> str: - return JpostProvider._get_public_root(accession) - - @staticmethod - def _get_jpost_public_ftp_url(accession: str, remote_path: str) -> str: - return JpostProvider._get_public_ftp_url(accession, remote_path) - - @staticmethod - def _build_jpost_file_record(accession, ftp_url, category_from_proxi=None): - return JpostProvider._build_file_record(accession, ftp_url, category_from_proxi) - - @staticmethod - def _build_iprox_file_record(accession, https_url, category_from_px=None): - """Shim — see :meth:`pridepy.providers.iprox.IproxProvider._build_file_record`.""" - return IproxProvider._build_file_record(accession, https_url, category_from_px) - - @staticmethod - def _get_iprox_public_root(accession: str) -> str: - return IproxProvider._get_public_root(accession) - - @staticmethod - def _get_iprox_public_ftp_url(accession: str, remote_path: str) -> str: - return IproxProvider._get_public_ftp_url(accession, remote_path) + def is_iprox_accession(accession: str) -> bool: + return IproxProvider.matches(accession) @staticmethod def is_direct_download_accession(accession: str) -> bool: - """Shim — True for MassIVE/JPOST/iProX (explicitly excludes PRIDE). - - PRIDE is also a registered provider but PRIDE downloads go through - the multi-protocol orchestrator (FTP/Aspera/S3/Globus with checksum - validation and fallback), not the direct-download partitioned-by-URL- - scheme path. So we filter PRIDE out here. - """ + """True for MassIVE / JPOST / iProX (explicitly excludes PRIDE).""" try: provider = registry.resolve(accession) except ValueError: return False return provider.name != "pride" - @staticmethod - def is_iprox_accession(accession: str) -> bool: - """Shim — see :meth:`pridepy.providers.iprox.IproxProvider.matches`.""" - return IproxProvider.matches(accession) - @staticmethod def _repo_uses_tls(accession: str) -> bool: - """Shim — returns the resolved provider's use_tls flag (False if unknown).""" + """Return the resolved provider's ``use_tls`` flag (False if unknown).""" try: provider = registry.resolve(accession) except ValueError: return False return getattr(provider, "use_tls", False) - @staticmethod - def _walk_ftp_tree(ftp: FTP, remote_dir: str) -> List[str]: - """Shim — see :func:`pridepy.providers.transport._walk_ftp_tree`.""" - return transport._walk_ftp_tree(ftp=ftp, remote_dir=remote_dir) - - @staticmethod - def _open_ftp_connection(host: str, use_tls: bool, timeout: int = 30) -> FTP: - """Shim — see :func:`pridepy.providers.transport._open_ftp_connection`.""" - return transport._open_ftp_connection(host=host, use_tls=use_tls, timeout=timeout) - - @staticmethod - def _list_ftp_repo_files(host, remote_root, error_label, use_tls=False): - """Shim — see :func:`pridepy.providers.transport._list_ftp_repo_files`.""" - return transport._list_ftp_repo_files(host=host, remote_root=remote_root, error_label=error_label, use_tls=use_tls) - - def _list_massive_public_files(self, accession: str) -> List[Dict]: - """Shim — see :meth:`pridepy.providers.massive.MassiveProvider.list_files`.""" - return MassiveProvider().list_files(accession) - - def _download_massive_file_records( - self, - accession: str, - file_records: List[Dict], - output_folder: str, - skip_if_downloaded_already: bool, - protocol: str, - parallel_files: int = 1, - ) -> None: - """ - Download public MassIVE files via anonymous FTP (now FTPS). - Backward-compat shim — dispatches via the provider registry. - """ - registry.resolve(accession).download_files( - accession=accession, - records=file_records, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - protocol=protocol, - parallel_files=parallel_files, - ) - - def _list_jpost_public_files(self, accession: str) -> List[Dict]: - """ - Discover all public files for a JPOST dataset. - - Delegates to JpostProvider but routes via the shim methods so that - test patches on ``_list_jpost_public_files_via_proxi`` and - ``_list_ftp_repo_files`` continue to intercept. - """ - normalized_accession = accession.upper() - try: - return self._list_jpost_public_files_via_proxi(normalized_accession) - except Exception as proxi_error: - logging.warning( - f"JPOST PROXI listing failed for {normalized_accession} " - f"({proxi_error}); falling back to FTP tree walk." - ) - remote_root = JpostProvider._get_public_root(normalized_accession) - remote_files = self._list_ftp_repo_files( - host=JpostProvider.ARCHIVE_FTP, - remote_root=remote_root, - error_label=f"JPOST dataset {normalized_accession}", - ) - return [ - self._build_jpost_file_record( - normalized_accession, - JpostProvider._get_public_ftp_url(normalized_accession, remote_file), - ) - for remote_file in remote_files - ] - - def _list_jpost_public_files_via_proxi(self, accession: str) -> List[Dict]: - """Shim — see :meth:`pridepy.providers.jpost.JpostProvider._list_via_proxi`.""" - return JpostProvider()._list_via_proxi(accession) - - def _list_iprox_public_files(self, accession: str) -> List[Dict]: - """Shim — see :meth:`pridepy.providers.iprox.IproxProvider.list_files`.""" - return IproxProvider().list_files(accession) + # Listing / metadata. async def stream_all_files_metadata(self, output_file, accession=None): - """Shim — see :meth:`pridepy.providers.pride.PrideProvider.stream_all_files_metadata`.""" + """Shim — see :meth:`PrideProvider.stream_all_files_metadata`.""" return await PrideProvider().stream_all_files_metadata(output_file, accession) - def stream_all_files_by_project(self, accession) -> List[Dict]: - """Shim — see :meth:`pridepy.providers.pride.PrideProvider.stream_all_files_by_project`.""" - return PrideProvider().stream_all_files_by_project(accession) - def get_all_raw_file_list(self, project_accession): - """Get raw file list for any registered provider. - - Returns the dataset's file records filtered to fileCategory == "RAW". - """ + """Get raw file list for any registered provider (records with fileCategory == "RAW").""" provider = registry.resolve(project_accession) records = provider.list_files(project_accession) return [r for r in records if r["fileCategory"]["value"] == "RAW"] + def get_all_category_file_list( + self, accession: str, categories: "str | List[str]" + ) -> List[Dict]: + """Retrieve project files belonging to the given categories.""" + if isinstance(categories, str): + categories = [categories] + category_set = {c.upper() for c in categories} + records = registry.resolve(accession).list_files(accession) + return [r for r in records if r["fileCategory"]["value"] in category_set] + + def get_submitted_file_path_prefix(self, accession): + """Shim — see :meth:`PrideProvider.get_submitted_file_path_prefix`.""" + return PrideProvider().get_submitted_file_path_prefix(accession) + + def get_file_from_api(self, accession, file_name) -> List[Dict]: + """Return records matching ``file_name`` from the provider's listing.""" + try: + records = registry.resolve(accession).list_files(accession) + return [r for r in records if r["fileName"] == file_name] + except Exception as e: + raise Exception("File not found " + str(e)) + + # Download entry points. + def download_all_raw_files( self, accession, @@ -320,124 +214,34 @@ def download_all_raw_files( aspera_maximum_bandwidth=aspera_maximum_bandwidth, ) - @staticmethod - def download_files_from_ftp( - file_list_json, - output_folder, - skip_if_downloaded_already, - max_connection_retries=3, - max_download_retries=3, - ): - """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_files_from_ftp`.""" - return PrideProvider.download_files_from_ftp( - file_list_json, - output_folder, - skip_if_downloaded_already, - max_connection_retries=max_connection_retries, - max_download_retries=max_download_retries, - ) - - @staticmethod - def get_output_file_name(download_url, file, output_folder): - public_filepath_part = download_url.rsplit("/", 1) - accession = file.get("accession", "unknown-accession") - logging.debug(accession + " -> " + public_filepath_part[1]) - new_file_path = os.path.join(output_folder, f"{public_filepath_part[1]}") - return new_file_path - - @staticmethod - def download_files_from_aspera( - file_list_json: List[Dict], + def download_all_category_files( + self, + accession: str, output_folder: str, - skip_if_downloaded_already, - maximum_bandwidth: str = "100M", - ): - """ - Download files using aspera transfer url - :param file_list_json: file list in json format - :param output_folder: folder to download the files - :param maximum_bandwidth: parameter in Aspera sets the maximum bandwidth for the transfer. - :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. - """ - ascp_path = Files.get_ascp_binary() - key_full_path = importlib.resources.files("pridepy").joinpath( - "aspera/key/asperaweb_id_dsa.openssh" - ) - key_path = os.path.abspath(key_full_path) - for file in file_list_json: - if file["publicFileLocations"][0]["name"] == "Aspera Protocol": - download_url = file["publicFileLocations"][0]["value"] - else: - download_url = file["publicFileLocations"][1]["value"] - - # Create a clean filename to save the downloaded file - logging.debug(f"Downloading via Aspera: {download_url}") - new_file_path = Files.get_output_file_name(download_url, file, output_folder) - - if skip_if_downloaded_already == True and os.path.exists(new_file_path): - logging.info("Skipping download as file already exists") - continue - - try: - # Execute the ascp command using subprocess - subprocess.run( - [ - ascp_path, - "-QT", - "-P", - "33001", - "-l", - maximum_bandwidth, # Options for Aspera: adjust as necessary - "-i", - key_path, - download_url, - new_file_path, # Source and destination - ], - check=True, - ) - logging.info(f"Successfully downloaded {new_file_path} via Aspera") - except subprocess.CalledProcessError as e: - logging.error(f"Aspera download failed for {new_file_path}: {str(e)}") - - @staticmethod - def _parallel_download(url, file_path, position=0): - """Shim — see :func:`pridepy.providers.transport._parallel_download`.""" - return transport._parallel_download(url=url, file_path=file_path, position=position) - - @staticmethod - def _globus_download_one(file, output_folder, skip_if_downloaded_already, max_retries=6, position=0): - """Shim — see :meth:`pridepy.providers.pride.PrideProvider._globus_download_one`.""" - return PrideProvider._globus_download_one( - file, output_folder, skip_if_downloaded_already, - max_retries=max_retries, position=position, - ) - - @staticmethod - def download_files_from_globus( - file_list_json: List[Dict], output_folder, skip_if_downloaded_already, + skip_if_downloaded_already: bool, + protocol: str, + aspera_maximum_bandwidth: str, + checksum_check: bool, + categories: List[str] = None, + category: str = None, parallel_files: int = 1, - checksum_map: Optional[Dict[str, str]] = None, ): - """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_files_from_globus`.""" - return PrideProvider.download_files_from_globus( - file_list_json, output_folder, skip_if_downloaded_already, + """Download all files of the given categories from a project.""" + if categories is None: + categories = [category] if category else ["RAW"] + records = self.get_all_category_file_list(accession, categories) + provider = registry.resolve(accession) + provider.download_files( + accession=accession, + records=records, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, parallel_files=parallel_files, - checksum_map=checksum_map, - ) - - @staticmethod - def download_files_from_s3( - file_list_json: List[Dict], output_folder: str, skip_if_downloaded_already - ): - """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_files_from_s3`.""" - return PrideProvider.download_files_from_s3( - file_list_json, output_folder, skip_if_downloaded_already, + checksum_check=checksum_check, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, ) - def get_submitted_file_path_prefix(self, accession): - """Shim — see :meth:`pridepy.providers.pride.PrideProvider.get_submitted_file_path_prefix`.""" - return PrideProvider().get_submitted_file_path_prefix(accession) - def download_file_by_name( self, accession, @@ -450,25 +254,17 @@ def download_file_by_name( aspera_maximum_bandwidth, checksum_check, ): - """ - Download files from url - :param accession: PRIDE accession - :param file_name: file name to download - :param output_folder: folder to download the files - :param protocol: ftp, aspera, globus - :param username: Username for private datasets - :param password: Password for private datasets - :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. - :param aspera_maximum_bandwidth: Aspera maximum bandwidth - :param checksum_check: Download checksum for a given project. - """ + """Download a single file by name. + PRIDE supports public / private modes via the V2 private API. Other + providers (MassIVE / JPOST / iProX) only support public downloads. + """ if not os.path.isdir(output_folder): os.mkdir(output_folder) provider = registry.resolve(accession) - ## Check type of project + # Direct-download providers always use the public path. if provider.name in ("massive", "jpost", "iprox"): logging.info( "Downloading file from public direct-download dataset {}".format(accession) @@ -487,6 +283,7 @@ def download_file_by_name( ) return + # PRIDE has a public/private split that needs status interrogation. public_project = False project_status = Util.get_api_call(self.API_BASE_URL + "/status/{}".format(accession)) @@ -501,18 +298,18 @@ def download_file_by_name( if public_project: logging.info("Downloading file from public dataset {}".format(accession)) response = self.get_file_from_api(accession, file_name) - self.download_files( - response, - accession, - output_folder, - skip_if_downloaded_already, - protocol, + PrideProvider._download_files_batch( + file_list_json=response, + accession=accession, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, aspera_maximum_bandwidth=aspera_maximum_bandwidth, checksum_check=checksum_check, ) elif not public_project and (username is not None and password is not None): logging.info("Downloading file from private dataset {}".format(accession)) - self.download_private_file_name( + PrideProvider().download_private_file_name( accession=accession, file_name=file_name, output_folder=output_folder, @@ -531,105 +328,6 @@ def download_file_by_name( ) ) - def get_file_from_api(self, accession, file_name) -> List[Dict]: - """ - Fetches file from API - :param accession: PRIDE accession - :param file_name: file name - :return: file in json format - """ - try: - records = registry.resolve(accession).list_files(accession) - return [r for r in records if r["fileName"] == file_name] - except Exception as e: - raise Exception("File not found " + str(e)) - - def download_private_file_name(self, accession, file_name, output_folder, username, password): - """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_private_file_name`.""" - return PrideProvider().download_private_file_name( - accession, file_name, output_folder, username, password, - ) - - @staticmethod - def get_ascp_binary(): - """Shim — see :meth:`pridepy.providers.pride.PrideProvider.get_ascp_binary`.""" - return PrideProvider.get_ascp_binary() - - @staticmethod - def save_checksum_file(accession, output_folder): - """Shim — see :meth:`pridepy.providers.pride.PrideProvider.save_checksum_file`.""" - return PrideProvider.save_checksum_file(accession, output_folder) - - @staticmethod - def _batch_download_by_protocol( - file_list: List[Dict], - output_folder: str, - protocol: str, - skip_if_downloaded_already: bool, - aspera_maximum_bandwidth: str, - parallel_files: int = 1, - checksum_map: Optional[Dict[str, str]] = None, - ) -> None: - """Shim — see :meth:`pridepy.providers.pride.PrideProvider._batch_download_by_protocol`. - - Tests patch this method via ``patch.object(Files, "_batch_download_by_protocol")``; - :class:`PrideProvider` calls back through ``Files.X`` so those patches - keep intercepting. - """ - return PrideProvider._batch_download_by_protocol( - file_list, - output_folder, - protocol, - skip_if_downloaded_already, - aspera_maximum_bandwidth, - parallel_files=parallel_files, - checksum_map=checksum_map, - ) - - @staticmethod - def _download_with_fallback( - file_record: Dict, - output_folder: str, - protocol_sequence: List[str], - expected_checksum: Optional[str], - aspera_maximum_bandwidth: str, - max_protocol_retries: int = 2, - parallel_files: int = 1, - ) -> bool: - """Shim — see :meth:`pridepy.providers.pride.PrideProvider._download_with_fallback`.""" - return PrideProvider._download_with_fallback( - file_record, - output_folder, - protocol_sequence, - expected_checksum, - aspera_maximum_bandwidth, - max_protocol_retries=max_protocol_retries, - parallel_files=parallel_files, - ) - - @staticmethod - def download_files( - file_list_json: List[Dict], - accession, - output_folder: str, - skip_if_downloaded_already, - protocol: str = "ftp", - aspera_maximum_bandwidth: str = "100M", # Aspera maximum bandwidth - checksum_check=False, - parallel_files: int = 1, - ): - """Shim — see :meth:`pridepy.providers.pride.PrideProvider._download_files_batch`.""" - return PrideProvider._download_files_batch( - file_list_json, - accession, - output_folder, - skip_if_downloaded_already, - protocol=protocol, - aspera_maximum_bandwidth=aspera_maximum_bandwidth, - checksum_check=checksum_check, - parallel_files=parallel_files, - ) - def download_files_by_list( self, accession: str, @@ -641,7 +339,7 @@ def download_files_by_list( checksum_check: bool = False, parallel_files: int = 1, ) -> None: - """Shim — see :func:`pridepy.commands.by_list.download_files_by_list`.""" + """Delegate to :func:`pridepy.commands.by_list.download_files_by_list`.""" return by_list.download_files_by_list( accession=accession, file_names=file_names, @@ -653,11 +351,6 @@ def download_files_by_list( parallel_files=parallel_files, ) - @staticmethod - def _extract_pride_accession(url: str) -> Optional[str]: - """Shim — see :func:`pridepy.commands.by_url._extract_pride_accession`.""" - return by_url._extract_pride_accession(url) - @staticmethod def download_files_by_url( urls: List[str], @@ -667,7 +360,7 @@ def download_files_by_url( parallel_files: int = 1, checksum_check: bool = False, ) -> None: - """Shim — see :func:`pridepy.commands.by_url.download_files_by_url`.""" + """Delegate to :func:`pridepy.commands.by_url.download_files_by_url`.""" return by_url.download_files_by_url( urls=urls, output_folder=output_folder, @@ -677,236 +370,13 @@ def download_files_by_url( checksum_check=checksum_check, ) - @staticmethod - def _validate_urls_checksums(urls: List[str], output_folder: str) -> None: - """Shim — see :func:`pridepy.commands.by_url._validate_urls_checksums`.""" - return by_url._validate_urls_checksums(urls, output_folder) - - @staticmethod - def _download_single_url( - url: str, - output_folder: str, - skip_if_exists: bool = False, - protocol: str = "ftp", - position: int = 0, - ) -> str: - """Shim — see :func:`pridepy.commands.by_url._download_single_url`.""" - return by_url._download_single_url(url, output_folder, skip_if_exists, protocol, position) - - @staticmethod - def _dispatch_url_scheme(parsed, target: str, protocol: str = "ftp", position: int = 0) -> None: - """Shim — see :func:`pridepy.commands.by_url._dispatch_url_scheme`.""" - return by_url._dispatch_url_scheme(parsed, target, protocol=protocol, position=position) - - @staticmethod - def _http_download_url(url: str, target: str) -> None: - """Shim — see :func:`pridepy.commands.by_url._http_download_url`.""" - return by_url._http_download_url(url, target) - - @staticmethod - def _ftp_download_url(parsed, target: str) -> None: - """Shim — see :func:`pridepy.commands.by_url._ftp_download_url`.""" - return by_url._ftp_download_url(parsed, target) - - def download_all_category_files( - self, - accession: str, - output_folder: str, - skip_if_downloaded_already: bool, - protocol: str, - aspera_maximum_bandwidth: str, - checksum_check: bool, - categories: List[str] = None, - category: str = None, - parallel_files: int = 1, - ): - """ - Download all files of specified categories from a PRIDE project. - - :param accession: The PRIDE project accession identifier. - :param output_folder: The directory where the files will be downloaded. - :param skip_if_downloaded_already: If True, skips downloading files that already exist. - :param protocol: The transfer protocol to use (e.g., ftp, aspera, globus, s3). - :param aspera_maximum_bandwidth: Maximum bandwidth for Aspera transfers. - :param checksum_check: If True, downloads the checksum file for the project. - :param categories: List of file categories to download. - :param category: Single file category (deprecated, use categories instead). - """ - if categories is None: - categories = [category] if category else ["RAW"] - records = self.get_all_category_file_list(accession, categories) - provider = registry.resolve(accession) - provider.download_files( - accession=accession, - records=records, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - protocol=protocol, - parallel_files=parallel_files, - checksum_check=checksum_check, - aspera_maximum_bandwidth=aspera_maximum_bandwidth, - ) - - def get_all_category_file_list( - self, accession: str, categories: "str | List[str]" - ) -> List[Dict]: - """ - Retrieve a list of files from a specific project that belong to given categories. - - :param accession: The PRIDE project accession identifier. - :param categories: A single category string or list of categories to filter by. - :return: A list of files matching the specified categories. - """ - if isinstance(categories, str): - categories = [categories] - category_set = {c.upper() for c in categories} - records = registry.resolve(accession).list_files(accession) - return [r for r in records if r["fileCategory"]["value"] in category_set] - - # ------------------------------- - # ProteomeXchange support - # ------------------------------- - - @staticmethod - def _normalize_px_xml_url(px_id_or_url: str) -> str: - """Shim — see :meth:`pridepy.providers.proteomexchange.ProteomeXchangeProvider._normalize_px_xml_url`.""" - return ProteomeXchangeProvider._normalize_px_xml_url(px_id_or_url) - - @staticmethod - def _parse_px_xml_for_raw_file_urls(px_xml_url: str): - """Shim — see :meth:`pridepy.providers.proteomexchange.ProteomeXchangeProvider._parse_px_xml_for_raw_file_urls`.""" - return ProteomeXchangeProvider._parse_px_xml_for_raw_file_urls(px_xml_url) - def download_px_raw_files( self, px_id_or_url: str, output_folder: str, skip_if_downloaded_already: bool = True, ) -> None: - """Shim — see :meth:`pridepy.providers.proteomexchange.ProteomeXchangeProvider.download_from_accession_or_url`.""" + """Delegate to :meth:`ProteomeXchangeProvider.download_from_accession_or_url`.""" return ProteomeXchangeProvider().download_from_accession_or_url( px_id_or_url, output_folder, skip_if_downloaded_already ) - - @staticmethod - def _local_path_for_url(download_url: str, output_folder: str) -> str: - """Shim — see :func:`pridepy.providers.transport._local_path_for_url`.""" - return transport._local_path_for_url(download_url=download_url, output_folder=output_folder) - - @staticmethod - def _download_one_ftp_path( - ftp: FTP, - ftp_path: str, - local_path: str, - skip_if_downloaded_already: bool, - max_download_retries: int, - position: int = 0, - ) -> None: - """Shim — see :func:`pridepy.providers.transport._download_one_ftp_path`.""" - return transport._download_one_ftp_path( - ftp=ftp, - ftp_path=ftp_path, - local_path=local_path, - skip_if_downloaded_already=skip_if_downloaded_already, - max_download_retries=max_download_retries, - position=position, - ) - - @staticmethod - def _download_ftp_paths_serial( - host: str, - paths: List[str], - output_folder: str, - skip_if_downloaded_already: bool, - use_tls: bool, - max_connection_retries: int, - max_download_retries: int, - ) -> None: - """Shim — see :func:`pridepy.providers.transport._download_ftp_paths_serial`.""" - return transport._download_ftp_paths_serial( - host=host, - paths=paths, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - use_tls=use_tls, - max_connection_retries=max_connection_retries, - max_download_retries=max_download_retries, - ) - - @staticmethod - def _download_ftp_paths_parallel( - host: str, - paths: List[str], - output_folder: str, - skip_if_downloaded_already: bool, - use_tls: bool, - max_connection_retries: int, - max_download_retries: int, - parallel_files: int, - ) -> None: - """Shim — see :func:`pridepy.providers.transport._download_ftp_paths_parallel`.""" - return transport._download_ftp_paths_parallel( - host=host, - paths=paths, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - use_tls=use_tls, - max_connection_retries=max_connection_retries, - max_download_retries=max_download_retries, - parallel_files=parallel_files, - ) - - @staticmethod - def download_ftp_urls( - ftp_urls: List[str], - output_folder: str, - skip_if_downloaded_already: bool, - max_connection_retries: int = 3, - max_download_retries: int = 3, - use_tls: bool = False, - parallel_files: int = 1, - ) -> None: - """Shim — see :func:`pridepy.providers.transport.download_ftp_urls`.""" - return transport.download_ftp_urls( - ftp_urls=ftp_urls, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - max_connection_retries=max_connection_retries, - max_download_retries=max_download_retries, - use_tls=use_tls, - parallel_files=parallel_files, - ) - - @staticmethod - def _http_download_one( - url: str, - output_folder: str, - skip_if_downloaded_already: bool, - max_retries: int = 3, - position: int = 0, - ) -> None: - """Shim — see :func:`pridepy.providers.transport._http_download_one`.""" - return transport._http_download_one( - url=url, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - max_retries=max_retries, - position=position, - ) - - @staticmethod - def download_http_urls( - http_urls: List[str], - output_folder: str, - skip_if_downloaded_already: bool, - parallel_files: int = 1, - max_retries: int = 3, - ) -> None: - """Shim — see :func:`pridepy.providers.transport.download_http_urls`.""" - return transport.download_http_urls( - http_urls=http_urls, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - parallel_files=parallel_files, - max_retries=max_retries, - ) diff --git a/pridepy/providers/base.py b/pridepy/providers/base.py index f9fa8bc..cbd1830 100644 --- a/pridepy/providers/base.py +++ b/pridepy/providers/base.py @@ -1,7 +1,11 @@ """Abstract base classes for pridepy providers.""" +import logging from abc import ABC, abstractmethod from typing import ClassVar, Dict, List, Optional +from pridepy.providers import transport +from pridepy.providers import util as _provider_util + class Provider(ABC): """Abstract base for every repository pridepy can list and download from.""" @@ -46,10 +50,8 @@ class BaseDirectDownloadProvider(Provider): Subclasses set the ``use_tls`` class var (True for MassIVE FTPS, False for JPOST plain FTP) and override :meth:`list_files`. The shared ``download_files`` implementation partitions record URLs by scheme: - ``ftp://`` URLs are handed to :meth:`Files.download_ftp_urls`; ``http(s)://`` - URLs go to :meth:`Files.download_http_urls`. It calls **back** into - ``Files`` so that test patches on ``Files.download_ftp_urls`` / - ``Files.download_http_urls`` continue to intercept the calls. + ``ftp://`` URLs are handed to :func:`transport.download_ftp_urls`; + ``http(s)://`` URLs go to :func:`transport.download_http_urls`. """ use_tls: ClassVar[bool] = False @@ -67,31 +69,25 @@ def download_files( username: Optional[str] = None, password: Optional[str] = None, ) -> None: - # Lazy import: providers know about Files (the facade) only via the - # public attributes that tests may patch; avoid module-load cycle. - from pridepy.files.files import Files - if protocol not in ("ftp", "https", "http"): - import logging logging.warning( "Direct downloads currently use ftp / https only. " f"Ignoring requested protocol '{protocol}' for {accession}." ) - all_urls = [Files._get_download_url(record, "ftp") for record in records] + all_urls = [_provider_util._get_download_url(record, "ftp") for record in records] ftp_urls = [u for u in all_urls if u.lower().startswith("ftp://")] http_urls = [ u for u in all_urls if u.lower().startswith(("http://", "https://")) ] if not ftp_urls and not http_urls: - import logging logging.info( f"No files matched for direct-download dataset {accession}" ) return if ftp_urls: - Files.download_ftp_urls( + transport.download_ftp_urls( ftp_urls=ftp_urls, output_folder=output_folder, skip_if_downloaded_already=skip_if_downloaded_already, @@ -99,7 +95,7 @@ def download_files( parallel_files=parallel_files, ) if http_urls: - Files.download_http_urls( + transport.download_http_urls( http_urls=http_urls, output_folder=output_folder, skip_if_downloaded_already=skip_if_downloaded_already, diff --git a/pridepy/providers/pride.py b/pridepy/providers/pride.py index c8c40ca..5456544 100644 --- a/pridepy/providers/pride.py +++ b/pridepy/providers/pride.py @@ -3,14 +3,14 @@ PRIDE has the richest behaviour of all providers: multi-protocol batch download with aspera/s3/ftp/globus fallback, private-dataset path with username/password auth, checksum TSV validation, and submitter-path -helpers. This module hosts all of those; the :class:`Files` facade -delegates via lightweight shim methods. - -Implementation note: PRIDE-specific helpers that the existing test suite -patches via ``patch.object(Files, "X")`` are called from inside this -provider via ``Files.X(...)`` (lazy import) — never ``self.X`` — so the -patches keep intercepting. This is a deliberate backward-compat choice -documented in the refactor plan (Task 8). +helpers. This module owns all of that logic; the :class:`Files` facade +exposes a thin public surface for downstream callers. + +Implementation note: PRIDE provider methods route through other +PrideProvider methods (``PrideProvider.X(...)``) or directly through the +shared ``transport`` / ``util`` helpers — they do NOT call back into the +``Files`` facade. Tests patch the canonical locations +(``PrideProvider.X``, ``transport.X``, ``util.X``) directly. """ import ftplib import importlib.resources @@ -35,7 +35,8 @@ from tqdm import tqdm from pridepy.authentication.authentication import Authentication -from pridepy.providers import registry +from pridepy.providers import registry, transport +from pridepy.providers import util as _provider_util from pridepy.providers.base import Provider from pridepy.providers.util import Progress from pridepy.util.api_handling import Util @@ -110,10 +111,9 @@ def get_submitted_file_path_prefix(self, accession): :param accession: PRIDE accession :return: path fragment (eg: 2018/10/PXD008644) """ - # Use Files facade so test patches on get_all_raw_file_list keep working. - from pridepy.files.files import Files - results = Files().get_all_raw_file_list(accession) - first_file = results[0]["publicFileLocations"][0]["value"] + records = self.list_files(accession) + raw_files = [r for r in records if r["fileCategory"]["value"] == "RAW"] + first_file = raw_files[0]["publicFileLocations"][0]["value"] path_fragment = re.search(r"\d{4}/\d{2}/PXD\d*", first_file).group() return path_fragment @@ -157,6 +157,15 @@ def get_ascp_binary(): else: raise OSError(f"Unsupported OS or architecture: {os_type}, {arch}") + @staticmethod + def get_output_file_name(download_url, file, output_folder): + """Build the local output path for ``download_url`` inside ``output_folder``.""" + public_filepath_part = download_url.rsplit("/", 1) + accession = file.get("accession", "unknown-accession") + logging.debug(accession + " -> " + public_filepath_part[1]) + new_file_path = os.path.join(output_folder, f"{public_filepath_part[1]}") + return new_file_path + @staticmethod def save_checksum_file(accession, output_folder): """ @@ -182,11 +191,8 @@ def save_checksum_file(accession, output_folder): @staticmethod def _globus_download_one(file, output_folder, skip_if_downloaded_already, max_retries=6, position=0): """Download a single file via globus; used as a worker target.""" - # Use Files facade so test patches on Files helpers keep working. - from pridepy.files.files import Files - - download_url = Files._get_download_url(file, "globus") - new_file_path = Files.get_output_file_name(download_url, file, output_folder) + download_url = _provider_util._get_download_url(file, "globus") + new_file_path = PrideProvider.get_output_file_name(download_url, file, output_folder) if skip_if_downloaded_already and os.path.exists(new_file_path): logging.info(f"Skipping download as file already exists: {new_file_path}") @@ -194,7 +200,7 @@ def _globus_download_one(file, output_folder, skip_if_downloaded_already, max_re for attempt in range(1, max_retries + 1): try: - Files._parallel_download(download_url, new_file_path, position=position) + transport._parallel_download(download_url, new_file_path, position=position) return except Exception as e: logging.warning(f"Attempt {attempt}/{max_retries} failed for {file.get('fileName', '?')}: {e}") @@ -221,8 +227,6 @@ def download_files_from_ftp( :param max_connection_retries: Number of attempts to reconnect to the FTP server if the connection is lost. :param max_download_retries: Number of attempts to retry the download of a file in case of failure. """ - from pridepy.files.files import Files - if not os.path.isdir(output_folder): os.makedirs(output_folder) @@ -249,7 +253,7 @@ def connect_ftp(): logging.debug("ftp_filepath:" + download_url) # Get output file path - new_file_path = Files.get_output_file_name( + new_file_path = PrideProvider.get_output_file_name( download_url, file, output_folder ) @@ -327,6 +331,60 @@ def callback(data): ) break + @staticmethod + def download_files_from_aspera( + file_list_json: List[Dict], + output_folder: str, + skip_if_downloaded_already, + maximum_bandwidth: str = "100M", + ): + """ + Download files using aspera transfer url + :param file_list_json: file list in json format + :param output_folder: folder to download the files + :param maximum_bandwidth: parameter in Aspera sets the maximum bandwidth for the transfer. + :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. + """ + ascp_path = PrideProvider.get_ascp_binary() + key_full_path = importlib.resources.files("pridepy").joinpath( + "aspera/key/asperaweb_id_dsa.openssh" + ) + key_path = os.path.abspath(key_full_path) + for file in file_list_json: + if file["publicFileLocations"][0]["name"] == "Aspera Protocol": + download_url = file["publicFileLocations"][0]["value"] + else: + download_url = file["publicFileLocations"][1]["value"] + + # Create a clean filename to save the downloaded file + logging.debug(f"Downloading via Aspera: {download_url}") + new_file_path = PrideProvider.get_output_file_name(download_url, file, output_folder) + + if skip_if_downloaded_already and os.path.exists(new_file_path): + logging.info("Skipping download as file already exists") + continue + + try: + # Execute the ascp command using subprocess + subprocess.run( + [ + ascp_path, + "-QT", + "-P", + "33001", + "-l", + maximum_bandwidth, # Options for Aspera: adjust as necessary + "-i", + key_path, + download_url, + new_file_path, # Source and destination + ], + check=True, + ) + logging.info(f"Successfully downloaded {new_file_path} via Aspera") + except subprocess.CalledProcessError as e: + logging.error(f"Aspera download failed for {new_file_path}: {str(e)}") + @staticmethod def download_files_from_globus( file_list_json: List[Dict], output_folder, skip_if_downloaded_already, @@ -346,9 +404,6 @@ def download_files_from_globus( :param parallel_files: number of files to download simultaneously :param checksum_map: mapping of file name to expected MD5 checksum """ - # Use Files facade so test patches on Files._globus_download_one etc. keep working. - from pridepy.files.files import Files - if checksum_map is None: checksum_map = {} @@ -358,12 +413,12 @@ def download_files_from_globus( # --- Phase 0: pre-filter files that need downloading ----------------- files_to_download: List[Dict] = [] for file in file_list_json: - download_url = Files._get_download_url(file, "globus") - new_file_path = Files.get_output_file_name(download_url, file, output_folder) + download_url = _provider_util._get_download_url(file, "globus") + new_file_path = PrideProvider.get_output_file_name(download_url, file, output_folder) if skip_if_downloaded_already and os.path.exists(new_file_path): expected_cs = checksum_map.get(file.get("fileName", "")) if expected_cs: - valid, reason = Files.validate_download(new_file_path, expected_cs) + valid, reason = _provider_util.validate_download(new_file_path, expected_cs) if not valid: logging.warning(f"Corrupted file detected ({reason}), will re-download: {new_file_path}") files_to_download.append(file) @@ -386,11 +441,11 @@ def download_files_from_globus( if parallel_files < 2: for file in files_to_download: try: - Files._globus_download_one( + PrideProvider._globus_download_one( file, output_folder, False ) - new_file_path = Files.get_output_file_name( - Files._get_download_url(file, "globus"), file, output_folder + new_file_path = PrideProvider.get_output_file_name( + _provider_util._get_download_url(file, "globus"), file, output_folder ) logging.info(f"Successfully downloaded {new_file_path}") except Exception as e: @@ -400,7 +455,7 @@ def download_files_from_globus( with ThreadPoolExecutor(max_workers=parallel_files) as executor: futures = { executor.submit( - Files._globus_download_one, + PrideProvider._globus_download_one, file, output_folder, False, position=idx, ): file @@ -422,8 +477,6 @@ def download_files_from_s3( :param output_folder: folder to download the files :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. """ - from pridepy.files.files import Files - if not os.path.isdir(output_folder): os.makedirs(output_folder, exist_ok=True) @@ -453,7 +506,7 @@ def download_files_from_s3( ftp_base_url = "ftp://ftp.pride.ebi.ac.uk/pride/data/archive/" s3_path = download_url.replace(ftp_base_url, "") - new_file_path = Files.get_output_file_name(download_url, file, output_folder) + new_file_path = PrideProvider.get_output_file_name(download_url, file, output_folder) if skip_if_downloaded_already == True and os.path.exists(new_file_path): logging.info("Skipping download as file already exists") @@ -587,20 +640,17 @@ def _batch_download_by_protocol( Transfer a batch of files with one protocol, reusing a single connection where the underlying helper supports it (FTP, S3). """ - # Use Files facade so test patches on each per-protocol helper keep working. - from pridepy.files.files import Files - if not file_list: return if protocol == "ftp": - Files.download_files_from_ftp( + PrideProvider.download_files_from_ftp( file_list, output_folder, skip_if_downloaded_already=skip_if_downloaded_already, ) return if protocol == "aspera": - Files.download_files_from_aspera( + PrideProvider.download_files_from_aspera( file_list, output_folder, skip_if_downloaded_already=skip_if_downloaded_already, @@ -608,7 +658,7 @@ def _batch_download_by_protocol( ) return if protocol == "globus": - Files.download_files_from_globus( + PrideProvider.download_files_from_globus( file_list, output_folder, skip_if_downloaded_already=skip_if_downloaded_already, @@ -617,7 +667,7 @@ def _batch_download_by_protocol( ) return if protocol == "s3": - Files.download_files_from_s3( + PrideProvider.download_files_from_s3( file_list, output_folder, skip_if_downloaded_already=skip_if_downloaded_already, @@ -640,10 +690,7 @@ def _download_with_fallback( after every attempt. Intended as the per-file fallback path; batch download of the primary protocol is handled separately. """ - # Patch-sensitive: call through Files so test patches intercept. - from pridepy.files.files import Files - - local_path = Files._resolve_local_path(file_record, output_folder) + local_path = _provider_util._resolve_local_path(file_record, output_folder) for protocol in protocol_sequence: for attempt in range(1, max_protocol_retries + 1): @@ -652,8 +699,8 @@ def _download_with_fallback( f"(attempt {attempt}/{max_protocol_retries})" ) try: - Files._remove_if_exists(local_path) - Files._batch_download_by_protocol( + _provider_util._remove_if_exists(local_path) + PrideProvider._batch_download_by_protocol( [file_record], output_folder, protocol, @@ -666,7 +713,7 @@ def _download_with_fallback( f"Protocol {protocol} failed for {file_record['fileName']}: {error}" ) - valid, reason = Files.validate_download(local_path, expected_checksum) + valid, reason = _provider_util.validate_download(local_path, expected_checksum) if valid: logging.info( f"File {file_record['fileName']} downloaded successfully via {protocol}" @@ -676,7 +723,7 @@ def _download_with_fallback( logging.warning( f"Validation failed for {file_record['fileName']} via {protocol}: {reason}" ) - Files._remove_if_exists(local_path) + _provider_util._remove_if_exists(local_path) logging.warning( f"Protocol {protocol} exhausted for {file_record['fileName']}, switching protocol." @@ -730,10 +777,6 @@ def _download_files_batch( :param aspera_maximum_bandwidth: parameter in Aspera sets the maximum bandwidth for the transfer. :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. """ - # Patch-sensitive: call _batch_download_by_protocol and - # _download_with_fallback through Files so test patches intercept. - from pridepy.files.files import Files - protocols_supported = ["ftp", "aspera", "globus", "s3"] if protocol not in protocols_supported: logging.error("Protocol should be one of ftp, aspera, globus, s3") @@ -743,14 +786,14 @@ def _download_files_batch( checksum_map: Dict[str, str] = {} if checksum_check: - checksum_file_path = Files.save_checksum_file(accession, output_folder) - checksum_map = Files.read_checksum_file(checksum_file_path) + checksum_file_path = PrideProvider.save_checksum_file(accession, output_folder) + checksum_map = _provider_util.read_checksum_file(checksum_file_path) logging.info(f"Loaded checksums for {len(checksum_map)} files") if not file_list_json: return - protocol_sequence = Files._protocol_sequence(protocol) + protocol_sequence = PrideProvider._protocol_sequence(protocol) primary_protocol = protocol_sequence[0] # Retry with the primary protocol first, then fall back to others fallback_sequence = protocol_sequence @@ -762,7 +805,7 @@ def _download_files_batch( f"Downloading {len(file_list_json)} file(s) via {primary_protocol} (batch)" ) try: - Files._batch_download_by_protocol( + PrideProvider._batch_download_by_protocol( file_list_json, output_folder, primary_protocol, @@ -782,9 +825,9 @@ def _download_files_batch( failed_files: List[str] = [] for i, file_record in enumerate(file_list_json, 1): expected_checksum = checksum_map.get(file_record["fileName"]) - local_path = Files._resolve_local_path(file_record, output_folder) + local_path = _provider_util._resolve_local_path(file_record, output_folder) logging.info("Validating [%d/%d] %s", i, len(file_list_json), file_record["fileName"]) - valid, reason = Files.validate_download(local_path, expected_checksum) + valid, reason = _provider_util.validate_download(local_path, expected_checksum) if valid: continue @@ -792,13 +835,13 @@ def _download_files_batch( f"{file_record['fileName']} invalid after {primary_protocol} ({reason})" ) if "checksum mismatch" in reason: - Files._remove_if_exists(local_path) + _provider_util._remove_if_exists(local_path) if not fallback_sequence: failed_files.append(file_record.get("fileName", "")) continue - success = Files._download_with_fallback( + success = PrideProvider._download_with_fallback( file_record=file_record, output_folder=output_folder, protocol_sequence=fallback_sequence, diff --git a/pridepy/providers/proteomexchange.py b/pridepy/providers/proteomexchange.py index cef0524..f42419b 100644 --- a/pridepy/providers/proteomexchange.py +++ b/pridepy/providers/proteomexchange.py @@ -28,6 +28,7 @@ from typing import ClassVar, Dict, List, Optional from urllib.parse import urlparse +from pridepy.providers import transport from pridepy.providers.base import Provider from pridepy.util.api_handling import Util @@ -139,13 +140,9 @@ def download_files( ) -> None: """Partition record URLs by scheme and route to the matching transport. - Routes ftp:// records to :meth:`Files.download_ftp_urls` and - http(s):// records to :meth:`Files.download_http_urls`, going - through the Files facade so test patches like - ``patch.object(Files, "download_ftp_urls")`` continue to intercept. + Routes ftp:// records to :func:`transport.download_ftp_urls` and + http(s):// records to :func:`transport.download_http_urls`. """ - from pridepy.files.files import Files # lazy: avoid module-load cycle - if not os.path.isdir(output_folder): os.makedirs(output_folder, exist_ok=True) @@ -158,11 +155,11 @@ def download_files( http_urls = [u for u in urls if u.lower().startswith(("http://", "https://"))] if ftp_urls: - Files.download_ftp_urls( + transport.download_ftp_urls( ftp_urls, output_folder, skip_if_downloaded_already ) if http_urls: - Files.download_http_urls( + transport.download_http_urls( http_urls, output_folder, skip_if_downloaded_already ) diff --git a/pridepy/providers/util.py b/pridepy/providers/util.py index 0fc5791..0a4f354 100644 --- a/pridepy/providers/util.py +++ b/pridepy/providers/util.py @@ -133,7 +133,9 @@ def _get_download_url(file_record: Dict, protocol: str) -> str: arbitrary non-Aspera location would produce a URL the caller cannot actually transfer with). """ - from pridepy.files.files import Files + # Lazy import to avoid module-load cycle with PrideProvider (which lives + # in the providers package and imports back into util via _resolve_local_path). + from pridepy.providers.pride import PrideProvider locations = file_record.get("publicFileLocations", []) if not locations: @@ -159,8 +161,8 @@ def _get_download_url(file_record: Dict, protocol: str) -> str: return ftp_url if protocol == "globus": return ftp_url.replace( - Files.PRIDE_ARCHIVE_FTP_URL_PREFIX, - Files.PRIDE_ARCHIVE_HTTPS_URL_PREFIX, + PrideProvider.ARCHIVE_FTP_URL_PREFIX, + PrideProvider.ARCHIVE_HTTPS_URL_PREFIX, 1, ) if protocol == "s3": @@ -172,12 +174,13 @@ def _resolve_local_path(file_record: Dict, output_folder: str) -> str: """ Compute the canonical local path for a file regardless of transfer protocol. """ - from pridepy.files.files import Files + # Lazy import to avoid module-load cycle with PrideProvider. + from pridepy.providers.pride import PrideProvider try: canonical_url = _get_download_url(file_record, "ftp") except ValueError: canonical_url = "" if canonical_url: - return Files.get_output_file_name(canonical_url, file_record, output_folder) + return PrideProvider.get_output_file_name(canonical_url, file_record, output_folder) return os.path.join(output_folder, file_record["fileName"]) diff --git a/pridepy/tests/test_download_by_url.py b/pridepy/tests/test_download_by_url.py index fb34491..aa195dc 100644 --- a/pridepy/tests/test_download_by_url.py +++ b/pridepy/tests/test_download_by_url.py @@ -12,6 +12,7 @@ import click import pytest +from pridepy.commands import by_url from pridepy.files.files import Files from pridepy.pridepy import _read_url_arguments @@ -38,7 +39,7 @@ def fake_http(_url, target_path): _touch_valid(target_path) with patch.object( - Files, "_http_download_url", side_effect=fake_http + by_url, "_http_download_url", side_effect=fake_http ) as mock_http: Files.download_files_by_url( urls=["https://example.org/sample.raw"], @@ -56,7 +57,7 @@ def fake_ftp(_parsed, target_path): _touch_valid(target_path) with patch.object( - Files, "_ftp_download_url", side_effect=fake_ftp + by_url, "_ftp_download_url", side_effect=fake_ftp ) as mock_ftp: Files.download_files_by_url( urls=["ftp://ftp.pride.ebi.ac.uk/path/sample.raw"], @@ -86,7 +87,7 @@ def test_skip_if_exists_short_circuits(self): with tempfile.TemporaryDirectory() as tmp_dir: target = os.path.join(tmp_dir, "existing.raw") _touch_valid(target) - with patch.object(Files, "_http_download_url") as mock_http: + with patch.object(by_url, "_http_download_url") as mock_http: Files.download_files_by_url( urls=["https://example.org/existing.raw"], output_folder=tmp_dir, diff --git a/pridepy/tests/test_download_resilience.py b/pridepy/tests/test_download_resilience.py index 0f86013..29b115f 100644 --- a/pridepy/tests/test_download_resilience.py +++ b/pridepy/tests/test_download_resilience.py @@ -4,7 +4,13 @@ from unittest import TestCase from unittest.mock import Mock, patch +from pridepy.commands import by_url from pridepy.files.files import Files +from pridepy.providers import transport +from pridepy.providers import util as provider_util +from pridepy.providers.massive import MassiveProvider +from pridepy.providers.pride import PrideProvider +from pridepy.providers import registry class TestDownloadResilience(TestCase): @@ -40,7 +46,7 @@ def test_get_download_url_maps_globus_to_pride_archive_https(self): ] } - download_url = Files._get_download_url(file_record, "globus") + download_url = provider_util._get_download_url(file_record, "globus") assert download_url == "https://ftp.pride.ebi.ac.uk/path/file.raw" @@ -61,10 +67,10 @@ def test_parallel_download_streams_full_file(self): session.get.return_value = stream_response with patch( - "pridepy.files.files.Util.create_session_with_retries", + "pridepy.providers.transport.Util.create_session_with_retries", return_value=session, ): - Files._parallel_download( + transport._parallel_download( "https://example.org/file.raw", output_file, ) @@ -86,10 +92,10 @@ def test_parallel_download_falls_back_when_head_fails(self): session.get.return_value = fallback_response with patch( - "pridepy.files.files.Util.create_session_with_retries", + "pridepy.providers.transport.Util.create_session_with_retries", return_value=session, ): - Files._parallel_download( + transport._parallel_download( "https://example.org/file.raw", output_file, ) @@ -114,10 +120,10 @@ def test_parallel_download_falls_back_without_accept_ranges(self): session.get.return_value = fallback_response with patch( - "pridepy.files.files.Util.create_session_with_retries", + "pridepy.providers.transport.Util.create_session_with_retries", return_value=session, ): - Files._parallel_download( + transport._parallel_download( "https://example.org/file.raw", output_file, ) @@ -142,8 +148,8 @@ def test_validate_download_rejects_empty_and_bad_checksum(self): assert "checksum mismatch" in reason def test_protocol_sequence_prefers_requested_then_fallback(self): - assert Files._protocol_sequence("ftp") == ["ftp", "aspera", "s3", "globus"] - assert Files._protocol_sequence("aspera") == ["aspera", "s3", "ftp", "globus"] + assert PrideProvider._protocol_sequence("ftp") == ["ftp", "aspera", "s3", "globus"] + assert PrideProvider._protocol_sequence("aspera") == ["aspera", "s3", "ftp", "globus"] def test_download_with_fallback_switches_protocol_after_invalid_file(self): file_record = { @@ -168,8 +174,8 @@ def fake_batch(file_list, output_folder, protocol, skip_if_downloaded_already, with open(local_path, "wb") as handle: handle.write(b"abc") - with patch.object(Files, "_batch_download_by_protocol", side_effect=fake_batch): - success = Files._download_with_fallback( + with patch.object(PrideProvider, "_batch_download_by_protocol", side_effect=fake_batch): + success = PrideProvider._download_with_fallback( file_record=file_record, output_folder=tmp_dir, protocol_sequence=["aspera", "s3"], @@ -201,9 +207,9 @@ def fake_batch(file_list, output_folder, protocol, skip_if_downloaded_already, with open(local_path, "wb") as handle: handle.write(b"data") - with patch.object(Files, "_batch_download_by_protocol", side_effect=fake_batch) as batch_mock, \ - patch.object(Files, "_download_with_fallback") as fallback_mock: - Files.download_files( + with patch.object(PrideProvider, "_batch_download_by_protocol", side_effect=fake_batch) as batch_mock, \ + patch.object(PrideProvider, "_download_with_fallback") as fallback_mock: + PrideProvider._download_files_batch( file_list_json=[file_record], accession="PXD000000", output_folder=tmp_dir, @@ -229,8 +235,8 @@ def test_globus_parallel_workers_capped_to_file_count(self): ] with tempfile.TemporaryDirectory() as tmp_dir: - with patch.object(Files, "_globus_download_one") as mock_one: - Files.download_files_from_globus( + with patch.object(PrideProvider, "_globus_download_one") as mock_one: + PrideProvider.download_files_from_globus( file_list_json=file_records, output_folder=tmp_dir, skip_if_downloaded_already=False, @@ -243,7 +249,7 @@ def test_globus_parallel_workers_capped_to_file_count(self): def test_url_parallel_workers_capped_to_url_count(self): """download_files_by_url must cap workers to len(urls).""" with tempfile.TemporaryDirectory() as tmp_dir: - with patch.object(Files, "_download_single_url") as mock_single: + with patch.object(by_url, "_download_single_url") as mock_single: Files.download_files_by_url( urls=["https://example.org/a.raw"], output_folder=tmp_dir, @@ -258,10 +264,10 @@ def test_download_files_raises_when_any_file_fails(self): with tempfile.TemporaryDirectory() as tmp_dir: file_list = [{"fileName": "missing.raw"}] - with patch.object(Files, "_batch_download_by_protocol"), \ - patch.object(Files, "_download_with_fallback", return_value=False): + with patch.object(PrideProvider, "_batch_download_by_protocol"), \ + patch.object(PrideProvider, "_download_with_fallback", return_value=False): with self.assertRaisesRegex(RuntimeError, "missing.raw"): - Files.download_files( + PrideProvider._download_files_batch( file_list_json=file_list, accession="PXD000000", output_folder=tmp_dir, @@ -274,12 +280,10 @@ def test_facade_dispatches_pride_through_registry_to_fallback(self): Files facade -> Registry.resolve -> PrideProvider.download_files -> _batch_download_by_protocol (mocked). - Patching Files._batch_download_by_protocol proves the patch intercepts - (i.e. PrideProvider calls *back* through Files, preserving the test - contract for the multi-protocol orchestrator). + Patching PrideProvider._batch_download_by_protocol proves the patch + intercepts (i.e. PrideProvider owns the multi-protocol orchestrator + and no longer routes through Files). """ - from pridepy.providers.pride import PrideProvider - fake_records = [ { "accession": "PXD000001", @@ -293,9 +297,9 @@ def test_facade_dispatches_pride_through_registry_to_fallback(self): with tempfile.TemporaryDirectory() as tmp: with patch.object(PrideProvider, "list_files", return_value=fake_records), \ - patch.object(Files, "_batch_download_by_protocol", return_value=[]) as batch_mock, \ - patch.object(Files, "validate_download", return_value=(True, "ok")), \ - patch.object(Files, "_download_with_fallback") as fallback_mock: + patch.object(PrideProvider, "_batch_download_by_protocol", return_value=[]) as batch_mock, \ + patch.object(provider_util, "validate_download", return_value=(True, "ok")), \ + patch.object(PrideProvider, "_download_with_fallback") as fallback_mock: Files().download_all_raw_files( accession="PXD000001", output_folder=tmp, diff --git a/pridepy/tests/test_ftp_download_validation.py b/pridepy/tests/test_ftp_download_validation.py index 10bbfb5..ae80ad6 100644 --- a/pridepy/tests/test_ftp_download_validation.py +++ b/pridepy/tests/test_ftp_download_validation.py @@ -12,7 +12,7 @@ import pytest -from pridepy.files.files import Files +from pridepy.providers import transport def _make_fake_ftp(expected_size, write_bytes_per_call): @@ -43,7 +43,7 @@ def test_size_mismatch_is_retried_then_succeeds(self): local_path = os.path.join(tmp, "f.bin") ftp = _make_fake_ftp(expected_size=100, write_bytes_per_call=[50, 50]) - Files._download_one_ftp_path( + transport._download_one_ftp_path( ftp=ftp, ftp_path="/JPST000001/f.bin", local_path=local_path, @@ -64,7 +64,7 @@ def test_size_mismatch_after_retries_raises(self): ftp = _make_fake_ftp(expected_size=100, write_bytes_per_call=[10, 10, 10]) with pytest.raises(RuntimeError, match="Giving up"): - Files._download_one_ftp_path( + transport._download_one_ftp_path( ftp=ftp, ftp_path="/JPST000001/f.bin", local_path=local_path, @@ -79,7 +79,7 @@ def test_correct_size_returns_without_retry(self): local_path = os.path.join(tmp, "f.bin") ftp = _make_fake_ftp(expected_size=50, write_bytes_per_call=[50]) - Files._download_one_ftp_path( + transport._download_one_ftp_path( ftp=ftp, ftp_path="/JPST000001/f.bin", local_path=local_path, diff --git a/pridepy/tests/test_iprox_files.py b/pridepy/tests/test_iprox_files.py index dfdcd21..83c6d2d 100644 --- a/pridepy/tests/test_iprox_files.py +++ b/pridepy/tests/test_iprox_files.py @@ -14,6 +14,9 @@ from unittest.mock import MagicMock, patch from pridepy.files.files import Files +from pridepy.providers import transport +from pridepy.providers.iprox import IproxProvider +from pridepy.providers.pride import PrideProvider IPROX_XML_FIXTURE = """ @@ -61,7 +64,7 @@ def test_iprox_is_a_direct_download_accession(self): assert Files.is_direct_download_accession("IPX0017413000") def test_build_iprox_file_record_maps_px_cv_to_category(self): - record = Files._build_iprox_file_record( + record = IproxProvider._build_file_record( "IPX0017413000", "http://download.iprox.org/IPX0017413000/IPX0017413001/sample.raw", category_from_px="Associated raw file URI", @@ -74,14 +77,13 @@ def test_build_iprox_file_record_maps_px_cv_to_category(self): assert record["publicFileLocations"][0]["value"].startswith("http://") def test_list_iprox_public_files_parses_px_xml(self): - files = Files() fake_response = MagicMock() fake_response.content = IPROX_XML_FIXTURE fake_response.raise_for_status = MagicMock() with patch( - "pridepy.files.files.requests.get", return_value=fake_response + "pridepy.providers.iprox.requests.get", return_value=fake_response ) as req_mock: - records = files._list_iprox_public_files("IPX0017413000") + records = IproxProvider().list_files("IPX0017413000") # The fetch hits the deterministic PX XML URL. req_mock.assert_called_once() @@ -108,8 +110,8 @@ def test_get_all_raw_file_list_filters_iprox_records(self): fake_response.content = IPROX_XML_FIXTURE fake_response.raise_for_status = MagicMock() with patch( - "pridepy.files.files.requests.get", return_value=fake_response - ), patch.object(Files, "stream_all_files_by_project") as pride_mock: + "pridepy.providers.iprox.requests.get", return_value=fake_response + ), patch.object(PrideProvider, "stream_all_files_by_project") as pride_mock: raw_files = files.get_all_raw_file_list("IPX0017413000") pride_mock.assert_not_called() @@ -121,9 +123,9 @@ def test_download_file_by_name_routes_iprox_to_http_urls(self): fake_response.content = IPROX_XML_FIXTURE fake_response.raise_for_status = MagicMock() with tempfile.TemporaryDirectory() as tmp_dir, patch( - "pridepy.files.files.requests.get", return_value=fake_response - ), patch.object(Files, "download_http_urls") as http_mock, patch.object( - Files, "download_ftp_urls" + "pridepy.providers.iprox.requests.get", return_value=fake_response + ), patch.object(transport, "download_http_urls") as http_mock, patch.object( + transport, "download_ftp_urls" ) as ftp_mock: files.download_file_by_name( accession="IPX0017413000", diff --git a/pridepy/tests/test_jpost_files.py b/pridepy/tests/test_jpost_files.py index 1e4c652..53e1a8e 100644 --- a/pridepy/tests/test_jpost_files.py +++ b/pridepy/tests/test_jpost_files.py @@ -4,6 +4,7 @@ from unittest.mock import MagicMock, patch from pridepy.files.files import Files +from pridepy.providers import transport from pridepy.providers.jpost import JpostProvider @@ -19,7 +20,7 @@ def test_is_direct_download_accession_includes_jpost(self): assert Files.is_direct_download_accession("JPST000001") def test_build_jpost_file_record_maps_collection_to_category(self): - record = Files._build_jpost_file_record( + record = JpostProvider._build_file_record( "JPST000001", "ftp://ftp.jpostdb.org/JPST000001/peak/sample.mzML", ) @@ -30,7 +31,7 @@ def test_build_jpost_file_record_maps_collection_to_category(self): assert record["source"] == "JPOST" def test_build_jpost_file_record_marks_raw_collection_as_raw(self): - record = Files._build_jpost_file_record( + record = JpostProvider._build_file_record( "JPST000001", "ftp://ftp.jpostdb.org/JPST000001/raw/run01.raw", ) @@ -41,11 +42,11 @@ def test_build_jpost_file_record_marks_raw_collection_as_raw(self): def test_get_all_raw_file_list_filters_jpost_records(self): files = Files() jpost_records = [ - Files._build_jpost_file_record( + JpostProvider._build_file_record( "JPST000001", "ftp://ftp.jpostdb.org/JPST000001/raw/run1.raw", ), - Files._build_jpost_file_record( + JpostProvider._build_file_record( "JPST000001", "ftp://ftp.jpostdb.org/JPST000001/result/results.tsv", ), @@ -59,7 +60,7 @@ def test_get_all_raw_file_list_filters_jpost_records(self): def test_download_file_by_name_uses_jpost_ftp_listing(self): files = Files() - file_record = Files._build_jpost_file_record( + file_record = JpostProvider._build_file_record( "JPST000001", "ftp://ftp.jpostdb.org/JPST000001/raw/folder/sample.raw", ) @@ -67,7 +68,7 @@ def test_download_file_by_name_uses_jpost_ftp_listing(self): with tempfile.TemporaryDirectory() as tmp_dir: with patch.object( JpostProvider, "list_files", return_value=[file_record] - ), patch.object(Files, "download_ftp_urls") as download_mock: + ), patch.object(transport, "download_ftp_urls") as download_mock: files.download_file_by_name( accession="JPST000001", file_name="sample.raw", @@ -89,7 +90,6 @@ def test_download_file_by_name_uses_jpost_ftp_listing(self): ) def test_proxi_listing_maps_cv_name_to_category(self): - files = Files() proxi_response = { "datasetFiles": [ { @@ -117,8 +117,8 @@ def test_proxi_listing_maps_cv_name_to_category(self): fake_response = MagicMock() fake_response.content = json.dumps(proxi_response).encode("utf-8") fake_response.raise_for_status = MagicMock() - with patch("pridepy.files.files.requests.get", return_value=fake_response) as req_mock: - records = files._list_jpost_public_files_via_proxi("JPST002311") + with patch("pridepy.providers.jpost.requests.get", return_value=fake_response) as req_mock: + records = JpostProvider()._list_via_proxi("JPST002311") req_mock.assert_called_once() call_url = req_mock.call_args[0][0] @@ -132,18 +132,14 @@ def test_proxi_listing_maps_cv_name_to_category(self): assert cats["sample01.txt"] == "OTHER" def test_proxi_falls_back_to_ftp_walk_on_error(self): - files = Files() - ftp_record = Files._build_jpost_file_record( - "JPST000001", "ftp://ftp.jpostdb.org/JPST000001/raw/x.raw" - ) with patch.object( - Files, - "_list_jpost_public_files_via_proxi", + JpostProvider, + "_list_via_proxi", side_effect=RuntimeError("proxi down"), ), patch.object( - Files, "_list_ftp_repo_files", return_value=["/JPST000001/raw/x.raw"] + transport, "_list_ftp_repo_files", return_value=["/JPST000001/raw/x.raw"] ) as ftp_mock: - result = files._list_jpost_public_files("JPST000001") + result = JpostProvider().list_files("JPST000001") ftp_mock.assert_called_once() assert len(result) == 1 diff --git a/pridepy/tests/test_massive_files.py b/pridepy/tests/test_massive_files.py index a4e9278..290aea3 100644 --- a/pridepy/tests/test_massive_files.py +++ b/pridepy/tests/test_massive_files.py @@ -3,6 +3,7 @@ from unittest.mock import patch from pridepy.files.files import Files +from pridepy.providers import transport from pridepy.providers.massive import MassiveProvider @@ -14,7 +15,7 @@ def test_is_massive_accession(self): assert not Files.is_massive_accession("MSV123") def test_build_massive_file_record_maps_collection_to_category(self): - record = Files._build_massive_file_record( + record = MassiveProvider._build_file_record( "MSV000012345", "ftp://massive-ftp.ucsd.edu/v01/MSV000012345/ccms_peak/converted/sample.mzML", ) @@ -24,7 +25,7 @@ def test_build_massive_file_record_maps_collection_to_category(self): assert record["fileCategory"]["value"] == "PEAK" def test_build_massive_file_record_marks_raw_collection_as_raw(self): - record = Files._build_massive_file_record( + record = MassiveProvider._build_file_record( "MSV000012345", "ftp://massive-ftp.ucsd.edu/v01/MSV000012345/raw/run01.raw", ) @@ -33,7 +34,7 @@ def test_build_massive_file_record_marks_raw_collection_as_raw(self): assert record["fileCategory"]["value"] == "RAW" def test_build_massive_file_record_keeps_non_raw_collection_even_for_raw_like_file_names(self): - record = Files._build_massive_file_record( + record = MassiveProvider._build_file_record( "MSV000012345", "ftp://massive-ftp.ucsd.edu/v01/MSV000012345/uploads/run01.raw", ) @@ -42,7 +43,7 @@ def test_build_massive_file_record_keeps_non_raw_collection_even_for_raw_like_fi assert record["fileCategory"]["value"] == "OTHER" def test_build_massive_file_record_marks_ab_sciex_scan_sidecar_as_raw_when_under_raw(self): - record = Files._build_massive_file_record( + record = MassiveProvider._build_file_record( "MSV000012345", "ftp://massive-ftp.ucsd.edu/v01/MSV000012345/raw/sample.wiff.scan", ) @@ -53,15 +54,15 @@ def test_build_massive_file_record_marks_ab_sciex_scan_sidecar_as_raw_when_under def test_get_all_raw_file_list_filters_massive_records(self): files = Files() massive_records = [ - Files._build_massive_file_record( + MassiveProvider._build_file_record( "MSV000012345", "ftp://massive-ftp.ucsd.edu/v01/MSV000012345/raw/run1.raw", ), - Files._build_massive_file_record( + MassiveProvider._build_file_record( "MSV000012345", "ftp://massive-ftp.ucsd.edu/v01/MSV000012345/quant/results.tsv", ), - Files._build_massive_file_record( + MassiveProvider._build_file_record( "MSV000012345", "ftp://massive-ftp.ucsd.edu/v01/MSV000012345/uploads/run2.mzML", ), @@ -75,14 +76,14 @@ def test_get_all_raw_file_list_filters_massive_records(self): def test_download_file_by_name_uses_massive_ftp_listing(self): files = Files() - file_record = Files._build_massive_file_record( + file_record = MassiveProvider._build_file_record( "MSV000012345", "ftp://massive-ftp.ucsd.edu/v01/MSV000012345/raw/folder/sample.raw", ) with tempfile.TemporaryDirectory() as tmp_dir: with patch.object(MassiveProvider, "list_files", return_value=[file_record]), patch.object( - Files, "download_ftp_urls" + transport, "download_ftp_urls" ) as download_mock: files.download_file_by_name( accession="MSV000012345", @@ -112,7 +113,7 @@ def test_repo_uses_tls_true_for_massive_false_for_jpost(self): def test_download_all_raw_files_threads_parallel_files_for_massive(self): files = Files() massive_records = [ - Files._build_massive_file_record( + MassiveProvider._build_file_record( "MSV000012345", f"ftp://massive-ftp.ucsd.edu/v01/MSV000012345/raw/run{i}.raw", ) @@ -122,7 +123,7 @@ def test_download_all_raw_files_threads_parallel_files_for_massive(self): with tempfile.TemporaryDirectory() as tmp_dir: with patch.object( MassiveProvider, "list_files", return_value=massive_records - ), patch.object(Files, "download_ftp_urls") as download_mock: + ), patch.object(transport, "download_ftp_urls") as download_mock: files.download_all_raw_files( accession="MSV000012345", output_folder=tmp_dir, @@ -143,7 +144,7 @@ def test_base_direct_download_provider_partitions_urls_by_scheme(self): provider = MassiveProvider() records = [ - Files._build_massive_file_record( + MassiveProvider._build_file_record( "MSV000012345", "ftp://massive-ftp.ucsd.edu/v01/MSV000012345/raw/a.raw", ), @@ -157,8 +158,8 @@ def test_base_direct_download_provider_partitions_urls_by_scheme(self): ], }, ] - with patch.object(Files, "download_ftp_urls") as ftp_mock, \ - patch.object(Files, "download_http_urls") as http_mock: + with patch.object(transport, "download_ftp_urls") as ftp_mock, \ + patch.object(transport, "download_http_urls") as http_mock: provider.download_files( accession="MSV000012345", records=records, diff --git a/pyproject.toml b/pyproject.toml index f5b74ee..bc347c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pridepy" -version = "0.0.17" +version = "0.0.18" description = "Python Client library for PRIDE Rest API" readme = "README.md" requires-python = ">=3.9"