diff --git a/README.md b/README.md index 0715bc6..27bab43 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ You can: - download public and private PRIDE files -- download public MassIVE datasets directly from `MSV...` accessions +- download public MassIVE (`MSV...`), JPOST (`JPST...`), and iProX (`IPX...`) datasets directly. MassIVE goes through FTPS at `massive-ftp.ucsd.edu`; JPOST uses the JSON PROXI endpoint at `repository.jpostdb.org` for listings and `ftp.jpostdb.org` for transfers; iProX fetches the dataset's ProteomeXchange XML from `download.iprox.org` and downloads files over anonymous HTTPS - download by category (`RAW`, `SEARCH`, `RESULT`, etc.) - stream project and file metadata - search projects by keyword and filters @@ -80,15 +80,31 @@ pridepy download-all-public-raw-files \ --checksum-check ``` -### 3) Download a public MassIVE dataset directly +### 3) Download a public MassIVE, JPOST, or iProX dataset directly ```bash +# MassIVE pridepy download-all-public-raw-files \ -a MSV000082297 \ -o ./downloads/MSV000082297 + +# JPOST +pridepy download-all-public-raw-files \ + -a JPST002311 \ + -o ./downloads/JPST002311 + +# iProX +pridepy download-all-public-raw-files \ + -a IPX0017413000 \ + -o ./downloads/IPX0017413000 ``` -For direct `MSV...` downloads, `pridepy` enumerates the dataset from MassIVE's public FTP tree. Raw downloads follow MassIVE's own collection layout, so `download-all-public-raw-files` downloads the files stored under the dataset's `raw/` collection. +For these direct downloads, `pridepy` enumerates the dataset from the repository: +- **MassIVE** lists files by walking the FTPS tree at `massive-ftp.ucsd.edu` (TLS is required by the server). +- **JPOST** lists files through the JSON PROXI endpoint at `https://repository.jpostdb.org/proxi/datasets/` and downloads them from `ftp.jpostdb.org` over plain FTP. The PROXI listing avoids the source-IP connection limit JPOST enforces on FTP. +- **iProX** fetches the dataset's ProteomeXchange XML from `http://download.iprox.org//PX_.xml`, then downloads each referenced file from the same host over anonymous HTTPS. iProX exposes Aspera (`faspe://`) with username/password for very large bulk transfers; `pridepy` uses the public HTTPS endpoint instead so no iProX credentials are required. + +Raw downloads follow each repository's own collection layout, so `download-all-public-raw-files` downloads the files stored under the dataset's `raw/` collection. Direct downloads support resume (REST for FTP, byte-Range for HTTPS), per-file retries, parallel workers (`-w N` up to 3), and post-transfer size verification against the server-reported size. ### 4) Download only selected categories @@ -99,7 +115,7 @@ pridepy download-all-public-category-files \ -c RAW,SEARCH ``` -You can also request a specific MassIVE collection through the same category interface: +You can also request a specific MassIVE / JPOST / iProX collection through the same category interface: ```bash pridepy download-all-public-category-files \ @@ -244,14 +260,15 @@ print(f"RAW files: {len(raw_files)}") print(raw_files[0]["fileName"]) ``` -For MassIVE accessions, the same method returns the files found under the dataset's `raw/` collection: +For MassIVE / JPOST / iProX accessions, the same method returns the files found under the dataset's `raw/` collection: ```python from pridepy.files.files import Files files = Files() -raw_files = files.get_all_raw_file_list("MSV000082297") -print(f"MassIVE raw files: {len(raw_files)}") +for accession in ("MSV000082297", "JPST002311", "IPX0017413000"): + raw_files = files.get_all_raw_file_list(accession) + print(f"{accession} raw files: {len(raw_files)}") ``` ### Example: search projects diff --git a/pridepy/commands/__init__.py b/pridepy/commands/__init__.py new file mode 100644 index 0000000..f1312b8 --- /dev/null +++ b/pridepy/commands/__init__.py @@ -0,0 +1,20 @@ +"""Cross-cutting download commands. + +Each module under this package owns one user-facing command that doesn't +fit any single provider: + +- ``by_url``: download a list of explicit URLs (ftp/http/https) +- ``by_list``: download a subset of a project's files by filename + +ProteomeXchange used to live here too but moved to +:class:`pridepy.providers.proteomexchange.ProteomeXchangeProvider` because +it conforms to the ``Provider`` interface (takes an accession or URL and +returns file records). It is deliberately not auto-registered with the +provider registry — PXD/PRD accessions continue to route through +:class:`pridepy.providers.pride.PrideProvider`; ProteomeXchangeProvider is +the explicit gateway for the cross-repository XML view, invoked via the +``download-px-raw-files`` CLI command and ``Files.download_px_raw_files``. + +The ``pridepy.files.files.Files`` facade keeps shim methods that +delegate here, so existing test patches on ``Files.X`` keep working. +""" diff --git a/pridepy/commands/by_list.py b/pridepy/commands/by_list.py new file mode 100644 index 0000000..d2d0244 --- /dev/null +++ b/pridepy/commands/by_list.py @@ -0,0 +1,59 @@ +"""Download a subset of project files identified by a filename list.""" +import logging +from typing import List, Optional + +from pridepy.providers import registry + + +def download_files_by_list( + accession: str, + file_names: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + protocol: str = "ftp", + aspera_maximum_bandwidth: str = "100M", + checksum_check: bool = False, + parallel_files: int = 1, +) -> None: + """Download a subset of project files identified by a filename list. + + Resolves each requested filename via the project metadata API and + delegates to the provider's ``download_files`` so the existing batch + + protocol fallback engine is reused. + + :param accession: PRIDE or MassIVE project accession (public) + :param file_names: filenames to download + :param output_folder: directory to write downloaded files into + :param skip_if_downloaded_already: skip files already present locally + :param protocol: preferred protocol; falls back across others on failure + :param aspera_maximum_bandwidth: aspera ascp bandwidth cap + :param checksum_check: download project checksums and validate + :param parallel_files: number of files to download simultaneously for globus + :raises ValueError: if ``file_names`` is empty or none match the project + """ + if not file_names: + raise ValueError("file_names must contain at least one filename") + + provider = registry.resolve(accession) + all_files = provider.list_files(accession) + + requested = set(file_names) + matched = [f for f in all_files if f.get("fileName") in requested] + missing = sorted(requested - {f.get("fileName") for f in matched}) + if missing: + logging.warning("Files not found in project %s: %s", accession, missing) + if not matched: + raise ValueError( + f"No matching files in project {accession} for: {sorted(requested)}" + ) + + provider.download_files( + accession=accession, + records=matched, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, + parallel_files=parallel_files, + checksum_check=checksum_check, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, + ) diff --git a/pridepy/commands/by_url.py b/pridepy/commands/by_url.py new file mode 100644 index 0000000..91d6fec --- /dev/null +++ b/pridepy/commands/by_url.py @@ -0,0 +1,254 @@ +"""Download a list of explicit URLs (ftp/http/https). + +Each URL is dispatched to the matching transport based on its scheme. +PRIDE checksum validation is supported when the accession can be +inferred from the URL path. +""" +import ftplib +import logging +import os +import re +from concurrent.futures import ThreadPoolExecutor, as_completed +from ftplib import FTP +from typing import Dict, List, Optional, Tuple +from urllib.parse import urlparse + +from tqdm import tqdm + +from pridepy.util.api_handling import Util + + +def _extract_pride_accession(url: str) -> Optional[str]: + """Extract a PRIDE accession (PXD/PRD followed by digits) from a URL path. + + PRIDE archive URLs follow the pattern + ``…/pride/data/archive/YYYY/MM//filename``. + Returns ``None`` when no accession can be identified. + """ + match = re.search(r"((?:PXD|PRD)\d{4,})", url) + return match.group(1) if match else None + + +def _validate_urls_checksums(urls: List[str], output_folder: str) -> None: + """Validate downloaded files against PRIDE checksum API. + + Accessions are inferred from URL paths via + :func:`_extract_pride_accession`. URLs that do not contain a + recognisable PRIDE accession are skipped with a warning. + + :raises RuntimeError: if one or more files fail validation + """ + from pridepy.files.files import Files + + accession_urls: Dict[str, List[str]] = {} + for url in urls: + acc = _extract_pride_accession(url) + if acc: + accession_urls.setdefault(acc, []).append(url) + else: + logging.warning( + "Cannot infer PRIDE accession from URL, skipping checksum: %s", url + ) + + validation_failures: List[str] = [] + for acc, acc_urls in accession_urls.items(): + checksum_file_path = Files.save_checksum_file(acc, output_folder) + checksum_map = Files.read_checksum_file(checksum_file_path) + logging.info( + "Loaded checksums for %d files (project %s)", + len(checksum_map), acc, + ) + for url in acc_urls: + file_name = os.path.basename(urlparse(url).path) + target = os.path.join(output_folder, file_name) + expected = checksum_map.get(file_name) + logging.info("Validating %s", file_name) + valid, reason = Files.validate_download(target, expected) + if not valid: + logging.error("Validation failed for %s: %s", file_name, reason) + validation_failures.append(f"{file_name} ({reason})") + else: + logging.info("Checksum OK: %s", file_name) + + if validation_failures: + raise RuntimeError( + f"Checksum validation failed for {len(validation_failures)} file(s): " + + ", ".join(validation_failures) + ) + + +def _http_download_url(url: str, target: str) -> None: + """Stream an http/https URL into ``target`` with a progress bar.""" + session = Util.create_session_with_retries() + with session.get(url, stream=True, timeout=60) as response: + response.raise_for_status() + total = int(response.headers.get("Content-Length", 0)) + with open(target, "wb") as out, tqdm( + total=total, + unit="B", + unit_scale=True, + desc=os.path.basename(target), + ) as pbar: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + out.write(chunk) + pbar.update(len(chunk)) + + +def _ftp_download_url(parsed, target: str) -> None: + """Download a single file from an ftp:// URL with a progress bar.""" + host = parsed.hostname + if not host: + raise ValueError(f"FTP URL missing host: {parsed.geturl()}") + port = parsed.port or 21 + user = parsed.username or "anonymous" + pwd = parsed.password or "anonymous@" + remote_path = parsed.path + with FTP() as ftp: + ftp.connect(host, port, timeout=60) + ftp.login(user, pwd) + try: + total = ftp.size(remote_path) or 0 + except ftplib.error_perm: + total = 0 + with open(target, "wb") as out, tqdm( + total=total, + unit="B", + unit_scale=True, + desc=os.path.basename(target), + ) as pbar: + + def _callback(data: bytes) -> None: + out.write(data) + pbar.update(len(data)) + + ftp.retrbinary(f"RETR {remote_path}", _callback) + + +def _dispatch_url_scheme(parsed, target: str, protocol: str = "ftp", position: int = 0) -> None: + """Route a parsed URL to its protocol-specific downloader. + + ``protocol='globus'`` swaps the http/https single-connection streamer + for :func:`pridepy.files.files.Files._parallel_download` (single-connection with progress bar). + ftp:// URLs are unaffected. + """ + from pridepy.files.files import Files + + scheme = (parsed.scheme or "").lower() + if scheme in ("http", "https"): + if protocol == "globus": + Files._parallel_download(parsed.geturl(), target, position=position) + else: + Files._http_download_url(parsed.geturl(), target) + elif scheme == "ftp": + Files._ftp_download_url(parsed, target) + else: + raise ValueError(f"Unsupported URL scheme: {scheme}") + + +def _download_single_url( + url: str, + output_folder: str, + skip_if_exists: bool = False, + protocol: str = "ftp", + position: int = 0, +) -> str: + """Download one URL, dispatched by scheme; return the local file path.""" + from pridepy.files.files import Files + + parsed = urlparse(url) + if not (parsed.scheme or "").lower(): + raise ValueError(f"URL missing scheme: {url}") + + file_name = os.path.basename(parsed.path) + if not file_name: + raise ValueError(f"Cannot derive filename from URL: {url}") + + target = os.path.join(output_folder, file_name) + if skip_if_exists and os.path.isfile(target) and os.path.getsize(target) > 0: + logging.info("Skipping %s: already downloaded", file_name) + return target + + Files._dispatch_url_scheme(parsed, target, protocol, position=position) + + ok, reason = Files.validate_download(target) + if not ok: + Files._remove_if_exists(target) + raise RuntimeError(f"Download invalid: {reason} ({target})") + return target + + +def download_files_by_url( + urls: List[str], + output_folder: str, + skip_if_downloaded_already: bool = False, + protocol: str = "ftp", + parallel_files: int = 1, + checksum_check: bool = False, +) -> None: + """Download files from a list of raw URLs, dispatched by URL scheme. + + Supported schemes: ``http``, ``https``, ``ftp``. Each URL is downloaded + independently; per-URL errors are logged, then aggregated and re-raised + as a single :class:`RuntimeError` so callers see a complete failure + summary. + + :param urls: fully-qualified URLs (each contains its scheme) + :param output_folder: directory to write downloaded files into + :param skip_if_downloaded_already: skip URLs whose target file exists + :param protocol: ``ftp`` (default) for single-connection per URL scheme; + ``globus`` for resume-capable http/https downloads (single-connection stream) + (no effect on ftp:// URLs which always use single-connection FTP) + :param checksum_check: validate downloads against PRIDE checksum API; + accessions are inferred from URL paths (only PRIDE URLs supported) + :raises ValueError: if ``urls`` is empty + :raises RuntimeError: if one or more URLs failed + """ + if not urls: + raise ValueError("urls must contain at least one URL") + + os.makedirs(output_folder, exist_ok=True) + + parallel_files = min(parallel_files, 3, len(urls)) + failures: List[Tuple[str, str]] = [] + from pridepy.files.files import Files + + if parallel_files < 2: + for url in urls: + try: + Files._download_single_url( + url, output_folder, skip_if_downloaded_already, protocol, + ) + except Exception as exc: # pylint: disable=broad-except + logging.error("Failed to download %s: %s", url, exc) + failures.append((url, str(exc))) + else: + logging.info( + "Downloading %d URL(s) with %d parallel workers", + len(urls), parallel_files, + ) + with ThreadPoolExecutor(max_workers=parallel_files) as executor: + futures = { + executor.submit( + Files._download_single_url, + url, output_folder, skip_if_downloaded_already, protocol, + position=idx, + ): url + for idx, url in enumerate(urls) + } + for future in as_completed(futures): + url = futures[future] + try: + future.result() + except Exception as exc: # pylint: disable=broad-except + logging.error("Failed to download %s: %s", url, exc) + failures.append((url, str(exc))) + + if failures: + summary = ", ".join(f"{u} ({e})" for u, e in failures) + raise RuntimeError( + f"Failed to download {len(failures)} URL(s): {summary}" + ) + + if checksum_check: + _validate_urls_checksums(urls, output_folder) diff --git a/pridepy/files/files.py b/pridepy/files/files.py index fbcf2f9..3567e78 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -1,83 +1,73 @@ #!/usr/bin/env python -import ftplib -import hashlib import importlib.resources import logging import os -import platform -import posixpath -import re import subprocess import urllib import urllib.request -import time -from concurrent.futures import ThreadPoolExecutor, as_completed from ftplib import FTP from typing import Dict, List, Optional, Tuple -import socket -from urllib.parse import urlparse -import xml.etree.ElementTree as ET -import boto3 -import botocore -import requests -from botocore.config import Config -from tqdm import tqdm +import requests # noqa: F401 — kept as a patch target for tests -from pridepy.authentication.authentication import Authentication from pridepy.util.api_handling import Util +# Module-level imports of the modular architecture. Providers and commands +# do not import Files at module level (only lazily inside method bodies), +# so hoisting these to the top is safe and avoids cluttering every shim +# method body with a local import. +from pridepy.providers import registry, transport +from pridepy.providers import util as _provider_util +from pridepy.providers.iprox import IproxProvider +from pridepy.providers.jpost import JpostProvider +from pridepy.providers.massive import MASSIVE_CATEGORY_MAP, MassiveProvider +from pridepy.providers.pride import PrideProvider +from pridepy.providers.proteomexchange import ProteomeXchangeProvider +from pridepy.commands import by_list, by_url -class Progress: - def __init__(self, total_size, file_name): - self.pbar = tqdm( - total=total_size, - unit="B", - unit_scale=True, - desc="Downloading {}".format(file_name), - ) - - def __call__(self, bytes_amount): - self.pbar.update(bytes_amount) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.pbar.close() - - def close(self): - self.pbar.close() +# Re-export Progress so external `from pridepy.files.files import Progress` +# still works. +from pridepy.providers.util import Progress # noqa: F401 class Files: """ - This class handles PRIDE API files endpoint. + This class handles PRIDE API files endpoint, and dispatches to the + per-repository provider classes in :mod:`pridepy.providers`. """ - V3_API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3" - API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3" - API_PRIVATE_URL = "https://www.ebi.ac.uk/pride/private/ws/archive/v2" - PRIDE_ARCHIVE_FTP = "ftp.pride.ebi.ac.uk" - PRIDE_ARCHIVE_FTP_URL_PREFIX = "ftp://ftp.pride.ebi.ac.uk/" - PRIDE_ARCHIVE_HTTPS_URL_PREFIX = "https://ftp.pride.ebi.ac.uk/" - MASSIVE_ARCHIVE_FTP = "massive-ftp.ucsd.edu" - MASSIVE_ARCHIVE_FTP_URL_PREFIX = "ftp://massive-ftp.ucsd.edu/v01/" - S3_URL = "https://hh.fire.sdo.ebi.ac.uk" - S3_BUCKET = "pride-public" - PROTOCOL_ORDER = ["aspera", "s3", "ftp", "globus"] - MASSIVE_CATEGORY_MAP = { - "raw": "RAW", - "peak": "PEAK", - "ccms_peak": "PEAK", - "search": "SEARCH", - "result": "RESULT", - "ccms_result": "RESULT", - "quant": "RESULT", - "fasta": "FASTA", - "spectrum_library": "SPECTRUM_LIBRARY", - "library": "SPECTRUM_LIBRARY", - } + # PRIDE class-attribute re-exports (kept here for back-compat). + V3_API_BASE_URL = PrideProvider.V3_API_BASE_URL + API_BASE_URL = PrideProvider.API_BASE_URL + API_PRIVATE_URL = PrideProvider.API_PRIVATE_URL + PRIDE_ARCHIVE_FTP = PrideProvider.ARCHIVE_FTP + PRIDE_ARCHIVE_FTP_URL_PREFIX = PrideProvider.ARCHIVE_FTP_URL_PREFIX + PRIDE_ARCHIVE_HTTPS_URL_PREFIX = PrideProvider.ARCHIVE_HTTPS_URL_PREFIX + S3_URL = PrideProvider.S3_URL + S3_BUCKET = PrideProvider.S3_BUCKET + PROTOCOL_ORDER = PrideProvider.PROTOCOL_ORDER + + # MassIVE class-attribute re-exports. + MASSIVE_ARCHIVE_FTP = MassiveProvider.ARCHIVE_FTP + MASSIVE_ARCHIVE_FTP_URL_PREFIX = MassiveProvider.ARCHIVE_FTP_URL_PREFIX + # Note: MASSIVE_CATEGORY_MAP is the module-level constant in providers/massive.py, + # re-exported on Files as a class attribute via the module-level import above. + + # JPOST class-attribute re-exports. + JPOST_ARCHIVE_FTP = JpostProvider.ARCHIVE_FTP + JPOST_ARCHIVE_FTP_URL_PREFIX = JpostProvider.ARCHIVE_FTP_URL_PREFIX + JPOST_PROXI_BASE_URL = JpostProvider.PROXI_BASE_URL + JPOST_PROXI_CATEGORY_MAP = JpostProvider.PROXI_CATEGORY_MAP + + # iProX class-attribute re-exports. + IPROX_DOWNLOAD_BASE_URL = IproxProvider.DOWNLOAD_BASE_URL + IPROX_PX_XML_URL_TEMPLATE = IproxProvider.PX_XML_URL_TEMPLATE + IPROX_PX_CATEGORY_MAP = IproxProvider.PX_CATEGORY_MAP + + # MassIVE category map re-exported. Class attribute shadowing the module-level + # constant of the same name happens cleanly in class scope. + MASSIVE_CATEGORY_MAP = MASSIVE_CATEGORY_MAP + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") def __init__(self): @@ -85,271 +75,147 @@ def __init__(self): @staticmethod def _find_tsv_columns(header: str) -> Optional[Tuple[int, int]]: - """Return (name_idx, checksum_idx) from a TSV header, or None.""" - cols = [col.strip().lower() for col in header.split("\t")] - required_cols = {"file-name", "file-md5checksum", "file-size"} - if not required_cols.issubset(set(cols)): - return None - return cols.index("file-name"), cols.index("file-md5checksum") + """Shim — see :func:`pridepy.providers.util._find_tsv_columns`.""" + return _provider_util._find_tsv_columns(header) @staticmethod def _is_md5_checksum(value: str) -> bool: - return len(value) == 32 and all(char in "0123456789abcdef" for char in value) + """Shim — see :func:`pridepy.providers.util._is_md5_checksum`.""" + return _provider_util._is_md5_checksum(value) @staticmethod def read_checksum_file(checksum_file_path: str) -> Dict[str, str]: - """ - Read PRIDE API checksum TSV and build {file_name: md5} map. - Expected format: File-Name\tFile-MD5Checksum\tFile-Size - """ - checksums: Dict[str, str] = {} - if not checksum_file_path or not os.path.exists(checksum_file_path): - return checksums - - with open(checksum_file_path, "r", encoding="utf-8") as f: - header = f.readline().strip() - if not header: - return checksums - - col_indices = Files._find_tsv_columns(header) - if col_indices is None: - logging.warning(f"Unrecognized checksum file format: {header}") - return checksums - - name_idx, checksum_idx = col_indices - min_cols = max(name_idx, checksum_idx) + 1 - for line in f: - parts = line.strip().split("\t") - if len(parts) >= min_cols: - fn = os.path.basename(parts[name_idx].strip()) - cs = parts[checksum_idx].strip().lower() - if fn and Files._is_md5_checksum(cs): - checksums[fn] = cs - - return checksums + """Shim — see :func:`pridepy.providers.util.read_checksum_file`.""" + return _provider_util.read_checksum_file(checksum_file_path) @staticmethod def compute_md5(file_path: str, chunk_size: int = 4 * 1024 * 1024) -> str: - """ - Compute an MD5 checksum for integrity validation, not for security use. - """ - try: - md5 = hashlib.md5(usedforsecurity=False) - except TypeError: - md5 = hashlib.md5() - with open(file_path, "rb") as file_handle: - while True: - chunk = file_handle.read(chunk_size) - if not chunk: - break - md5.update(chunk) - return md5.hexdigest() + """Shim — see :func:`pridepy.providers.util.compute_md5`.""" + return _provider_util.compute_md5(file_path, chunk_size) @staticmethod def validate_download(file_path: str, expected_checksum: Optional[str] = None) -> Tuple[bool, str]: - """ - Validate a local file exists, is non-empty, and checksum matches when provided. - """ - if not os.path.exists(file_path): - return False, "file does not exist" - if os.path.getsize(file_path) == 0: - return False, "file is empty" - if expected_checksum: - actual_checksum = Files.compute_md5(file_path) - if actual_checksum.lower() != expected_checksum.lower(): - return False, ( - f"checksum mismatch (expected={expected_checksum.lower()}, actual={actual_checksum.lower()})" - ) - return True, "ok" + """Shim — see :func:`pridepy.providers.util.validate_download`.""" + return _provider_util.validate_download(file_path, expected_checksum) @staticmethod def _remove_if_exists(file_path: str) -> None: - """ - Remove a file if it already exists locally. - """ - if os.path.exists(file_path): - os.remove(file_path) + """Shim — see :func:`pridepy.providers.util._remove_if_exists`.""" + return _provider_util._remove_if_exists(file_path) @staticmethod def _get_download_url(file_record: Dict, protocol: str) -> str: - """ - Resolve the public download URL for a file and protocol. - - Raises ValueError when the requested protocol has no suitable location. - Aspera requires a dedicated "Aspera Protocol" entry; ftp/s3/globus - derive their URL from the "FTP Protocol" entry (falling back to an - arbitrary non-Aspera location would produce a URL the caller cannot - actually transfer with). - """ - locations = file_record.get("publicFileLocations", []) - if not locations: - raise ValueError("No public file locations present") - - aspera_url = None - ftp_url = None - for location in locations: - name = location.get("name") - if name == "Aspera Protocol": - aspera_url = location.get("value") - elif name == "FTP Protocol": - ftp_url = location.get("value") - - if protocol == "aspera": - if not aspera_url: - raise ValueError("Aspera URL not available") - return aspera_url - - if not ftp_url: - raise ValueError("FTP URL not available") - if protocol == "ftp": - return ftp_url - if protocol == "globus": - return ftp_url.replace( - Files.PRIDE_ARCHIVE_FTP_URL_PREFIX, - Files.PRIDE_ARCHIVE_HTTPS_URL_PREFIX, - 1, - ) - if protocol == "s3": - return ftp_url - raise ValueError(f"Unsupported protocol: {protocol}") + """Shim — see :func:`pridepy.providers.util._get_download_url`.""" + return _provider_util._get_download_url(file_record, protocol) @staticmethod def _resolve_local_path(file_record: Dict, output_folder: str) -> str: - """ - Compute the canonical local path for a file regardless of transfer protocol. - """ - try: - canonical_url = Files._get_download_url(file_record, "ftp") - except ValueError: - canonical_url = "" - if canonical_url: - return Files.get_output_file_name(canonical_url, file_record, output_folder) - return os.path.join(output_folder, file_record["fileName"]) + """Shim — see :func:`pridepy.providers.util._resolve_local_path`.""" + return _provider_util._resolve_local_path(file_record, output_folder) @staticmethod def _protocol_sequence(protocol: str) -> List[str]: - """ - Build the ordered list of protocols to try for a requested download mode. - """ - if protocol not in Files.PROTOCOL_ORDER: - return [] - return [protocol] + [p for p in Files.PROTOCOL_ORDER if p != protocol] + """Shim — see :meth:`pridepy.providers.pride.PrideProvider._protocol_sequence`.""" + return PrideProvider._protocol_sequence(protocol) @staticmethod def is_massive_accession(accession: str) -> bool: - """ - Return True when the accession looks like a MassIVE dataset accession. - """ - if not accession: - return False - return bool(re.fullmatch(r"R?MSV\d{9}", accession.upper())) + """Shim — see :meth:`pridepy.providers.massive.MassiveProvider.matches`.""" + return MassiveProvider.matches(accession) @staticmethod def _get_massive_public_root(accession: str) -> str: - normalized_accession = accession.upper() - return f"/v01/{normalized_accession}" + return MassiveProvider._get_public_root(accession) @staticmethod def _get_massive_public_ftp_url(accession: str, remote_path: str) -> str: - root_path = Files._get_massive_public_root(accession).rstrip("/") - relative_path = remote_path - if remote_path.startswith(root_path): - relative_path = remote_path[len(root_path) :].lstrip("/") - return f"{Files.MASSIVE_ARCHIVE_FTP_URL_PREFIX}{accession.upper()}/{relative_path}" + return MassiveProvider._get_public_ftp_url(accession, remote_path) @staticmethod def _map_massive_collection_to_category(collection: str) -> str: - return Files.MASSIVE_CATEGORY_MAP.get(collection.lower(), "OTHER") + return MassiveProvider._map_collection_to_category(collection) @staticmethod def _build_massive_file_record(accession: str, ftp_url: str) -> Dict: - parsed = urlparse(ftp_url) - root_prefix = f"/v01/{accession.upper()}/" - relative_path = parsed.path - if relative_path.startswith(root_prefix): - relative_path = relative_path[len(root_prefix) :] - relative_path = relative_path.lstrip("/") - collection = relative_path.split("/", 1)[0] if relative_path else "" - return { - "accession": accession.upper(), - "fileName": os.path.basename(parsed.path), - "fileCategory": {"value": Files._map_massive_collection_to_category(collection)}, - "publicFileLocations": [{"name": "FTP Protocol", "value": ftp_url}], - "relativePath": relative_path, - "collection": collection, - "source": "MassIVE", - } + return MassiveProvider._build_file_record(accession, ftp_url) @staticmethod - def _walk_ftp_tree(ftp: FTP, remote_dir: str) -> List[str]: - """ - Recursively list files under a remote FTP directory. + def is_jpost_accession(accession: str) -> bool: + """Shim — see :meth:`pridepy.providers.jpost.JpostProvider.matches`.""" + return JpostProvider.matches(accession) + + @staticmethod + def _get_jpost_public_root(accession: str) -> str: + return JpostProvider._get_public_root(accession) + + @staticmethod + def _get_jpost_public_ftp_url(accession: str, remote_path: str) -> str: + return JpostProvider._get_public_ftp_url(accession, remote_path) + + @staticmethod + def _build_jpost_file_record(accession, ftp_url, category_from_proxi=None): + return JpostProvider._build_file_record(accession, ftp_url, category_from_proxi) + + @staticmethod + def _build_iprox_file_record(accession, https_url, category_from_px=None): + """Shim — see :meth:`pridepy.providers.iprox.IproxProvider._build_file_record`.""" + return IproxProvider._build_file_record(accession, https_url, category_from_px) + + @staticmethod + def _get_iprox_public_root(accession: str) -> str: + return IproxProvider._get_public_root(accession) + + @staticmethod + def _get_iprox_public_ftp_url(accession: str, remote_path: str) -> str: + return IproxProvider._get_public_ftp_url(accession, remote_path) + + @staticmethod + def is_direct_download_accession(accession: str) -> bool: + """Shim — True for MassIVE/JPOST/iProX (explicitly excludes PRIDE). + + PRIDE is also a registered provider but PRIDE downloads go through + the multi-protocol orchestrator (FTP/Aspera/S3/Globus with checksum + validation and fallback), not the direct-download partitioned-by-URL- + scheme path. So we filter PRIDE out here. """ - file_paths: List[str] = [] try: - entries = list(ftp.mlsd(remote_dir)) - for name, facts in entries: - if name in {".", ".."}: - continue - child_path = posixpath.join(remote_dir.rstrip("/"), name) - if facts.get("type") == "dir": - file_paths.extend(Files._walk_ftp_tree(ftp, child_path)) - elif facts.get("type") == "file": - file_paths.append(child_path) - return file_paths - except (AttributeError, ftplib.error_perm): - pass - - current_dir = ftp.pwd() - listing: List[str] = [] + provider = registry.resolve(accession) + except ValueError: + return False + return provider.name != "pride" + + @staticmethod + def is_iprox_accession(accession: str) -> bool: + """Shim — see :meth:`pridepy.providers.iprox.IproxProvider.matches`.""" + return IproxProvider.matches(accession) + + @staticmethod + def _repo_uses_tls(accession: str) -> bool: + """Shim — returns the resolved provider's use_tls flag (False if unknown).""" try: - ftp.cwd(remote_dir) - ftp.retrlines("LIST", listing.append) - for entry in listing: - parts = entry.split(maxsplit=8) - if len(parts) < 9: - continue - name = parts[8] - if name in {".", ".."}: - continue - child_path = posixpath.join(remote_dir.rstrip("/"), name) - if entry.startswith("d"): - file_paths.extend(Files._walk_ftp_tree(ftp, child_path)) - else: - file_paths.append(child_path) - finally: - ftp.cwd(current_dir) - return file_paths + provider = registry.resolve(accession) + except ValueError: + return False + return getattr(provider, "use_tls", False) + + @staticmethod + def _walk_ftp_tree(ftp: FTP, remote_dir: str) -> List[str]: + """Shim — see :func:`pridepy.providers.transport._walk_ftp_tree`.""" + return transport._walk_ftp_tree(ftp=ftp, remote_dir=remote_dir) + + @staticmethod + def _open_ftp_connection(host: str, use_tls: bool, timeout: int = 30) -> FTP: + """Shim — see :func:`pridepy.providers.transport._open_ftp_connection`.""" + return transport._open_ftp_connection(host=host, use_tls=use_tls, timeout=timeout) + + @staticmethod + def _list_ftp_repo_files(host, remote_root, error_label, use_tls=False): + """Shim — see :func:`pridepy.providers.transport._list_ftp_repo_files`.""" + return transport._list_ftp_repo_files(host=host, remote_root=remote_root, error_label=error_label, use_tls=use_tls) def _list_massive_public_files(self, accession: str) -> List[Dict]: - """ - Discover all public files for a MassIVE dataset from its anonymous FTP tree. - """ - normalized_accession = accession.upper() - remote_root = self._get_massive_public_root(normalized_accession) - ftp = FTP(self.MASSIVE_ARCHIVE_FTP, timeout=30) - try: - ftp.login() - ftp.set_pasv(True) - logging.info(f"Connected to FTP host: {self.MASSIVE_ARCHIVE_FTP}") - remote_files = self._walk_ftp_tree(ftp, remote_root) - except Exception as error: - raise RuntimeError( - f"Unable to list public files for MassIVE dataset {normalized_accession}: {error}" - ) from error - finally: - try: - ftp.quit() - except Exception: - ftp.close() - - return [ - self._build_massive_file_record( - normalized_accession, - self._get_massive_public_ftp_url(normalized_accession, remote_file), - ) - for remote_file in remote_files - ] + """Shim — see :meth:`pridepy.providers.massive.MassiveProvider.list_files`.""" + return MassiveProvider().list_files(accession) def _download_massive_file_records( self, @@ -358,72 +224,75 @@ def _download_massive_file_records( output_folder: str, skip_if_downloaded_already: bool, protocol: str, + parallel_files: int = 1, ) -> None: """ - Download public MassIVE files via anonymous FTP. + Download public MassIVE files via anonymous FTP (now FTPS). + Backward-compat shim — dispatches via the provider registry. """ - if protocol != "ftp": - logging.warning( - "MassIVE direct downloads currently use ftp only. " - f"Ignoring requested protocol '{protocol}' for {accession}." - ) - - ftp_urls = [self._get_download_url(file_record, "ftp") for file_record in file_records] - if not ftp_urls: - logging.info(f"No files matched for MassIVE dataset {accession}") - return - - self.download_ftp_urls( - ftp_urls=ftp_urls, + registry.resolve(accession).download_files( + accession=accession, + records=file_records, output_folder=output_folder, skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, + parallel_files=parallel_files, ) - async def stream_all_files_metadata(self, output_file, accession=None): - """ - get stream all project files from PRIDE API in JSON format - """ - if accession is None: - request_url = f"{self.V3_API_BASE_URL}/files/all" - count_request_url = f"{self.V3_API_BASE_URL}/files/count" - else: - request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/all" - count_request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/count" - headers = {"Accept": "application/JSON"} - response = Util.get_api_call(count_request_url, headers) - total_records = response.json() - - regex_search_pattern = '"fileName"' - await Util.stream_response_to_file( - output_file, total_records, regex_search_pattern, request_url, headers - ) - - def stream_all_files_by_project(self, accession) -> List[Dict]: - """ - get stream all project files from PRIDE API in JSON format + def _list_jpost_public_files(self, accession: str) -> List[Dict]: """ - request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/all" - headers = {"Accept": "application/JSON"} - record_files = Util.read_json_stream(api_url=request_url, headers=headers) - return record_files + Discover all public files for a JPOST dataset. - def get_all_raw_file_list(self, project_accession): - """ - Get all raw file lists from PRIDE API for a given project_accession - :param project_accession: PRIDE accession - :return: raw file list in JSON format + Delegates to JpostProvider but routes via the shim methods so that + test patches on ``_list_jpost_public_files_via_proxi`` and + ``_list_ftp_repo_files`` continue to intercept. """ - if self.is_massive_accession(project_accession): - record_files = self._list_massive_public_files(project_accession) + normalized_accession = accession.upper() + try: + return self._list_jpost_public_files_via_proxi(normalized_accession) + except Exception as proxi_error: + logging.warning( + f"JPOST PROXI listing failed for {normalized_accession} " + f"({proxi_error}); falling back to FTP tree walk." + ) + remote_root = JpostProvider._get_public_root(normalized_accession) + remote_files = self._list_ftp_repo_files( + host=JpostProvider.ARCHIVE_FTP, + remote_root=remote_root, + error_label=f"JPOST dataset {normalized_accession}", + ) return [ - file for file in record_files if file["fileCategory"]["value"] == "RAW" + self._build_jpost_file_record( + normalized_accession, + JpostProvider._get_public_ftp_url(normalized_accession, remote_file), + ) + for remote_file in remote_files ] - record_files = self.stream_all_files_by_project(project_accession) + def _list_jpost_public_files_via_proxi(self, accession: str) -> List[Dict]: + """Shim — see :meth:`pridepy.providers.jpost.JpostProvider._list_via_proxi`.""" + return JpostProvider()._list_via_proxi(accession) + + def _list_iprox_public_files(self, accession: str) -> List[Dict]: + """Shim — see :meth:`pridepy.providers.iprox.IproxProvider.list_files`.""" + return IproxProvider().list_files(accession) + + async def stream_all_files_metadata(self, output_file, accession=None): + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.stream_all_files_metadata`.""" + return await PrideProvider().stream_all_files_metadata(output_file, accession) + + def stream_all_files_by_project(self, accession) -> List[Dict]: + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.stream_all_files_by_project`.""" + return PrideProvider().stream_all_files_by_project(accession) + + def get_all_raw_file_list(self, project_accession): + """Get raw file list for any registered provider. - # Filter projects by fileCategory = RAW - raw_files = [file for file in record_files if file["fileCategory"]["value"] == "RAW"] - return raw_files + Returns the dataset's file records filtered to fileCategory == "RAW". + """ + provider = registry.resolve(project_accession) + records = provider.list_files(project_accession) + return [r for r in records if r["fileCategory"]["value"] == "RAW"] def download_all_raw_files( self, @@ -435,41 +304,20 @@ def download_all_raw_files( checksum_check: bool = False, parallel_files: int = 1, ): - """ - This method will download all the raw files from PRIDE PROJECT - :param output_folder: output directory where raw files will get saved - :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. - :param accession: PRIDE accession - :param protocol: ftp, aspera, globus - :param aspera_maximum_bandwidth: Aspera maximum bandwidth - :param checksum_check: Download checksum for a given project. - :return: None - """ - - if not (os.path.isdir(output_folder)): + """Download all RAW files for any registered provider.""" + if not os.path.isdir(output_folder): os.mkdir(output_folder) - - raw_files = self.get_all_raw_file_list(accession) - - if self.is_massive_accession(accession): - self._download_massive_file_records( - accession=accession, - file_records=raw_files, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - protocol=protocol, - ) - return - - self.download_files( - raw_files, - accession, - output_folder, - skip_if_downloaded_already, - protocol, - aspera_maximum_bandwidth=aspera_maximum_bandwidth, - checksum_check=checksum_check, + provider = registry.resolve(accession) + records = self.get_all_raw_file_list(accession) + provider.download_files( + accession=accession, + records=records, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, parallel_files=parallel_files, + checksum_check=checksum_check, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, ) @staticmethod @@ -480,118 +328,14 @@ def download_files_from_ftp( max_connection_retries=3, max_download_retries=3, ): - """ - Download files using a single FTP connection with a retry mechanism and a progress bar for each file. - :param file_list_json: file list in JSON format - :param output_folder: folder to download the files - :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. - :param max_connection_retries: Number of attempts to reconnect to the FTP server if the connection is lost. - :param max_download_retries: Number of attempts to retry the download of a file in case of failure. - """ - - if not os.path.isdir(output_folder): - os.makedirs(output_folder) - - def connect_ftp(): - """Helper function to establish FTP connection.""" - ftp = FTP(Files.PRIDE_ARCHIVE_FTP, timeout=30) - ftp.login() # Anonymous login - ftp.set_pasv(True) # Enable passive mode - logging.info(f"Connected to FTP host: {Files.PRIDE_ARCHIVE_FTP}") - return ftp - - connection_attempt = 0 - while connection_attempt < max_connection_retries: - try: - ftp = connect_ftp() - for file in file_list_json: - try: - # Get FTP download URL - if file["publicFileLocations"][0]["name"] == "FTP Protocol": - download_url = file["publicFileLocations"][0]["value"] - else: - download_url = file["publicFileLocations"][1]["value"] - - logging.debug("ftp_filepath:" + download_url) - - # Get output file path - new_file_path = Files.get_output_file_name( - download_url, file, output_folder - ) - - if skip_if_downloaded_already and os.path.exists(new_file_path): - logging.info("Skipping download as file already exists") - continue - - # Extract file path from the download URL - parsed_url = urlparse(download_url) - ftp_file_path = urllib.parse.unquote(parsed_url.path.lstrip("/")) - - logging.info(f"Starting FTP download: {ftp_file_path}") - - # Retry download in case of failure - download_attempt = 0 - while download_attempt < max_download_retries: - try: - # Get file size for progress tracking - total_size = ftp.size(ftp_file_path) - logging.info(f"File size: {total_size} bytes") - - # Initialize progress bar - with open(new_file_path, "wb") as f: - with tqdm( - total=total_size, - unit="B", - unit_scale=True, - desc=new_file_path, - ) as pbar: - - def callback(data): - f.write(data) - pbar.update(len(data)) - - # Retrieve the file with progress callback - ftp.retrbinary(f"RETR {ftp_file_path}", callback) - - logging.info(f"Successfully downloaded {new_file_path}") - break # Exit download retry loop if successful - except ( - socket.timeout, - ftplib.error_temp, - ftplib.error_perm, - ) as e: - download_attempt += 1 - logging.error( - f"Download failed for {new_file_path} (attempt {download_attempt}): {str(e)}" - ) - if download_attempt >= max_download_retries: - logging.error( - f"Giving up on {new_file_path} after {max_download_retries} attempts." - ) - break # Give up on this file after max retries - except (KeyError, IndexError) as e: - logging.error(f"Failed to process file due to missing data: {str(e)}") - except Exception as e: - logging.error(f"Unexpected error while processing file: {str(e)}") - ftp.quit() # Close FTP connection after all files are downloaded - logging.info(f"Disconnected from FTP host: {Files.PRIDE_ARCHIVE_FTP}") - break # Exit connection retry loop if everything was successful - except ( - socket.timeout, - ftplib.error_temp, - ftplib.error_perm, - socket.error, - ) as e: - connection_attempt += 1 - logging.error(f"FTP connection failed (attempt {connection_attempt}): {str(e)}") - if connection_attempt < max_connection_retries: - logging.info("Retrying connection...") - time.sleep(5) # Optional delay before retrying - else: - logging.error( - f"Giving up after {max_connection_retries} failed connection attempts." - ) - break + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_files_from_ftp`.""" + return PrideProvider.download_files_from_ftp( + file_list_json, + output_folder, + skip_if_downloaded_already, + max_connection_retries=max_connection_retries, + max_download_retries=max_download_retries, + ) @staticmethod def get_output_file_name(download_url, file, output_folder): @@ -655,93 +399,18 @@ def download_files_from_aspera( except subprocess.CalledProcessError as e: logging.error(f"Aspera download failed for {new_file_path}: {str(e)}") - @staticmethod - def _download_range(url, file_path, start, end, pbar, max_retries=3): - """Download a byte range directly into the target file using seek.""" - for attempt in range(1, max_retries + 1): - try: - session = Util.create_session_with_retries() - headers = {"Range": f"bytes={start}-{end}"} - with session.get(url, headers=headers, stream=True, timeout=(15, 15)) as r: - r.raise_for_status() - if r.status_code != 206: - raise RuntimeError(f"Server did not honor Range request: {r.status_code}") - content_range = r.headers.get("Content-Range", "") - if not content_range.lower().startswith(f"bytes {start}-{end}/"): - raise RuntimeError(f"Unexpected Content-Range header: {content_range}") - with open(file_path, "r+b") as f: - f.seek(start) - for chunk in r.iter_content(chunk_size=8 * 1024 * 1024): - if chunk: - f.write(chunk) - pbar.update(len(chunk)) - return - except (requests.RequestException, RuntimeError, OSError) as exc: - logging.warning( - f"Range {start}-{end} attempt {attempt}/{max_retries} failed: {exc}" - ) - if attempt >= max_retries: - raise - time.sleep(2 * attempt) - @staticmethod def _parallel_download(url, file_path, position=0): - """Download a file via a single-connection HTTP stream with optional resume. - If a partial file exists and the server supports Range requests, resumes - from where it left off; otherwise restarts from scratch.""" - session = Util.create_session_with_retries() - try: - head = session.head(url, timeout=(30, 30)) - head.raise_for_status() - total_size = int(head.headers.get("content-length", 0)) - accept_ranges = head.headers.get("accept-ranges", "none").strip().lower() - except (requests.RequestException, ValueError) as exc: - logging.info(f"HEAD request failed, falling back to single connection: {exc}") - total_size = 0 - accept_ranges = "none" - - resume_size = 0 - if os.path.exists(file_path) and accept_ranges == "bytes" and total_size > 0: - resume_size = os.path.getsize(file_path) - if resume_size >= total_size: - logging.info(f"File already complete: {file_path}") - return - if resume_size > 0: - logging.info(f"Resuming download from {resume_size} bytes: {file_path}") - - headers = {"Range": f"bytes={resume_size}-"} if resume_size > 0 else {} - with session.get(url, headers=headers, stream=True, timeout=(30, 60)) as r: - r.raise_for_status() - if resume_size > 0 and r.status_code != 206: - logging.warning("Server did not honor Range request (status %s), restarting download", r.status_code) - resume_size = 0 - with tqdm(total=total_size, unit="B", unit_scale=True, desc=file_path, - initial=resume_size, position=position, leave=True) as pbar: - mode = "ab" if resume_size > 0 else "wb" - with open(file_path, mode, buffering=8 * 1024 * 1024) as f: - for chunk in r.iter_content(chunk_size=8 * 1024 * 1024): - if chunk: - f.write(chunk) - pbar.update(len(chunk)) + """Shim — see :func:`pridepy.providers.transport._parallel_download`.""" + return transport._parallel_download(url=url, file_path=file_path, position=position) @staticmethod def _globus_download_one(file, output_folder, skip_if_downloaded_already, max_retries=6, position=0): - """Download a single file via globus; used as a worker target.""" - download_url = Files._get_download_url(file, "globus") - new_file_path = Files.get_output_file_name(download_url, file, output_folder) - - if skip_if_downloaded_already and os.path.exists(new_file_path): - logging.info(f"Skipping download as file already exists: {new_file_path}") - return - - for attempt in range(1, max_retries + 1): - try: - Files._parallel_download(download_url, new_file_path, position=position) - return - except Exception as e: - logging.warning(f"Attempt {attempt}/{max_retries} failed for {file.get('fileName', '?')}: {e}") - if attempt == max_retries: - raise + """Shim — see :meth:`pridepy.providers.pride.PrideProvider._globus_download_one`.""" + return PrideProvider._globus_download_one( + file, output_folder, skip_if_downloaded_already, + max_retries=max_retries, position=position, + ) @staticmethod def download_files_from_globus( @@ -749,172 +418,25 @@ def download_files_from_globus( parallel_files: int = 1, checksum_map: Optional[Dict[str, str]] = None, ): - """ - Download files using globus transfer url with progress bar for each file. - When skip_if_downloaded_already is True, files are pre-filtered so that - only missing or incomplete files are submitted to the worker pool, - ensuring the -w parallel_files parameter is fully utilised. - When checksum_map is provided, existing files are validated against - their expected checksum; corrupted files are re-downloaded. - :param file_list_json: file list in json format - :param output_folder: folder to download the files - :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. - :param parallel_files: number of files to download simultaneously - :param checksum_map: mapping of file name to expected MD5 checksum - """ - if checksum_map is None: - checksum_map = {} - - if not (os.path.isdir(output_folder)): - os.makedirs(output_folder, exist_ok=True) - - # --- Phase 0: pre-filter files that need downloading ----------------- - files_to_download: List[Dict] = [] - for file in file_list_json: - download_url = Files._get_download_url(file, "globus") - new_file_path = Files.get_output_file_name(download_url, file, output_folder) - if skip_if_downloaded_already and os.path.exists(new_file_path): - expected_cs = checksum_map.get(file.get("fileName", "")) - if expected_cs: - valid, reason = Files.validate_download(new_file_path, expected_cs) - if not valid: - logging.warning(f"Corrupted file detected ({reason}), will re-download: {new_file_path}") - files_to_download.append(file) - continue - logging.info(f"Skipping download as file already exists: {new_file_path}") - continue - files_to_download.append(file) - - if not files_to_download: - logging.info("All files already downloaded, nothing to do.") - return - - logging.info( - f"{len(file_list_json) - len(files_to_download)} file(s) skipped, " - f"{len(files_to_download)} file(s) to download" + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_files_from_globus`.""" + return PrideProvider.download_files_from_globus( + file_list_json, output_folder, skip_if_downloaded_already, + parallel_files=parallel_files, + checksum_map=checksum_map, ) - # --- Phase 1: download (skip check already done, pass False) --------- - parallel_files = min(parallel_files, 3, len(files_to_download)) - if parallel_files < 2: - for file in files_to_download: - try: - Files._globus_download_one( - file, output_folder, False - ) - new_file_path = Files.get_output_file_name( - Files._get_download_url(file, "globus"), file, output_folder - ) - logging.info(f"Successfully downloaded {new_file_path}") - except Exception as e: - logging.error(f"Download from Globus failed: {str(e)}") - else: - logging.info(f"Downloading {len(files_to_download)} file(s) with {parallel_files} parallel workers") - with ThreadPoolExecutor(max_workers=parallel_files) as executor: - futures = { - executor.submit( - Files._globus_download_one, - file, output_folder, False, - position=idx, - ): file - for idx, file in enumerate(files_to_download) - } - for future in as_completed(futures): - try: - future.result() - except Exception as e: - logging.error(f"Download from Globus failed: {str(e)}") - @staticmethod def download_files_from_s3( file_list_json: List[Dict], output_folder: str, skip_if_downloaded_already ): - """ - Download files using S3 transfer URL with a progress bar and retry logic. - :param file_list_json: file list in JSON format - :param output_folder: folder to download the files - :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. - """ - - if not os.path.isdir(output_folder): - os.makedirs(output_folder, exist_ok=True) - - # Retry and timeout config - retry_config = Config( - retries={"max_attempts": 5, "mode": "standard"}, - connect_timeout=120, # Increase timeout to 120 seconds - read_timeout=120, # Timeout for reading data - signature_version=botocore.UNSIGNED, # Unsigned requests for public data - ) - - s3_resource = boto3.resource( - "s3", - config=retry_config, - endpoint_url=Files.S3_URL, + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_files_from_s3`.""" + return PrideProvider.download_files_from_s3( + file_list_json, output_folder, skip_if_downloaded_already, ) - bucket = s3_resource.Bucket(Files.S3_BUCKET) - - for file in file_list_json: - try: - # Determine S3 or FTP path - download_url = ( - file["publicFileLocations"][0]["value"] - if file["publicFileLocations"][0]["name"] == "FTP Protocol" - else file["publicFileLocations"][1]["value"] - ) - - ftp_base_url = "ftp://ftp.pride.ebi.ac.uk/pride/data/archive/" - s3_path = download_url.replace(ftp_base_url, "") - new_file_path = Files.get_output_file_name(download_url, file, output_folder) - - if skip_if_downloaded_already == True and os.path.exists(new_file_path): - logging.info("Skipping download as file already exists") - continue - - logging.debug(f"Downloading From S3: {s3_path}") - - # Get file size for progress tracking - obj = bucket.Object(s3_path) - total_size = obj.content_length - - # Initialize progress bar - progress = Progress(total_size, new_file_path) - - # Download with progress bar and retry handling - for attempt in range(5): - try: - bucket.download_file(s3_path, new_file_path, Callback=progress) - progress.close() - logging.info(f"Successfully downloaded {new_file_path}") - break - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "404": - logging.error("The object does not exist.") - break - else: - logging.error(f"Download failed: {e}") - if attempt < 4: - time.sleep(2**attempt) # Exponential backoff - logging.info(f"Retrying... ({attempt + 1}/5)") - else: - raise - except Exception as e: - logging.error(f"Failed to download {file['fileName']}: {e}") def get_submitted_file_path_prefix(self, accession): - """ - At pride repository, public data is disseminated according to a proper structure. - I.e. base/path/ + yyyy/mm/accession/ + submitted/ - This extracts the yyyy/mm/accession path fragment from the API by examine the file path - of a public file. - I.e. ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2018/10/PXD008644/7550GI_Y.raw - :param accession: PRIDE accession - :return: path fragment (eg: 2018/10/PXD008644) - """ - results = self.get_all_raw_file_list(accession) - first_file = results[0]["publicFileLocations"][0]["value"] - path_fragment = re.search(r"\d{4}/\d{2}/PXD\d*", first_file).group() - return path_fragment + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.get_submitted_file_path_prefix`.""" + return PrideProvider().get_submitted_file_path_prefix(accession) def download_file_by_name( self, @@ -941,20 +463,24 @@ def download_file_by_name( :param checksum_check: Download checksum for a given project. """ - if not (os.path.isdir(output_folder)): + if not os.path.isdir(output_folder): os.mkdir(output_folder) + provider = registry.resolve(accession) + ## Check type of project - if self.is_massive_accession(accession): - logging.info("Downloading file from public MassIVE dataset {}".format(accession)) + if provider.name in ("massive", "jpost", "iprox"): + logging.info( + "Downloading file from public direct-download dataset {}".format(accession) + ) response = self.get_file_from_api(accession, file_name) if not response: raise Exception( - "File name {} not found in MassIVE dataset {}".format(file_name, accession) + "File name {} not found in dataset {}".format(file_name, accession) ) - self._download_massive_file_records( + provider.download_files( accession=accession, - file_records=response, + records=response, output_folder=output_folder, skip_if_downloaded_already=skip_if_downloaded_already, protocol=protocol, @@ -1012,137 +538,27 @@ def get_file_from_api(self, accession, file_name) -> List[Dict]: :param file_name: file name :return: file in json format """ - try: - if self.is_massive_accession(accession): - files = self._list_massive_public_files(accession) - return [f for f in files if f["fileName"] == file_name] - files = self.stream_all_files_by_project(accession) - file = [f for f in files if f["fileName"] == file_name] - return file + records = registry.resolve(accession).list_files(accession) + return [r for r in records if r["fileName"] == file_name] except Exception as e: raise Exception("File not found " + str(e)) def download_private_file_name(self, accession, file_name, output_folder, username, password): - """ - Get the information for a given private file to be downloaded from the api. - :param accession: Project accession - :param file_name: The file name to be downloaded - :param username: Username with access to the dataset - :param password: Password for user with access to the dataset - """ - - auth = Authentication() - auth_token = auth.get_token(username, password) - validate_token = auth.validate_token(auth_token) - logging.info("Valid token after login: {}".format(validate_token)) - - url = self.API_PRIVATE_URL + "/projects/{}/files?search={}".format(accession, file_name) - content = requests.get(url, headers={"Authorization": "Bearer {}".format(auth_token)}) - if content.ok and content.status_code == 200: - json_file = content.json() - if ( - "_embedded" in json_file - and "files" in json_file["_embedded"] - and len(json_file["_embedded"]["files"]) == 1 - ): - download_url = json_file["_embedded"]["files"][0]["_links"]["download"]["href"] - logging.info(download_url) - - # Create a clean filename to save the downloaded file - new_file_path = os.path.join(output_folder, f"{file_name}") - - session = Util.create_session_with_retries() # Create session with retries - # Check if the file already exists - if os.path.exists(new_file_path): - resume_header = {"Range": f"bytes={os.path.getsize(new_file_path)}-"} - mode = "ab" # Append to file - resume_size = os.path.getsize(new_file_path) - else: - resume_header = {} - mode = "wb" # Write new file - resume_size = 0 - - with session.get( - download_url, stream=True, headers=resume_header, timeout=(10, 60) - ) as r: - r.raise_for_status() - total_size = int(r.headers.get("content-length", 0)) + resume_size - block_size = 1024 * 1024 # 1 MB chunks - - with tqdm( - total=total_size, - unit="B", - unit_scale=True, - desc=new_file_path, - initial=resume_size, - ) as pbar: - with open(new_file_path, mode) as f: - for chunk in r.iter_content(chunk_size=block_size): - if chunk: - f.write(chunk) - pbar.update(len(chunk)) - - logging.info(f"Successfully downloaded {new_file_path}") - - else: - logging.info( - "File name {} found more than once for the given project {}".format( - file_name, accession - ) - ) - else: - logging.info( - f"File name {file_name} now found in the project {accession}, or user don't have access" - ) - raise Exception( - f"File name {file_name} now found in the project {accession}, or user don't have access" - ) + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_private_file_name`.""" + return PrideProvider().download_private_file_name( + accession, file_name, output_folder, username, password, + ) @staticmethod def get_ascp_binary(): - """ - Detect the OS and architecture, and return the appropriate ascp binary path. - - Returns: - str: Path to the correct ascp binary. - """ - os_type = platform.system().lower() - arch, _ = platform.architecture() - aspera_dir = importlib.resources.files("pridepy").joinpath("aspera/") - - if os_type == "linux": - if arch == "32bit": - return os.path.join(aspera_dir, "linux-32", "ascp") - elif arch == "64bit": - return os.path.join(aspera_dir, "linux-64", "ascp") - elif os_type == "darwin": # macOS (intel-based) - return os.path.join(aspera_dir, "mac-intel", "ascp") - elif os_type == "windows": - if arch == "32bit": - return os.path.join(aspera_dir, "windows-32", "ascp.exe") - elif arch == "64bit": - return os.path.join(aspera_dir, "windows-64", "ascp.exe") - else: - raise OSError(f"Unsupported OS or architecture: {os_type}, {arch}") + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.get_ascp_binary`.""" + return PrideProvider.get_ascp_binary() @staticmethod def save_checksum_file(accession, output_folder): - """ - Download and persist the checksum manifest for a PRIDE accession. - """ - os.makedirs(output_folder, exist_ok=True) - url = f"{Files.V3_API_BASE_URL}/files/checksum/{accession}" - headers = {"accept": "text/plain"} - request = urllib.request.Request(url, headers=headers, method="GET") - logging.info(f"Fetching checksum file from {url}") - with urllib.request.urlopen(request) as response: - data = response.read().decode("utf-8") - # Save the data to a .tsv file - output_path = os.path.join(output_folder, f"{accession}-checksum.tsv") - with open(output_path, "w", encoding="utf-8") as file: - file.write(data) - return output_path + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.save_checksum_file`.""" + return PrideProvider.save_checksum_file(accession, output_folder) @staticmethod def _batch_download_by_protocol( @@ -1154,44 +570,21 @@ def _batch_download_by_protocol( parallel_files: int = 1, checksum_map: Optional[Dict[str, str]] = None, ) -> None: + """Shim — see :meth:`pridepy.providers.pride.PrideProvider._batch_download_by_protocol`. + + Tests patch this method via ``patch.object(Files, "_batch_download_by_protocol")``; + :class:`PrideProvider` calls back through ``Files.X`` so those patches + keep intercepting. """ - Transfer a batch of files with one protocol, reusing a single - connection where the underlying helper supports it (FTP, S3). - """ - if not file_list: - return - if protocol == "ftp": - Files.download_files_from_ftp( - file_list, - output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - ) - return - if protocol == "aspera": - Files.download_files_from_aspera( - file_list, - output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - maximum_bandwidth=aspera_maximum_bandwidth, - ) - return - if protocol == "globus": - Files.download_files_from_globus( - file_list, - output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - parallel_files=parallel_files, - checksum_map=checksum_map or {}, - ) - return - if protocol == "s3": - Files.download_files_from_s3( - file_list, - output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - ) - return - raise ValueError(f"Unsupported protocol: {protocol}") + return PrideProvider._batch_download_by_protocol( + file_list, + output_folder, + protocol, + skip_if_downloaded_already, + aspera_maximum_bandwidth, + parallel_files=parallel_files, + checksum_map=checksum_map, + ) @staticmethod def _download_with_fallback( @@ -1203,52 +596,16 @@ def _download_with_fallback( max_protocol_retries: int = 2, parallel_files: int = 1, ) -> bool: - """ - Download one file by trying each protocol in sequence, validating - after every attempt. Intended as the per-file fallback path; batch - download of the primary protocol is handled separately. - """ - local_path = Files._resolve_local_path(file_record, output_folder) - - for protocol in protocol_sequence: - for attempt in range(1, max_protocol_retries + 1): - logging.info( - f"Downloading {file_record['fileName']} via {protocol} " - f"(attempt {attempt}/{max_protocol_retries})" - ) - try: - Files._remove_if_exists(local_path) - Files._batch_download_by_protocol( - [file_record], - output_folder, - protocol, - skip_if_downloaded_already=False, - aspera_maximum_bandwidth=aspera_maximum_bandwidth, - parallel_files=parallel_files, - ) - except Exception as error: - logging.error( - f"Protocol {protocol} failed for {file_record['fileName']}: {error}" - ) - - valid, reason = Files.validate_download(local_path, expected_checksum) - if valid: - logging.info( - f"File {file_record['fileName']} downloaded successfully via {protocol}" - ) - return True - - logging.warning( - f"Validation failed for {file_record['fileName']} via {protocol}: {reason}" - ) - Files._remove_if_exists(local_path) - - logging.warning( - f"Protocol {protocol} exhausted for {file_record['fileName']}, switching protocol." - ) - - logging.error(f"All protocol attempts failed for {file_record['fileName']}") - return False + """Shim — see :meth:`pridepy.providers.pride.PrideProvider._download_with_fallback`.""" + return PrideProvider._download_with_fallback( + file_record, + output_folder, + protocol_sequence, + expected_checksum, + aspera_maximum_bandwidth, + max_protocol_retries=max_protocol_retries, + parallel_files=parallel_files, + ) @staticmethod def download_files( @@ -1261,94 +618,17 @@ def download_files( checksum_check=False, parallel_files: int = 1, ): - """ - Download files using either FTP or Aspera transfer protocol. - :param file_list_json: File list in JSON format - :param accession: Project accession - :param output_folder: Folder to download the files - :param protocol: ftp, aspera, globus - :param aspera_maximum_bandwidth: parameter in Aspera sets the maximum bandwidth for the transfer. - :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. - """ - protocols_supported = ["ftp", "aspera", "globus", "s3"] - if protocol not in protocols_supported: - logging.error("Protocol should be one of ftp, aspera, globus, s3") - return - - os.makedirs(output_folder, exist_ok=True) - - checksum_map: Dict[str, str] = {} - if checksum_check: - checksum_file_path = Files.save_checksum_file(accession, output_folder) - checksum_map = Files.read_checksum_file(checksum_file_path) - logging.info(f"Loaded checksums for {len(checksum_map)} files") - - if not file_list_json: - return - - protocol_sequence = Files._protocol_sequence(protocol) - primary_protocol = protocol_sequence[0] - # Retry with the primary protocol first, then fall back to others - fallback_sequence = protocol_sequence - - # Phase 1: batch download with the requested protocol. Reuses a single - # FTP/S3 connection for all files (the previous behaviour) instead of - # paying the per-file reconnect cost in the common happy path. - logging.info( - f"Downloading {len(file_list_json)} file(s) via {primary_protocol} (batch)" + """Shim — see :meth:`pridepy.providers.pride.PrideProvider._download_files_batch`.""" + return PrideProvider._download_files_batch( + file_list_json, + accession, + output_folder, + skip_if_downloaded_already, + protocol=protocol, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, + checksum_check=checksum_check, + parallel_files=parallel_files, ) - try: - Files._batch_download_by_protocol( - file_list_json, - output_folder, - primary_protocol, - skip_if_downloaded_already=skip_if_downloaded_already, - aspera_maximum_bandwidth=aspera_maximum_bandwidth, - parallel_files=parallel_files, - checksum_map=checksum_map, - ) - except Exception as exc: - logging.warning( - f"Batch {primary_protocol} run hit an error; will retry individual failures: {exc}" - ) - - # Phase 2: validate every file and fall back per-file for the ones - # that are missing or invalid. - logging.info("Phase 2: validating %d downloaded file(s)", len(file_list_json)) - failed_files: List[str] = [] - for i, file_record in enumerate(file_list_json, 1): - expected_checksum = checksum_map.get(file_record["fileName"]) - local_path = Files._resolve_local_path(file_record, output_folder) - logging.info("Validating [%d/%d] %s", i, len(file_list_json), file_record["fileName"]) - valid, reason = Files.validate_download(local_path, expected_checksum) - if valid: - continue - - logging.warning( - f"{file_record['fileName']} invalid after {primary_protocol} ({reason})" - ) - if "checksum mismatch" in reason: - Files._remove_if_exists(local_path) - - if not fallback_sequence: - failed_files.append(file_record.get("fileName", "")) - continue - - success = Files._download_with_fallback( - file_record=file_record, - output_folder=output_folder, - protocol_sequence=fallback_sequence, - expected_checksum=expected_checksum, - aspera_maximum_bandwidth=aspera_maximum_bandwidth, - parallel_files=parallel_files, - ) - if not success: - failed_files.append(file_record.get("fileName", "")) - - if failed_files: - failed_summary = ", ".join(failed_files) - logging.error(f"Failed to download {len(failed_files)} file(s): {failed_summary}") - raise RuntimeError(f"Failed to download {len(failed_files)} file(s): {failed_summary}") def download_files_by_list( self, @@ -1361,55 +641,13 @@ def download_files_by_list( checksum_check: bool = False, parallel_files: int = 1, ) -> None: - """Download a subset of project files identified by a filename list. - - Resolves each requested filename via the project metadata API and - delegates to :meth:`download_files` so the existing batch + protocol - fallback engine is reused. - - :param accession: PRIDE or MassIVE project accession (public) - :param file_names: filenames to download - :param output_folder: directory to write downloaded files into - :param skip_if_downloaded_already: skip files already present locally - :param protocol: preferred protocol; falls back across others on failure - :param aspera_maximum_bandwidth: aspera ascp bandwidth cap - :param checksum_check: download project checksums and validate - :param parallel_files: number of files to download simultaneously for globus - :raises ValueError: if ``file_names`` is empty or none match the project - """ - if not file_names: - raise ValueError("file_names must contain at least one filename") - - if self.is_massive_accession(accession): - all_files = self._list_massive_public_files(accession) - else: - all_files = self.stream_all_files_by_project(accession) - requested = set(file_names) - matched = [f for f in all_files if f.get("fileName") in requested] - missing = sorted(requested - {f.get("fileName") for f in matched}) - if missing: - logging.warning("Files not found in project %s: %s", accession, missing) - if not matched: - raise ValueError( - f"No matching files in project {accession} for: {sorted(requested)}" - ) - - if self.is_massive_accession(accession): - self._download_massive_file_records( - accession=accession, - file_records=matched, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - protocol=protocol, - ) - return - - self.download_files( - matched, - accession, - output_folder, - skip_if_downloaded_already, - protocol, + """Shim — see :func:`pridepy.commands.by_list.download_files_by_list`.""" + return by_list.download_files_by_list( + accession=accession, + file_names=file_names, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, aspera_maximum_bandwidth=aspera_maximum_bandwidth, checksum_check=checksum_check, parallel_files=parallel_files, @@ -1417,14 +655,8 @@ def download_files_by_list( @staticmethod def _extract_pride_accession(url: str) -> Optional[str]: - """Extract a PRIDE accession (PXD/PRD followed by digits) from a URL path. - - PRIDE archive URLs follow the pattern - ``…/pride/data/archive/YYYY/MM//filename``. - Returns ``None`` when no accession can be identified. - """ - match = re.search(r"((?:PXD|PRD)\d{4,})", url) - return match.group(1) if match else None + """Shim — see :func:`pridepy.commands.by_url._extract_pride_accession`.""" + return by_url._extract_pride_accession(url) @staticmethod def download_files_by_url( @@ -1435,116 +667,20 @@ def download_files_by_url( parallel_files: int = 1, checksum_check: bool = False, ) -> None: - """Download files from a list of raw URLs, dispatched by URL scheme. - - Supported schemes: ``http``, ``https``, ``ftp``. Each URL is downloaded - independently; per-URL errors are logged, then aggregated and re-raised - as a single :class:`RuntimeError` so callers see a complete failure - summary. - - :param urls: fully-qualified URLs (each contains its scheme) - :param output_folder: directory to write downloaded files into - :param skip_if_downloaded_already: skip URLs whose target file exists - :param protocol: ``ftp`` (default) for single-connection per URL scheme; - ``globus`` for resume-capable http/https downloads (single-connection stream) - (no effect on ftp:// URLs which always use single-connection FTP) - :param checksum_check: validate downloads against PRIDE checksum API; - accessions are inferred from URL paths (only PRIDE URLs supported) - :raises ValueError: if ``urls`` is empty - :raises RuntimeError: if one or more URLs failed - """ - if not urls: - raise ValueError("urls must contain at least one URL") - - os.makedirs(output_folder, exist_ok=True) - - parallel_files = min(parallel_files, 3, len(urls)) - failures: List[Tuple[str, str]] = [] - if parallel_files < 2: - for url in urls: - try: - Files._download_single_url( - url, output_folder, skip_if_downloaded_already, protocol, - ) - except Exception as exc: # pylint: disable=broad-except - logging.error("Failed to download %s: %s", url, exc) - failures.append((url, str(exc))) - else: - logging.info( - "Downloading %d URL(s) with %d parallel workers", - len(urls), parallel_files, - ) - with ThreadPoolExecutor(max_workers=parallel_files) as executor: - futures = { - executor.submit( - Files._download_single_url, - url, output_folder, skip_if_downloaded_already, protocol, - position=idx, - ): url - for idx, url in enumerate(urls) - } - for future in as_completed(futures): - url = futures[future] - try: - future.result() - except Exception as exc: # pylint: disable=broad-except - logging.error("Failed to download %s: %s", url, exc) - failures.append((url, str(exc))) - - if failures: - summary = ", ".join(f"{u} ({e})" for u, e in failures) - raise RuntimeError( - f"Failed to download {len(failures)} URL(s): {summary}" - ) - - if checksum_check: - Files._validate_urls_checksums(urls, output_folder) + """Shim — see :func:`pridepy.commands.by_url.download_files_by_url`.""" + return by_url.download_files_by_url( + urls=urls, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, + parallel_files=parallel_files, + checksum_check=checksum_check, + ) @staticmethod def _validate_urls_checksums(urls: List[str], output_folder: str) -> None: - """Validate downloaded files against PRIDE checksum API. - - Accessions are inferred from URL paths via - :meth:`_extract_pride_accession`. URLs that do not contain a - recognisable PRIDE accession are skipped with a warning. - - :raises RuntimeError: if one or more files fail validation - """ - accession_urls: Dict[str, List[str]] = {} - for url in urls: - acc = Files._extract_pride_accession(url) - if acc: - accession_urls.setdefault(acc, []).append(url) - else: - logging.warning( - "Cannot infer PRIDE accession from URL, skipping checksum: %s", url - ) - - validation_failures: List[str] = [] - for acc, acc_urls in accession_urls.items(): - checksum_file_path = Files.save_checksum_file(acc, output_folder) - checksum_map = Files.read_checksum_file(checksum_file_path) - logging.info( - "Loaded checksums for %d files (project %s)", - len(checksum_map), acc, - ) - for url in acc_urls: - file_name = os.path.basename(urlparse(url).path) - target = os.path.join(output_folder, file_name) - expected = checksum_map.get(file_name) - logging.info("Validating %s", file_name) - valid, reason = Files.validate_download(target, expected) - if not valid: - logging.error("Validation failed for %s: %s", file_name, reason) - validation_failures.append(f"{file_name} ({reason})") - else: - logging.info("Checksum OK: %s", file_name) - - if validation_failures: - raise RuntimeError( - f"Checksum validation failed for {len(validation_failures)} file(s): " - + ", ".join(validation_failures) - ) + """Shim — see :func:`pridepy.commands.by_url._validate_urls_checksums`.""" + return by_url._validate_urls_checksums(urls, output_folder) @staticmethod def _download_single_url( @@ -1554,94 +690,23 @@ def _download_single_url( protocol: str = "ftp", position: int = 0, ) -> str: - """Download one URL, dispatched by scheme; return the local file path.""" - parsed = urlparse(url) - if not (parsed.scheme or "").lower(): - raise ValueError(f"URL missing scheme: {url}") - - file_name = os.path.basename(parsed.path) - if not file_name: - raise ValueError(f"Cannot derive filename from URL: {url}") - - target = os.path.join(output_folder, file_name) - if skip_if_exists and os.path.isfile(target) and os.path.getsize(target) > 0: - logging.info("Skipping %s: already downloaded", file_name) - return target - - Files._dispatch_url_scheme(parsed, target, protocol, position=position) - - ok, reason = Files.validate_download(target) - if not ok: - Files._remove_if_exists(target) - raise RuntimeError(f"Download invalid: {reason} ({target})") - return target + """Shim — see :func:`pridepy.commands.by_url._download_single_url`.""" + return by_url._download_single_url(url, output_folder, skip_if_exists, protocol, position) @staticmethod def _dispatch_url_scheme(parsed, target: str, protocol: str = "ftp", position: int = 0) -> None: - """Route a parsed URL to its protocol-specific downloader. - - ``protocol='globus'`` swaps the http/https single-connection streamer - for :meth:`_parallel_download` (single-connection with progress bar). - ftp:// URLs are unaffected. - """ - scheme = (parsed.scheme or "").lower() - if scheme in ("http", "https"): - if protocol == "globus": - Files._parallel_download(parsed.geturl(), target, position=position) - else: - Files._http_download_url(parsed.geturl(), target) - elif scheme == "ftp": - Files._ftp_download_url(parsed, target) - else: - raise ValueError(f"Unsupported URL scheme: {scheme}") + """Shim — see :func:`pridepy.commands.by_url._dispatch_url_scheme`.""" + return by_url._dispatch_url_scheme(parsed, target, protocol=protocol, position=position) @staticmethod def _http_download_url(url: str, target: str) -> None: - """Stream an http/https URL into ``target`` with a progress bar.""" - session = Util.create_session_with_retries() - with session.get(url, stream=True, timeout=60) as response: - response.raise_for_status() - total = int(response.headers.get("Content-Length", 0)) - with open(target, "wb") as out, tqdm( - total=total, - unit="B", - unit_scale=True, - desc=os.path.basename(target), - ) as pbar: - for chunk in response.iter_content(chunk_size=8192): - if chunk: - out.write(chunk) - pbar.update(len(chunk)) + """Shim — see :func:`pridepy.commands.by_url._http_download_url`.""" + return by_url._http_download_url(url, target) @staticmethod def _ftp_download_url(parsed, target: str) -> None: - """Download a single file from an ftp:// URL with a progress bar.""" - host = parsed.hostname - if not host: - raise ValueError(f"FTP URL missing host: {parsed.geturl()}") - port = parsed.port or 21 - user = parsed.username or "anonymous" - pwd = parsed.password or "anonymous@" - remote_path = parsed.path - with FTP() as ftp: - ftp.connect(host, port, timeout=60) - ftp.login(user, pwd) - try: - total = ftp.size(remote_path) or 0 - except ftplib.error_perm: - total = 0 - with open(target, "wb") as out, tqdm( - total=total, - unit="B", - unit_scale=True, - desc=os.path.basename(target), - ) as pbar: - - def _callback(data: bytes) -> None: - out.write(data) - pbar.update(len(data)) - - ftp.retrbinary(f"RETR {remote_path}", _callback) + """Shim — see :func:`pridepy.commands.by_url._ftp_download_url`.""" + return by_url._ftp_download_url(parsed, target) def download_all_category_files( self, @@ -1669,25 +734,17 @@ def download_all_category_files( """ if categories is None: categories = [category] if category else ["RAW"] - raw_files = self.get_all_category_file_list(accession, categories) - if self.is_massive_accession(accession): - self._download_massive_file_records( - accession=accession, - file_records=raw_files, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - protocol=protocol, - ) - return - self.download_files( - raw_files, - accession, - output_folder, - skip_if_downloaded_already, - protocol, - aspera_maximum_bandwidth=aspera_maximum_bandwidth, - checksum_check=checksum_check, + records = self.get_all_category_file_list(accession, categories) + provider = registry.resolve(accession) + provider.download_files( + accession=accession, + records=records, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, parallel_files=parallel_files, + checksum_check=checksum_check, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, ) def get_all_category_file_list( @@ -1702,17 +759,9 @@ def get_all_category_file_list( """ if isinstance(categories, str): categories = [categories] - category_set = {category.upper() for category in categories} - - if self.is_massive_accession(accession): - record_files = self._list_massive_public_files(accession) - else: - record_files = self.stream_all_files_by_project(accession) - - category_files = [ - file for file in record_files if file["fileCategory"]["value"] in category_set - ] - return category_files + category_set = {c.upper() for c in categories} + records = registry.resolve(accession).list_files(accession) + return [r for r in records if r["fileCategory"]["value"] in category_set] # ------------------------------- # ProteomeXchange support @@ -1720,53 +769,13 @@ def get_all_category_file_list( @staticmethod def _normalize_px_xml_url(px_id_or_url: str) -> str: - """ - Build the ProteomeXchange XML endpoint from a dataset accession or a dataset web URL. - Examples accepted: - - PXD039236 - - https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236 - - https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236&anything - """ - if px_id_or_url.startswith("http://") or px_id_or_url.startswith("https://"): - parsed = urlparse(px_id_or_url) - # keep the ID param value if present; otherwise fallback to the path tail - query = parsed.query or "" - if "ID=" in query: - id_value = [q.split("=", 1)[1] for q in query.split("&") if q.startswith("ID=")] - if id_value: - return ( - f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={id_value[0]}&outputMode=XML&test=no" - ) - # If the input URL already requests XML, just ensure flags - if parsed.path.endswith("/cgi/GetDataset"): - return ( - f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?{query}&outputMode=XML&test=no" - ) - # Assume it's a plain accession if not a URL - return ( - f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={px_id_or_url}&outputMode=XML&test=no" - ) + """Shim — see :meth:`pridepy.providers.proteomexchange.ProteomeXchangeProvider._normalize_px_xml_url`.""" + return ProteomeXchangeProvider._normalize_px_xml_url(px_id_or_url) @staticmethod - def _parse_px_xml_for_raw_file_urls(px_xml_url: str) -> List[str]: - """ - Parse the PX XML and return a list of associated raw file URIs. - We extract cvParam with name "Associated raw file URI" under each DatasetFile. - """ - headers = {"Accept": "application/xml"} - response = Util.get_api_call(px_xml_url, headers) - response.raise_for_status() - root = ET.fromstring(response.content) - - urls: List[str] = [] - # The XML namespace is often absent in PX XML; access elements directly - for dataset_file in root.iter("DatasetFile"): - for cv in dataset_file.findall("cvParam"): - name = cv.attrib.get("name") - value = cv.attrib.get("value") - if name == "Associated raw file URI" and value: - urls.append(value) - return urls + def _parse_px_xml_for_raw_file_urls(px_xml_url: str): + """Shim — see :meth:`pridepy.providers.proteomexchange.ProteomeXchangeProvider._parse_px_xml_for_raw_file_urls`.""" + return ProteomeXchangeProvider._parse_px_xml_for_raw_file_urls(px_xml_url) def download_px_raw_files( self, @@ -1774,32 +783,78 @@ def download_px_raw_files( output_folder: str, skip_if_downloaded_already: bool = True, ) -> None: - """ - Download all raw files referenced by a ProteomeXchange dataset. - Prefer FTP when the URL is ftp://, otherwise use HTTP(S). Supports resume and skip. - """ - if not os.path.isdir(output_folder): - os.makedirs(output_folder, exist_ok=True) + """Shim — see :meth:`pridepy.providers.proteomexchange.ProteomeXchangeProvider.download_from_accession_or_url`.""" + return ProteomeXchangeProvider().download_from_accession_or_url( + px_id_or_url, output_folder, skip_if_downloaded_already + ) - px_xml_url = self._normalize_px_xml_url(px_id_or_url) - logging.info(f"Fetching PX XML: {px_xml_url}") - urls = self._parse_px_xml_for_raw_file_urls(px_xml_url) - if not urls: - logging.info("No Associated raw file URIs found in PX XML") - return + @staticmethod + def _local_path_for_url(download_url: str, output_folder: str) -> str: + """Shim — see :func:`pridepy.providers.transport._local_path_for_url`.""" + return transport._local_path_for_url(download_url=download_url, output_folder=output_folder) - ftp_urls = [u for u in urls if u.lower().startswith("ftp://")] - http_urls = [u for u in urls if u.lower().startswith("http://") or u.lower().startswith("https://")] + @staticmethod + def _download_one_ftp_path( + ftp: FTP, + ftp_path: str, + local_path: str, + skip_if_downloaded_already: bool, + max_download_retries: int, + position: int = 0, + ) -> None: + """Shim — see :func:`pridepy.providers.transport._download_one_ftp_path`.""" + return transport._download_one_ftp_path( + ftp=ftp, + ftp_path=ftp_path, + local_path=local_path, + skip_if_downloaded_already=skip_if_downloaded_already, + max_download_retries=max_download_retries, + position=position, + ) - if ftp_urls: - self.download_ftp_urls(ftp_urls, output_folder, skip_if_downloaded_already) - if http_urls: - self.download_http_urls(http_urls, output_folder, skip_if_downloaded_already) + @staticmethod + def _download_ftp_paths_serial( + host: str, + paths: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + use_tls: bool, + max_connection_retries: int, + max_download_retries: int, + ) -> None: + """Shim — see :func:`pridepy.providers.transport._download_ftp_paths_serial`.""" + return transport._download_ftp_paths_serial( + host=host, + paths=paths, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + use_tls=use_tls, + max_connection_retries=max_connection_retries, + max_download_retries=max_download_retries, + ) @staticmethod - def _local_path_for_url(download_url: str, output_folder: str) -> str: - filename = os.path.basename(urlparse(download_url).path) - return os.path.join(output_folder, filename) + def _download_ftp_paths_parallel( + host: str, + paths: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + use_tls: bool, + max_connection_retries: int, + max_download_retries: int, + parallel_files: int, + ) -> None: + """Shim — see :func:`pridepy.providers.transport._download_ftp_paths_parallel`.""" + return transport._download_ftp_paths_parallel( + host=host, + paths=paths, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + use_tls=use_tls, + max_connection_retries=max_connection_retries, + max_download_retries=max_download_retries, + parallel_files=parallel_files, + ) @staticmethod def download_ftp_urls( @@ -1808,144 +863,50 @@ def download_ftp_urls( skip_if_downloaded_already: bool, max_connection_retries: int = 3, max_download_retries: int = 3, + use_tls: bool = False, + parallel_files: int = 1, ) -> None: - """ - Download a list of FTP URLs using a single connection, with retries and progress bars. - """ - if not os.path.isdir(output_folder): - os.makedirs(output_folder, exist_ok=True) - - def connect_ftp(host: str): - ftp = FTP(host, timeout=30) - ftp.login() - ftp.set_pasv(True) - logging.info(f"Connected to FTP host: {host}") - return ftp - - # Group URLs by host to reuse connections efficiently - host_to_paths: Dict[str, List[str]] = {} - for url in ftp_urls: - parsed = urlparse(url) - host_to_paths.setdefault(parsed.hostname, []).append(parsed.path.lstrip("/")) - - for host, paths in host_to_paths.items(): - connection_attempt = 0 - while connection_attempt < max_connection_retries: - try: - ftp = connect_ftp(host) - for ftp_path in paths: - try: - local_path = os.path.join(output_folder, os.path.basename(ftp_path)) - if skip_if_downloaded_already and os.path.exists(local_path): - logging.info("Skipping download as file already exists") - continue - - logging.info(f"Starting FTP download: {host}/{ftp_path}") - download_attempt = 0 - while download_attempt < max_download_retries: - try: - total_size = ftp.size(ftp_path) - # Try to resume using REST if partial file exists - if os.path.exists(local_path): - current_size = os.path.getsize(local_path) - mode = "ab" - else: - current_size = 0 - mode = "wb" - - with open(local_path, mode) as f, tqdm( - total=total_size, - unit="B", - unit_scale=True, - desc=local_path, - initial=current_size, - ) as pbar: - def callback(data): - f.write(data) - pbar.update(len(data)) - - if current_size: - try: - ftp.sendcmd(f"REST {current_size}") - except Exception: - # If REST not supported, fall back to full download - current_size = 0 - f.seek(0) - f.truncate() - ftp.retrbinary(f"RETR {ftp_path}", callback) - logging.info(f"Successfully downloaded {local_path}") - break - except (socket.timeout, ftplib.error_temp, ftplib.error_perm) as e: - download_attempt += 1 - logging.error( - f"Download failed for {local_path} (attempt {download_attempt}): {str(e)}" - ) - if download_attempt >= max_download_retries: - logging.error( - f"Giving up on {local_path} after {max_download_retries} attempts." - ) - break - except Exception as e: - logging.error(f"Unexpected error while processing FTP path {ftp_path}: {str(e)}") - ftp.quit() - logging.info(f"Disconnected from FTP host: {host}") - break - except (socket.timeout, ftplib.error_temp, ftplib.error_perm, socket.error) as e: - connection_attempt += 1 - logging.error(f"FTP connection failed (attempt {connection_attempt}): {str(e)}") - if connection_attempt < max_connection_retries: - logging.info("Retrying connection...") - time.sleep(5) - else: - logging.error( - f"Giving up after {max_connection_retries} failed connection attempts to {host}." - ) + """Shim — see :func:`pridepy.providers.transport.download_ftp_urls`.""" + return transport.download_ftp_urls( + ftp_urls=ftp_urls, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + max_connection_retries=max_connection_retries, + max_download_retries=max_download_retries, + use_tls=use_tls, + parallel_files=parallel_files, + ) + + @staticmethod + def _http_download_one( + url: str, + output_folder: str, + skip_if_downloaded_already: bool, + max_retries: int = 3, + position: int = 0, + ) -> None: + """Shim — see :func:`pridepy.providers.transport._http_download_one`.""" + return transport._http_download_one( + url=url, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + max_retries=max_retries, + position=position, + ) @staticmethod def download_http_urls( http_urls: List[str], output_folder: str, skip_if_downloaded_already: bool, + parallel_files: int = 1, + max_retries: int = 3, ) -> None: - """ - Download a list of HTTP(S) URLs with resume support and progress bars. - """ - if not os.path.isdir(output_folder): - os.makedirs(output_folder, exist_ok=True) - - session = Util.create_session_with_retries() - for url in http_urls: - try: - local_path = Files._local_path_for_url(url, output_folder) - if skip_if_downloaded_already and os.path.exists(local_path): - logging.info("Skipping download as file already exists") - continue - - if os.path.exists(local_path): - resume_size = os.path.getsize(local_path) - headers = {"Range": f"bytes={resume_size}-"} - mode = "ab" - else: - resume_size = 0 - headers = {} - mode = "wb" - - with session.get(url, stream=True, headers=headers, timeout=(10, 60)) as r: - r.raise_for_status() - total_size = int(r.headers.get("content-length", 0)) + resume_size - block_size = 1024 * 1024 - with tqdm( - total=total_size, - unit="B", - unit_scale=True, - desc=local_path, - initial=resume_size, - ) as pbar: - with open(local_path, mode) as f: - for chunk in r.iter_content(chunk_size=block_size): - if chunk: - f.write(chunk) - pbar.update(len(chunk)) - logging.info(f"Successfully downloaded {local_path}") - except Exception as e: - logging.error(f"HTTP download failed for {url}: {str(e)}") + """Shim — see :func:`pridepy.providers.transport.download_http_urls`.""" + return transport.download_http_urls( + http_urls=http_urls, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + parallel_files=parallel_files, + max_retries=max_retries, + ) diff --git a/pridepy/providers/__init__.py b/pridepy/providers/__init__.py new file mode 100644 index 0000000..ee1de17 --- /dev/null +++ b/pridepy/providers/__init__.py @@ -0,0 +1,7 @@ +"""Per-repository provider classes used by :class:`pridepy.files.files.Files`. + +Each module under this package owns the listing, transport choice, and +record-construction logic for one repository: PRIDE, MassIVE, JPOST, iProX. +The :mod:`registry` module maps an accession to the right provider; the +:mod:`transport` module hosts the shared FTP/FTPS/HTTPS download plumbing. +""" diff --git a/pridepy/providers/base.py b/pridepy/providers/base.py new file mode 100644 index 0000000..f9fa8bc --- /dev/null +++ b/pridepy/providers/base.py @@ -0,0 +1,107 @@ +"""Abstract base classes for pridepy providers.""" +from abc import ABC, abstractmethod +from typing import ClassVar, Dict, List, Optional + + +class Provider(ABC): + """Abstract base for every repository pridepy can list and download from.""" + + name: ClassVar[str] # "pride", "massive", "jpost", "iprox" + + @staticmethod + @abstractmethod + def matches(accession: str) -> bool: + """Return True if this provider should handle ``accession``.""" + + @abstractmethod + def list_files(self, accession: str) -> List[Dict]: + """Return pridepy file records for the dataset. + + Each record is a dict shaped like the PRIDE V3 API file response, + with at minimum: ``accession``, ``fileName``, ``fileCategory`` + (with nested ``value``), ``publicFileLocations`` (list of + ``{"name": ..., "value": }``). + """ + + @abstractmethod + def download_files( + self, + accession: str, + records: List[Dict], + output_folder: str, + skip_if_downloaded_already: bool, + protocol: str, + parallel_files: int = 1, + checksum_check: bool = False, + aspera_maximum_bandwidth: str = "100M", + username: Optional[str] = None, + password: Optional[str] = None, + ) -> None: + """Download the given records into ``output_folder``.""" + + +class BaseDirectDownloadProvider(Provider): + """Shared ``download_files`` for MassIVE / JPOST / iProX. + + Subclasses set the ``use_tls`` class var (True for MassIVE FTPS, False for + JPOST plain FTP) and override :meth:`list_files`. The shared + ``download_files`` implementation partitions record URLs by scheme: + ``ftp://`` URLs are handed to :meth:`Files.download_ftp_urls`; ``http(s)://`` + URLs go to :meth:`Files.download_http_urls`. It calls **back** into + ``Files`` so that test patches on ``Files.download_ftp_urls`` / + ``Files.download_http_urls`` continue to intercept the calls. + """ + + use_tls: ClassVar[bool] = False + + def download_files( + self, + accession: str, + records: List[Dict], + output_folder: str, + skip_if_downloaded_already: bool, + protocol: str, + parallel_files: int = 1, + checksum_check: bool = False, + aspera_maximum_bandwidth: str = "100M", + username: Optional[str] = None, + password: Optional[str] = None, + ) -> None: + # Lazy import: providers know about Files (the facade) only via the + # public attributes that tests may patch; avoid module-load cycle. + from pridepy.files.files import Files + + if protocol not in ("ftp", "https", "http"): + import logging + logging.warning( + "Direct downloads currently use ftp / https only. " + f"Ignoring requested protocol '{protocol}' for {accession}." + ) + + all_urls = [Files._get_download_url(record, "ftp") for record in records] + ftp_urls = [u for u in all_urls if u.lower().startswith("ftp://")] + http_urls = [ + u for u in all_urls if u.lower().startswith(("http://", "https://")) + ] + if not ftp_urls and not http_urls: + import logging + logging.info( + f"No files matched for direct-download dataset {accession}" + ) + return + + if ftp_urls: + Files.download_ftp_urls( + ftp_urls=ftp_urls, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + use_tls=self.use_tls, + parallel_files=parallel_files, + ) + if http_urls: + Files.download_http_urls( + http_urls=http_urls, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + parallel_files=parallel_files, + ) diff --git a/pridepy/providers/iprox.py b/pridepy/providers/iprox.py new file mode 100644 index 0000000..292307c --- /dev/null +++ b/pridepy/providers/iprox.py @@ -0,0 +1,129 @@ +"""iProX direct-download provider. + +iProX publishes the ProteomeXchange XML for each dataset at a +deterministic path on its anonymous HTTPS download server:: + + http://download.iprox.org//PX_.xml + +We fetch the XML, walk every ````'s ``cvParam`` entries, and +turn each ``Associated raw file URI`` (and sibling URIs for search-engine +output, result files, etc.) into a pridepy file record. File downloads +themselves go through plain HTTPS on the same host, which supports +``Range`` requests for resume. +""" +import logging +import os +import re +import xml.etree.ElementTree as ET +from typing import ClassVar, Dict, List, Optional +from urllib.parse import urlparse + +import requests + +from pridepy.providers import registry +from pridepy.providers.base import BaseDirectDownloadProvider +from pridepy.providers.jpost import JpostProvider + + +@registry.register +class IproxProvider(BaseDirectDownloadProvider): + name: ClassVar[str] = "iprox" + use_tls: ClassVar[bool] = False # download.iprox.org serves over plain HTTP + + DOWNLOAD_BASE_URL: ClassVar[str] = "http://download.iprox.org/" + PX_XML_URL_TEMPLATE: ClassVar[str] = ( + "http://download.iprox.org/{accession}/PX_{accession}.xml" + ) + # iProX PX XML uses the same PSI-MS cvParam "name" values as JPOST PROXI, + # so we reuse JpostProvider's category map. + PX_CATEGORY_MAP: ClassVar[Dict[str, str]] = JpostProvider.PROXI_CATEGORY_MAP + + @staticmethod + def matches(accession: str) -> bool: + """Return True when ``accession`` looks like an iProX dataset accession.""" + if not accession: + return False + return bool(re.fullmatch(r"IPX\d{7,10}", accession.upper())) + + @staticmethod + def _get_public_root(accession: str) -> str: + return f"/{accession.upper()}" + + @classmethod + def _get_public_ftp_url(cls, accession: str, remote_path: str) -> str: + # NOTE: name kept as `_get_public_ftp_url` for parity with other providers, + # but iProX URLs are http(s) not ftp. The dispatcher routes by scheme. + root_path = cls._get_public_root(accession).rstrip("/") + relative_path = remote_path + if remote_path.startswith(root_path): + relative_path = remote_path[len(root_path):].lstrip("/") + return f"{cls.DOWNLOAD_BASE_URL}{accession.upper()}/{relative_path}" + + @classmethod + def _build_file_record( + cls, accession: str, https_url: str, category_from_px: Optional[str] = None + ) -> Dict: + """Build a pridepy file record for an iProX file. + + ``category_from_px`` is the ``cvParam`` ``name`` from the dataset's + ProteomeXchange XML (e.g. ``"Associated raw file URI"``). + """ + from pridepy.providers.massive import MassiveProvider + parsed = urlparse(https_url) + root_prefix = f"/{accession.upper()}/" + relative_path = parsed.path + if relative_path.startswith(root_prefix): + relative_path = relative_path[len(root_prefix):] + relative_path = relative_path.lstrip("/") + collection = relative_path.split("/", 1)[0] if relative_path else "" + if category_from_px and category_from_px in cls.PX_CATEGORY_MAP: + category = cls.PX_CATEGORY_MAP[category_from_px] + else: + category = MassiveProvider._map_collection_to_category(collection) + return { + "accession": accession.upper(), + "fileName": os.path.basename(parsed.path), + "fileCategory": {"value": category}, + # "FTP Protocol" is the existing label the download dispatcher uses + # to locate a file URL; here it actually points at HTTPS. + # BaseDirectDownloadProvider.download_files routes by URL scheme. + "publicFileLocations": [{"name": "FTP Protocol", "value": https_url}], + "relativePath": relative_path, + "collection": collection, + "source": "iProX", + } + + def list_files(self, accession: str) -> List[Dict]: + normalized = accession.upper() + xml_url = self.PX_XML_URL_TEMPLATE.format(accession=normalized) + logging.info(f"Fetching iProX PX XML: {xml_url}") + response = requests.get(xml_url, timeout=30) + response.raise_for_status() + try: + root = ET.fromstring(response.content) + except ET.ParseError as parse_error: + raise RuntimeError( + f"Unable to parse iProX PX XML for {normalized}: {parse_error}" + ) from parse_error + + records: List[Dict] = [] + for dataset_file in root.iter("DatasetFile"): + for cv in dataset_file.findall("cvParam"): + name = cv.attrib.get("name") + value = cv.attrib.get("value") + if not value or not name or not name.endswith("URI"): + continue + if not value.lower().startswith(("http://", "https://")): + continue + records.append( + self._build_file_record( + normalized, + value, + category_from_px=name, + ) + ) + if not records: + raise RuntimeError( + f"iProX PX XML for {normalized} contained no downloadable HTTPS URIs" + ) + return records diff --git a/pridepy/providers/jpost.py b/pridepy/providers/jpost.py new file mode 100644 index 0000000..a9ab23e --- /dev/null +++ b/pridepy/providers/jpost.py @@ -0,0 +1,150 @@ +"""JPOST direct-download provider. + +PRIMARY listing: PROXI JSON at repository.jpostdb.org. The PROXI endpoint +returns ``datasetFiles[*].value`` as ``ftp://`` URLs alongside CV labels +(Associated raw file URI, Search engine output file URI, etc.) which map +cleanly to PRIDE file categories. + +FALLBACK listing: when PROXI fails, walk the FTP tree at ftp.jpostdb.org. +This is needed because JPOST's FTP server rate-limits aggressively per +source IP (sticky 421-too-many-connections); the PROXI path lets us avoid +walking the FTP tree just for a listing. +""" +import logging +import os +import re +from typing import ClassVar, Dict, List, Optional +from urllib.parse import urlparse + +import requests + +from pridepy.providers import registry +from pridepy.providers.base import BaseDirectDownloadProvider + + +@registry.register +class JpostProvider(BaseDirectDownloadProvider): + name: ClassVar[str] = "jpost" + use_tls: ClassVar[bool] = False + + ARCHIVE_FTP: ClassVar[str] = "ftp.jpostdb.org" + ARCHIVE_FTP_URL_PREFIX: ClassVar[str] = "ftp://ftp.jpostdb.org/" + PROXI_BASE_URL: ClassVar[str] = "https://repository.jpostdb.org/proxi/datasets/" + + PROXI_CATEGORY_MAP: ClassVar[Dict[str, str]] = { + "Associated raw file URI": "RAW", + "Result file URI": "RESULT", + "Search engine output file URI": "SEARCH", + "Peak list file URI": "PEAK", + "Spectrum library file URI": "SPECTRUM_LIBRARY", + "Sequence database URI": "FASTA", + "Quantification file URI": "RESULT", + } + + @staticmethod + def matches(accession: str) -> bool: + if not accession: + return False + return bool(re.fullmatch(r"JPST\d{6}", accession.upper())) + + @staticmethod + def _get_public_root(accession: str) -> str: + return f"/{accession.upper()}" + + @classmethod + def _get_public_ftp_url(cls, accession: str, remote_path: str) -> str: + root_path = cls._get_public_root(accession).rstrip("/") + relative_path = remote_path + if remote_path.startswith(root_path): + relative_path = remote_path[len(root_path):].lstrip("/") + return f"{cls.ARCHIVE_FTP_URL_PREFIX}{accession.upper()}/{relative_path}" + + @classmethod + def _build_file_record( + cls, accession: str, ftp_url: str, category_from_proxi: Optional[str] = None + ) -> Dict: + """Build a pridepy file record from an FTP URL. + + When ``category_from_proxi`` is provided (e.g. ``"Associated raw file URI"``), + the PROXI CV name takes precedence over the heuristic collection-from-path + mapping. Falls back to the same path-segment heuristic used for MassIVE + when the category isn't known. + """ + # Import the MassIVE collection->category map for the fallback heuristic. + from pridepy.providers.massive import MassiveProvider + parsed = urlparse(ftp_url) + root_prefix = f"/{accession.upper()}/" + relative_path = parsed.path + if relative_path.startswith(root_prefix): + relative_path = relative_path[len(root_prefix):] + relative_path = relative_path.lstrip("/") + collection = relative_path.split("/", 1)[0] if relative_path else "" + if category_from_proxi and category_from_proxi in cls.PROXI_CATEGORY_MAP: + category = cls.PROXI_CATEGORY_MAP[category_from_proxi] + else: + category = MassiveProvider._map_collection_to_category(collection) + return { + "accession": accession.upper(), + "fileName": os.path.basename(parsed.path), + "fileCategory": {"value": category}, + "publicFileLocations": [{"name": "FTP Protocol", "value": ftp_url}], + "relativePath": relative_path, + "collection": collection, + "source": "JPOST", + } + + def list_files(self, accession: str) -> List[Dict]: + """PRIMARY: PROXI JSON. FALLBACK: FTP tree walk.""" + normalized = accession.upper() + try: + return self._list_via_proxi(normalized) + except Exception as proxi_error: + logging.warning( + f"JPOST PROXI listing failed for {normalized} " + f"({proxi_error}); falling back to FTP tree walk." + ) + from pridepy.providers import transport + remote_root = self._get_public_root(normalized) + remote_files = transport._list_ftp_repo_files( + host=self.ARCHIVE_FTP, + remote_root=remote_root, + error_label=f"JPOST dataset {normalized}", + ) + return [ + self._build_file_record( + normalized, + self._get_public_ftp_url(normalized, remote_file), + ) + for remote_file in remote_files + ] + + def _list_via_proxi(self, accession: str) -> List[Dict]: + """Fetch JPOST PROXI dataset metadata and turn each datasetFiles entry into a file record.""" + import json as _json + proxi_url = f"{self.PROXI_BASE_URL}{accession}" + logging.info(f"Fetching JPOST PROXI metadata: {proxi_url}") + response = requests.get( + proxi_url, + headers={"Accept": "application/json"}, + timeout=30, + ) + response.raise_for_status() + data = _json.loads(response.content) + dataset_files = data.get("datasetFiles") or [] + records: List[Dict] = [] + for entry in dataset_files: + value = (entry or {}).get("value") + if not value or not value.startswith("ftp://"): + continue + records.append( + self._build_file_record( + accession, + value, + category_from_proxi=(entry or {}).get("name"), + ) + ) + if not records: + raise RuntimeError( + f"JPOST PROXI returned no FTP file URIs for {accession}" + ) + return records diff --git a/pridepy/providers/massive.py b/pridepy/providers/massive.py new file mode 100644 index 0000000..cdc466b --- /dev/null +++ b/pridepy/providers/massive.py @@ -0,0 +1,97 @@ +"""MassIVE direct-download provider. + +Lists files by walking the FTPS tree at massive-ftp.ucsd.edu (TLS is +required by the server). Downloads files via the shared transport layer +with ``use_tls=True``. +""" +import os +import re +from typing import ClassVar, Dict, List +from urllib.parse import urlparse + +from pridepy.providers import registry +from pridepy.providers.base import BaseDirectDownloadProvider + + +MASSIVE_CATEGORY_MAP = { + "raw": "RAW", + "peak": "PEAK", + "ccms_peak": "PEAK", + "search": "SEARCH", + "result": "RESULT", + "ccms_result": "RESULT", + "quant": "RESULT", + "fasta": "FASTA", + "spectrum_library": "SPECTRUM_LIBRARY", + "library": "SPECTRUM_LIBRARY", +} + + +@registry.register +class MassiveProvider(BaseDirectDownloadProvider): + name: ClassVar[str] = "massive" + use_tls: ClassVar[bool] = True + + ARCHIVE_FTP: ClassVar[str] = "massive-ftp.ucsd.edu" + ARCHIVE_FTP_URL_PREFIX: ClassVar[str] = "ftp://massive-ftp.ucsd.edu/v01/" + + @staticmethod + def matches(accession: str) -> bool: + """Return True when ``accession`` is a MassIVE dataset accession.""" + if not accession: + return False + return bool(re.fullmatch(r"R?MSV\d{9}", accession.upper())) + + @staticmethod + def _get_public_root(accession: str) -> str: + return f"/v01/{accession.upper()}" + + @classmethod + def _get_public_ftp_url(cls, accession: str, remote_path: str) -> str: + root_path = cls._get_public_root(accession).rstrip("/") + relative_path = remote_path + if remote_path.startswith(root_path): + relative_path = remote_path[len(root_path):].lstrip("/") + return f"{cls.ARCHIVE_FTP_URL_PREFIX}{accession.upper()}/{relative_path}" + + @staticmethod + def _map_collection_to_category(collection: str) -> str: + return MASSIVE_CATEGORY_MAP.get(collection.lower(), "OTHER") + + @classmethod + def _build_file_record(cls, accession: str, ftp_url: str) -> Dict: + """Build a pridepy file record from an FTP URL inside the dataset.""" + parsed = urlparse(ftp_url) + root_prefix = f"/v01/{accession.upper()}/" + relative_path = parsed.path + if relative_path.startswith(root_prefix): + relative_path = relative_path[len(root_prefix):] + relative_path = relative_path.lstrip("/") + collection = relative_path.split("/", 1)[0] if relative_path else "" + return { + "accession": accession.upper(), + "fileName": os.path.basename(parsed.path), + "fileCategory": {"value": cls._map_collection_to_category(collection)}, + "publicFileLocations": [{"name": "FTP Protocol", "value": ftp_url}], + "relativePath": relative_path, + "collection": collection, + "source": "MassIVE", + } + + def list_files(self, accession: str) -> List[Dict]: + from pridepy.providers import transport + normalized = accession.upper() + remote_root = self._get_public_root(normalized) + remote_files = transport._list_ftp_repo_files( + host=self.ARCHIVE_FTP, + remote_root=remote_root, + error_label=f"MassIVE dataset {normalized}", + use_tls=True, + ) + return [ + self._build_file_record( + normalized, + self._get_public_ftp_url(normalized, remote_file), + ) + for remote_file in remote_files + ] diff --git a/pridepy/providers/pride.py b/pridepy/providers/pride.py new file mode 100644 index 0000000..c8c40ca --- /dev/null +++ b/pridepy/providers/pride.py @@ -0,0 +1,815 @@ +"""PRIDE Archive provider. + +PRIDE has the richest behaviour of all providers: multi-protocol batch +download with aspera/s3/ftp/globus fallback, private-dataset path with +username/password auth, checksum TSV validation, and submitter-path +helpers. This module hosts all of those; the :class:`Files` facade +delegates via lightweight shim methods. + +Implementation note: PRIDE-specific helpers that the existing test suite +patches via ``patch.object(Files, "X")`` are called from inside this +provider via ``Files.X(...)`` (lazy import) — never ``self.X`` — so the +patches keep intercepting. This is a deliberate backward-compat choice +documented in the refactor plan (Task 8). +""" +import ftplib +import importlib.resources +import logging +import os +import platform +import re +import socket +import subprocess +import time +import urllib +import urllib.request +from concurrent.futures import ThreadPoolExecutor, as_completed +from ftplib import FTP +from typing import ClassVar, Dict, List, Optional +from urllib.parse import urlparse + +import boto3 +import botocore +import requests +from botocore.config import Config +from tqdm import tqdm + +from pridepy.authentication.authentication import Authentication +from pridepy.providers import registry +from pridepy.providers.base import Provider +from pridepy.providers.util import Progress +from pridepy.util.api_handling import Util + + +@registry.register +class PrideProvider(Provider): + """PRIDE Archive provider with multi-protocol fallback orchestration.""" + + name: ClassVar[str] = "pride" + + V3_API_BASE_URL: ClassVar[str] = "https://www.ebi.ac.uk/pride/ws/archive/v3" + API_BASE_URL: ClassVar[str] = "https://www.ebi.ac.uk/pride/ws/archive/v3" + API_PRIVATE_URL: ClassVar[str] = "https://www.ebi.ac.uk/pride/private/ws/archive/v2" + ARCHIVE_FTP: ClassVar[str] = "ftp.pride.ebi.ac.uk" + ARCHIVE_FTP_URL_PREFIX: ClassVar[str] = "ftp://ftp.pride.ebi.ac.uk/" + ARCHIVE_HTTPS_URL_PREFIX: ClassVar[str] = "https://ftp.pride.ebi.ac.uk/" + S3_URL: ClassVar[str] = "https://hh.fire.sdo.ebi.ac.uk" + S3_BUCKET: ClassVar[str] = "pride-public" + PROTOCOL_ORDER: ClassVar[List[str]] = ["aspera", "s3", "ftp", "globus"] + + @staticmethod + def matches(accession: str) -> bool: + """Return True when ``accession`` is a PRIDE dataset accession.""" + if not accession: + return False + return bool(re.fullmatch(r"(?:PXD|PRD)\d+", accession.upper())) + + # ------------------------------------------------------------------ + # Listing + # ------------------------------------------------------------------ + + async def stream_all_files_metadata(self, output_file, accession=None): + """ + get stream all project files from PRIDE API in JSON format + """ + if accession is None: + request_url = f"{self.V3_API_BASE_URL}/files/all" + count_request_url = f"{self.V3_API_BASE_URL}/files/count" + else: + request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/all" + count_request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/count" + headers = {"Accept": "application/JSON"} + response = Util.get_api_call(count_request_url, headers) + total_records = response.json() + + regex_search_pattern = '"fileName"' + await Util.stream_response_to_file( + output_file, total_records, regex_search_pattern, request_url, headers + ) + + def stream_all_files_by_project(self, accession) -> List[Dict]: + """ + get stream all project files from PRIDE API in JSON format + """ + request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/all" + headers = {"Accept": "application/JSON"} + record_files = Util.read_json_stream(api_url=request_url, headers=headers) + return record_files + + def list_files(self, accession: str) -> List[Dict]: + """Return PRIDE file records for the dataset.""" + return self.stream_all_files_by_project(accession) + + def get_submitted_file_path_prefix(self, accession): + """ + At pride repository, public data is disseminated according to a proper structure. + I.e. base/path/ + yyyy/mm/accession/ + submitted/ + This extracts the yyyy/mm/accession path fragment from the API by examine the file path + of a public file. + I.e. ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2018/10/PXD008644/7550GI_Y.raw + :param accession: PRIDE accession + :return: path fragment (eg: 2018/10/PXD008644) + """ + # Use Files facade so test patches on get_all_raw_file_list keep working. + from pridepy.files.files import Files + results = Files().get_all_raw_file_list(accession) + first_file = results[0]["publicFileLocations"][0]["value"] + path_fragment = re.search(r"\d{4}/\d{2}/PXD\d*", first_file).group() + return path_fragment + + # ------------------------------------------------------------------ + # Static utilities + # ------------------------------------------------------------------ + + @staticmethod + def _protocol_sequence(protocol: str) -> List[str]: + """ + Build the ordered list of protocols to try for a requested download mode. + """ + if protocol not in PrideProvider.PROTOCOL_ORDER: + return [] + return [protocol] + [p for p in PrideProvider.PROTOCOL_ORDER if p != protocol] + + @staticmethod + def get_ascp_binary(): + """ + Detect the OS and architecture, and return the appropriate ascp binary path. + + Returns: + str: Path to the correct ascp binary. + """ + os_type = platform.system().lower() + arch, _ = platform.architecture() + aspera_dir = importlib.resources.files("pridepy").joinpath("aspera/") + + if os_type == "linux": + if arch == "32bit": + return os.path.join(aspera_dir, "linux-32", "ascp") + elif arch == "64bit": + return os.path.join(aspera_dir, "linux-64", "ascp") + elif os_type == "darwin": # macOS (intel-based) + return os.path.join(aspera_dir, "mac-intel", "ascp") + elif os_type == "windows": + if arch == "32bit": + return os.path.join(aspera_dir, "windows-32", "ascp.exe") + elif arch == "64bit": + return os.path.join(aspera_dir, "windows-64", "ascp.exe") + else: + raise OSError(f"Unsupported OS or architecture: {os_type}, {arch}") + + @staticmethod + def save_checksum_file(accession, output_folder): + """ + Download and persist the checksum manifest for a PRIDE accession. + """ + os.makedirs(output_folder, exist_ok=True) + url = f"{PrideProvider.V3_API_BASE_URL}/files/checksum/{accession}" + headers = {"accept": "text/plain"} + request = urllib.request.Request(url, headers=headers, method="GET") + logging.info(f"Fetching checksum file from {url}") + with urllib.request.urlopen(request) as response: + data = response.read().decode("utf-8") + # Save the data to a .tsv file + output_path = os.path.join(output_folder, f"{accession}-checksum.tsv") + with open(output_path, "w", encoding="utf-8") as file: + file.write(data) + return output_path + + # ------------------------------------------------------------------ + # Per-protocol single-file workers + # ------------------------------------------------------------------ + + @staticmethod + def _globus_download_one(file, output_folder, skip_if_downloaded_already, max_retries=6, position=0): + """Download a single file via globus; used as a worker target.""" + # Use Files facade so test patches on Files helpers keep working. + from pridepy.files.files import Files + + download_url = Files._get_download_url(file, "globus") + new_file_path = Files.get_output_file_name(download_url, file, output_folder) + + if skip_if_downloaded_already and os.path.exists(new_file_path): + logging.info(f"Skipping download as file already exists: {new_file_path}") + return + + for attempt in range(1, max_retries + 1): + try: + Files._parallel_download(download_url, new_file_path, position=position) + return + except Exception as e: + logging.warning(f"Attempt {attempt}/{max_retries} failed for {file.get('fileName', '?')}: {e}") + if attempt == max_retries: + raise + + # ------------------------------------------------------------------ + # Per-protocol batch helpers + # ------------------------------------------------------------------ + + @staticmethod + def download_files_from_ftp( + file_list_json, + output_folder, + skip_if_downloaded_already, + max_connection_retries=3, + max_download_retries=3, + ): + """ + Download files using a single FTP connection with a retry mechanism and a progress bar for each file. + :param file_list_json: file list in JSON format + :param output_folder: folder to download the files + :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. + :param max_connection_retries: Number of attempts to reconnect to the FTP server if the connection is lost. + :param max_download_retries: Number of attempts to retry the download of a file in case of failure. + """ + from pridepy.files.files import Files + + if not os.path.isdir(output_folder): + os.makedirs(output_folder) + + def connect_ftp(): + """Helper function to establish FTP connection.""" + ftp = FTP(PrideProvider.ARCHIVE_FTP, timeout=30) + ftp.login() # Anonymous login + ftp.set_pasv(True) # Enable passive mode + logging.info(f"Connected to FTP host: {PrideProvider.ARCHIVE_FTP}") + return ftp + + connection_attempt = 0 + while connection_attempt < max_connection_retries: + try: + ftp = connect_ftp() + for file in file_list_json: + try: + # Get FTP download URL + if file["publicFileLocations"][0]["name"] == "FTP Protocol": + download_url = file["publicFileLocations"][0]["value"] + else: + download_url = file["publicFileLocations"][1]["value"] + + logging.debug("ftp_filepath:" + download_url) + + # Get output file path + new_file_path = Files.get_output_file_name( + download_url, file, output_folder + ) + + if skip_if_downloaded_already and os.path.exists(new_file_path): + logging.info("Skipping download as file already exists") + continue + + # Extract file path from the download URL + parsed_url = urlparse(download_url) + ftp_file_path = urllib.parse.unquote(parsed_url.path.lstrip("/")) + + logging.info(f"Starting FTP download: {ftp_file_path}") + + # Retry download in case of failure + download_attempt = 0 + while download_attempt < max_download_retries: + try: + # Get file size for progress tracking + total_size = ftp.size(ftp_file_path) + logging.info(f"File size: {total_size} bytes") + + # Initialize progress bar + with open(new_file_path, "wb") as f: + with tqdm( + total=total_size, + unit="B", + unit_scale=True, + desc=new_file_path, + ) as pbar: + + def callback(data): + f.write(data) + pbar.update(len(data)) + + # Retrieve the file with progress callback + ftp.retrbinary(f"RETR {ftp_file_path}", callback) + + logging.info(f"Successfully downloaded {new_file_path}") + break # Exit download retry loop if successful + except ( + socket.timeout, + ftplib.error_temp, + ftplib.error_perm, + ) as e: + download_attempt += 1 + logging.error( + f"Download failed for {new_file_path} (attempt {download_attempt}): {str(e)}" + ) + if download_attempt >= max_download_retries: + logging.error( + f"Giving up on {new_file_path} after {max_download_retries} attempts." + ) + break # Give up on this file after max retries + except (KeyError, IndexError) as e: + logging.error(f"Failed to process file due to missing data: {str(e)}") + except Exception as e: + logging.error(f"Unexpected error while processing file: {str(e)}") + ftp.quit() # Close FTP connection after all files are downloaded + logging.info(f"Disconnected from FTP host: {PrideProvider.ARCHIVE_FTP}") + break # Exit connection retry loop if everything was successful + except ( + socket.timeout, + ftplib.error_temp, + ftplib.error_perm, + socket.error, + ) as e: + connection_attempt += 1 + logging.error(f"FTP connection failed (attempt {connection_attempt}): {str(e)}") + if connection_attempt < max_connection_retries: + logging.info("Retrying connection...") + time.sleep(5) # Optional delay before retrying + else: + logging.error( + f"Giving up after {max_connection_retries} failed connection attempts." + ) + break + + @staticmethod + def download_files_from_globus( + file_list_json: List[Dict], output_folder, skip_if_downloaded_already, + parallel_files: int = 1, + checksum_map: Optional[Dict[str, str]] = None, + ): + """ + Download files using globus transfer url with progress bar for each file. + When skip_if_downloaded_already is True, files are pre-filtered so that + only missing or incomplete files are submitted to the worker pool, + ensuring the -w parallel_files parameter is fully utilised. + When checksum_map is provided, existing files are validated against + their expected checksum; corrupted files are re-downloaded. + :param file_list_json: file list in json format + :param output_folder: folder to download the files + :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. + :param parallel_files: number of files to download simultaneously + :param checksum_map: mapping of file name to expected MD5 checksum + """ + # Use Files facade so test patches on Files._globus_download_one etc. keep working. + from pridepy.files.files import Files + + if checksum_map is None: + checksum_map = {} + + if not (os.path.isdir(output_folder)): + os.makedirs(output_folder, exist_ok=True) + + # --- Phase 0: pre-filter files that need downloading ----------------- + files_to_download: List[Dict] = [] + for file in file_list_json: + download_url = Files._get_download_url(file, "globus") + new_file_path = Files.get_output_file_name(download_url, file, output_folder) + if skip_if_downloaded_already and os.path.exists(new_file_path): + expected_cs = checksum_map.get(file.get("fileName", "")) + if expected_cs: + valid, reason = Files.validate_download(new_file_path, expected_cs) + if not valid: + logging.warning(f"Corrupted file detected ({reason}), will re-download: {new_file_path}") + files_to_download.append(file) + continue + logging.info(f"Skipping download as file already exists: {new_file_path}") + continue + files_to_download.append(file) + + if not files_to_download: + logging.info("All files already downloaded, nothing to do.") + return + + logging.info( + f"{len(file_list_json) - len(files_to_download)} file(s) skipped, " + f"{len(files_to_download)} file(s) to download" + ) + + # --- Phase 1: download (skip check already done, pass False) --------- + parallel_files = min(parallel_files, 3, len(files_to_download)) + if parallel_files < 2: + for file in files_to_download: + try: + Files._globus_download_one( + file, output_folder, False + ) + new_file_path = Files.get_output_file_name( + Files._get_download_url(file, "globus"), file, output_folder + ) + logging.info(f"Successfully downloaded {new_file_path}") + except Exception as e: + logging.error(f"Download from Globus failed: {str(e)}") + else: + logging.info(f"Downloading {len(files_to_download)} file(s) with {parallel_files} parallel workers") + with ThreadPoolExecutor(max_workers=parallel_files) as executor: + futures = { + executor.submit( + Files._globus_download_one, + file, output_folder, False, + position=idx, + ): file + for idx, file in enumerate(files_to_download) + } + for future in as_completed(futures): + try: + future.result() + except Exception as e: + logging.error(f"Download from Globus failed: {str(e)}") + + @staticmethod + def download_files_from_s3( + file_list_json: List[Dict], output_folder: str, skip_if_downloaded_already + ): + """ + Download files using S3 transfer URL with a progress bar and retry logic. + :param file_list_json: file list in JSON format + :param output_folder: folder to download the files + :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. + """ + from pridepy.files.files import Files + + if not os.path.isdir(output_folder): + os.makedirs(output_folder, exist_ok=True) + + # Retry and timeout config + retry_config = Config( + retries={"max_attempts": 5, "mode": "standard"}, + connect_timeout=120, # Increase timeout to 120 seconds + read_timeout=120, # Timeout for reading data + signature_version=botocore.UNSIGNED, # Unsigned requests for public data + ) + + s3_resource = boto3.resource( + "s3", + config=retry_config, + endpoint_url=PrideProvider.S3_URL, + ) + bucket = s3_resource.Bucket(PrideProvider.S3_BUCKET) + + for file in file_list_json: + try: + # Determine S3 or FTP path + download_url = ( + file["publicFileLocations"][0]["value"] + if file["publicFileLocations"][0]["name"] == "FTP Protocol" + else file["publicFileLocations"][1]["value"] + ) + + ftp_base_url = "ftp://ftp.pride.ebi.ac.uk/pride/data/archive/" + s3_path = download_url.replace(ftp_base_url, "") + new_file_path = Files.get_output_file_name(download_url, file, output_folder) + + if skip_if_downloaded_already == True and os.path.exists(new_file_path): + logging.info("Skipping download as file already exists") + continue + + logging.debug(f"Downloading From S3: {s3_path}") + + # Get file size for progress tracking + obj = bucket.Object(s3_path) + total_size = obj.content_length + + # Initialize progress bar + progress = Progress(total_size, new_file_path) + + # Download with progress bar and retry handling + for attempt in range(5): + try: + bucket.download_file(s3_path, new_file_path, Callback=progress) + progress.close() + logging.info(f"Successfully downloaded {new_file_path}") + break + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + logging.error("The object does not exist.") + break + else: + logging.error(f"Download failed: {e}") + if attempt < 4: + time.sleep(2**attempt) # Exponential backoff + logging.info(f"Retrying... ({attempt + 1}/5)") + else: + raise + except Exception as e: + logging.error(f"Failed to download {file['fileName']}: {e}") + + # ------------------------------------------------------------------ + # Private dataset download + # ------------------------------------------------------------------ + + def download_private_file_name(self, accession, file_name, output_folder, username, password): + """ + Get the information for a given private file to be downloaded from the api. + :param accession: Project accession + :param file_name: The file name to be downloaded + :param username: Username with access to the dataset + :param password: Password for user with access to the dataset + """ + + auth = Authentication() + auth_token = auth.get_token(username, password) + validate_token = auth.validate_token(auth_token) + logging.info("Valid token after login: {}".format(validate_token)) + + url = self.API_PRIVATE_URL + "/projects/{}/files?search={}".format(accession, file_name) + content = requests.get(url, headers={"Authorization": "Bearer {}".format(auth_token)}) + if content.ok and content.status_code == 200: + json_file = content.json() + if ( + "_embedded" in json_file + and "files" in json_file["_embedded"] + and len(json_file["_embedded"]["files"]) == 1 + ): + download_url = json_file["_embedded"]["files"][0]["_links"]["download"]["href"] + logging.info(download_url) + + # Create a clean filename to save the downloaded file + new_file_path = os.path.join(output_folder, f"{file_name}") + + session = Util.create_session_with_retries() # Create session with retries + # Check if the file already exists + if os.path.exists(new_file_path): + resume_header = {"Range": f"bytes={os.path.getsize(new_file_path)}-"} + mode = "ab" # Append to file + resume_size = os.path.getsize(new_file_path) + else: + resume_header = {} + mode = "wb" # Write new file + resume_size = 0 + + with session.get( + download_url, stream=True, headers=resume_header, timeout=(10, 60) + ) as r: + r.raise_for_status() + total_size = int(r.headers.get("content-length", 0)) + resume_size + block_size = 1024 * 1024 # 1 MB chunks + + with tqdm( + total=total_size, + unit="B", + unit_scale=True, + desc=new_file_path, + initial=resume_size, + ) as pbar: + with open(new_file_path, mode) as f: + for chunk in r.iter_content(chunk_size=block_size): + if chunk: + f.write(chunk) + pbar.update(len(chunk)) + + logging.info(f"Successfully downloaded {new_file_path}") + + else: + logging.info( + "File name {} found more than once for the given project {}".format( + file_name, accession + ) + ) + else: + logging.info( + f"File name {file_name} now found in the project {accession}, or user don't have access" + ) + raise Exception( + f"File name {file_name} now found in the project {accession}, or user don't have access" + ) + + # ------------------------------------------------------------------ + # Multi-protocol orchestrator + # ------------------------------------------------------------------ + + @staticmethod + def _batch_download_by_protocol( + file_list: List[Dict], + output_folder: str, + protocol: str, + skip_if_downloaded_already: bool, + aspera_maximum_bandwidth: str, + parallel_files: int = 1, + checksum_map: Optional[Dict[str, str]] = None, + ) -> None: + """ + Transfer a batch of files with one protocol, reusing a single + connection where the underlying helper supports it (FTP, S3). + """ + # Use Files facade so test patches on each per-protocol helper keep working. + from pridepy.files.files import Files + + if not file_list: + return + if protocol == "ftp": + Files.download_files_from_ftp( + file_list, + output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + ) + return + if protocol == "aspera": + Files.download_files_from_aspera( + file_list, + output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + maximum_bandwidth=aspera_maximum_bandwidth, + ) + return + if protocol == "globus": + Files.download_files_from_globus( + file_list, + output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + parallel_files=parallel_files, + checksum_map=checksum_map or {}, + ) + return + if protocol == "s3": + Files.download_files_from_s3( + file_list, + output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + ) + return + raise ValueError(f"Unsupported protocol: {protocol}") + + @staticmethod + def _download_with_fallback( + file_record: Dict, + output_folder: str, + protocol_sequence: List[str], + expected_checksum: Optional[str], + aspera_maximum_bandwidth: str, + max_protocol_retries: int = 2, + parallel_files: int = 1, + ) -> bool: + """ + Download one file by trying each protocol in sequence, validating + after every attempt. Intended as the per-file fallback path; batch + download of the primary protocol is handled separately. + """ + # Patch-sensitive: call through Files so test patches intercept. + from pridepy.files.files import Files + + local_path = Files._resolve_local_path(file_record, output_folder) + + for protocol in protocol_sequence: + for attempt in range(1, max_protocol_retries + 1): + logging.info( + f"Downloading {file_record['fileName']} via {protocol} " + f"(attempt {attempt}/{max_protocol_retries})" + ) + try: + Files._remove_if_exists(local_path) + Files._batch_download_by_protocol( + [file_record], + output_folder, + protocol, + skip_if_downloaded_already=False, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, + parallel_files=parallel_files, + ) + except Exception as error: + logging.error( + f"Protocol {protocol} failed for {file_record['fileName']}: {error}" + ) + + valid, reason = Files.validate_download(local_path, expected_checksum) + if valid: + logging.info( + f"File {file_record['fileName']} downloaded successfully via {protocol}" + ) + return True + + logging.warning( + f"Validation failed for {file_record['fileName']} via {protocol}: {reason}" + ) + Files._remove_if_exists(local_path) + + logging.warning( + f"Protocol {protocol} exhausted for {file_record['fileName']}, switching protocol." + ) + + logging.error(f"All protocol attempts failed for {file_record['fileName']}") + return False + + def download_files( + self, + accession, + records: List[Dict], + output_folder: str, + skip_if_downloaded_already, + protocol: str = "ftp", + aspera_maximum_bandwidth: str = "100M", + checksum_check: bool = False, + parallel_files: int = 1, + username: Optional[str] = None, + password: Optional[str] = None, + ): + """Implement Provider.download_files — maps to the legacy static batch downloader.""" + PrideProvider._download_files_batch( + file_list_json=records, + accession=accession, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, + checksum_check=checksum_check, + parallel_files=parallel_files, + ) + + @staticmethod + def _download_files_batch( + file_list_json: List[Dict], + accession, + output_folder: str, + skip_if_downloaded_already, + protocol: str = "ftp", + aspera_maximum_bandwidth: str = "100M", # Aspera maximum bandwidth + checksum_check=False, + parallel_files: int = 1, + ): + """ + Download files using either FTP or Aspera transfer protocol. + :param file_list_json: File list in JSON format + :param accession: Project accession + :param output_folder: Folder to download the files + :param protocol: ftp, aspera, globus + :param aspera_maximum_bandwidth: parameter in Aspera sets the maximum bandwidth for the transfer. + :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. + """ + # Patch-sensitive: call _batch_download_by_protocol and + # _download_with_fallback through Files so test patches intercept. + from pridepy.files.files import Files + + protocols_supported = ["ftp", "aspera", "globus", "s3"] + if protocol not in protocols_supported: + logging.error("Protocol should be one of ftp, aspera, globus, s3") + return + + os.makedirs(output_folder, exist_ok=True) + + checksum_map: Dict[str, str] = {} + if checksum_check: + checksum_file_path = Files.save_checksum_file(accession, output_folder) + checksum_map = Files.read_checksum_file(checksum_file_path) + logging.info(f"Loaded checksums for {len(checksum_map)} files") + + if not file_list_json: + return + + protocol_sequence = Files._protocol_sequence(protocol) + primary_protocol = protocol_sequence[0] + # Retry with the primary protocol first, then fall back to others + fallback_sequence = protocol_sequence + + # Phase 1: batch download with the requested protocol. Reuses a single + # FTP/S3 connection for all files (the previous behaviour) instead of + # paying the per-file reconnect cost in the common happy path. + logging.info( + f"Downloading {len(file_list_json)} file(s) via {primary_protocol} (batch)" + ) + try: + Files._batch_download_by_protocol( + file_list_json, + output_folder, + primary_protocol, + skip_if_downloaded_already=skip_if_downloaded_already, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, + parallel_files=parallel_files, + checksum_map=checksum_map, + ) + except Exception as exc: + logging.warning( + f"Batch {primary_protocol} run hit an error; will retry individual failures: {exc}" + ) + + # Phase 2: validate every file and fall back per-file for the ones + # that are missing or invalid. + logging.info("Phase 2: validating %d downloaded file(s)", len(file_list_json)) + failed_files: List[str] = [] + for i, file_record in enumerate(file_list_json, 1): + expected_checksum = checksum_map.get(file_record["fileName"]) + local_path = Files._resolve_local_path(file_record, output_folder) + logging.info("Validating [%d/%d] %s", i, len(file_list_json), file_record["fileName"]) + valid, reason = Files.validate_download(local_path, expected_checksum) + if valid: + continue + + logging.warning( + f"{file_record['fileName']} invalid after {primary_protocol} ({reason})" + ) + if "checksum mismatch" in reason: + Files._remove_if_exists(local_path) + + if not fallback_sequence: + failed_files.append(file_record.get("fileName", "")) + continue + + success = Files._download_with_fallback( + file_record=file_record, + output_folder=output_folder, + protocol_sequence=fallback_sequence, + expected_checksum=expected_checksum, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, + parallel_files=parallel_files, + ) + if not success: + failed_files.append(file_record.get("fileName", "")) + + if failed_files: + failed_summary = ", ".join(failed_files) + logging.error(f"Failed to download {len(failed_files)} file(s): {failed_summary}") + raise RuntimeError(f"Failed to download {len(failed_files)} file(s): {failed_summary}") diff --git a/pridepy/providers/proteomexchange.py b/pridepy/providers/proteomexchange.py new file mode 100644 index 0000000..cef0524 --- /dev/null +++ b/pridepy/providers/proteomexchange.py @@ -0,0 +1,192 @@ +"""ProteomeXchange provider. + +ProteomeXchange is a meta-repository: a PXD/PRD accession routes through +the cross-repository XML at ``proteomecentral.proteomexchange.org``, and +the XML's ``Associated raw file URI`` cvParams point at the actual hosting +repository (PRIDE / MassIVE / JPOST / iProX / etc.). + +Unlike the other providers in this package, ``ProteomeXchangeProvider`` is +NOT auto-registered with :mod:`pridepy.providers.registry`. PXD/PRD +accessions would otherwise be ambiguous between PRIDE's V3 API listing and +ProteomeXchange's XML listing; the registry continues to route PXD/PRD via +:class:`pridepy.providers.pride.PrideProvider`. ``ProteomeXchangeProvider`` +is the explicit gateway invoked by the ``download-px-raw-files`` CLI +command and by ``Files.download_px_raw_files`` — callers who specifically +want the cross-repository XML view. + +The class accepts either: + +- a plain accession (``PXD039236``) +- a ProteomeCentral dataset URL (``https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=...``) + +…and resolves it to the XML endpoint via :meth:`_normalize_px_xml_url`. +""" +import logging +import os +import re +import xml.etree.ElementTree as ET +from typing import ClassVar, Dict, List, Optional +from urllib.parse import urlparse + +from pridepy.providers.base import Provider +from pridepy.util.api_handling import Util + + +class ProteomeXchangeProvider(Provider): + name: ClassVar[str] = "proteomexchange" + + @staticmethod + def matches(accession: str) -> bool: + """Return True for PXD/PRD accessions or ProteomeCentral URLs. + + Not used by :mod:`pridepy.providers.registry` (this provider is + deliberately not auto-registered). Provided for parity with the + ``Provider`` interface and so direct callers can introspect whether + a given input looks like something ProteomeXchange knows how to + handle. + """ + if not accession: + return False + if accession.lower().startswith(("http://", "https://")): + return "proteomexchange" in accession.lower() or "cgi/GetDataset" in accession + return bool(re.fullmatch(r"(?:PXD|PRD)\d+", accession.upper())) + + @staticmethod + def _normalize_px_xml_url(px_id_or_url: str) -> str: + """Build the ProteomeXchange XML endpoint URL from an accession or URL. + + Examples accepted: + - ``PXD039236`` + - ``https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236`` + - ``https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236&anything`` + """ + if px_id_or_url.startswith("http://") or px_id_or_url.startswith("https://"): + parsed = urlparse(px_id_or_url) + query = parsed.query or "" + if "ID=" in query: + id_value = [ + q.split("=", 1)[1] for q in query.split("&") if q.startswith("ID=") + ] + if id_value: + return ( + "https://proteomecentral.proteomexchange.org/cgi/GetDataset" + f"?ID={id_value[0]}&outputMode=XML&test=no" + ) + if parsed.path.endswith("/cgi/GetDataset"): + return ( + "https://proteomecentral.proteomexchange.org/cgi/GetDataset" + f"?{query}&outputMode=XML&test=no" + ) + return ( + "https://proteomecentral.proteomexchange.org/cgi/GetDataset" + f"?ID={px_id_or_url}&outputMode=XML&test=no" + ) + + @staticmethod + def _parse_px_xml_for_raw_file_urls(px_xml_url: str) -> List[str]: + """Fetch the PX XML and return every ``Associated raw file URI`` value.""" + headers = {"Accept": "application/xml"} + response = Util.get_api_call(px_xml_url, headers) + response.raise_for_status() + root = ET.fromstring(response.content) + + urls: List[str] = [] + for dataset_file in root.iter("DatasetFile"): + for cv in dataset_file.findall("cvParam"): + name = cv.attrib.get("name") + value = cv.attrib.get("value") + if name == "Associated raw file URI" and value: + urls.append(value) + return urls + + def list_files(self, accession: str) -> List[Dict]: + """Return the dataset's raw-file URIs as minimal file records. + + The PX XML doesn't expose checksums or rich category labels, so + each record carries just enough to drive the downloader. + """ + px_xml_url = self._normalize_px_xml_url(accession) + logging.info(f"Fetching PX XML: {px_xml_url}") + urls = self._parse_px_xml_for_raw_file_urls(px_xml_url) + records: List[Dict] = [] + for url in urls: + parsed = urlparse(url) + records.append( + { + "accession": accession, + "fileName": os.path.basename(parsed.path), + "fileCategory": {"value": "RAW"}, + "publicFileLocations": [ + {"name": "FTP Protocol", "value": url} + ], + "source": "ProteomeXchange", + } + ) + return records + + def download_files( + self, + accession: str, + records: List[Dict], + output_folder: str, + skip_if_downloaded_already: bool, + protocol: str, + parallel_files: int = 1, + checksum_check: bool = False, + aspera_maximum_bandwidth: str = "100M", + username: Optional[str] = None, + password: Optional[str] = None, + ) -> None: + """Partition record URLs by scheme and route to the matching transport. + + Routes ftp:// records to :meth:`Files.download_ftp_urls` and + http(s):// records to :meth:`Files.download_http_urls`, going + through the Files facade so test patches like + ``patch.object(Files, "download_ftp_urls")`` continue to intercept. + """ + from pridepy.files.files import Files # lazy: avoid module-load cycle + + if not os.path.isdir(output_folder): + os.makedirs(output_folder, exist_ok=True) + + urls = [ + record["publicFileLocations"][0]["value"] + for record in records + if record.get("publicFileLocations") + ] + ftp_urls = [u for u in urls if u.lower().startswith("ftp://")] + http_urls = [u for u in urls if u.lower().startswith(("http://", "https://"))] + + if ftp_urls: + Files.download_ftp_urls( + ftp_urls, output_folder, skip_if_downloaded_already + ) + if http_urls: + Files.download_http_urls( + http_urls, output_folder, skip_if_downloaded_already + ) + + def download_from_accession_or_url( + self, + px_id_or_url: str, + output_folder: str, + skip_if_downloaded_already: bool = True, + ) -> None: + """End-to-end: resolve XML, list files, partition by scheme, download. + + Convenience for the ``download-px-raw-files`` CLI command — combines + :meth:`list_files` and :meth:`download_files` with the original + ``download_px_raw_files`` defaults (skip-if-downloaded-already + defaults to ``True``, no parallel workers). + """ + records = self.list_files(px_id_or_url) + if not records: + logging.info("No Associated raw file URIs found in PX XML") + return + self.download_files( + accession=px_id_or_url, + records=records, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol="ftp", + ) diff --git a/pridepy/providers/registry.py b/pridepy/providers/registry.py new file mode 100644 index 0000000..7d2c20d --- /dev/null +++ b/pridepy/providers/registry.py @@ -0,0 +1,38 @@ +"""Accession-to-provider resolution. + +Providers are tried in priority order; direct-download repositories +(MassIVE / JPOST / iProX) are tried first because their accession patterns +are unambiguous. PRIDE is tried last and acts as the catch-all for +``PXD\\d+`` / ``PRD\\d+`` accessions. +""" +from typing import List, Type + +from pridepy.providers.base import Provider + +_PROVIDERS: List[Type[Provider]] = [] # populated by individual provider modules + + +def register(provider_cls: Type[Provider]) -> Type[Provider]: + """Register a provider class. Usable as a decorator.""" + if provider_cls not in _PROVIDERS: + _PROVIDERS.append(provider_cls) + return provider_cls + + +def resolve(accession: str) -> Provider: + """Return a provider instance that matches ``accession``. + + :raises ValueError: when no registered provider matches. + """ + for cls in _PROVIDERS: + if cls.matches(accession): + return cls() + raise ValueError(f"No provider registered for accession {accession!r}") + + +def is_known(accession: str) -> bool: + """Return True if any registered provider matches ``accession``.""" + for cls in _PROVIDERS: + if cls.matches(accession): + return True + return False diff --git a/pridepy/providers/transport.py b/pridepy/providers/transport.py new file mode 100644 index 0000000..6649657 --- /dev/null +++ b/pridepy/providers/transport.py @@ -0,0 +1,504 @@ +"""Shared FTP / FTPS / HTTPS download transport. + +Stateless helpers used by the per-repository providers (and re-exported on +:class:`pridepy.files.files.Files` for backward compatibility with tests that +patch ``Files.download_ftp_urls`` etc.). +""" +import ftplib +import logging +import os +import socket +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from ftplib import FTP +from typing import Dict, List, Optional +from urllib.parse import urlparse + +import requests +from tqdm import tqdm + +from pridepy.util.api_handling import Util + + +def _local_path_for_url(download_url: str, output_folder: str) -> str: + filename = os.path.basename(urlparse(download_url).path) + return os.path.join(output_folder, filename) + + +def _open_ftp_connection(host: str, use_tls: bool, timeout: int = 30) -> FTP: + """ + Open an anonymous FTP connection, transparently using FTPS when the + server requires TLS (e.g., MassIVE). When ``use_tls`` is False but the + server replies ``421 TLS is required`` to ``login``, transparently + retry with FTPS so callers don't need to know the policy in advance. + """ + if use_tls: + ftp: FTP = ftplib.FTP_TLS(host, timeout=timeout) + ftp.login() + ftp.prot_p() + else: + ftp = FTP(host, timeout=timeout) + try: + ftp.login() + except ftplib.error_temp as e: + if "TLS" in str(e).upper(): + try: + ftp.close() + except Exception: + pass + ftp = ftplib.FTP_TLS(host, timeout=timeout) + ftp.login() + ftp.prot_p() + else: + raise + ftp.set_pasv(True) + return ftp + + +def _walk_ftp_tree(ftp: FTP, remote_dir: str) -> List[str]: + """ + Recursively list files under a remote FTP directory. + """ + import posixpath + file_paths: List[str] = [] + try: + entries = list(ftp.mlsd(remote_dir)) + for name, facts in entries: + if name in {".", ".."}: + continue + child_path = posixpath.join(remote_dir.rstrip("/"), name) + if facts.get("type") == "dir": + file_paths.extend(_walk_ftp_tree(ftp, child_path)) + elif facts.get("type") == "file": + file_paths.append(child_path) + return file_paths + except (AttributeError, ftplib.error_perm): + pass + + current_dir = ftp.pwd() + listing: List[str] = [] + try: + ftp.cwd(remote_dir) + ftp.retrlines("LIST", listing.append) + for entry in listing: + parts = entry.split(maxsplit=8) + if len(parts) < 9: + continue + name = parts[8] + if name in {".", ".."}: + continue + child_path = posixpath.join(remote_dir.rstrip("/"), name) + if entry.startswith("d"): + file_paths.extend(_walk_ftp_tree(ftp, child_path)) + else: + file_paths.append(child_path) + finally: + ftp.cwd(current_dir) + return file_paths + + +def _list_ftp_repo_files( + host: str, + remote_root: str, + error_label: str, + use_tls: bool = False, +) -> List[str]: + """ + Connect to an anonymous FTP host (FTP or FTPS), walk a directory tree, + and return file paths. + + ``use_tls`` should be True for servers that reject plain FTP (e.g. + MassIVE). Centralizes connection lifecycle so a constructor failure + doesn't mask the underlying error in ``finally`` (PR #98 review). + """ + ftp: Optional[FTP] = None + try: + ftp = _open_ftp_connection(host, use_tls=use_tls) + logging.info(f"Connected to FTP host: {host} (tls={use_tls})") + return _walk_ftp_tree(ftp, remote_root) + except Exception as error: + raise RuntimeError( + f"Unable to list public files for {error_label}: {error}" + ) from error + finally: + if ftp is not None: + try: + ftp.quit() + except Exception: + try: + ftp.close() + except Exception: + pass + + +def _download_one_ftp_path( + ftp: FTP, + ftp_path: str, + local_path: str, + skip_if_downloaded_already: bool, + max_download_retries: int, + position: int = 0, +) -> None: + """ + Download a single FTP path over an existing connection, with REST resume + and per-file retry. Raises on giving up so the caller can decide what to do. + """ + if skip_if_downloaded_already and os.path.exists(local_path): + logging.info(f"Skipping download as file already exists: {local_path}") + return + + attempt = 0 + last_error: Optional[Exception] = None + while attempt < max_download_retries: + try: + total_size = ftp.size(ftp_path) + if os.path.exists(local_path): + current_size = os.path.getsize(local_path) + mode = "ab" + else: + current_size = 0 + mode = "wb" + + with open(local_path, mode) as f, tqdm( + total=total_size, + unit="B", + unit_scale=True, + desc=local_path, + initial=current_size, + position=position, + leave=True, + ) as pbar: + def callback(data): + f.write(data) + pbar.update(len(data)) + + if current_size: + try: + ftp.sendcmd(f"REST {current_size}") + except Exception: + current_size = 0 + f.seek(0) + f.truncate() + ftp.retrbinary(f"RETR {ftp_path}", callback) + + # Post-transfer integrity check: server-reported size must match + # the local size. Catches half-finished transfers that retrbinary + # didn't raise on (e.g. server closed the data channel early). + # The next iteration will REST-resume from where we left off. + if total_size: + final_size = os.path.getsize(local_path) + if final_size != total_size: + attempt += 1 + logging.error( + f"Size mismatch for {local_path}: " + f"got {final_size} bytes, expected {total_size} " + f"(attempt {attempt})" + ) + continue + logging.info(f"Successfully downloaded {local_path}") + return + except (socket.timeout, ftplib.error_temp, ftplib.error_perm) as e: + attempt += 1 + last_error = e + logging.error( + f"Download failed for {local_path} (attempt {attempt}): {e}" + ) + raise RuntimeError( + f"Giving up on {local_path} after {max_download_retries} attempts" + ) from last_error + + +def _download_ftp_paths_serial( + host: str, + paths: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + use_tls: bool, + max_connection_retries: int, + max_download_retries: int, +) -> None: + """Download all paths from one host over a single (reused) connection.""" + connection_attempt = 0 + while connection_attempt < max_connection_retries: + try: + ftp = _open_ftp_connection(host, use_tls=use_tls) + logging.info(f"Connected to FTP host: {host} (tls={use_tls})") + for ftp_path in paths: + local_path = os.path.join(output_folder, os.path.basename(ftp_path)) + try: + _download_one_ftp_path( + ftp=ftp, + ftp_path=ftp_path, + local_path=local_path, + skip_if_downloaded_already=skip_if_downloaded_already, + max_download_retries=max_download_retries, + ) + except Exception as e: + logging.error( + f"Failed to download {ftp_path} from {host}: {e}" + ) + try: + ftp.quit() + except Exception: + try: + ftp.close() + except Exception: + pass + logging.info(f"Disconnected from FTP host: {host}") + return + except (socket.timeout, ftplib.error_temp, ftplib.error_perm, OSError) as e: + connection_attempt += 1 + logging.error( + f"FTP connection failed (attempt {connection_attempt}): {e}" + ) + if connection_attempt < max_connection_retries: + logging.info("Retrying connection...") + time.sleep(5) + else: + logging.error( + f"Giving up after {max_connection_retries} failed connection attempts to {host}." + ) + + +def _download_ftp_paths_parallel( + host: str, + paths: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + use_tls: bool, + max_connection_retries: int, + max_download_retries: int, + parallel_files: int, +) -> None: + """ + Download paths concurrently using ``parallel_files`` workers; each + worker opens its own FTP connection so transfers don't serialize. + """ + def worker(ftp_path: str, position: int) -> None: + local_path = os.path.join(output_folder, os.path.basename(ftp_path)) + if skip_if_downloaded_already and os.path.exists(local_path): + logging.info(f"Skipping download as file already exists: {local_path}") + return + connection_attempt = 0 + while connection_attempt < max_connection_retries: + try: + ftp = _open_ftp_connection(host, use_tls=use_tls) + try: + _download_one_ftp_path( + ftp=ftp, + ftp_path=ftp_path, + local_path=local_path, + skip_if_downloaded_already=False, + max_download_retries=max_download_retries, + position=position, + ) + return + finally: + try: + ftp.quit() + except Exception: + try: + ftp.close() + except Exception: + pass + except (socket.timeout, ftplib.error_temp, ftplib.error_perm, OSError) as e: + connection_attempt += 1 + logging.error( + f"FTP connection failed for {ftp_path} (attempt {connection_attempt}): {e}" + ) + if connection_attempt < max_connection_retries: + time.sleep(5) + logging.error(f"Giving up on {ftp_path} from {host}") + + with ThreadPoolExecutor(max_workers=parallel_files) as executor: + futures = [ + executor.submit(worker, path, idx) for idx, path in enumerate(paths) + ] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + logging.error(f"Parallel FTP download error: {e}") + + +def download_ftp_urls( + ftp_urls: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + max_connection_retries: int = 3, + max_download_retries: int = 3, + use_tls: bool = False, + parallel_files: int = 1, +) -> None: + """ + Download a list of FTP URLs with retries, REST-based resume, and + optional parallel workers. + + :param use_tls: Open the FTP connection with TLS (FTP_TLS / PROT P). + Required for hosts that reject plain anonymous FTP (e.g. MassIVE). + When False but the server replies ``421 TLS is required``, the + connection is transparently retried over TLS. + :param parallel_files: When >1, downloads run concurrently with that + many worker connections per host (capped at the number of files). + """ + if not os.path.isdir(output_folder): + os.makedirs(output_folder, exist_ok=True) + + host_to_paths: Dict[str, List[str]] = {} + for url in ftp_urls: + parsed = urlparse(url) + host_to_paths.setdefault(parsed.hostname, []).append(parsed.path.lstrip("/")) + + for host, paths in host_to_paths.items(): + workers = max(1, min(parallel_files, len(paths))) + if workers > 1: + _download_ftp_paths_parallel( + host=host, + paths=paths, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + use_tls=use_tls, + max_connection_retries=max_connection_retries, + max_download_retries=max_download_retries, + parallel_files=workers, + ) + else: + _download_ftp_paths_serial( + host=host, + paths=paths, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + use_tls=use_tls, + max_connection_retries=max_connection_retries, + max_download_retries=max_download_retries, + ) + + +def _parallel_download(url, file_path, position=0): + """Download a file via a single-connection HTTP stream with optional resume. + If a partial file exists and the server supports Range requests, resumes + from where it left off; otherwise restarts from scratch.""" + session = Util.create_session_with_retries() + try: + head = session.head(url, timeout=(30, 30)) + head.raise_for_status() + total_size = int(head.headers.get("content-length", 0)) + accept_ranges = head.headers.get("accept-ranges", "none").strip().lower() + except (requests.RequestException, ValueError) as exc: + logging.info(f"HEAD request failed, falling back to single connection: {exc}") + total_size = 0 + accept_ranges = "none" + + resume_size = 0 + if os.path.exists(file_path) and accept_ranges == "bytes" and total_size > 0: + resume_size = os.path.getsize(file_path) + if resume_size >= total_size: + logging.info(f"File already complete: {file_path}") + return + if resume_size > 0: + logging.info(f"Resuming download from {resume_size} bytes: {file_path}") + + headers = {"Range": f"bytes={resume_size}-"} if resume_size > 0 else {} + with session.get(url, headers=headers, stream=True, timeout=(30, 60)) as r: + r.raise_for_status() + if resume_size > 0 and r.status_code != 206: + logging.warning("Server did not honor Range request (status %s), restarting download", r.status_code) + resume_size = 0 + with tqdm(total=total_size, unit="B", unit_scale=True, desc=file_path, + initial=resume_size, position=position, leave=True) as pbar: + mode = "ab" if resume_size > 0 else "wb" + with open(file_path, mode, buffering=8 * 1024 * 1024) as f: + for chunk in r.iter_content(chunk_size=8 * 1024 * 1024): + if chunk: + f.write(chunk) + pbar.update(len(chunk)) + + +def _http_download_one( + url: str, + output_folder: str, + skip_if_downloaded_already: bool, + max_retries: int = 3, + position: int = 0, +) -> None: + """ + Download a single HTTP(S) URL with HEAD-then-Range resume and retry. + Used as the worker target for both the serial loop and the parallel + ThreadPoolExecutor path. Reuses :meth:`_parallel_download` so the same + resume / restart-on-non-206 behaviour is shared with globus downloads. + """ + local_path = _local_path_for_url(url, output_folder) + if skip_if_downloaded_already and os.path.exists(local_path): + logging.info(f"Skipping download as file already exists: {local_path}") + return + last_error: Optional[Exception] = None + for attempt in range(1, max_retries + 1): + try: + _parallel_download(url, local_path, position=position) + logging.info(f"Successfully downloaded {local_path}") + return + except Exception as e: + last_error = e + logging.warning( + f"HTTP download attempt {attempt}/{max_retries} failed for {url}: {e}" + ) + raise RuntimeError( + f"Giving up on {local_path} after {max_retries} HTTP attempts" + ) from last_error + + +def download_http_urls( + http_urls: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + parallel_files: int = 1, + max_retries: int = 3, +) -> None: + """ + Download a list of HTTP(S) URLs with HEAD-then-Range resume, per-file + retries, and an optional ``parallel_files`` worker pool. + + When ``parallel_files`` > 1, downloads run concurrently using a + :class:`ThreadPoolExecutor`. Each worker manages its own file (a new + ``requests`` session is opened inside ``_parallel_download``) so the + only shared resource is the output directory. + """ + if not os.path.isdir(output_folder): + os.makedirs(output_folder, exist_ok=True) + + if not http_urls: + return + + workers = max(1, min(parallel_files, len(http_urls))) + if workers > 1: + logging.info( + f"Downloading {len(http_urls)} HTTP(S) file(s) with {workers} parallel workers" + ) + with ThreadPoolExecutor(max_workers=workers) as executor: + futures = [ + executor.submit( + _http_download_one, + url, + output_folder, + skip_if_downloaded_already, + max_retries, + idx, + ) + for idx, url in enumerate(http_urls) + ] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + logging.error(f"Parallel HTTP download error: {e}") + else: + for url in http_urls: + try: + _http_download_one( + url, + output_folder, + skip_if_downloaded_already, + max_retries, + ) + except Exception as e: + logging.error(f"HTTP download failed for {url}: {e}") diff --git a/pridepy/providers/util.py b/pridepy/providers/util.py new file mode 100644 index 0000000..0fc5791 --- /dev/null +++ b/pridepy/providers/util.py @@ -0,0 +1,183 @@ +"""Cross-cutting utilities used by providers and the Files facade. + +Pure functions (and one tiny Progress class) for checksums, record-shape +helpers, and download progress. Originally on ``Files`` as @staticmethods; +moved here so providers can use them without depending on Files at import +time, and Files keeps shim re-exports for backward compatibility with +existing test patches. +""" +import hashlib +import logging +import os +from typing import Dict, List, Optional, Tuple + +from tqdm import tqdm + + +class Progress: + def __init__(self, total_size, file_name): + self.pbar = tqdm( + total=total_size, + unit="B", + unit_scale=True, + desc="Downloading {}".format(file_name), + ) + + def __call__(self, bytes_amount): + self.pbar.update(bytes_amount) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.pbar.close() + + def close(self): + self.pbar.close() + + +def _find_tsv_columns(header: str) -> Optional[Tuple[int, int]]: + """Return (name_idx, checksum_idx) from a TSV header, or None.""" + cols = [col.strip().lower() for col in header.split("\t")] + required_cols = {"file-name", "file-md5checksum", "file-size"} + if not required_cols.issubset(set(cols)): + return None + return cols.index("file-name"), cols.index("file-md5checksum") + + +def _is_md5_checksum(value: str) -> bool: + return len(value) == 32 and all(char in "0123456789abcdef" for char in value) + + +def read_checksum_file(checksum_file_path: str) -> Dict[str, str]: + """ + Read PRIDE API checksum TSV and build {file_name: md5} map. + Expected format: File-Name\tFile-MD5Checksum\tFile-Size + """ + checksums: Dict[str, str] = {} + if not checksum_file_path or not os.path.exists(checksum_file_path): + return checksums + + with open(checksum_file_path, "r", encoding="utf-8") as f: + header = f.readline().strip() + if not header: + return checksums + + col_indices = _find_tsv_columns(header) + if col_indices is None: + logging.warning(f"Unrecognized checksum file format: {header}") + return checksums + + name_idx, checksum_idx = col_indices + min_cols = max(name_idx, checksum_idx) + 1 + for line in f: + parts = line.strip().split("\t") + if len(parts) >= min_cols: + fn = os.path.basename(parts[name_idx].strip()) + cs = parts[checksum_idx].strip().lower() + if fn and _is_md5_checksum(cs): + checksums[fn] = cs + + return checksums + + +def compute_md5(file_path: str, chunk_size: int = 4 * 1024 * 1024) -> str: + """ + Compute an MD5 checksum for integrity validation, not for security use. + """ + try: + md5 = hashlib.md5(usedforsecurity=False) + except TypeError: + md5 = hashlib.md5() + with open(file_path, "rb") as file_handle: + while True: + chunk = file_handle.read(chunk_size) + if not chunk: + break + md5.update(chunk) + return md5.hexdigest() + + +def validate_download(file_path: str, expected_checksum: Optional[str] = None) -> Tuple[bool, str]: + """ + Validate a local file exists, is non-empty, and checksum matches when provided. + """ + if not os.path.exists(file_path): + return False, "file does not exist" + if os.path.getsize(file_path) == 0: + return False, "file is empty" + if expected_checksum: + actual_checksum = compute_md5(file_path) + if actual_checksum.lower() != expected_checksum.lower(): + return False, ( + f"checksum mismatch (expected={expected_checksum.lower()}, actual={actual_checksum.lower()})" + ) + return True, "ok" + + +def _remove_if_exists(file_path: str) -> None: + """ + Remove a file if it already exists locally. + """ + if os.path.exists(file_path): + os.remove(file_path) + + +def _get_download_url(file_record: Dict, protocol: str) -> str: + """ + Resolve the public download URL for a file and protocol. + + Raises ValueError when the requested protocol has no suitable location. + Aspera requires a dedicated "Aspera Protocol" entry; ftp/s3/globus + derive their URL from the "FTP Protocol" entry (falling back to an + arbitrary non-Aspera location would produce a URL the caller cannot + actually transfer with). + """ + from pridepy.files.files import Files + + locations = file_record.get("publicFileLocations", []) + if not locations: + raise ValueError("No public file locations present") + + aspera_url = None + ftp_url = None + for location in locations: + name = location.get("name") + if name == "Aspera Protocol": + aspera_url = location.get("value") + elif name == "FTP Protocol": + ftp_url = location.get("value") + + if protocol == "aspera": + if not aspera_url: + raise ValueError("Aspera URL not available") + return aspera_url + + if not ftp_url: + raise ValueError("FTP URL not available") + if protocol == "ftp": + return ftp_url + if protocol == "globus": + return ftp_url.replace( + Files.PRIDE_ARCHIVE_FTP_URL_PREFIX, + Files.PRIDE_ARCHIVE_HTTPS_URL_PREFIX, + 1, + ) + if protocol == "s3": + return ftp_url + raise ValueError(f"Unsupported protocol: {protocol}") + + +def _resolve_local_path(file_record: Dict, output_folder: str) -> str: + """ + Compute the canonical local path for a file regardless of transfer protocol. + """ + from pridepy.files.files import Files + + try: + canonical_url = _get_download_url(file_record, "ftp") + except ValueError: + canonical_url = "" + if canonical_url: + return Files.get_output_file_name(canonical_url, file_record, output_folder) + return os.path.join(output_folder, file_record["fileName"]) diff --git a/pridepy/tests/test_download_by_list.py b/pridepy/tests/test_download_by_list.py index df81914..5115b4e 100644 --- a/pridepy/tests/test_download_by_list.py +++ b/pridepy/tests/test_download_by_list.py @@ -14,6 +14,7 @@ from pridepy.files.files import Files from pridepy.pridepy import _read_filename_arguments +from pridepy.providers.pride import PrideProvider class TestDownloadFilesByList(TestCase): @@ -36,8 +37,8 @@ def test_filters_metadata_and_delegates(self): {"fileName": "c.raw"}, ] with patch.object( - files_obj, "stream_all_files_by_project", return_value=api_response - ), patch.object(files_obj, "download_files") as mock_download: + PrideProvider, "list_files", return_value=api_response + ), patch.object(PrideProvider, "download_files") as mock_download: files_obj.download_files_by_list( accession="PXD001819", file_names=["a.raw", "c.raw"], @@ -46,16 +47,16 @@ def test_filters_metadata_and_delegates(self): protocol="ftp", ) - args, _ = mock_download.call_args - matched = args[0] + _, kwargs = mock_download.call_args + matched = kwargs["records"] assert {f["fileName"] for f in matched} == {"a.raw", "c.raw"} def test_warns_on_partial_match(self): files_obj = Files() api_response = [{"fileName": "a.raw"}] with patch.object( - files_obj, "stream_all_files_by_project", return_value=api_response - ), patch.object(files_obj, "download_files") as mock_download, self.assertLogs( + PrideProvider, "list_files", return_value=api_response + ), patch.object(PrideProvider, "download_files") as mock_download, self.assertLogs( level="WARNING" ) as log_ctx: files_obj.download_files_by_list( @@ -71,7 +72,7 @@ def test_warns_on_partial_match(self): def test_raises_when_no_files_match(self): files_obj = Files() with patch.object( - files_obj, "stream_all_files_by_project", return_value=[] + PrideProvider, "list_files", return_value=[] ): with pytest.raises(ValueError, match="No matching files"): files_obj.download_files_by_list( diff --git a/pridepy/tests/test_download_resilience.py b/pridepy/tests/test_download_resilience.py index 21b1603..0f86013 100644 --- a/pridepy/tests/test_download_resilience.py +++ b/pridepy/tests/test_download_resilience.py @@ -268,3 +268,43 @@ def test_download_files_raises_when_any_file_fails(self): skip_if_downloaded_already=False, protocol="ftp", ) + + def test_facade_dispatches_pride_through_registry_to_fallback(self): + """Files().download_all_raw_files for a PXD accession must flow: + Files facade -> Registry.resolve -> PrideProvider.download_files + -> _batch_download_by_protocol (mocked). + + Patching Files._batch_download_by_protocol proves the patch intercepts + (i.e. PrideProvider calls *back* through Files, preserving the test + contract for the multi-protocol orchestrator). + """ + from pridepy.providers.pride import PrideProvider + + fake_records = [ + { + "accession": "PXD000001", + "fileName": "x.raw", + "fileCategory": {"value": "RAW"}, + "publicFileLocations": [ + {"name": "FTP Protocol", "value": "ftp://ftp.pride.ebi.ac.uk/.../x.raw"} + ], + }, + ] + + with tempfile.TemporaryDirectory() as tmp: + with patch.object(PrideProvider, "list_files", return_value=fake_records), \ + patch.object(Files, "_batch_download_by_protocol", return_value=[]) as batch_mock, \ + patch.object(Files, "validate_download", return_value=(True, "ok")), \ + patch.object(Files, "_download_with_fallback") as fallback_mock: + Files().download_all_raw_files( + accession="PXD000001", + output_folder=tmp, + skip_if_downloaded_already=False, + protocol="ftp", + aspera_maximum_bandwidth="100M", + ) + + batch_mock.assert_called_once() + # No fallback expected because all files passed validation after + # the primary-protocol batch run. + fallback_mock.assert_not_called() diff --git a/pridepy/tests/test_ftp_download_validation.py b/pridepy/tests/test_ftp_download_validation.py new file mode 100644 index 0000000..10bbfb5 --- /dev/null +++ b/pridepy/tests/test_ftp_download_validation.py @@ -0,0 +1,91 @@ +"""Coverage for the size-mismatch detection added to ``_download_one_ftp_path``. + +The FTP server's ``SIZE`` reply is the only integrity signal direct downloads +have (MassIVE/JPOST don't publish per-file MD5 manifests like PRIDE). After +``retrbinary`` returns, we re-check the local size against the server-reported +size and treat a mismatch as a retryable failure. +""" +import os +import tempfile +from unittest import TestCase +from unittest.mock import MagicMock + +import pytest + +from pridepy.files.files import Files + + +def _make_fake_ftp(expected_size, write_bytes_per_call): + """Return a MagicMock FTP that writes ``write_bytes_per_call`` bytes per call. + + ``retrbinary`` is invoked once per attempt; we record how many attempts + happened by counting calls and produce a different payload size for each. + """ + fake = MagicMock() + fake.size.return_value = expected_size + fake.sendcmd = MagicMock() + fake._call_count = 0 + + def retrbinary(cmd, callback): + idx = fake._call_count + fake._call_count += 1 + payload = b"x" * write_bytes_per_call[idx] + callback(payload) + + fake.retrbinary.side_effect = retrbinary + return fake + + +class TestSizeMismatchValidation(TestCase): + def test_size_mismatch_is_retried_then_succeeds(self): + """First attempt returns 50 bytes (expected 100) -> retry, second yields 50 more -> 100, OK.""" + with tempfile.TemporaryDirectory() as tmp: + local_path = os.path.join(tmp, "f.bin") + ftp = _make_fake_ftp(expected_size=100, write_bytes_per_call=[50, 50]) + + Files._download_one_ftp_path( + ftp=ftp, + ftp_path="/JPST000001/f.bin", + local_path=local_path, + skip_if_downloaded_already=False, + max_download_retries=3, + ) + + assert os.path.getsize(local_path) == 100 + assert ftp.retrbinary.call_count == 2 + # First attempt: file empty, no REST. Second: file has 50 bytes, REST 50 issued. + sendcmd_args = [call.args[0] for call in ftp.sendcmd.call_args_list] + assert sendcmd_args == ["REST 50"] + + def test_size_mismatch_after_retries_raises(self): + """Three attempts all undersize -> RuntimeError after giving up.""" + with tempfile.TemporaryDirectory() as tmp: + local_path = os.path.join(tmp, "f.bin") + ftp = _make_fake_ftp(expected_size=100, write_bytes_per_call=[10, 10, 10]) + + with pytest.raises(RuntimeError, match="Giving up"): + Files._download_one_ftp_path( + ftp=ftp, + ftp_path="/JPST000001/f.bin", + local_path=local_path, + skip_if_downloaded_already=False, + max_download_retries=3, + ) + + assert ftp.retrbinary.call_count == 3 + + def test_correct_size_returns_without_retry(self): + with tempfile.TemporaryDirectory() as tmp: + local_path = os.path.join(tmp, "f.bin") + ftp = _make_fake_ftp(expected_size=50, write_bytes_per_call=[50]) + + Files._download_one_ftp_path( + ftp=ftp, + ftp_path="/JPST000001/f.bin", + local_path=local_path, + skip_if_downloaded_already=False, + max_download_retries=3, + ) + + assert os.path.getsize(local_path) == 50 + assert ftp.retrbinary.call_count == 1 diff --git a/pridepy/tests/test_iprox_files.py b/pridepy/tests/test_iprox_files.py new file mode 100644 index 0000000..dfdcd21 --- /dev/null +++ b/pridepy/tests/test_iprox_files.py @@ -0,0 +1,148 @@ +"""iProX direct-download support. + +iProX publishes the ProteomeXchange XML for each dataset at a deterministic +path on its anonymous HTTPS download server:: + + http://download.iprox.org//PX_.xml + +The referenced files are served from the same host over HTTPS with byte-range +support, so resume and parallel downloads use the same plumbing as PRIDE +HTTP(S) transfers. +""" +import tempfile +from unittest import TestCase +from unittest.mock import MagicMock, patch + +from pridepy.files.files import Files + + +IPROX_XML_FIXTURE = """ + + + + + + + + + + + + + + + + + + + + + +""".encode("utf-8") + + +class TestIProXFiles(TestCase): + def test_is_iprox_accession_matches_ipx_format(self): + assert Files.is_iprox_accession("IPX0000123") + assert Files.is_iprox_accession("IPX0000123000") + assert Files.is_iprox_accession("ipx1234567") + assert not Files.is_iprox_accession("PXD000012") + assert not Files.is_iprox_accession("MSV000012345") + assert not Files.is_iprox_accession("JPST000001") + assert not Files.is_iprox_accession("IPX12") + assert not Files.is_iprox_accession("") + assert not Files.is_iprox_accession(None) + + def test_iprox_is_a_direct_download_accession(self): + assert Files.is_direct_download_accession("IPX0017413000") + + def test_build_iprox_file_record_maps_px_cv_to_category(self): + record = Files._build_iprox_file_record( + "IPX0017413000", + "http://download.iprox.org/IPX0017413000/IPX0017413001/sample.raw", + category_from_px="Associated raw file URI", + ) + assert record["fileName"] == "sample.raw" + assert record["fileCategory"]["value"] == "RAW" + assert record["source"] == "iProX" + # _download_direct_download_records dispatches by URL scheme, so the + # publicFileLocations URL must still be the HTTPS download URL. + assert record["publicFileLocations"][0]["value"].startswith("http://") + + def test_list_iprox_public_files_parses_px_xml(self): + files = Files() + fake_response = MagicMock() + fake_response.content = IPROX_XML_FIXTURE + fake_response.raise_for_status = MagicMock() + with patch( + "pridepy.files.files.requests.get", return_value=fake_response + ) as req_mock: + records = files._list_iprox_public_files("IPX0017413000") + + # The fetch hits the deterministic PX XML URL. + req_mock.assert_called_once() + called_url = req_mock.call_args[0][0] + assert called_url == ( + "http://download.iprox.org/IPX0017413000/PX_IPX0017413000.xml" + ) + + # 3 valid HTTPS records; the ftp:// "Other URI" cvParam was filtered out. + assert len(records) == 3 + cats = {r["fileName"]: r["fileCategory"]["value"] for r in records} + assert cats == { + "sample1.raw": "RAW", + "sample2.raw": "RAW", + "results.tsv": "SEARCH", + } + for r in records: + assert r["source"] == "iProX" + assert r["publicFileLocations"][0]["value"].startswith("http://") + + def test_get_all_raw_file_list_filters_iprox_records(self): + files = Files() + fake_response = MagicMock() + fake_response.content = IPROX_XML_FIXTURE + fake_response.raise_for_status = MagicMock() + with patch( + "pridepy.files.files.requests.get", return_value=fake_response + ), patch.object(Files, "stream_all_files_by_project") as pride_mock: + raw_files = files.get_all_raw_file_list("IPX0017413000") + + pride_mock.assert_not_called() + assert {r["fileName"] for r in raw_files} == {"sample1.raw", "sample2.raw"} + + def test_download_file_by_name_routes_iprox_to_http_urls(self): + files = Files() + fake_response = MagicMock() + fake_response.content = IPROX_XML_FIXTURE + fake_response.raise_for_status = MagicMock() + with tempfile.TemporaryDirectory() as tmp_dir, patch( + "pridepy.files.files.requests.get", return_value=fake_response + ), patch.object(Files, "download_http_urls") as http_mock, patch.object( + Files, "download_ftp_urls" + ) as ftp_mock: + files.download_file_by_name( + accession="IPX0017413000", + file_name="results.tsv", + output_folder=tmp_dir, + skip_if_downloaded_already=False, + protocol="ftp", + username=None, + password=None, + aspera_maximum_bandwidth="100M", + checksum_check=False, + ) + + # iProX is HTTPS, not FTP — FTP path must not be called. + ftp_mock.assert_not_called() + http_mock.assert_called_once() + kwargs = http_mock.call_args.kwargs + assert kwargs["http_urls"] == [ + "http://download.iprox.org/IPX0017413000/IPX0017413001/results.tsv" + ] + assert kwargs["parallel_files"] == 1 + assert kwargs["skip_if_downloaded_already"] is False diff --git a/pridepy/tests/test_jpost_files.py b/pridepy/tests/test_jpost_files.py new file mode 100644 index 0000000..1e4c652 --- /dev/null +++ b/pridepy/tests/test_jpost_files.py @@ -0,0 +1,151 @@ +import json +import tempfile +from unittest import TestCase +from unittest.mock import MagicMock, patch + +from pridepy.files.files import Files +from pridepy.providers.jpost import JpostProvider + + +class TestJPOSTFiles(TestCase): + def test_is_jpost_accession(self): + assert Files.is_jpost_accession("JPST000001") + assert Files.is_jpost_accession("jpst123456") + assert not Files.is_jpost_accession("PXD000012") + assert not Files.is_jpost_accession("MSV000012345") + assert not Files.is_jpost_accession("JPST12") + + def test_is_direct_download_accession_includes_jpost(self): + assert Files.is_direct_download_accession("JPST000001") + + def test_build_jpost_file_record_maps_collection_to_category(self): + record = Files._build_jpost_file_record( + "JPST000001", + "ftp://ftp.jpostdb.org/JPST000001/peak/sample.mzML", + ) + + assert record["fileName"] == "sample.mzML" + assert record["collection"] == "peak" + assert record["fileCategory"]["value"] == "PEAK" + assert record["source"] == "JPOST" + + def test_build_jpost_file_record_marks_raw_collection_as_raw(self): + record = Files._build_jpost_file_record( + "JPST000001", + "ftp://ftp.jpostdb.org/JPST000001/raw/run01.raw", + ) + + assert record["collection"] == "raw" + assert record["fileCategory"]["value"] == "RAW" + + def test_get_all_raw_file_list_filters_jpost_records(self): + files = Files() + jpost_records = [ + Files._build_jpost_file_record( + "JPST000001", + "ftp://ftp.jpostdb.org/JPST000001/raw/run1.raw", + ), + Files._build_jpost_file_record( + "JPST000001", + "ftp://ftp.jpostdb.org/JPST000001/result/results.tsv", + ), + ] + + with patch.object(JpostProvider, "list_files", return_value=jpost_records): + result = files.get_all_raw_file_list("JPST000001") + + assert len(result) == 1 + assert {file["fileName"] for file in result} == {"run1.raw"} + + def test_download_file_by_name_uses_jpost_ftp_listing(self): + files = Files() + file_record = Files._build_jpost_file_record( + "JPST000001", + "ftp://ftp.jpostdb.org/JPST000001/raw/folder/sample.raw", + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.object( + JpostProvider, "list_files", return_value=[file_record] + ), patch.object(Files, "download_ftp_urls") as download_mock: + files.download_file_by_name( + accession="JPST000001", + file_name="sample.raw", + output_folder=tmp_dir, + skip_if_downloaded_already=False, + protocol="ftp", + username=None, + password=None, + aspera_maximum_bandwidth="100M", + checksum_check=False, + ) + + download_mock.assert_called_once_with( + ftp_urls=["ftp://ftp.jpostdb.org/JPST000001/raw/folder/sample.raw"], + output_folder=tmp_dir, + skip_if_downloaded_already=False, + use_tls=False, + parallel_files=1, + ) + + def test_proxi_listing_maps_cv_name_to_category(self): + files = Files() + proxi_response = { + "datasetFiles": [ + { + "accession": "PRIDE:0000404", + "name": "Associated raw file URI", + "value": "ftp://ftp.jpostdb.org/JPST002311/sample01.raw", + }, + { + "accession": "PRIDE:0000408", + "name": "Search engine output file URI", + "value": "ftp://ftp.jpostdb.org/JPST002311/sample01.sne", + }, + { + "accession": "PRIDE:0000999", + "name": "Some unknown CV", + "value": "ftp://ftp.jpostdb.org/JPST002311/misc/sample01.txt", + }, + { + "accession": "PRIDE:0000404", + "name": "Associated raw file URI", + "value": "https://example.org/not-ftp.raw", + }, + ] + } + fake_response = MagicMock() + fake_response.content = json.dumps(proxi_response).encode("utf-8") + fake_response.raise_for_status = MagicMock() + with patch("pridepy.files.files.requests.get", return_value=fake_response) as req_mock: + records = files._list_jpost_public_files_via_proxi("JPST002311") + + req_mock.assert_called_once() + call_url = req_mock.call_args[0][0] + assert call_url == "https://repository.jpostdb.org/proxi/datasets/JPST002311" + # Non-FTP URI ignored; three FTP entries kept. + assert len(records) == 3 + cats = {r["fileName"]: r["fileCategory"]["value"] for r in records} + assert cats["sample01.raw"] == "RAW" + assert cats["sample01.sne"] == "SEARCH" + # Unknown CV falls back to path-based heuristic (collection "misc" -> OTHER). + assert cats["sample01.txt"] == "OTHER" + + def test_proxi_falls_back_to_ftp_walk_on_error(self): + files = Files() + ftp_record = Files._build_jpost_file_record( + "JPST000001", "ftp://ftp.jpostdb.org/JPST000001/raw/x.raw" + ) + with patch.object( + Files, + "_list_jpost_public_files_via_proxi", + side_effect=RuntimeError("proxi down"), + ), patch.object( + Files, "_list_ftp_repo_files", return_value=["/JPST000001/raw/x.raw"] + ) as ftp_mock: + result = files._list_jpost_public_files("JPST000001") + + ftp_mock.assert_called_once() + assert len(result) == 1 + assert result[0]["fileName"] == "x.raw" + assert result[0]["source"] == "JPOST" diff --git a/pridepy/tests/test_massive_files.py b/pridepy/tests/test_massive_files.py index f600b71..a4e9278 100644 --- a/pridepy/tests/test_massive_files.py +++ b/pridepy/tests/test_massive_files.py @@ -3,6 +3,7 @@ from unittest.mock import patch from pridepy.files.files import Files +from pridepy.providers.massive import MassiveProvider class TestMassIVEFiles(TestCase): @@ -66,7 +67,7 @@ def test_get_all_raw_file_list_filters_massive_records(self): ), ] - with patch.object(Files, "_list_massive_public_files", return_value=massive_records): + with patch.object(MassiveProvider, "list_files", return_value=massive_records): result = files.get_all_raw_file_list("MSV000012345") assert len(result) == 1 @@ -80,7 +81,7 @@ def test_download_file_by_name_uses_massive_ftp_listing(self): ) with tempfile.TemporaryDirectory() as tmp_dir: - with patch.object(Files, "_list_massive_public_files", return_value=[file_record]), patch.object( + with patch.object(MassiveProvider, "list_files", return_value=[file_record]), patch.object( Files, "download_ftp_urls" ) as download_mock: files.download_file_by_name( @@ -99,4 +100,78 @@ def test_download_file_by_name_uses_massive_ftp_listing(self): ftp_urls=["ftp://massive-ftp.ucsd.edu/v01/MSV000012345/raw/folder/sample.raw"], output_folder=tmp_dir, skip_if_downloaded_already=False, + use_tls=True, + parallel_files=1, ) + + def test_repo_uses_tls_true_for_massive_false_for_jpost(self): + assert Files._repo_uses_tls("MSV000012345") is True + assert Files._repo_uses_tls("JPST000001") is False + assert Files._repo_uses_tls("PXD000012") is False + + def test_download_all_raw_files_threads_parallel_files_for_massive(self): + files = Files() + massive_records = [ + Files._build_massive_file_record( + "MSV000012345", + f"ftp://massive-ftp.ucsd.edu/v01/MSV000012345/raw/run{i}.raw", + ) + for i in range(3) + ] + + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.object( + MassiveProvider, "list_files", return_value=massive_records + ), patch.object(Files, "download_ftp_urls") as download_mock: + files.download_all_raw_files( + accession="MSV000012345", + output_folder=tmp_dir, + skip_if_downloaded_already=False, + protocol="ftp", + aspera_maximum_bandwidth="100M", + checksum_check=False, + parallel_files=3, + ) + + kwargs = download_mock.call_args.kwargs + assert kwargs["use_tls"] is True + assert kwargs["parallel_files"] == 3 + + def test_base_direct_download_provider_partitions_urls_by_scheme(self): + """Records mixing ftp:// and http(s):// route to the right transport.""" + from pridepy.providers.massive import MassiveProvider + + provider = MassiveProvider() + records = [ + Files._build_massive_file_record( + "MSV000012345", + "ftp://massive-ftp.ucsd.edu/v01/MSV000012345/raw/a.raw", + ), + # Synthetic http record to verify partitioning (real MassIVE uses ftp). + { + "accession": "MSV000012345", + "fileName": "b.raw", + "fileCategory": {"value": "RAW"}, + "publicFileLocations": [ + {"name": "FTP Protocol", "value": "http://example.org/b.raw"} + ], + }, + ] + with patch.object(Files, "download_ftp_urls") as ftp_mock, \ + patch.object(Files, "download_http_urls") as http_mock: + provider.download_files( + accession="MSV000012345", + records=records, + output_folder="/tmp/test", + skip_if_downloaded_already=False, + protocol="ftp", + parallel_files=1, + ) + + ftp_mock.assert_called_once() + assert ftp_mock.call_args.kwargs["use_tls"] is True + assert ftp_mock.call_args.kwargs["ftp_urls"] == [ + "ftp://massive-ftp.ucsd.edu/v01/MSV000012345/raw/a.raw" + ] + http_mock.assert_called_once() + assert http_mock.call_args.kwargs["http_urls"] == ["http://example.org/b.raw"] diff --git a/pyproject.toml b/pyproject.toml index 90f40ae..f5b74ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pridepy" -version = "0.0.15" +version = "0.0.17" description = "Python Client library for PRIDE Rest API" readme = "README.md" requires-python = ">=3.9"