From d471e0b26729ac4f2d12d0e12f9b71b5251f0288 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 11:03:23 +0100 Subject: [PATCH 01/21] Add direct downloads for JPOST and iProX accessions; bump to 0.0.16 Extends the direct-download support introduced for MassIVE in PR #98 to two more proteomics repositories whose datasets are often standalone (no ProteomeXchange accession): - JPOST (Japan ProteOme STandard Repository): JPST\d{6} accessions, listed and downloaded from ftp.jpostdb.org. - iProX (Integrated Proteome resources): IPX\d{7,10} accessions, listed and downloaded from ftp.iprox.cn. Refactor: - Add is_direct_download_accession() unifying the MSV/JPST/IPX checks, plus _list_direct_download_files() and _download_direct_download_records() dispatchers. All call sites (get_all_raw_file_list, download_all_raw_files, download_all_category_files, get_file_from_api, download_file_by_name, download_files_by_list, get_all_category_file_list) now go through the unified entry points. - Extract _list_ftp_repo_files() helper so the FTP connection lifecycle (connect / login / passive / walk / quit) lives in one place. As part of that, fix the FTP-constructor-outside-try issue flagged in the PR #98 review: a connect failure no longer triggers NameError in finally. - Keep is_massive_accession, _list_massive_public_files, and _download_massive_file_records as thin backward-compatible wrappers so existing tests and external callers continue to work. Tests: add test_jpost_files.py and test_iprox_files.py mirroring the MassIVE coverage (regex match, record building, raw-only filtering, and the download_file_by_name happy path). All 19 direct-download tests pass. Version: 0.0.16. --- README.md | 26 +++- pridepy/files/files.py | 251 ++++++++++++++++++++++++++---- pridepy/tests/test_iprox_files.py | 89 +++++++++++ pridepy/tests/test_jpost_files.py | 88 +++++++++++ pyproject.toml | 2 +- 5 files changed, 416 insertions(+), 40 deletions(-) create mode 100644 pridepy/tests/test_iprox_files.py create mode 100644 pridepy/tests/test_jpost_files.py diff --git a/README.md b/README.md index 0715bc6..a661153 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ You can: - download public and private PRIDE files -- download public MassIVE datasets directly from `MSV...` accessions +- download public MassIVE (`MSV...`), JPOST (`JPST...`), and iProX (`IPX...`) datasets directly from their native FTP archives - download by category (`RAW`, `SEARCH`, `RESULT`, etc.) - stream project and file metadata - search projects by keyword and filters @@ -80,15 +80,26 @@ pridepy download-all-public-raw-files \ --checksum-check ``` -### 3) Download a public MassIVE dataset directly +### 3) Download a public MassIVE, JPOST, or iProX dataset directly ```bash +# MassIVE pridepy download-all-public-raw-files \ -a MSV000082297 \ -o ./downloads/MSV000082297 + +# JPOST +pridepy download-all-public-raw-files \ + -a JPST000123 \ + -o ./downloads/JPST000123 + +# iProX +pridepy download-all-public-raw-files \ + -a IPX0000123000 \ + -o ./downloads/IPX0000123000 ``` -For direct `MSV...` downloads, `pridepy` enumerates the dataset from MassIVE's public FTP tree. Raw downloads follow MassIVE's own collection layout, so `download-all-public-raw-files` downloads the files stored under the dataset's `raw/` collection. +For these direct downloads, `pridepy` enumerates the dataset from the repository's public FTP tree (MassIVE at `massive-ftp.ucsd.edu`, JPOST at `ftp.jpostdb.org`, iProX at `ftp.iprox.cn`). Raw downloads follow each repository's own collection layout, so `download-all-public-raw-files` downloads the files stored under the dataset's `raw/` collection. ### 4) Download only selected categories @@ -99,7 +110,7 @@ pridepy download-all-public-category-files \ -c RAW,SEARCH ``` -You can also request a specific MassIVE collection through the same category interface: +You can also request a specific MassIVE / JPOST / iProX collection through the same category interface: ```bash pridepy download-all-public-category-files \ @@ -244,14 +255,15 @@ print(f"RAW files: {len(raw_files)}") print(raw_files[0]["fileName"]) ``` -For MassIVE accessions, the same method returns the files found under the dataset's `raw/` collection: +For MassIVE / JPOST / iProX accessions, the same method returns the files found under the dataset's `raw/` collection: ```python from pridepy.files.files import Files files = Files() -raw_files = files.get_all_raw_file_list("MSV000082297") -print(f"MassIVE raw files: {len(raw_files)}") +for accession in ("MSV000082297", "JPST000123", "IPX0000123000"): + raw_files = files.get_all_raw_file_list(accession) + print(f"{accession} raw files: {len(raw_files)}") ``` ### Example: search projects diff --git a/pridepy/files/files.py b/pridepy/files/files.py index fbcf2f9..1e5ac75 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -63,6 +63,10 @@ class Files: PRIDE_ARCHIVE_HTTPS_URL_PREFIX = "https://ftp.pride.ebi.ac.uk/" MASSIVE_ARCHIVE_FTP = "massive-ftp.ucsd.edu" MASSIVE_ARCHIVE_FTP_URL_PREFIX = "ftp://massive-ftp.ucsd.edu/v01/" + JPOST_ARCHIVE_FTP = "ftp.jpostdb.org" + JPOST_ARCHIVE_FTP_URL_PREFIX = "ftp://ftp.jpostdb.org/" + IPROX_ARCHIVE_FTP = "ftp.iprox.cn" + IPROX_ARCHIVE_FTP_URL_PREFIX = "ftp://ftp.iprox.cn/" S3_URL = "https://hh.fire.sdo.ebi.ac.uk" S3_BUCKET = "pride-public" PROTOCOL_ORDER = ["aspera", "s3", "ftp", "globus"] @@ -280,6 +284,98 @@ def _build_massive_file_record(accession: str, ftp_url: str) -> Dict: "source": "MassIVE", } + @staticmethod + def is_jpost_accession(accession: str) -> bool: + """ + Return True when the accession looks like a JPOST dataset accession. + """ + if not accession: + return False + return bool(re.fullmatch(r"JPST\d{6}", accession.upper())) + + @staticmethod + def _get_jpost_public_root(accession: str) -> str: + return f"/{accession.upper()}" + + @staticmethod + def _get_jpost_public_ftp_url(accession: str, remote_path: str) -> str: + root_path = Files._get_jpost_public_root(accession).rstrip("/") + relative_path = remote_path + if remote_path.startswith(root_path): + relative_path = remote_path[len(root_path) :].lstrip("/") + return f"{Files.JPOST_ARCHIVE_FTP_URL_PREFIX}{accession.upper()}/{relative_path}" + + @staticmethod + def _build_jpost_file_record(accession: str, ftp_url: str) -> Dict: + parsed = urlparse(ftp_url) + root_prefix = f"/{accession.upper()}/" + relative_path = parsed.path + if relative_path.startswith(root_prefix): + relative_path = relative_path[len(root_prefix) :] + relative_path = relative_path.lstrip("/") + collection = relative_path.split("/", 1)[0] if relative_path else "" + return { + "accession": accession.upper(), + "fileName": os.path.basename(parsed.path), + "fileCategory": {"value": Files._map_massive_collection_to_category(collection)}, + "publicFileLocations": [{"name": "FTP Protocol", "value": ftp_url}], + "relativePath": relative_path, + "collection": collection, + "source": "JPOST", + } + + @staticmethod + def is_iprox_accession(accession: str) -> bool: + """ + Return True when the accession looks like an iProX dataset accession. + """ + if not accession: + return False + return bool(re.fullmatch(r"IPX\d{7,10}", accession.upper())) + + @staticmethod + def _get_iprox_public_root(accession: str) -> str: + return f"/{accession.upper()}" + + @staticmethod + def _get_iprox_public_ftp_url(accession: str, remote_path: str) -> str: + root_path = Files._get_iprox_public_root(accession).rstrip("/") + relative_path = remote_path + if remote_path.startswith(root_path): + relative_path = remote_path[len(root_path) :].lstrip("/") + return f"{Files.IPROX_ARCHIVE_FTP_URL_PREFIX}{accession.upper()}/{relative_path}" + + @staticmethod + def _build_iprox_file_record(accession: str, ftp_url: str) -> Dict: + parsed = urlparse(ftp_url) + root_prefix = f"/{accession.upper()}/" + relative_path = parsed.path + if relative_path.startswith(root_prefix): + relative_path = relative_path[len(root_prefix) :] + relative_path = relative_path.lstrip("/") + collection = relative_path.split("/", 1)[0] if relative_path else "" + return { + "accession": accession.upper(), + "fileName": os.path.basename(parsed.path), + "fileCategory": {"value": Files._map_massive_collection_to_category(collection)}, + "publicFileLocations": [{"name": "FTP Protocol", "value": ftp_url}], + "relativePath": relative_path, + "collection": collection, + "source": "iProX", + } + + @staticmethod + def is_direct_download_accession(accession: str) -> bool: + """ + Return True when the accession is served by a public FTP repository + that pridepy supports via direct downloads (no ProteomeXchange API). + """ + return ( + Files.is_massive_accession(accession) + or Files.is_jpost_accession(accession) + or Files.is_iprox_accession(accession) + ) + @staticmethod def _walk_ftp_tree(ftp: FTP, remote_dir: str) -> List[str]: """ @@ -321,28 +417,46 @@ def _walk_ftp_tree(ftp: FTP, remote_dir: str) -> List[str]: ftp.cwd(current_dir) return file_paths - def _list_massive_public_files(self, accession: str) -> List[Dict]: + def _list_ftp_repo_files( + self, host: str, remote_root: str, error_label: str + ) -> List[str]: """ - Discover all public files for a MassIVE dataset from its anonymous FTP tree. + Connect to an anonymous FTP host, walk a directory tree, and return file paths. + Centralizes connection lifecycle so the constructor failure case doesn't mask + the underlying error in ``finally`` (see PR #98 review). """ - normalized_accession = accession.upper() - remote_root = self._get_massive_public_root(normalized_accession) - ftp = FTP(self.MASSIVE_ARCHIVE_FTP, timeout=30) + ftp: Optional[FTP] = None try: + ftp = FTP(host, timeout=30) ftp.login() ftp.set_pasv(True) - logging.info(f"Connected to FTP host: {self.MASSIVE_ARCHIVE_FTP}") - remote_files = self._walk_ftp_tree(ftp, remote_root) + logging.info(f"Connected to FTP host: {host}") + return self._walk_ftp_tree(ftp, remote_root) except Exception as error: raise RuntimeError( - f"Unable to list public files for MassIVE dataset {normalized_accession}: {error}" + f"Unable to list public files for {error_label}: {error}" ) from error finally: - try: - ftp.quit() - except Exception: - ftp.close() + if ftp is not None: + try: + ftp.quit() + except Exception: + try: + ftp.close() + except Exception: + pass + def _list_massive_public_files(self, accession: str) -> List[Dict]: + """ + Discover all public files for a MassIVE dataset from its anonymous FTP tree. + """ + normalized_accession = accession.upper() + remote_root = self._get_massive_public_root(normalized_accession) + remote_files = self._list_ftp_repo_files( + host=self.MASSIVE_ARCHIVE_FTP, + remote_root=remote_root, + error_label=f"MassIVE dataset {normalized_accession}", + ) return [ self._build_massive_file_record( normalized_accession, @@ -362,15 +476,86 @@ def _download_massive_file_records( """ Download public MassIVE files via anonymous FTP. """ + self._download_direct_download_records( + accession=accession, + file_records=file_records, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, + ) + + def _list_jpost_public_files(self, accession: str) -> List[Dict]: + """ + Discover all public files for a JPOST dataset from its anonymous FTP tree. + """ + normalized_accession = accession.upper() + remote_root = self._get_jpost_public_root(normalized_accession) + remote_files = self._list_ftp_repo_files( + host=self.JPOST_ARCHIVE_FTP, + remote_root=remote_root, + error_label=f"JPOST dataset {normalized_accession}", + ) + return [ + self._build_jpost_file_record( + normalized_accession, + self._get_jpost_public_ftp_url(normalized_accession, remote_file), + ) + for remote_file in remote_files + ] + + def _list_iprox_public_files(self, accession: str) -> List[Dict]: + """ + Discover all public files for an iProX dataset from its anonymous FTP tree. + """ + normalized_accession = accession.upper() + remote_root = self._get_iprox_public_root(normalized_accession) + remote_files = self._list_ftp_repo_files( + host=self.IPROX_ARCHIVE_FTP, + remote_root=remote_root, + error_label=f"iProX dataset {normalized_accession}", + ) + return [ + self._build_iprox_file_record( + normalized_accession, + self._get_iprox_public_ftp_url(normalized_accession, remote_file), + ) + for remote_file in remote_files + ] + + def _list_direct_download_files(self, accession: str) -> List[Dict]: + """ + Dispatch to the right FTP-based listing for a direct-download repository. + """ + if self.is_massive_accession(accession): + return self._list_massive_public_files(accession) + if self.is_jpost_accession(accession): + return self._list_jpost_public_files(accession) + if self.is_iprox_accession(accession): + return self._list_iprox_public_files(accession) + raise ValueError( + f"Accession {accession} is not a direct-download repository accession" + ) + + def _download_direct_download_records( + self, + accession: str, + file_records: List[Dict], + output_folder: str, + skip_if_downloaded_already: bool, + protocol: str, + ) -> None: + """ + Download files from a direct-download repository (MassIVE/JPOST/iProX) via anonymous FTP. + """ if protocol != "ftp": logging.warning( - "MassIVE direct downloads currently use ftp only. " + "Direct downloads currently use ftp only. " f"Ignoring requested protocol '{protocol}' for {accession}." ) ftp_urls = [self._get_download_url(file_record, "ftp") for file_record in file_records] if not ftp_urls: - logging.info(f"No files matched for MassIVE dataset {accession}") + logging.info(f"No files matched for direct-download dataset {accession}") return self.download_ftp_urls( @@ -413,8 +598,8 @@ def get_all_raw_file_list(self, project_accession): :param project_accession: PRIDE accession :return: raw file list in JSON format """ - if self.is_massive_accession(project_accession): - record_files = self._list_massive_public_files(project_accession) + if self.is_direct_download_accession(project_accession): + record_files = self._list_direct_download_files(project_accession) return [ file for file in record_files if file["fileCategory"]["value"] == "RAW" ] @@ -451,8 +636,8 @@ def download_all_raw_files( raw_files = self.get_all_raw_file_list(accession) - if self.is_massive_accession(accession): - self._download_massive_file_records( + if self.is_direct_download_accession(accession): + self._download_direct_download_records( accession=accession, file_records=raw_files, output_folder=output_folder, @@ -945,14 +1130,16 @@ def download_file_by_name( os.mkdir(output_folder) ## Check type of project - if self.is_massive_accession(accession): - logging.info("Downloading file from public MassIVE dataset {}".format(accession)) + if self.is_direct_download_accession(accession): + logging.info( + "Downloading file from public direct-download dataset {}".format(accession) + ) response = self.get_file_from_api(accession, file_name) if not response: raise Exception( - "File name {} not found in MassIVE dataset {}".format(file_name, accession) + "File name {} not found in dataset {}".format(file_name, accession) ) - self._download_massive_file_records( + self._download_direct_download_records( accession=accession, file_records=response, output_folder=output_folder, @@ -1014,8 +1201,8 @@ def get_file_from_api(self, accession, file_name) -> List[Dict]: """ try: - if self.is_massive_accession(accession): - files = self._list_massive_public_files(accession) + if self.is_direct_download_accession(accession): + files = self._list_direct_download_files(accession) return [f for f in files if f["fileName"] == file_name] files = self.stream_all_files_by_project(accession) file = [f for f in files if f["fileName"] == file_name] @@ -1380,8 +1567,8 @@ def download_files_by_list( if not file_names: raise ValueError("file_names must contain at least one filename") - if self.is_massive_accession(accession): - all_files = self._list_massive_public_files(accession) + if self.is_direct_download_accession(accession): + all_files = self._list_direct_download_files(accession) else: all_files = self.stream_all_files_by_project(accession) requested = set(file_names) @@ -1394,8 +1581,8 @@ def download_files_by_list( f"No matching files in project {accession} for: {sorted(requested)}" ) - if self.is_massive_accession(accession): - self._download_massive_file_records( + if self.is_direct_download_accession(accession): + self._download_direct_download_records( accession=accession, file_records=matched, output_folder=output_folder, @@ -1670,8 +1857,8 @@ def download_all_category_files( if categories is None: categories = [category] if category else ["RAW"] raw_files = self.get_all_category_file_list(accession, categories) - if self.is_massive_accession(accession): - self._download_massive_file_records( + if self.is_direct_download_accession(accession): + self._download_direct_download_records( accession=accession, file_records=raw_files, output_folder=output_folder, @@ -1704,8 +1891,8 @@ def get_all_category_file_list( categories = [categories] category_set = {category.upper() for category in categories} - if self.is_massive_accession(accession): - record_files = self._list_massive_public_files(accession) + if self.is_direct_download_accession(accession): + record_files = self._list_direct_download_files(accession) else: record_files = self.stream_all_files_by_project(accession) diff --git a/pridepy/tests/test_iprox_files.py b/pridepy/tests/test_iprox_files.py new file mode 100644 index 0000000..0af7194 --- /dev/null +++ b/pridepy/tests/test_iprox_files.py @@ -0,0 +1,89 @@ +import tempfile +from unittest import TestCase +from unittest.mock import patch + +from pridepy.files.files import Files + + +class TestIProXFiles(TestCase): + def test_is_iprox_accession(self): + assert Files.is_iprox_accession("IPX0000123") + assert Files.is_iprox_accession("IPX0000123000") + assert Files.is_iprox_accession("ipx1234567") + assert not Files.is_iprox_accession("PXD000012") + assert not Files.is_iprox_accession("MSV000012345") + assert not Files.is_iprox_accession("IPX12") + + def test_is_direct_download_accession_includes_iprox(self): + assert Files.is_direct_download_accession("IPX0000123000") + + def test_build_iprox_file_record_maps_collection_to_category(self): + record = Files._build_iprox_file_record( + "IPX0000123000", + "ftp://ftp.iprox.cn/IPX0000123000/peak/sample.mzML", + ) + + assert record["fileName"] == "sample.mzML" + assert record["collection"] == "peak" + assert record["fileCategory"]["value"] == "PEAK" + assert record["source"] == "iProX" + + def test_build_iprox_file_record_marks_raw_collection_as_raw(self): + record = Files._build_iprox_file_record( + "IPX0000123000", + "ftp://ftp.iprox.cn/IPX0000123000/raw/run01.raw", + ) + + assert record["collection"] == "raw" + assert record["fileCategory"]["value"] == "RAW" + + def test_get_all_raw_file_list_filters_iprox_records(self): + files = Files() + iprox_records = [ + Files._build_iprox_file_record( + "IPX0000123000", + "ftp://ftp.iprox.cn/IPX0000123000/raw/run1.raw", + ), + Files._build_iprox_file_record( + "IPX0000123000", + "ftp://ftp.iprox.cn/IPX0000123000/result/results.tsv", + ), + ] + + with patch.object(Files, "_list_iprox_public_files", return_value=iprox_records), patch.object( + Files, "stream_all_files_by_project" + ) as pride_mock: + result = files.get_all_raw_file_list("IPX0000123000") + + pride_mock.assert_not_called() + assert len(result) == 1 + assert {file["fileName"] for file in result} == {"run1.raw"} + + def test_download_file_by_name_uses_iprox_ftp_listing(self): + files = Files() + file_record = Files._build_iprox_file_record( + "IPX0000123000", + "ftp://ftp.iprox.cn/IPX0000123000/raw/folder/sample.raw", + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.object( + Files, "_list_iprox_public_files", return_value=[file_record] + ), patch.object(Files, "download_ftp_urls") as download_mock: + files.download_file_by_name( + accession="IPX0000123000", + file_name="sample.raw", + output_folder=tmp_dir, + skip_if_downloaded_already=False, + protocol="ftp", + username=None, + password=None, + aspera_maximum_bandwidth="100M", + checksum_check=False, + ) + + download_mock.assert_called_once_with( + ftp_urls=["ftp://ftp.iprox.cn/IPX0000123000/raw/folder/sample.raw"], + output_folder=tmp_dir, + skip_if_downloaded_already=False, + ) diff --git a/pridepy/tests/test_jpost_files.py b/pridepy/tests/test_jpost_files.py new file mode 100644 index 0000000..d41cb4f --- /dev/null +++ b/pridepy/tests/test_jpost_files.py @@ -0,0 +1,88 @@ +import tempfile +from unittest import TestCase +from unittest.mock import patch + +from pridepy.files.files import Files + + +class TestJPOSTFiles(TestCase): + def test_is_jpost_accession(self): + assert Files.is_jpost_accession("JPST000001") + assert Files.is_jpost_accession("jpst123456") + assert not Files.is_jpost_accession("PXD000012") + assert not Files.is_jpost_accession("MSV000012345") + assert not Files.is_jpost_accession("JPST12") + + def test_is_direct_download_accession_includes_jpost(self): + assert Files.is_direct_download_accession("JPST000001") + + def test_build_jpost_file_record_maps_collection_to_category(self): + record = Files._build_jpost_file_record( + "JPST000001", + "ftp://ftp.jpostdb.org/JPST000001/peak/sample.mzML", + ) + + assert record["fileName"] == "sample.mzML" + assert record["collection"] == "peak" + assert record["fileCategory"]["value"] == "PEAK" + assert record["source"] == "JPOST" + + def test_build_jpost_file_record_marks_raw_collection_as_raw(self): + record = Files._build_jpost_file_record( + "JPST000001", + "ftp://ftp.jpostdb.org/JPST000001/raw/run01.raw", + ) + + assert record["collection"] == "raw" + assert record["fileCategory"]["value"] == "RAW" + + def test_get_all_raw_file_list_filters_jpost_records(self): + files = Files() + jpost_records = [ + Files._build_jpost_file_record( + "JPST000001", + "ftp://ftp.jpostdb.org/JPST000001/raw/run1.raw", + ), + Files._build_jpost_file_record( + "JPST000001", + "ftp://ftp.jpostdb.org/JPST000001/result/results.tsv", + ), + ] + + with patch.object(Files, "_list_jpost_public_files", return_value=jpost_records), patch.object( + Files, "stream_all_files_by_project" + ) as pride_mock: + result = files.get_all_raw_file_list("JPST000001") + + pride_mock.assert_not_called() + assert len(result) == 1 + assert {file["fileName"] for file in result} == {"run1.raw"} + + def test_download_file_by_name_uses_jpost_ftp_listing(self): + files = Files() + file_record = Files._build_jpost_file_record( + "JPST000001", + "ftp://ftp.jpostdb.org/JPST000001/raw/folder/sample.raw", + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.object( + Files, "_list_jpost_public_files", return_value=[file_record] + ), patch.object(Files, "download_ftp_urls") as download_mock: + files.download_file_by_name( + accession="JPST000001", + file_name="sample.raw", + output_folder=tmp_dir, + skip_if_downloaded_already=False, + protocol="ftp", + username=None, + password=None, + aspera_maximum_bandwidth="100M", + checksum_check=False, + ) + + download_mock.assert_called_once_with( + ftp_urls=["ftp://ftp.jpostdb.org/JPST000001/raw/folder/sample.raw"], + output_folder=tmp_dir, + skip_if_downloaded_already=False, + ) diff --git a/pyproject.toml b/pyproject.toml index 90f40ae..4a95f24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pridepy" -version = "0.0.15" +version = "0.0.16" description = "Python Client library for PRIDE Rest API" readme = "README.md" requires-python = ">=3.9" From 3f8ed3fb2c2a5484a3a3b955969d1efc6f90f351 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 11:29:11 +0100 Subject: [PATCH 02/21] Direct downloads: FTPS for MassIVE, parallelism, defer iProX Address feedback that direct downloads should match the PRIDE feature set (resume, parallel, retries) and that MassIVE's actual FTP server requires TLS. Live findings: - massive-ftp.ucsd.edu now rejects plain anonymous FTP with 421 TLS is required. The merged PR #98 code was effectively broken against the live server. Switch to FTP_TLS + PROT P. - ftp.jpostdb.org accepts plain anonymous FTP; keep as-is. - ftp.iprox.cn does not resolve (DNS fail) and no other iProX FTP host responds. iProX is HTTPS-only and needs a different transport (REST API). Defer iProX support; user to provide the endpoint. Implementation: - New static _open_ftp_connection(host, use_tls) opens FTP or FTP_TLS with the right TLS setup, and transparently falls back to FTPS if a plain FTP server replies 'TLS is required'. - _list_ftp_repo_files() and download_ftp_urls() both grow a use_tls flag. _repo_uses_tls(accession) wires this from the repo type (MassIVE = True, JPOST = False). - download_ftp_urls() grows parallel_files: when >1, a ThreadPoolExecutor runs that many FTP workers per host, each with its own connection. Existing serial single-connection-per-host path is preserved for parallel_files <= 1. - Extracted _download_one_ftp_path() with REST-based resume + per-file retries; _download_ftp_paths_serial() / _download_ftp_paths_parallel() pick the right scheduling. REST resume verified live (3 KB pre-stage -> 10 KB final, MD5 matches full file). - _download_direct_download_records() now accepts parallel_files and forwards it (along with use_tls derived from the accession) to download_ftp_urls. All call sites (download_all_raw_files, download_all_category_files, download_files_by_list) thread the user-supplied -w/--parallel-files through. Tests: - test_jpost_files / test_massive_files updated to assert the new kwargs (use_tls, parallel_files). - New test_repo_uses_tls_true_for_massive_false_for_jpost and test_download_all_raw_files_threads_parallel_files_for_massive. - test_iprox_files.py removed (iProX is deferred). - All 15 unit tests pass. Live testing: - MassIVE: listed MSV000080175 (44 files), single-file download of params.xml (10315 B, MD5 43d87368d705c3f380c1d030b14850c4), REST resume from a 3000 B partial, and 3-worker parallel download of files from MSV000080175 + MSV000078335 all succeeded. - JPOST: rate-limited from this IP ('421 too many connections'). Code path is structurally identical to MassIVE (same FTP helper), routing covered by unit tests; deferred to a follow-up live check. --- README.md | 17 +- pridepy/files/files.py | 428 ++++++++++++++++++---------- pridepy/tests/test_iprox_files.py | 89 ------ pridepy/tests/test_jpost_files.py | 2 + pridepy/tests/test_massive_files.py | 35 +++ 5 files changed, 317 insertions(+), 254 deletions(-) delete mode 100644 pridepy/tests/test_iprox_files.py diff --git a/README.md b/README.md index a661153..6139748 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ You can: - download public and private PRIDE files -- download public MassIVE (`MSV...`), JPOST (`JPST...`), and iProX (`IPX...`) datasets directly from their native FTP archives +- download public MassIVE (`MSV...`) and JPOST (`JPST...`) datasets directly from their native FTP archives - download by category (`RAW`, `SEARCH`, `RESULT`, etc.) - stream project and file metadata - search projects by keyword and filters @@ -80,7 +80,7 @@ pridepy download-all-public-raw-files \ --checksum-check ``` -### 3) Download a public MassIVE, JPOST, or iProX dataset directly +### 3) Download a public MassIVE or JPOST dataset directly ```bash # MassIVE @@ -92,14 +92,9 @@ pridepy download-all-public-raw-files \ pridepy download-all-public-raw-files \ -a JPST000123 \ -o ./downloads/JPST000123 - -# iProX -pridepy download-all-public-raw-files \ - -a IPX0000123000 \ - -o ./downloads/IPX0000123000 ``` -For these direct downloads, `pridepy` enumerates the dataset from the repository's public FTP tree (MassIVE at `massive-ftp.ucsd.edu`, JPOST at `ftp.jpostdb.org`, iProX at `ftp.iprox.cn`). Raw downloads follow each repository's own collection layout, so `download-all-public-raw-files` downloads the files stored under the dataset's `raw/` collection. +For these direct downloads, `pridepy` enumerates the dataset from the repository's public FTP tree (MassIVE at `massive-ftp.ucsd.edu` over FTPS, JPOST at `ftp.jpostdb.org` over plain FTP). Raw downloads follow each repository's own collection layout, so `download-all-public-raw-files` downloads the files stored under the dataset's `raw/` collection. ### 4) Download only selected categories @@ -110,7 +105,7 @@ pridepy download-all-public-category-files \ -c RAW,SEARCH ``` -You can also request a specific MassIVE / JPOST / iProX collection through the same category interface: +You can also request a specific MassIVE / JPOST collection through the same category interface: ```bash pridepy download-all-public-category-files \ @@ -255,13 +250,13 @@ print(f"RAW files: {len(raw_files)}") print(raw_files[0]["fileName"]) ``` -For MassIVE / JPOST / iProX accessions, the same method returns the files found under the dataset's `raw/` collection: +For MassIVE / JPOST accessions, the same method returns the files found under the dataset's `raw/` collection: ```python from pridepy.files.files import Files files = Files() -for accession in ("MSV000082297", "JPST000123", "IPX0000123000"): +for accession in ("MSV000082297", "JPST000123"): raw_files = files.get_all_raw_file_list(accession) print(f"{accession} raw files: {len(raw_files)}") ``` diff --git a/pridepy/files/files.py b/pridepy/files/files.py index 1e5ac75..464e761 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -65,8 +65,6 @@ class Files: MASSIVE_ARCHIVE_FTP_URL_PREFIX = "ftp://massive-ftp.ucsd.edu/v01/" JPOST_ARCHIVE_FTP = "ftp.jpostdb.org" JPOST_ARCHIVE_FTP_URL_PREFIX = "ftp://ftp.jpostdb.org/" - IPROX_ARCHIVE_FTP = "ftp.iprox.cn" - IPROX_ARCHIVE_FTP_URL_PREFIX = "ftp://ftp.iprox.cn/" S3_URL = "https://hh.fire.sdo.ebi.ac.uk" S3_BUCKET = "pride-public" PROTOCOL_ORDER = ["aspera", "s3", "ftp", "globus"] @@ -324,46 +322,6 @@ def _build_jpost_file_record(accession: str, ftp_url: str) -> Dict: "source": "JPOST", } - @staticmethod - def is_iprox_accession(accession: str) -> bool: - """ - Return True when the accession looks like an iProX dataset accession. - """ - if not accession: - return False - return bool(re.fullmatch(r"IPX\d{7,10}", accession.upper())) - - @staticmethod - def _get_iprox_public_root(accession: str) -> str: - return f"/{accession.upper()}" - - @staticmethod - def _get_iprox_public_ftp_url(accession: str, remote_path: str) -> str: - root_path = Files._get_iprox_public_root(accession).rstrip("/") - relative_path = remote_path - if remote_path.startswith(root_path): - relative_path = remote_path[len(root_path) :].lstrip("/") - return f"{Files.IPROX_ARCHIVE_FTP_URL_PREFIX}{accession.upper()}/{relative_path}" - - @staticmethod - def _build_iprox_file_record(accession: str, ftp_url: str) -> Dict: - parsed = urlparse(ftp_url) - root_prefix = f"/{accession.upper()}/" - relative_path = parsed.path - if relative_path.startswith(root_prefix): - relative_path = relative_path[len(root_prefix) :] - relative_path = relative_path.lstrip("/") - collection = relative_path.split("/", 1)[0] if relative_path else "" - return { - "accession": accession.upper(), - "fileName": os.path.basename(parsed.path), - "fileCategory": {"value": Files._map_massive_collection_to_category(collection)}, - "publicFileLocations": [{"name": "FTP Protocol", "value": ftp_url}], - "relativePath": relative_path, - "collection": collection, - "source": "iProX", - } - @staticmethod def is_direct_download_accession(accession: str) -> bool: """ @@ -373,9 +331,17 @@ def is_direct_download_accession(accession: str) -> bool: return ( Files.is_massive_accession(accession) or Files.is_jpost_accession(accession) - or Files.is_iprox_accession(accession) ) + @staticmethod + def _repo_uses_tls(accession: str) -> bool: + """ + Whether the public FTP server for ``accession`` requires FTP over TLS. + MassIVE rejects plain anonymous FTP (``421 TLS is required``); JPOST + accepts plain FTP. + """ + return Files.is_massive_accession(accession) + @staticmethod def _walk_ftp_tree(ftp: FTP, remote_dir: str) -> List[str]: """ @@ -417,20 +383,55 @@ def _walk_ftp_tree(ftp: FTP, remote_dir: str) -> List[str]: ftp.cwd(current_dir) return file_paths + @staticmethod + def _open_ftp_connection(host: str, use_tls: bool, timeout: int = 30) -> FTP: + """ + Open an anonymous FTP connection, transparently using FTPS when the + server requires TLS (e.g., MassIVE). When ``use_tls`` is False but the + server replies ``421 TLS is required`` to ``login``, transparently + retry with FTPS so callers don't need to know the policy in advance. + """ + if use_tls: + ftp: FTP = ftplib.FTP_TLS(host, timeout=timeout) + ftp.login() + ftp.prot_p() + else: + ftp = FTP(host, timeout=timeout) + try: + ftp.login() + except ftplib.error_temp as e: + if "TLS" in str(e).upper(): + try: + ftp.close() + except Exception: + pass + ftp = ftplib.FTP_TLS(host, timeout=timeout) + ftp.login() + ftp.prot_p() + else: + raise + ftp.set_pasv(True) + return ftp + def _list_ftp_repo_files( - self, host: str, remote_root: str, error_label: str + self, + host: str, + remote_root: str, + error_label: str, + use_tls: bool = False, ) -> List[str]: """ - Connect to an anonymous FTP host, walk a directory tree, and return file paths. - Centralizes connection lifecycle so the constructor failure case doesn't mask - the underlying error in ``finally`` (see PR #98 review). + Connect to an anonymous FTP host (FTP or FTPS), walk a directory tree, + and return file paths. + + ``use_tls`` should be True for servers that reject plain FTP (e.g. + MassIVE). Centralizes connection lifecycle so a constructor failure + doesn't mask the underlying error in ``finally`` (PR #98 review). """ ftp: Optional[FTP] = None try: - ftp = FTP(host, timeout=30) - ftp.login() - ftp.set_pasv(True) - logging.info(f"Connected to FTP host: {host}") + ftp = self._open_ftp_connection(host, use_tls=use_tls) + logging.info(f"Connected to FTP host: {host} (tls={use_tls})") return self._walk_ftp_tree(ftp, remote_root) except Exception as error: raise RuntimeError( @@ -456,6 +457,7 @@ def _list_massive_public_files(self, accession: str) -> List[Dict]: host=self.MASSIVE_ARCHIVE_FTP, remote_root=remote_root, error_label=f"MassIVE dataset {normalized_accession}", + use_tls=True, ) return [ self._build_massive_file_record( @@ -472,9 +474,11 @@ def _download_massive_file_records( output_folder: str, skip_if_downloaded_already: bool, protocol: str, + parallel_files: int = 1, ) -> None: """ - Download public MassIVE files via anonymous FTP. + Download public MassIVE files via anonymous FTP (now FTPS). + Backward-compat wrapper around :meth:`_download_direct_download_records`. """ self._download_direct_download_records( accession=accession, @@ -482,6 +486,7 @@ def _download_massive_file_records( output_folder=output_folder, skip_if_downloaded_already=skip_if_downloaded_already, protocol=protocol, + parallel_files=parallel_files, ) def _list_jpost_public_files(self, accession: str) -> List[Dict]: @@ -503,25 +508,6 @@ def _list_jpost_public_files(self, accession: str) -> List[Dict]: for remote_file in remote_files ] - def _list_iprox_public_files(self, accession: str) -> List[Dict]: - """ - Discover all public files for an iProX dataset from its anonymous FTP tree. - """ - normalized_accession = accession.upper() - remote_root = self._get_iprox_public_root(normalized_accession) - remote_files = self._list_ftp_repo_files( - host=self.IPROX_ARCHIVE_FTP, - remote_root=remote_root, - error_label=f"iProX dataset {normalized_accession}", - ) - return [ - self._build_iprox_file_record( - normalized_accession, - self._get_iprox_public_ftp_url(normalized_accession, remote_file), - ) - for remote_file in remote_files - ] - def _list_direct_download_files(self, accession: str) -> List[Dict]: """ Dispatch to the right FTP-based listing for a direct-download repository. @@ -530,8 +516,6 @@ def _list_direct_download_files(self, accession: str) -> List[Dict]: return self._list_massive_public_files(accession) if self.is_jpost_accession(accession): return self._list_jpost_public_files(accession) - if self.is_iprox_accession(accession): - return self._list_iprox_public_files(accession) raise ValueError( f"Accession {accession} is not a direct-download repository accession" ) @@ -543,9 +527,12 @@ def _download_direct_download_records( output_folder: str, skip_if_downloaded_already: bool, protocol: str, + parallel_files: int = 1, ) -> None: """ - Download files from a direct-download repository (MassIVE/JPOST/iProX) via anonymous FTP. + Download files from a direct-download repository (MassIVE/JPOST) via + anonymous FTP. Supports REST-based resume, per-file retries, and + parallel workers (one connection per worker, capped at file count). """ if protocol != "ftp": logging.warning( @@ -562,6 +549,8 @@ def _download_direct_download_records( ftp_urls=ftp_urls, output_folder=output_folder, skip_if_downloaded_already=skip_if_downloaded_already, + use_tls=self._repo_uses_tls(accession), + parallel_files=parallel_files, ) async def stream_all_files_metadata(self, output_file, accession=None): @@ -643,6 +632,7 @@ def download_all_raw_files( output_folder=output_folder, skip_if_downloaded_already=skip_if_downloaded_already, protocol=protocol, + parallel_files=parallel_files, ) return @@ -1588,6 +1578,7 @@ def download_files_by_list( output_folder=output_folder, skip_if_downloaded_already=skip_if_downloaded_already, protocol=protocol, + parallel_files=parallel_files, ) return @@ -1864,6 +1855,7 @@ def download_all_category_files( output_folder=output_folder, skip_if_downloaded_already=skip_if_downloaded_already, protocol=protocol, + parallel_files=parallel_files, ) return self.download_files( @@ -1988,6 +1980,181 @@ def _local_path_for_url(download_url: str, output_folder: str) -> str: filename = os.path.basename(urlparse(download_url).path) return os.path.join(output_folder, filename) + @staticmethod + def _download_one_ftp_path( + ftp: FTP, + ftp_path: str, + local_path: str, + skip_if_downloaded_already: bool, + max_download_retries: int, + position: int = 0, + ) -> None: + """ + Download a single FTP path over an existing connection, with REST resume + and per-file retry. Raises on giving up so the caller can decide what to do. + """ + if skip_if_downloaded_already and os.path.exists(local_path): + logging.info(f"Skipping download as file already exists: {local_path}") + return + + attempt = 0 + last_error: Optional[Exception] = None + while attempt < max_download_retries: + try: + total_size = ftp.size(ftp_path) + if os.path.exists(local_path): + current_size = os.path.getsize(local_path) + mode = "ab" + else: + current_size = 0 + mode = "wb" + + with open(local_path, mode) as f, tqdm( + total=total_size, + unit="B", + unit_scale=True, + desc=local_path, + initial=current_size, + position=position, + leave=True, + ) as pbar: + def callback(data): + f.write(data) + pbar.update(len(data)) + + if current_size: + try: + ftp.sendcmd(f"REST {current_size}") + except Exception: + current_size = 0 + f.seek(0) + f.truncate() + ftp.retrbinary(f"RETR {ftp_path}", callback) + logging.info(f"Successfully downloaded {local_path}") + return + except (socket.timeout, ftplib.error_temp, ftplib.error_perm) as e: + attempt += 1 + last_error = e + logging.error( + f"Download failed for {local_path} (attempt {attempt}): {e}" + ) + raise RuntimeError( + f"Giving up on {local_path} after {max_download_retries} attempts" + ) from last_error + + @staticmethod + def _download_ftp_paths_serial( + host: str, + paths: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + use_tls: bool, + max_connection_retries: int, + max_download_retries: int, + ) -> None: + """Download all paths from one host over a single (reused) connection.""" + connection_attempt = 0 + while connection_attempt < max_connection_retries: + try: + ftp = Files._open_ftp_connection(host, use_tls=use_tls) + logging.info(f"Connected to FTP host: {host} (tls={use_tls})") + for ftp_path in paths: + local_path = os.path.join(output_folder, os.path.basename(ftp_path)) + try: + Files._download_one_ftp_path( + ftp=ftp, + ftp_path=ftp_path, + local_path=local_path, + skip_if_downloaded_already=skip_if_downloaded_already, + max_download_retries=max_download_retries, + ) + except Exception as e: + logging.error( + f"Failed to download {ftp_path} from {host}: {e}" + ) + try: + ftp.quit() + except Exception: + try: + ftp.close() + except Exception: + pass + logging.info(f"Disconnected from FTP host: {host}") + return + except (socket.timeout, ftplib.error_temp, ftplib.error_perm, OSError) as e: + connection_attempt += 1 + logging.error( + f"FTP connection failed (attempt {connection_attempt}): {e}" + ) + if connection_attempt < max_connection_retries: + logging.info("Retrying connection...") + time.sleep(5) + else: + logging.error( + f"Giving up after {max_connection_retries} failed connection attempts to {host}." + ) + + @staticmethod + def _download_ftp_paths_parallel( + host: str, + paths: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + use_tls: bool, + max_connection_retries: int, + max_download_retries: int, + parallel_files: int, + ) -> None: + """ + Download paths concurrently using ``parallel_files`` workers; each + worker opens its own FTP connection so transfers don't serialize. + """ + def worker(ftp_path: str, position: int) -> None: + local_path = os.path.join(output_folder, os.path.basename(ftp_path)) + if skip_if_downloaded_already and os.path.exists(local_path): + logging.info(f"Skipping download as file already exists: {local_path}") + return + connection_attempt = 0 + while connection_attempt < max_connection_retries: + try: + ftp = Files._open_ftp_connection(host, use_tls=use_tls) + try: + Files._download_one_ftp_path( + ftp=ftp, + ftp_path=ftp_path, + local_path=local_path, + skip_if_downloaded_already=False, + max_download_retries=max_download_retries, + position=position, + ) + return + finally: + try: + ftp.quit() + except Exception: + try: + ftp.close() + except Exception: + pass + except (socket.timeout, ftplib.error_temp, ftplib.error_perm, OSError) as e: + connection_attempt += 1 + logging.error( + f"FTP connection failed for {ftp_path} (attempt {connection_attempt}): {e}" + ) + if connection_attempt < max_connection_retries: + time.sleep(5) + logging.error(f"Giving up on {ftp_path} from {host}") + + with ThreadPoolExecutor(max_workers=parallel_files) as executor: + futures = [ + executor.submit(worker, path, idx) for idx, path in enumerate(paths) + ] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + logging.error(f"Parallel FTP download error: {e}") + @staticmethod def download_ftp_urls( ftp_urls: List[str], @@ -1995,98 +2162,51 @@ def download_ftp_urls( skip_if_downloaded_already: bool, max_connection_retries: int = 3, max_download_retries: int = 3, + use_tls: bool = False, + parallel_files: int = 1, ) -> None: """ - Download a list of FTP URLs using a single connection, with retries and progress bars. + Download a list of FTP URLs with retries, REST-based resume, and + optional parallel workers. + + :param use_tls: Open the FTP connection with TLS (FTP_TLS / PROT P). + Required for hosts that reject plain anonymous FTP (e.g. MassIVE). + When False but the server replies ``421 TLS is required``, the + connection is transparently retried over TLS. + :param parallel_files: When >1, downloads run concurrently with that + many worker connections per host (capped at the number of files). """ if not os.path.isdir(output_folder): os.makedirs(output_folder, exist_ok=True) - def connect_ftp(host: str): - ftp = FTP(host, timeout=30) - ftp.login() - ftp.set_pasv(True) - logging.info(f"Connected to FTP host: {host}") - return ftp - - # Group URLs by host to reuse connections efficiently host_to_paths: Dict[str, List[str]] = {} for url in ftp_urls: parsed = urlparse(url) host_to_paths.setdefault(parsed.hostname, []).append(parsed.path.lstrip("/")) for host, paths in host_to_paths.items(): - connection_attempt = 0 - while connection_attempt < max_connection_retries: - try: - ftp = connect_ftp(host) - for ftp_path in paths: - try: - local_path = os.path.join(output_folder, os.path.basename(ftp_path)) - if skip_if_downloaded_already and os.path.exists(local_path): - logging.info("Skipping download as file already exists") - continue - - logging.info(f"Starting FTP download: {host}/{ftp_path}") - download_attempt = 0 - while download_attempt < max_download_retries: - try: - total_size = ftp.size(ftp_path) - # Try to resume using REST if partial file exists - if os.path.exists(local_path): - current_size = os.path.getsize(local_path) - mode = "ab" - else: - current_size = 0 - mode = "wb" - - with open(local_path, mode) as f, tqdm( - total=total_size, - unit="B", - unit_scale=True, - desc=local_path, - initial=current_size, - ) as pbar: - def callback(data): - f.write(data) - pbar.update(len(data)) - - if current_size: - try: - ftp.sendcmd(f"REST {current_size}") - except Exception: - # If REST not supported, fall back to full download - current_size = 0 - f.seek(0) - f.truncate() - ftp.retrbinary(f"RETR {ftp_path}", callback) - logging.info(f"Successfully downloaded {local_path}") - break - except (socket.timeout, ftplib.error_temp, ftplib.error_perm) as e: - download_attempt += 1 - logging.error( - f"Download failed for {local_path} (attempt {download_attempt}): {str(e)}" - ) - if download_attempt >= max_download_retries: - logging.error( - f"Giving up on {local_path} after {max_download_retries} attempts." - ) - break - except Exception as e: - logging.error(f"Unexpected error while processing FTP path {ftp_path}: {str(e)}") - ftp.quit() - logging.info(f"Disconnected from FTP host: {host}") - break - except (socket.timeout, ftplib.error_temp, ftplib.error_perm, socket.error) as e: - connection_attempt += 1 - logging.error(f"FTP connection failed (attempt {connection_attempt}): {str(e)}") - if connection_attempt < max_connection_retries: - logging.info("Retrying connection...") - time.sleep(5) - else: - logging.error( - f"Giving up after {max_connection_retries} failed connection attempts to {host}." - ) + workers = max(1, min(parallel_files, len(paths))) + if workers > 1: + Files._download_ftp_paths_parallel( + host=host, + paths=paths, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + use_tls=use_tls, + max_connection_retries=max_connection_retries, + max_download_retries=max_download_retries, + parallel_files=workers, + ) + else: + Files._download_ftp_paths_serial( + host=host, + paths=paths, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + use_tls=use_tls, + max_connection_retries=max_connection_retries, + max_download_retries=max_download_retries, + ) @staticmethod def download_http_urls( diff --git a/pridepy/tests/test_iprox_files.py b/pridepy/tests/test_iprox_files.py deleted file mode 100644 index 0af7194..0000000 --- a/pridepy/tests/test_iprox_files.py +++ /dev/null @@ -1,89 +0,0 @@ -import tempfile -from unittest import TestCase -from unittest.mock import patch - -from pridepy.files.files import Files - - -class TestIProXFiles(TestCase): - def test_is_iprox_accession(self): - assert Files.is_iprox_accession("IPX0000123") - assert Files.is_iprox_accession("IPX0000123000") - assert Files.is_iprox_accession("ipx1234567") - assert not Files.is_iprox_accession("PXD000012") - assert not Files.is_iprox_accession("MSV000012345") - assert not Files.is_iprox_accession("IPX12") - - def test_is_direct_download_accession_includes_iprox(self): - assert Files.is_direct_download_accession("IPX0000123000") - - def test_build_iprox_file_record_maps_collection_to_category(self): - record = Files._build_iprox_file_record( - "IPX0000123000", - "ftp://ftp.iprox.cn/IPX0000123000/peak/sample.mzML", - ) - - assert record["fileName"] == "sample.mzML" - assert record["collection"] == "peak" - assert record["fileCategory"]["value"] == "PEAK" - assert record["source"] == "iProX" - - def test_build_iprox_file_record_marks_raw_collection_as_raw(self): - record = Files._build_iprox_file_record( - "IPX0000123000", - "ftp://ftp.iprox.cn/IPX0000123000/raw/run01.raw", - ) - - assert record["collection"] == "raw" - assert record["fileCategory"]["value"] == "RAW" - - def test_get_all_raw_file_list_filters_iprox_records(self): - files = Files() - iprox_records = [ - Files._build_iprox_file_record( - "IPX0000123000", - "ftp://ftp.iprox.cn/IPX0000123000/raw/run1.raw", - ), - Files._build_iprox_file_record( - "IPX0000123000", - "ftp://ftp.iprox.cn/IPX0000123000/result/results.tsv", - ), - ] - - with patch.object(Files, "_list_iprox_public_files", return_value=iprox_records), patch.object( - Files, "stream_all_files_by_project" - ) as pride_mock: - result = files.get_all_raw_file_list("IPX0000123000") - - pride_mock.assert_not_called() - assert len(result) == 1 - assert {file["fileName"] for file in result} == {"run1.raw"} - - def test_download_file_by_name_uses_iprox_ftp_listing(self): - files = Files() - file_record = Files._build_iprox_file_record( - "IPX0000123000", - "ftp://ftp.iprox.cn/IPX0000123000/raw/folder/sample.raw", - ) - - with tempfile.TemporaryDirectory() as tmp_dir: - with patch.object( - Files, "_list_iprox_public_files", return_value=[file_record] - ), patch.object(Files, "download_ftp_urls") as download_mock: - files.download_file_by_name( - accession="IPX0000123000", - file_name="sample.raw", - output_folder=tmp_dir, - skip_if_downloaded_already=False, - protocol="ftp", - username=None, - password=None, - aspera_maximum_bandwidth="100M", - checksum_check=False, - ) - - download_mock.assert_called_once_with( - ftp_urls=["ftp://ftp.iprox.cn/IPX0000123000/raw/folder/sample.raw"], - output_folder=tmp_dir, - skip_if_downloaded_already=False, - ) diff --git a/pridepy/tests/test_jpost_files.py b/pridepy/tests/test_jpost_files.py index d41cb4f..021401e 100644 --- a/pridepy/tests/test_jpost_files.py +++ b/pridepy/tests/test_jpost_files.py @@ -85,4 +85,6 @@ def test_download_file_by_name_uses_jpost_ftp_listing(self): ftp_urls=["ftp://ftp.jpostdb.org/JPST000001/raw/folder/sample.raw"], output_folder=tmp_dir, skip_if_downloaded_already=False, + use_tls=False, + parallel_files=1, ) diff --git a/pridepy/tests/test_massive_files.py b/pridepy/tests/test_massive_files.py index f600b71..a958309 100644 --- a/pridepy/tests/test_massive_files.py +++ b/pridepy/tests/test_massive_files.py @@ -99,4 +99,39 @@ def test_download_file_by_name_uses_massive_ftp_listing(self): ftp_urls=["ftp://massive-ftp.ucsd.edu/v01/MSV000012345/raw/folder/sample.raw"], output_folder=tmp_dir, skip_if_downloaded_already=False, + use_tls=True, + parallel_files=1, ) + + def test_repo_uses_tls_true_for_massive_false_for_jpost(self): + assert Files._repo_uses_tls("MSV000012345") is True + assert Files._repo_uses_tls("JPST000001") is False + assert Files._repo_uses_tls("PXD000012") is False + + def test_download_all_raw_files_threads_parallel_files_for_massive(self): + files = Files() + massive_records = [ + Files._build_massive_file_record( + "MSV000012345", + f"ftp://massive-ftp.ucsd.edu/v01/MSV000012345/raw/run{i}.raw", + ) + for i in range(3) + ] + + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.object( + Files, "_list_massive_public_files", return_value=massive_records + ), patch.object(Files, "download_ftp_urls") as download_mock: + files.download_all_raw_files( + accession="MSV000012345", + output_folder=tmp_dir, + skip_if_downloaded_already=False, + protocol="ftp", + aspera_maximum_bandwidth="100M", + checksum_check=False, + parallel_files=3, + ) + + kwargs = download_mock.call_args.kwargs + assert kwargs["use_tls"] is True + assert kwargs["parallel_files"] == 3 From a3603785826a5ea2b27b5e825f356db3571bbc46 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 11:51:06 +0100 Subject: [PATCH 03/21] JPOST PROXI listing, post-transfer size check, iProX accession guard Address deferred items from PR review: 1. JPOST PROXI listing ftp.jpostdb.org rate-limits aggressively per source IP (sticky 421 on any retry within ~10 min). For listing this is fatal because every pridepy invocation needs a fresh tree walk. JPOST publishes a JSON PROXI endpoint at https://repository.jpostdb.org/proxi/datasets/ that returns datasetFiles[*].value as ftp:// URIs alongside CV labels (Associated raw file URI, Search engine output file URI, Result file URI, Peak list file URI, ...). _list_jpost_public_files now hits PROXI first, builds file records with categories derived from the CV name (mapped via JPOST_PROXI_CATEGORY_MAP), and falls back to the FTP tree walk only if PROXI is unreachable or returns no records. Live-tested against JPST002311 -> 160 files (88 RAW, 72 SEARCH). 2. Post-transfer size check Neither MassIVE nor JPOST publishes per-file checksum manifests in a standard location, so md5 verification isn't an option for these datasets. As a lighter-weight integrity signal, _download_one_ftp_path now compares the local file size against ftp.size() after retrbinary returns and treats a mismatch as a retryable failure (next attempt resumes via REST from the current partial). This catches half-finished transfers where the data channel was closed early without retrbinary raising. 3. iProX accession guard Probing showed iProX REST endpoints (PMD009Controller/findByProjectId.jsonp, findFilesBySubProjectID.jsonp) all redirect unauthenticated callers to a CAS login page, and downloads use faspe:// URLs with per-session tokens. Native support is therefore blocked until iProX exposes an anonymous JSON API or pridepy carries iProX credentials. Until then, add is_iprox_accession() and _raise_if_iprox() so every public entry point (get_all_raw_file_list, download_all_raw_files, download_all_category_files, download_file_by_name, download_files_by_list, get_file_from_api) emits a clear NotImplementedError instead of silently falling through to the PRIDE API and 404-ing. Tests - test_jpost_files: PROXI listing maps CV names to PRIDE categories; PROXI failure falls back to FTP walk. - test_iprox_guard: regex coverage; assert each entry point raises NotImplementedError for IPX accessions; assert is_direct_download_accession returns False for IPX. - test_ftp_download_validation: size mismatch retries until success; repeated mismatch raises after max_download_retries; correct size skips retries. - 25 tests total (10 MassIVE + 8 JPOST + 5 iProX guard + 3 size check). Live verification (same MassIVE FTPS path as before) - MSV000080175 listing + params.xml download still produces 10315 B, MD5 43d87368d705c3f380c1d030b14850c4. - JPST002311 PROXI listing returns 160 files with correct categories. Actual file transfer is still blocked from this IP by JPOST's 421-too-many-connections rate limit; the code path is shared with MassIVE (same _download_one_ftp_path / download_ftp_urls), so a fresh IP should succeed. --- README.md | 10 +- pridepy/files/files.py | 150 ++++++++++++++++-- pridepy/tests/test_ftp_download_validation.py | 91 +++++++++++ pridepy/tests/test_iprox_guard.py | 62 ++++++++ pridepy/tests/test_jpost_files.py | 65 +++++++- 5 files changed, 361 insertions(+), 17 deletions(-) create mode 100644 pridepy/tests/test_ftp_download_validation.py create mode 100644 pridepy/tests/test_iprox_guard.py diff --git a/README.md b/README.md index 6139748..9609f26 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ You can: - download public and private PRIDE files -- download public MassIVE (`MSV...`) and JPOST (`JPST...`) datasets directly from their native FTP archives +- download public MassIVE (`MSV...`) and JPOST (`JPST...`) datasets directly. MassIVE goes through FTPS at `massive-ftp.ucsd.edu`; JPOST uses the JSON PROXI endpoint at `repository.jpostdb.org` for listings and `ftp.jpostdb.org` for transfers - download by category (`RAW`, `SEARCH`, `RESULT`, etc.) - stream project and file metadata - search projects by keyword and filters @@ -94,7 +94,13 @@ pridepy download-all-public-raw-files \ -o ./downloads/JPST000123 ``` -For these direct downloads, `pridepy` enumerates the dataset from the repository's public FTP tree (MassIVE at `massive-ftp.ucsd.edu` over FTPS, JPOST at `ftp.jpostdb.org` over plain FTP). Raw downloads follow each repository's own collection layout, so `download-all-public-raw-files` downloads the files stored under the dataset's `raw/` collection. +For these direct downloads, `pridepy` enumerates the dataset from the repository: +- **MassIVE** lists files by walking the FTPS tree at `massive-ftp.ucsd.edu` (TLS is required by the server). +- **JPOST** lists files through the JSON PROXI endpoint at `https://repository.jpostdb.org/proxi/datasets/` and downloads them from `ftp.jpostdb.org` over plain FTP. The PROXI listing avoids the source-IP connection limit JPOST enforces on FTP. + +Raw downloads follow each repository's own collection layout, so `download-all-public-raw-files` downloads the files stored under the dataset's `raw/` collection. Direct downloads support REST-based resume, per-file retries, parallel workers (`-w N` up to 3), and post-transfer size verification against the server-reported size. + +iProX accessions (`IPX...`) are recognised so the CLI gives you a clear "not supported yet" error rather than treating them as unknown PRIDE accessions. Native iProX download support is blocked on their REST API requiring CAS authentication and downloads going through Aspera with per-session tokens; track that work upstream. ### 4) Download only selected categories diff --git a/pridepy/files/files.py b/pridepy/files/files.py index 464e761..ecacd16 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -65,6 +65,16 @@ class Files: MASSIVE_ARCHIVE_FTP_URL_PREFIX = "ftp://massive-ftp.ucsd.edu/v01/" JPOST_ARCHIVE_FTP = "ftp.jpostdb.org" JPOST_ARCHIVE_FTP_URL_PREFIX = "ftp://ftp.jpostdb.org/" + JPOST_PROXI_BASE_URL = "https://repository.jpostdb.org/proxi/datasets/" + JPOST_PROXI_CATEGORY_MAP = { + "Associated raw file URI": "RAW", + "Result file URI": "RESULT", + "Search engine output file URI": "SEARCH", + "Peak list file URI": "PEAK", + "Spectrum library file URI": "SPECTRUM_LIBRARY", + "Sequence database URI": "FASTA", + "Quantification file URI": "RESULT", + } S3_URL = "https://hh.fire.sdo.ebi.ac.uk" S3_BUCKET = "pride-public" PROTOCOL_ORDER = ["aspera", "s3", "ftp", "globus"] @@ -304,7 +314,17 @@ def _get_jpost_public_ftp_url(accession: str, remote_path: str) -> str: return f"{Files.JPOST_ARCHIVE_FTP_URL_PREFIX}{accession.upper()}/{relative_path}" @staticmethod - def _build_jpost_file_record(accession: str, ftp_url: str) -> Dict: + def _build_jpost_file_record( + accession: str, ftp_url: str, category_from_proxi: Optional[str] = None + ) -> Dict: + """ + Build a pridepy file record for a JPOST file. + + When ``category_from_proxi`` is provided (e.g. ``"Associated raw file URI"``), + the PROXI CV name takes precedence over the heuristic collection-from-path + mapping. Falls back to the same path-segment heuristic used for MassIVE + when the category isn't known. + """ parsed = urlparse(ftp_url) root_prefix = f"/{accession.upper()}/" relative_path = parsed.path @@ -312,10 +332,14 @@ def _build_jpost_file_record(accession: str, ftp_url: str) -> Dict: relative_path = relative_path[len(root_prefix) :] relative_path = relative_path.lstrip("/") collection = relative_path.split("/", 1)[0] if relative_path else "" + if category_from_proxi and category_from_proxi in Files.JPOST_PROXI_CATEGORY_MAP: + category = Files.JPOST_PROXI_CATEGORY_MAP[category_from_proxi] + else: + category = Files._map_massive_collection_to_category(collection) return { "accession": accession.upper(), "fileName": os.path.basename(parsed.path), - "fileCategory": {"value": Files._map_massive_collection_to_category(collection)}, + "fileCategory": {"value": category}, "publicFileLocations": [{"name": "FTP Protocol", "value": ftp_url}], "relativePath": relative_path, "collection": collection, @@ -333,6 +357,34 @@ def is_direct_download_accession(accession: str) -> bool: or Files.is_jpost_accession(accession) ) + @staticmethod + def is_iprox_accession(accession: str) -> bool: + """ + Return True when the accession looks like an iProX dataset accession + (``IPX`` followed by 7-10 digits). iProX is recognised so the CLI can + emit a clear error rather than treating IPX as an unknown PRIDE + accession; direct downloads from iProX are not yet supported because + their listing API requires CAS authentication and downloads go through + Aspera with per-session tokens. + """ + if not accession: + return False + return bool(re.fullmatch(r"IPX\d{7,10}", accession.upper())) + + @staticmethod + def _raise_if_iprox(accession: str) -> None: + """ + Raise a clear ``NotImplementedError`` when a user passes an iProX + accession. iProX downloads need CAS authentication and Aspera-tokenised + ``faspe://`` URLs which pridepy does not handle yet. + """ + if Files.is_iprox_accession(accession): + raise NotImplementedError( + f"iProX accession {accession} is recognised but not yet supported. " + "iProX requires CAS authentication and Aspera-tokenised downloads; " + "track this in pridepy or use the iProX web interface for now." + ) + @staticmethod def _repo_uses_tls(accession: str) -> bool: """ @@ -491,22 +543,71 @@ def _download_massive_file_records( def _list_jpost_public_files(self, accession: str) -> List[Dict]: """ - Discover all public files for a JPOST dataset from its anonymous FTP tree. + Discover all public files for a JPOST dataset. + + Prefers the JPOST PROXI JSON endpoint at + ``https://repository.jpostdb.org/proxi/datasets/`` since it + returns file URLs with category labels and avoids the anonymous-FTP + rate limit that ``ftp.jpostdb.org`` applies per source IP. Falls back + to walking the FTP tree if PROXI is unreachable or returns no files. """ normalized_accession = accession.upper() - remote_root = self._get_jpost_public_root(normalized_accession) - remote_files = self._list_ftp_repo_files( - host=self.JPOST_ARCHIVE_FTP, - remote_root=remote_root, - error_label=f"JPOST dataset {normalized_accession}", + try: + return self._list_jpost_public_files_via_proxi(normalized_accession) + except Exception as proxi_error: + logging.warning( + f"JPOST PROXI listing failed for {normalized_accession} " + f"({proxi_error}); falling back to FTP tree walk." + ) + remote_root = self._get_jpost_public_root(normalized_accession) + remote_files = self._list_ftp_repo_files( + host=self.JPOST_ARCHIVE_FTP, + remote_root=remote_root, + error_label=f"JPOST dataset {normalized_accession}", + ) + return [ + self._build_jpost_file_record( + normalized_accession, + self._get_jpost_public_ftp_url(normalized_accession, remote_file), + ) + for remote_file in remote_files + ] + + def _list_jpost_public_files_via_proxi(self, accession: str) -> List[Dict]: + """ + Fetch the JPOST PROXI dataset metadata and turn each ``datasetFiles`` + entry into a pridepy file record. The PROXI ``name`` field is mapped to + a PRIDE-style category so existing RAW/SEARCH/RESULT filtering works. + """ + import json as _json + + proxi_url = f"{self.JPOST_PROXI_BASE_URL}{accession}" + logging.info(f"Fetching JPOST PROXI metadata: {proxi_url}") + response = requests.get( + proxi_url, + headers={"Accept": "application/json"}, + timeout=30, ) - return [ - self._build_jpost_file_record( - normalized_accession, - self._get_jpost_public_ftp_url(normalized_accession, remote_file), + response.raise_for_status() + data = _json.loads(response.content) + dataset_files = data.get("datasetFiles") or [] + records: List[Dict] = [] + for entry in dataset_files: + value = (entry or {}).get("value") + if not value or not value.startswith("ftp://"): + continue + records.append( + self._build_jpost_file_record( + accession, + value, + category_from_proxi=(entry or {}).get("name"), + ) ) - for remote_file in remote_files - ] + if not records: + raise RuntimeError( + f"JPOST PROXI returned no FTP file URIs for {accession}" + ) + return records def _list_direct_download_files(self, accession: str) -> List[Dict]: """ @@ -587,6 +688,7 @@ def get_all_raw_file_list(self, project_accession): :param project_accession: PRIDE accession :return: raw file list in JSON format """ + self._raise_if_iprox(project_accession) if self.is_direct_download_accession(project_accession): record_files = self._list_direct_download_files(project_accession) return [ @@ -619,6 +721,7 @@ def download_all_raw_files( :param checksum_check: Download checksum for a given project. :return: None """ + self._raise_if_iprox(accession) if not (os.path.isdir(output_folder)): os.mkdir(output_folder) @@ -1115,6 +1218,7 @@ def download_file_by_name( :param aspera_maximum_bandwidth: Aspera maximum bandwidth :param checksum_check: Download checksum for a given project. """ + self._raise_if_iprox(accession) if not (os.path.isdir(output_folder)): os.mkdir(output_folder) @@ -1189,6 +1293,7 @@ def get_file_from_api(self, accession, file_name) -> List[Dict]: :param file_name: file name :return: file in json format """ + self._raise_if_iprox(accession) try: if self.is_direct_download_accession(accession): @@ -1556,6 +1661,7 @@ def download_files_by_list( """ if not file_names: raise ValueError("file_names must contain at least one filename") + self._raise_if_iprox(accession) if self.is_direct_download_accession(accession): all_files = self._list_direct_download_files(accession) @@ -1845,6 +1951,7 @@ def download_all_category_files( :param categories: List of file categories to download. :param category: Single file category (deprecated, use categories instead). """ + self._raise_if_iprox(accession) if categories is None: categories = [category] if category else ["RAW"] raw_files = self.get_all_category_file_list(accession, categories) @@ -2030,6 +2137,21 @@ def callback(data): f.seek(0) f.truncate() ftp.retrbinary(f"RETR {ftp_path}", callback) + + # Post-transfer integrity check: server-reported size must match + # the local size. Catches half-finished transfers that retrbinary + # didn't raise on (e.g. server closed the data channel early). + # The next iteration will REST-resume from where we left off. + if total_size: + final_size = os.path.getsize(local_path) + if final_size != total_size: + attempt += 1 + logging.error( + f"Size mismatch for {local_path}: " + f"got {final_size} bytes, expected {total_size} " + f"(attempt {attempt})" + ) + continue logging.info(f"Successfully downloaded {local_path}") return except (socket.timeout, ftplib.error_temp, ftplib.error_perm) as e: diff --git a/pridepy/tests/test_ftp_download_validation.py b/pridepy/tests/test_ftp_download_validation.py new file mode 100644 index 0000000..10bbfb5 --- /dev/null +++ b/pridepy/tests/test_ftp_download_validation.py @@ -0,0 +1,91 @@ +"""Coverage for the size-mismatch detection added to ``_download_one_ftp_path``. + +The FTP server's ``SIZE`` reply is the only integrity signal direct downloads +have (MassIVE/JPOST don't publish per-file MD5 manifests like PRIDE). After +``retrbinary`` returns, we re-check the local size against the server-reported +size and treat a mismatch as a retryable failure. +""" +import os +import tempfile +from unittest import TestCase +from unittest.mock import MagicMock + +import pytest + +from pridepy.files.files import Files + + +def _make_fake_ftp(expected_size, write_bytes_per_call): + """Return a MagicMock FTP that writes ``write_bytes_per_call`` bytes per call. + + ``retrbinary`` is invoked once per attempt; we record how many attempts + happened by counting calls and produce a different payload size for each. + """ + fake = MagicMock() + fake.size.return_value = expected_size + fake.sendcmd = MagicMock() + fake._call_count = 0 + + def retrbinary(cmd, callback): + idx = fake._call_count + fake._call_count += 1 + payload = b"x" * write_bytes_per_call[idx] + callback(payload) + + fake.retrbinary.side_effect = retrbinary + return fake + + +class TestSizeMismatchValidation(TestCase): + def test_size_mismatch_is_retried_then_succeeds(self): + """First attempt returns 50 bytes (expected 100) -> retry, second yields 50 more -> 100, OK.""" + with tempfile.TemporaryDirectory() as tmp: + local_path = os.path.join(tmp, "f.bin") + ftp = _make_fake_ftp(expected_size=100, write_bytes_per_call=[50, 50]) + + Files._download_one_ftp_path( + ftp=ftp, + ftp_path="/JPST000001/f.bin", + local_path=local_path, + skip_if_downloaded_already=False, + max_download_retries=3, + ) + + assert os.path.getsize(local_path) == 100 + assert ftp.retrbinary.call_count == 2 + # First attempt: file empty, no REST. Second: file has 50 bytes, REST 50 issued. + sendcmd_args = [call.args[0] for call in ftp.sendcmd.call_args_list] + assert sendcmd_args == ["REST 50"] + + def test_size_mismatch_after_retries_raises(self): + """Three attempts all undersize -> RuntimeError after giving up.""" + with tempfile.TemporaryDirectory() as tmp: + local_path = os.path.join(tmp, "f.bin") + ftp = _make_fake_ftp(expected_size=100, write_bytes_per_call=[10, 10, 10]) + + with pytest.raises(RuntimeError, match="Giving up"): + Files._download_one_ftp_path( + ftp=ftp, + ftp_path="/JPST000001/f.bin", + local_path=local_path, + skip_if_downloaded_already=False, + max_download_retries=3, + ) + + assert ftp.retrbinary.call_count == 3 + + def test_correct_size_returns_without_retry(self): + with tempfile.TemporaryDirectory() as tmp: + local_path = os.path.join(tmp, "f.bin") + ftp = _make_fake_ftp(expected_size=50, write_bytes_per_call=[50]) + + Files._download_one_ftp_path( + ftp=ftp, + ftp_path="/JPST000001/f.bin", + local_path=local_path, + skip_if_downloaded_already=False, + max_download_retries=3, + ) + + assert os.path.getsize(local_path) == 50 + assert ftp.retrbinary.call_count == 1 diff --git a/pridepy/tests/test_iprox_guard.py b/pridepy/tests/test_iprox_guard.py new file mode 100644 index 0000000..f28f490 --- /dev/null +++ b/pridepy/tests/test_iprox_guard.py @@ -0,0 +1,62 @@ +"""iProX accession recognition and unsupported-accession guard. + +iProX direct downloads are not implemented (the iProX REST API gates listing +behind CAS authentication and files are served over Aspera with per-session +tokens). pridepy still recognises the accession format so the user gets a +clear ``NotImplementedError`` instead of a confusing PRIDE-API 404. +""" +import tempfile +from unittest import TestCase + +import pytest + +from pridepy.files.files import Files + + +class TestIProXGuard(TestCase): + def test_is_iprox_accession_matches_ipx_format(self): + assert Files.is_iprox_accession("IPX0000123") + assert Files.is_iprox_accession("IPX0000123000") + assert Files.is_iprox_accession("ipx1234567") + assert not Files.is_iprox_accession("PXD000012") + assert not Files.is_iprox_accession("MSV000012345") + assert not Files.is_iprox_accession("JPST000001") + assert not Files.is_iprox_accession("IPX12") + assert not Files.is_iprox_accession("") + assert not Files.is_iprox_accession(None) + + def test_iprox_is_not_a_direct_download_accession(self): + assert Files.is_direct_download_accession("IPX0000123000") is False + + def test_get_all_raw_file_list_raises_for_iprox(self): + files = Files() + with pytest.raises(NotImplementedError, match="iProX"): + files.get_all_raw_file_list("IPX0006033000") + + def test_download_file_by_name_raises_for_iprox(self): + files = Files() + with tempfile.TemporaryDirectory() as tmp_dir: + with pytest.raises(NotImplementedError, match="iProX"): + files.download_file_by_name( + accession="IPX0006033000", + file_name="foo.raw", + output_folder=tmp_dir, + skip_if_downloaded_already=False, + protocol="ftp", + username=None, + password=None, + aspera_maximum_bandwidth="100M", + checksum_check=False, + ) + + def test_download_all_raw_files_raises_for_iprox(self): + files = Files() + with tempfile.TemporaryDirectory() as tmp_dir: + with pytest.raises(NotImplementedError, match="iProX"): + files.download_all_raw_files( + accession="IPX0006033000", + output_folder=tmp_dir, + skip_if_downloaded_already=False, + protocol="ftp", + aspera_maximum_bandwidth="100M", + ) diff --git a/pridepy/tests/test_jpost_files.py b/pridepy/tests/test_jpost_files.py index 021401e..678adda 100644 --- a/pridepy/tests/test_jpost_files.py +++ b/pridepy/tests/test_jpost_files.py @@ -1,6 +1,7 @@ +import json import tempfile from unittest import TestCase -from unittest.mock import patch +from unittest.mock import MagicMock, patch from pridepy.files.files import Files @@ -88,3 +89,65 @@ def test_download_file_by_name_uses_jpost_ftp_listing(self): use_tls=False, parallel_files=1, ) + + def test_proxi_listing_maps_cv_name_to_category(self): + files = Files() + proxi_response = { + "datasetFiles": [ + { + "accession": "PRIDE:0000404", + "name": "Associated raw file URI", + "value": "ftp://ftp.jpostdb.org/JPST002311/sample01.raw", + }, + { + "accession": "PRIDE:0000408", + "name": "Search engine output file URI", + "value": "ftp://ftp.jpostdb.org/JPST002311/sample01.sne", + }, + { + "accession": "PRIDE:0000999", + "name": "Some unknown CV", + "value": "ftp://ftp.jpostdb.org/JPST002311/misc/sample01.txt", + }, + { + "accession": "PRIDE:0000404", + "name": "Associated raw file URI", + "value": "https://example.org/not-ftp.raw", + }, + ] + } + fake_response = MagicMock() + fake_response.content = json.dumps(proxi_response).encode("utf-8") + fake_response.raise_for_status = MagicMock() + with patch("pridepy.files.files.requests.get", return_value=fake_response) as req_mock: + records = files._list_jpost_public_files_via_proxi("JPST002311") + + req_mock.assert_called_once() + call_url = req_mock.call_args[0][0] + assert call_url == "https://repository.jpostdb.org/proxi/datasets/JPST002311" + # Non-FTP URI ignored; three FTP entries kept. + assert len(records) == 3 + cats = {r["fileName"]: r["fileCategory"]["value"] for r in records} + assert cats["sample01.raw"] == "RAW" + assert cats["sample01.sne"] == "SEARCH" + # Unknown CV falls back to path-based heuristic (collection "misc" -> OTHER). + assert cats["sample01.txt"] == "OTHER" + + def test_proxi_falls_back_to_ftp_walk_on_error(self): + files = Files() + ftp_record = Files._build_jpost_file_record( + "JPST000001", "ftp://ftp.jpostdb.org/JPST000001/raw/x.raw" + ) + with patch.object( + Files, + "_list_jpost_public_files_via_proxi", + side_effect=RuntimeError("proxi down"), + ), patch.object( + Files, "_list_ftp_repo_files", return_value=["/JPST000001/raw/x.raw"] + ) as ftp_mock: + result = files._list_jpost_public_files("JPST000001") + + ftp_mock.assert_called_once() + assert len(result) == 1 + assert result[0]["fileName"] == "x.raw" + assert result[0]["source"] == "JPOST" From df165ab64ee1ec3e5779064e47cd32c255a4b0b0 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 12:23:40 +0100 Subject: [PATCH 04/21] iProX direct downloads via PX XML + anonymous HTTPS at download.iprox.org User pointed out the iProX dataset XML endpoint: https://www.iprox.cn/FAF016Controller/readXml.jsonp?fileId=file__xml Probing showed something simpler works: iProX publishes the ProteomeXchange XML for every public dataset at a deterministic, anonymous-accessible path on download.iprox.org. No CAS auth, no Aspera tokens, no fileId discovery needed: http://download.iprox.org//PX_.xml The XML embeds Associated raw file URI / Search engine output file URI cvParams pointing at HTTPS file URLs on the same host (Accept-Ranges: bytes, so we get resume for free). The earlier 'iProX is auth-gated' finding was specifically about the iProX UI's CAS-protected JSON endpoints (PMD009Controller/findBySubProjectId.jsonp etc.); the public download server is anonymous. Changes ------- - IPROX_DOWNLOAD_BASE_URL, IPROX_PX_XML_URL_TEMPLATE, IPROX_PX_CATEGORY_MAP added (the latter is the same CV map JPOST PROXI uses). - _build_iprox_file_record() and _list_iprox_public_files() added, mirroring the JPOST helpers but using the PX XML schema. - is_direct_download_accession() now returns True for IPX accessions. - _raise_if_iprox() removed entirely. All entry points that previously called it no longer do, so IPX accessions flow through the unified direct-download dispatcher. - _list_direct_download_files() dispatches IPX -> _list_iprox_public_files. - _download_direct_download_records() now partitions URLs by scheme: ftp:// records go through download_ftp_urls (MassIVE/JPOST), http(s):// records through download_http_urls (iProX). A dataset whose records somehow contain both flows through both paths correctly. - download_http_urls() grew parallel_files + max_retries kwargs and a ThreadPoolExecutor path. The per-file worker _http_download_one() wraps _parallel_download() (reused from the globus codepath) with retry, so iProX gets the same HEAD-then-Range resume and restart-on- non-206 behaviour we already use for PRIDE HTTPS. Tests (26 pass) --------------- - test_iprox_guard.py renamed to test_iprox_files.py and rewritten as positive-path tests: regex coverage, IPX is a direct-download accession, PX XML parsing maps cvParam name -> PRIDE category, RAW filtering ignores non-FTP/HTTPS URIs, download_file_by_name routes iProX URLs to download_http_urls (not download_ftp_urls) with parallel_files=1. - All previously-added MassIVE / JPOST / size-validation tests still pass (20 + 6 iProX = 26 total). Live verification (this branch, against download.iprox.org) ----------------------------------------------------------- - IPX0017413000 listing: 7 files, correctly categorised (RAW + SEARCH). - download_file_by_name(IPX0017413000, protein_annotation_profile.xlsx): 3,253,230 B downloaded, MD5 c17baf230ffde1e2837ec4eb32dcea68, valid XLSX (PK header). - Range-based resume: pre-staged 1,000,000 B partial, completed to 3,253,230 B with the same MD5. - Parallel HTTPS: 3 worker downloads from IPX0017413000 (xlsx + 2 RAW) ran concurrently; cancelled mid-flight after observing all 3 files growing in parallel (xlsx finished at 3.25 MB, Tumor_NK1.raw was at 142 MB and Control_NK3.raw at 419 MB before cancellation). --- README.md | 24 +-- pridepy/files/files.py | 290 ++++++++++++++++++++++-------- pridepy/tests/test_iprox_files.py | 148 +++++++++++++++ pridepy/tests/test_iprox_guard.py | 62 ------- 4 files changed, 374 insertions(+), 150 deletions(-) create mode 100644 pridepy/tests/test_iprox_files.py delete mode 100644 pridepy/tests/test_iprox_guard.py diff --git a/README.md b/README.md index 9609f26..27bab43 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ You can: - download public and private PRIDE files -- download public MassIVE (`MSV...`) and JPOST (`JPST...`) datasets directly. MassIVE goes through FTPS at `massive-ftp.ucsd.edu`; JPOST uses the JSON PROXI endpoint at `repository.jpostdb.org` for listings and `ftp.jpostdb.org` for transfers +- download public MassIVE (`MSV...`), JPOST (`JPST...`), and iProX (`IPX...`) datasets directly. MassIVE goes through FTPS at `massive-ftp.ucsd.edu`; JPOST uses the JSON PROXI endpoint at `repository.jpostdb.org` for listings and `ftp.jpostdb.org` for transfers; iProX fetches the dataset's ProteomeXchange XML from `download.iprox.org` and downloads files over anonymous HTTPS - download by category (`RAW`, `SEARCH`, `RESULT`, etc.) - stream project and file metadata - search projects by keyword and filters @@ -80,7 +80,7 @@ pridepy download-all-public-raw-files \ --checksum-check ``` -### 3) Download a public MassIVE or JPOST dataset directly +### 3) Download a public MassIVE, JPOST, or iProX dataset directly ```bash # MassIVE @@ -90,17 +90,21 @@ pridepy download-all-public-raw-files \ # JPOST pridepy download-all-public-raw-files \ - -a JPST000123 \ - -o ./downloads/JPST000123 + -a JPST002311 \ + -o ./downloads/JPST002311 + +# iProX +pridepy download-all-public-raw-files \ + -a IPX0017413000 \ + -o ./downloads/IPX0017413000 ``` For these direct downloads, `pridepy` enumerates the dataset from the repository: - **MassIVE** lists files by walking the FTPS tree at `massive-ftp.ucsd.edu` (TLS is required by the server). - **JPOST** lists files through the JSON PROXI endpoint at `https://repository.jpostdb.org/proxi/datasets/` and downloads them from `ftp.jpostdb.org` over plain FTP. The PROXI listing avoids the source-IP connection limit JPOST enforces on FTP. +- **iProX** fetches the dataset's ProteomeXchange XML from `http://download.iprox.org//PX_.xml`, then downloads each referenced file from the same host over anonymous HTTPS. iProX exposes Aspera (`faspe://`) with username/password for very large bulk transfers; `pridepy` uses the public HTTPS endpoint instead so no iProX credentials are required. -Raw downloads follow each repository's own collection layout, so `download-all-public-raw-files` downloads the files stored under the dataset's `raw/` collection. Direct downloads support REST-based resume, per-file retries, parallel workers (`-w N` up to 3), and post-transfer size verification against the server-reported size. - -iProX accessions (`IPX...`) are recognised so the CLI gives you a clear "not supported yet" error rather than treating them as unknown PRIDE accessions. Native iProX download support is blocked on their REST API requiring CAS authentication and downloads going through Aspera with per-session tokens; track that work upstream. +Raw downloads follow each repository's own collection layout, so `download-all-public-raw-files` downloads the files stored under the dataset's `raw/` collection. Direct downloads support resume (REST for FTP, byte-Range for HTTPS), per-file retries, parallel workers (`-w N` up to 3), and post-transfer size verification against the server-reported size. ### 4) Download only selected categories @@ -111,7 +115,7 @@ pridepy download-all-public-category-files \ -c RAW,SEARCH ``` -You can also request a specific MassIVE / JPOST collection through the same category interface: +You can also request a specific MassIVE / JPOST / iProX collection through the same category interface: ```bash pridepy download-all-public-category-files \ @@ -256,13 +260,13 @@ print(f"RAW files: {len(raw_files)}") print(raw_files[0]["fileName"]) ``` -For MassIVE / JPOST accessions, the same method returns the files found under the dataset's `raw/` collection: +For MassIVE / JPOST / iProX accessions, the same method returns the files found under the dataset's `raw/` collection: ```python from pridepy.files.files import Files files = Files() -for accession in ("MSV000082297", "JPST000123"): +for accession in ("MSV000082297", "JPST002311", "IPX0017413000"): raw_files = files.get_all_raw_file_list(accession) print(f"{accession} raw files: {len(raw_files)}") ``` diff --git a/pridepy/files/files.py b/pridepy/files/files.py index ecacd16..0f8e029 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -75,6 +75,14 @@ class Files: "Sequence database URI": "FASTA", "Quantification file URI": "RESULT", } + IPROX_DOWNLOAD_BASE_URL = "http://download.iprox.org/" + IPROX_PX_XML_URL_TEMPLATE = ( + "http://download.iprox.org/{accession}/PX_{accession}.xml" + ) + # iProX PX XML uses the same PSI-MS cvParam "name" values as JPOST, so the + # JPOST PROXI category map applies. PX XML cvParam "Associated raw file URI" + # is the canonical raw-file label per the PSI-MS CV (MS:1002846). + IPROX_PX_CATEGORY_MAP = JPOST_PROXI_CATEGORY_MAP S3_URL = "https://hh.fire.sdo.ebi.ac.uk" S3_BUCKET = "pride-public" PROTOCOL_ORDER = ["aspera", "s3", "ftp", "globus"] @@ -346,45 +354,69 @@ def _build_jpost_file_record( "source": "JPOST", } + @staticmethod + def _build_iprox_file_record( + accession: str, https_url: str, category_from_px: Optional[str] = None + ) -> Dict: + """ + Build a pridepy file record for an iProX file. iProX exposes files + over anonymous HTTPS at + ``http://download.iprox.org///``; + ``category_from_px`` is the ``cvParam`` ``name`` from the dataset's + ProteomeXchange XML (e.g. ``"Associated raw file URI"``). + """ + parsed = urlparse(https_url) + root_prefix = f"/{accession.upper()}/" + relative_path = parsed.path + if relative_path.startswith(root_prefix): + relative_path = relative_path[len(root_prefix) :] + relative_path = relative_path.lstrip("/") + collection = relative_path.split("/", 1)[0] if relative_path else "" + if category_from_px and category_from_px in Files.IPROX_PX_CATEGORY_MAP: + category = Files.IPROX_PX_CATEGORY_MAP[category_from_px] + else: + category = Files._map_massive_collection_to_category(collection) + return { + "accession": accession.upper(), + "fileName": os.path.basename(parsed.path), + "fileCategory": {"value": category}, + # ``FTP Protocol`` is the existing label the download dispatcher + # uses to locate a file URL; here it actually points at HTTPS. + # ``_download_direct_download_records`` routes by URL scheme. + "publicFileLocations": [{"name": "FTP Protocol", "value": https_url}], + "relativePath": relative_path, + "collection": collection, + "source": "iProX", + } + @staticmethod def is_direct_download_accession(accession: str) -> bool: """ - Return True when the accession is served by a public FTP repository - that pridepy supports via direct downloads (no ProteomeXchange API). + Return True when the accession is served by a public repository that + pridepy supports via direct downloads (no ProteomeXchange API). + MassIVE and JPOST use FTP(S); iProX uses anonymous HTTPS via + ``download.iprox.org``. """ return ( Files.is_massive_accession(accession) or Files.is_jpost_accession(accession) + or Files.is_iprox_accession(accession) ) @staticmethod def is_iprox_accession(accession: str) -> bool: """ Return True when the accession looks like an iProX dataset accession - (``IPX`` followed by 7-10 digits). iProX is recognised so the CLI can - emit a clear error rather than treating IPX as an unknown PRIDE - accession; direct downloads from iProX are not yet supported because - their listing API requires CAS authentication and downloads go through - Aspera with per-session tokens. + (``IPX`` followed by 7-10 digits). iProX exposes the dataset + ProteomeXchange XML at + ``http://download.iprox.org//PX_.xml`` and the + referenced files are downloadable from ``download.iprox.org`` over + anonymous HTTPS with byte-range support. """ if not accession: return False return bool(re.fullmatch(r"IPX\d{7,10}", accession.upper())) - @staticmethod - def _raise_if_iprox(accession: str) -> None: - """ - Raise a clear ``NotImplementedError`` when a user passes an iProX - accession. iProX downloads need CAS authentication and Aspera-tokenised - ``faspe://`` URLs which pridepy does not handle yet. - """ - if Files.is_iprox_accession(accession): - raise NotImplementedError( - f"iProX accession {accession} is recognised but not yet supported. " - "iProX requires CAS authentication and Aspera-tokenised downloads; " - "track this in pridepy or use the iProX web interface for now." - ) - @staticmethod def _repo_uses_tls(accession: str) -> bool: """ @@ -609,14 +641,67 @@ def _list_jpost_public_files_via_proxi(self, accession: str) -> List[Dict]: ) return records + def _list_iprox_public_files(self, accession: str) -> List[Dict]: + """ + Discover all public files for an iProX dataset. + + iProX publishes the ProteomeXchange XML for every public dataset at a + deterministic path on its anonymous HTTPS download server:: + + http://download.iprox.org//PX_.xml + + We fetch that XML, walk every ````'s ``cvParam`` entries, + and turn each ``Associated raw file URI`` (and sibling URIs for + search-engine output, result files, etc.) into a pridepy file record. + File downloads themselves go through plain HTTPS on the same host, + which supports ``Range`` requests for resume. + """ + normalized_accession = accession.upper() + xml_url = self.IPROX_PX_XML_URL_TEMPLATE.format(accession=normalized_accession) + logging.info(f"Fetching iProX PX XML: {xml_url}") + response = requests.get(xml_url, timeout=30) + response.raise_for_status() + try: + root = ET.fromstring(response.content) + except ET.ParseError as parse_error: + raise RuntimeError( + f"Unable to parse iProX PX XML for {normalized_accession}: {parse_error}" + ) from parse_error + + records: List[Dict] = [] + for dataset_file in root.iter("DatasetFile"): + for cv in dataset_file.findall("cvParam"): + name = cv.attrib.get("name") + value = cv.attrib.get("value") + if not value or not name or not name.endswith("URI"): + continue + if not value.lower().startswith(("http://", "https://")): + continue + records.append( + self._build_iprox_file_record( + normalized_accession, + value, + category_from_px=name, + ) + ) + if not records: + raise RuntimeError( + f"iProX PX XML for {normalized_accession} contained no downloadable HTTPS URIs" + ) + return records + def _list_direct_download_files(self, accession: str) -> List[Dict]: """ - Dispatch to the right FTP-based listing for a direct-download repository. + Dispatch to the right listing transport for a direct-download + repository: MassIVE walks FTPS, JPOST uses PROXI JSON over HTTPS with + an FTP fallback, iProX uses the dataset's PX XML over HTTPS. """ if self.is_massive_accession(accession): return self._list_massive_public_files(accession) if self.is_jpost_accession(accession): return self._list_jpost_public_files(accession) + if self.is_iprox_accession(accession): + return self._list_iprox_public_files(accession) raise ValueError( f"Accession {accession} is not a direct-download repository accession" ) @@ -631,28 +716,42 @@ def _download_direct_download_records( parallel_files: int = 1, ) -> None: """ - Download files from a direct-download repository (MassIVE/JPOST) via - anonymous FTP. Supports REST-based resume, per-file retries, and - parallel workers (one connection per worker, capped at file count). + Download files from a direct-download repository. + + MassIVE and JPOST use anonymous FTP(S) with REST-based resume and + per-host parallel workers. iProX uses anonymous HTTPS via + ``download.iprox.org`` with ``Range``-based resume and per-file + parallel workers. URLs are partitioned by scheme so a mixed batch + (e.g. a JPOST PX XML that ever pointed at HTTPS) routes correctly. """ - if protocol != "ftp": + if protocol not in ("ftp", "https", "http"): logging.warning( - "Direct downloads currently use ftp only. " + "Direct downloads currently use ftp / https only. " f"Ignoring requested protocol '{protocol}' for {accession}." ) - ftp_urls = [self._get_download_url(file_record, "ftp") for file_record in file_records] - if not ftp_urls: + all_urls = [self._get_download_url(record, "ftp") for record in file_records] + ftp_urls = [u for u in all_urls if u.lower().startswith("ftp://")] + http_urls = [u for u in all_urls if u.lower().startswith(("http://", "https://"))] + if not ftp_urls and not http_urls: logging.info(f"No files matched for direct-download dataset {accession}") return - self.download_ftp_urls( - ftp_urls=ftp_urls, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - use_tls=self._repo_uses_tls(accession), - parallel_files=parallel_files, - ) + if ftp_urls: + self.download_ftp_urls( + ftp_urls=ftp_urls, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + use_tls=self._repo_uses_tls(accession), + parallel_files=parallel_files, + ) + if http_urls: + self.download_http_urls( + http_urls=http_urls, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + parallel_files=parallel_files, + ) async def stream_all_files_metadata(self, output_file, accession=None): """ @@ -688,7 +787,6 @@ def get_all_raw_file_list(self, project_accession): :param project_accession: PRIDE accession :return: raw file list in JSON format """ - self._raise_if_iprox(project_accession) if self.is_direct_download_accession(project_accession): record_files = self._list_direct_download_files(project_accession) return [ @@ -721,7 +819,6 @@ def download_all_raw_files( :param checksum_check: Download checksum for a given project. :return: None """ - self._raise_if_iprox(accession) if not (os.path.isdir(output_folder)): os.mkdir(output_folder) @@ -1218,7 +1315,6 @@ def download_file_by_name( :param aspera_maximum_bandwidth: Aspera maximum bandwidth :param checksum_check: Download checksum for a given project. """ - self._raise_if_iprox(accession) if not (os.path.isdir(output_folder)): os.mkdir(output_folder) @@ -1293,7 +1389,6 @@ def get_file_from_api(self, accession, file_name) -> List[Dict]: :param file_name: file name :return: file in json format """ - self._raise_if_iprox(accession) try: if self.is_direct_download_accession(accession): @@ -1661,7 +1756,6 @@ def download_files_by_list( """ if not file_names: raise ValueError("file_names must contain at least one filename") - self._raise_if_iprox(accession) if self.is_direct_download_accession(accession): all_files = self._list_direct_download_files(accession) @@ -1951,7 +2045,6 @@ def download_all_category_files( :param categories: List of file categories to download. :param category: Single file category (deprecated, use categories instead). """ - self._raise_if_iprox(accession) if categories is None: categories = [category] if category else ["RAW"] raw_files = self.get_all_category_file_list(accession, categories) @@ -2330,51 +2423,92 @@ def download_ftp_urls( max_download_retries=max_download_retries, ) + @staticmethod + def _http_download_one( + url: str, + output_folder: str, + skip_if_downloaded_already: bool, + max_retries: int = 3, + position: int = 0, + ) -> None: + """ + Download a single HTTP(S) URL with HEAD-then-Range resume and retry. + Used as the worker target for both the serial loop and the parallel + ThreadPoolExecutor path. Reuses :meth:`_parallel_download` so the same + resume / restart-on-non-206 behaviour is shared with globus downloads. + """ + local_path = Files._local_path_for_url(url, output_folder) + if skip_if_downloaded_already and os.path.exists(local_path): + logging.info(f"Skipping download as file already exists: {local_path}") + return + last_error: Optional[Exception] = None + for attempt in range(1, max_retries + 1): + try: + Files._parallel_download(url, local_path, position=position) + logging.info(f"Successfully downloaded {local_path}") + return + except Exception as e: + last_error = e + logging.warning( + f"HTTP download attempt {attempt}/{max_retries} failed for {url}: {e}" + ) + raise RuntimeError( + f"Giving up on {local_path} after {max_retries} HTTP attempts" + ) from last_error + @staticmethod def download_http_urls( http_urls: List[str], output_folder: str, skip_if_downloaded_already: bool, + parallel_files: int = 1, + max_retries: int = 3, ) -> None: """ - Download a list of HTTP(S) URLs with resume support and progress bars. + Download a list of HTTP(S) URLs with HEAD-then-Range resume, per-file + retries, and an optional ``parallel_files`` worker pool. + + When ``parallel_files`` > 1, downloads run concurrently using a + :class:`ThreadPoolExecutor`. Each worker manages its own file (a new + ``requests`` session is opened inside ``_parallel_download``) so the + only shared resource is the output directory. """ if not os.path.isdir(output_folder): os.makedirs(output_folder, exist_ok=True) - session = Util.create_session_with_retries() - for url in http_urls: - try: - local_path = Files._local_path_for_url(url, output_folder) - if skip_if_downloaded_already and os.path.exists(local_path): - logging.info("Skipping download as file already exists") - continue - - if os.path.exists(local_path): - resume_size = os.path.getsize(local_path) - headers = {"Range": f"bytes={resume_size}-"} - mode = "ab" - else: - resume_size = 0 - headers = {} - mode = "wb" + if not http_urls: + return - with session.get(url, stream=True, headers=headers, timeout=(10, 60)) as r: - r.raise_for_status() - total_size = int(r.headers.get("content-length", 0)) + resume_size - block_size = 1024 * 1024 - with tqdm( - total=total_size, - unit="B", - unit_scale=True, - desc=local_path, - initial=resume_size, - ) as pbar: - with open(local_path, mode) as f: - for chunk in r.iter_content(chunk_size=block_size): - if chunk: - f.write(chunk) - pbar.update(len(chunk)) - logging.info(f"Successfully downloaded {local_path}") - except Exception as e: - logging.error(f"HTTP download failed for {url}: {str(e)}") + workers = max(1, min(parallel_files, len(http_urls))) + if workers > 1: + logging.info( + f"Downloading {len(http_urls)} HTTP(S) file(s) with {workers} parallel workers" + ) + with ThreadPoolExecutor(max_workers=workers) as executor: + futures = [ + executor.submit( + Files._http_download_one, + url, + output_folder, + skip_if_downloaded_already, + max_retries, + idx, + ) + for idx, url in enumerate(http_urls) + ] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + logging.error(f"Parallel HTTP download error: {e}") + else: + for url in http_urls: + try: + Files._http_download_one( + url, + output_folder, + skip_if_downloaded_already, + max_retries, + ) + except Exception as e: + logging.error(f"HTTP download failed for {url}: {e}") diff --git a/pridepy/tests/test_iprox_files.py b/pridepy/tests/test_iprox_files.py new file mode 100644 index 0000000..dfdcd21 --- /dev/null +++ b/pridepy/tests/test_iprox_files.py @@ -0,0 +1,148 @@ +"""iProX direct-download support. + +iProX publishes the ProteomeXchange XML for each dataset at a deterministic +path on its anonymous HTTPS download server:: + + http://download.iprox.org//PX_.xml + +The referenced files are served from the same host over HTTPS with byte-range +support, so resume and parallel downloads use the same plumbing as PRIDE +HTTP(S) transfers. +""" +import tempfile +from unittest import TestCase +from unittest.mock import MagicMock, patch + +from pridepy.files.files import Files + + +IPROX_XML_FIXTURE = """ + + + + + + + + + + + + + + + + + + + + + +""".encode("utf-8") + + +class TestIProXFiles(TestCase): + def test_is_iprox_accession_matches_ipx_format(self): + assert Files.is_iprox_accession("IPX0000123") + assert Files.is_iprox_accession("IPX0000123000") + assert Files.is_iprox_accession("ipx1234567") + assert not Files.is_iprox_accession("PXD000012") + assert not Files.is_iprox_accession("MSV000012345") + assert not Files.is_iprox_accession("JPST000001") + assert not Files.is_iprox_accession("IPX12") + assert not Files.is_iprox_accession("") + assert not Files.is_iprox_accession(None) + + def test_iprox_is_a_direct_download_accession(self): + assert Files.is_direct_download_accession("IPX0017413000") + + def test_build_iprox_file_record_maps_px_cv_to_category(self): + record = Files._build_iprox_file_record( + "IPX0017413000", + "http://download.iprox.org/IPX0017413000/IPX0017413001/sample.raw", + category_from_px="Associated raw file URI", + ) + assert record["fileName"] == "sample.raw" + assert record["fileCategory"]["value"] == "RAW" + assert record["source"] == "iProX" + # _download_direct_download_records dispatches by URL scheme, so the + # publicFileLocations URL must still be the HTTPS download URL. + assert record["publicFileLocations"][0]["value"].startswith("http://") + + def test_list_iprox_public_files_parses_px_xml(self): + files = Files() + fake_response = MagicMock() + fake_response.content = IPROX_XML_FIXTURE + fake_response.raise_for_status = MagicMock() + with patch( + "pridepy.files.files.requests.get", return_value=fake_response + ) as req_mock: + records = files._list_iprox_public_files("IPX0017413000") + + # The fetch hits the deterministic PX XML URL. + req_mock.assert_called_once() + called_url = req_mock.call_args[0][0] + assert called_url == ( + "http://download.iprox.org/IPX0017413000/PX_IPX0017413000.xml" + ) + + # 3 valid HTTPS records; the ftp:// "Other URI" cvParam was filtered out. + assert len(records) == 3 + cats = {r["fileName"]: r["fileCategory"]["value"] for r in records} + assert cats == { + "sample1.raw": "RAW", + "sample2.raw": "RAW", + "results.tsv": "SEARCH", + } + for r in records: + assert r["source"] == "iProX" + assert r["publicFileLocations"][0]["value"].startswith("http://") + + def test_get_all_raw_file_list_filters_iprox_records(self): + files = Files() + fake_response = MagicMock() + fake_response.content = IPROX_XML_FIXTURE + fake_response.raise_for_status = MagicMock() + with patch( + "pridepy.files.files.requests.get", return_value=fake_response + ), patch.object(Files, "stream_all_files_by_project") as pride_mock: + raw_files = files.get_all_raw_file_list("IPX0017413000") + + pride_mock.assert_not_called() + assert {r["fileName"] for r in raw_files} == {"sample1.raw", "sample2.raw"} + + def test_download_file_by_name_routes_iprox_to_http_urls(self): + files = Files() + fake_response = MagicMock() + fake_response.content = IPROX_XML_FIXTURE + fake_response.raise_for_status = MagicMock() + with tempfile.TemporaryDirectory() as tmp_dir, patch( + "pridepy.files.files.requests.get", return_value=fake_response + ), patch.object(Files, "download_http_urls") as http_mock, patch.object( + Files, "download_ftp_urls" + ) as ftp_mock: + files.download_file_by_name( + accession="IPX0017413000", + file_name="results.tsv", + output_folder=tmp_dir, + skip_if_downloaded_already=False, + protocol="ftp", + username=None, + password=None, + aspera_maximum_bandwidth="100M", + checksum_check=False, + ) + + # iProX is HTTPS, not FTP — FTP path must not be called. + ftp_mock.assert_not_called() + http_mock.assert_called_once() + kwargs = http_mock.call_args.kwargs + assert kwargs["http_urls"] == [ + "http://download.iprox.org/IPX0017413000/IPX0017413001/results.tsv" + ] + assert kwargs["parallel_files"] == 1 + assert kwargs["skip_if_downloaded_already"] is False diff --git a/pridepy/tests/test_iprox_guard.py b/pridepy/tests/test_iprox_guard.py deleted file mode 100644 index f28f490..0000000 --- a/pridepy/tests/test_iprox_guard.py +++ /dev/null @@ -1,62 +0,0 @@ -"""iProX accession recognition and unsupported-accession guard. - -iProX direct downloads are not implemented (the iProX REST API gates listing -behind CAS authentication and files are served over Aspera with per-session -tokens). pridepy still recognises the accession format so the user gets a -clear ``NotImplementedError`` instead of a confusing PRIDE-API 404. -""" -import tempfile -from unittest import TestCase - -import pytest - -from pridepy.files.files import Files - - -class TestIProXGuard(TestCase): - def test_is_iprox_accession_matches_ipx_format(self): - assert Files.is_iprox_accession("IPX0000123") - assert Files.is_iprox_accession("IPX0000123000") - assert Files.is_iprox_accession("ipx1234567") - assert not Files.is_iprox_accession("PXD000012") - assert not Files.is_iprox_accession("MSV000012345") - assert not Files.is_iprox_accession("JPST000001") - assert not Files.is_iprox_accession("IPX12") - assert not Files.is_iprox_accession("") - assert not Files.is_iprox_accession(None) - - def test_iprox_is_not_a_direct_download_accession(self): - assert Files.is_direct_download_accession("IPX0000123000") is False - - def test_get_all_raw_file_list_raises_for_iprox(self): - files = Files() - with pytest.raises(NotImplementedError, match="iProX"): - files.get_all_raw_file_list("IPX0006033000") - - def test_download_file_by_name_raises_for_iprox(self): - files = Files() - with tempfile.TemporaryDirectory() as tmp_dir: - with pytest.raises(NotImplementedError, match="iProX"): - files.download_file_by_name( - accession="IPX0006033000", - file_name="foo.raw", - output_folder=tmp_dir, - skip_if_downloaded_already=False, - protocol="ftp", - username=None, - password=None, - aspera_maximum_bandwidth="100M", - checksum_check=False, - ) - - def test_download_all_raw_files_raises_for_iprox(self): - files = Files() - with tempfile.TemporaryDirectory() as tmp_dir: - with pytest.raises(NotImplementedError, match="iProX"): - files.download_all_raw_files( - accession="IPX0006033000", - output_folder=tmp_dir, - skip_if_downloaded_already=False, - protocol="ftp", - aspera_maximum_bandwidth="100M", - ) From eedbef1b46dce227afdcb69bbc5061dd59eba608 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 15:36:25 +0100 Subject: [PATCH 05/21] refactor(providers): scaffold providers/ package with Provider ABC + Registry Empty scaffolding for the per-provider refactor (spec: docs/specs/2026-05-27-files-py-provider-refactor-design.md). Introduces the Provider abstract base class, BaseDirectDownloadProvider with a shared download_files() that partitions URLs by scheme and routes through Files.download_ftp_urls / Files.download_http_urls (preserving test patches), and a Registry for accession -> provider resolution. No behaviour change. No providers registered yet. --- pridepy/providers/__init__.py | 7 +++ pridepy/providers/base.py | 107 ++++++++++++++++++++++++++++++++++ pridepy/providers/registry.py | 38 ++++++++++++ 3 files changed, 152 insertions(+) create mode 100644 pridepy/providers/__init__.py create mode 100644 pridepy/providers/base.py create mode 100644 pridepy/providers/registry.py diff --git a/pridepy/providers/__init__.py b/pridepy/providers/__init__.py new file mode 100644 index 0000000..ee1de17 --- /dev/null +++ b/pridepy/providers/__init__.py @@ -0,0 +1,7 @@ +"""Per-repository provider classes used by :class:`pridepy.files.files.Files`. + +Each module under this package owns the listing, transport choice, and +record-construction logic for one repository: PRIDE, MassIVE, JPOST, iProX. +The :mod:`registry` module maps an accession to the right provider; the +:mod:`transport` module hosts the shared FTP/FTPS/HTTPS download plumbing. +""" diff --git a/pridepy/providers/base.py b/pridepy/providers/base.py new file mode 100644 index 0000000..f9fa8bc --- /dev/null +++ b/pridepy/providers/base.py @@ -0,0 +1,107 @@ +"""Abstract base classes for pridepy providers.""" +from abc import ABC, abstractmethod +from typing import ClassVar, Dict, List, Optional + + +class Provider(ABC): + """Abstract base for every repository pridepy can list and download from.""" + + name: ClassVar[str] # "pride", "massive", "jpost", "iprox" + + @staticmethod + @abstractmethod + def matches(accession: str) -> bool: + """Return True if this provider should handle ``accession``.""" + + @abstractmethod + def list_files(self, accession: str) -> List[Dict]: + """Return pridepy file records for the dataset. + + Each record is a dict shaped like the PRIDE V3 API file response, + with at minimum: ``accession``, ``fileName``, ``fileCategory`` + (with nested ``value``), ``publicFileLocations`` (list of + ``{"name": ..., "value": }``). + """ + + @abstractmethod + def download_files( + self, + accession: str, + records: List[Dict], + output_folder: str, + skip_if_downloaded_already: bool, + protocol: str, + parallel_files: int = 1, + checksum_check: bool = False, + aspera_maximum_bandwidth: str = "100M", + username: Optional[str] = None, + password: Optional[str] = None, + ) -> None: + """Download the given records into ``output_folder``.""" + + +class BaseDirectDownloadProvider(Provider): + """Shared ``download_files`` for MassIVE / JPOST / iProX. + + Subclasses set the ``use_tls`` class var (True for MassIVE FTPS, False for + JPOST plain FTP) and override :meth:`list_files`. The shared + ``download_files`` implementation partitions record URLs by scheme: + ``ftp://`` URLs are handed to :meth:`Files.download_ftp_urls`; ``http(s)://`` + URLs go to :meth:`Files.download_http_urls`. It calls **back** into + ``Files`` so that test patches on ``Files.download_ftp_urls`` / + ``Files.download_http_urls`` continue to intercept the calls. + """ + + use_tls: ClassVar[bool] = False + + def download_files( + self, + accession: str, + records: List[Dict], + output_folder: str, + skip_if_downloaded_already: bool, + protocol: str, + parallel_files: int = 1, + checksum_check: bool = False, + aspera_maximum_bandwidth: str = "100M", + username: Optional[str] = None, + password: Optional[str] = None, + ) -> None: + # Lazy import: providers know about Files (the facade) only via the + # public attributes that tests may patch; avoid module-load cycle. + from pridepy.files.files import Files + + if protocol not in ("ftp", "https", "http"): + import logging + logging.warning( + "Direct downloads currently use ftp / https only. " + f"Ignoring requested protocol '{protocol}' for {accession}." + ) + + all_urls = [Files._get_download_url(record, "ftp") for record in records] + ftp_urls = [u for u in all_urls if u.lower().startswith("ftp://")] + http_urls = [ + u for u in all_urls if u.lower().startswith(("http://", "https://")) + ] + if not ftp_urls and not http_urls: + import logging + logging.info( + f"No files matched for direct-download dataset {accession}" + ) + return + + if ftp_urls: + Files.download_ftp_urls( + ftp_urls=ftp_urls, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + use_tls=self.use_tls, + parallel_files=parallel_files, + ) + if http_urls: + Files.download_http_urls( + http_urls=http_urls, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + parallel_files=parallel_files, + ) diff --git a/pridepy/providers/registry.py b/pridepy/providers/registry.py new file mode 100644 index 0000000..7d2c20d --- /dev/null +++ b/pridepy/providers/registry.py @@ -0,0 +1,38 @@ +"""Accession-to-provider resolution. + +Providers are tried in priority order; direct-download repositories +(MassIVE / JPOST / iProX) are tried first because their accession patterns +are unambiguous. PRIDE is tried last and acts as the catch-all for +``PXD\\d+`` / ``PRD\\d+`` accessions. +""" +from typing import List, Type + +from pridepy.providers.base import Provider + +_PROVIDERS: List[Type[Provider]] = [] # populated by individual provider modules + + +def register(provider_cls: Type[Provider]) -> Type[Provider]: + """Register a provider class. Usable as a decorator.""" + if provider_cls not in _PROVIDERS: + _PROVIDERS.append(provider_cls) + return provider_cls + + +def resolve(accession: str) -> Provider: + """Return a provider instance that matches ``accession``. + + :raises ValueError: when no registered provider matches. + """ + for cls in _PROVIDERS: + if cls.matches(accession): + return cls() + raise ValueError(f"No provider registered for accession {accession!r}") + + +def is_known(accession: str) -> bool: + """Return True if any registered provider matches ``accession``.""" + for cls in _PROVIDERS: + if cls.matches(accession): + return True + return False From 912287d9db0f0f8867de5cc665aba827aca7225f Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 15:46:41 +0100 Subject: [PATCH 06/21] refactor(providers): move FTP/HTTPS transport into providers/transport.py Pulled download_ftp_urls, download_http_urls, and their helpers (_open_ftp_connection, _walk_ftp_tree, _list_ftp_repo_files, _download_one_ftp_path, _download_ftp_paths_serial/_parallel, _http_download_one, _parallel_download, _local_path_for_url) out of the Files class verbatim into providers/transport.py. Files keeps a shim staticmethod for each function that does a lazy import and delegates, so existing test patches like patch.object(Files, 'download_ftp_urls') keep intercepting calls. No behaviour change. Test suite green. --- pridepy/files/files.py | 487 ++++++------------------------- pridepy/providers/transport.py | 504 +++++++++++++++++++++++++++++++++ 2 files changed, 583 insertions(+), 408 deletions(-) create mode 100644 pridepy/providers/transport.py diff --git a/pridepy/files/files.py b/pridepy/files/files.py index 0f8e029..0de339a 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -428,108 +428,21 @@ def _repo_uses_tls(accession: str) -> bool: @staticmethod def _walk_ftp_tree(ftp: FTP, remote_dir: str) -> List[str]: - """ - Recursively list files under a remote FTP directory. - """ - file_paths: List[str] = [] - try: - entries = list(ftp.mlsd(remote_dir)) - for name, facts in entries: - if name in {".", ".."}: - continue - child_path = posixpath.join(remote_dir.rstrip("/"), name) - if facts.get("type") == "dir": - file_paths.extend(Files._walk_ftp_tree(ftp, child_path)) - elif facts.get("type") == "file": - file_paths.append(child_path) - return file_paths - except (AttributeError, ftplib.error_perm): - pass - - current_dir = ftp.pwd() - listing: List[str] = [] - try: - ftp.cwd(remote_dir) - ftp.retrlines("LIST", listing.append) - for entry in listing: - parts = entry.split(maxsplit=8) - if len(parts) < 9: - continue - name = parts[8] - if name in {".", ".."}: - continue - child_path = posixpath.join(remote_dir.rstrip("/"), name) - if entry.startswith("d"): - file_paths.extend(Files._walk_ftp_tree(ftp, child_path)) - else: - file_paths.append(child_path) - finally: - ftp.cwd(current_dir) - return file_paths + """Shim — see :func:`pridepy.providers.transport._walk_ftp_tree`.""" + from pridepy.providers import transport + return transport._walk_ftp_tree(ftp=ftp, remote_dir=remote_dir) @staticmethod def _open_ftp_connection(host: str, use_tls: bool, timeout: int = 30) -> FTP: - """ - Open an anonymous FTP connection, transparently using FTPS when the - server requires TLS (e.g., MassIVE). When ``use_tls`` is False but the - server replies ``421 TLS is required`` to ``login``, transparently - retry with FTPS so callers don't need to know the policy in advance. - """ - if use_tls: - ftp: FTP = ftplib.FTP_TLS(host, timeout=timeout) - ftp.login() - ftp.prot_p() - else: - ftp = FTP(host, timeout=timeout) - try: - ftp.login() - except ftplib.error_temp as e: - if "TLS" in str(e).upper(): - try: - ftp.close() - except Exception: - pass - ftp = ftplib.FTP_TLS(host, timeout=timeout) - ftp.login() - ftp.prot_p() - else: - raise - ftp.set_pasv(True) - return ftp + """Shim — see :func:`pridepy.providers.transport._open_ftp_connection`.""" + from pridepy.providers import transport + return transport._open_ftp_connection(host=host, use_tls=use_tls, timeout=timeout) - def _list_ftp_repo_files( - self, - host: str, - remote_root: str, - error_label: str, - use_tls: bool = False, - ) -> List[str]: - """ - Connect to an anonymous FTP host (FTP or FTPS), walk a directory tree, - and return file paths. - - ``use_tls`` should be True for servers that reject plain FTP (e.g. - MassIVE). Centralizes connection lifecycle so a constructor failure - doesn't mask the underlying error in ``finally`` (PR #98 review). - """ - ftp: Optional[FTP] = None - try: - ftp = self._open_ftp_connection(host, use_tls=use_tls) - logging.info(f"Connected to FTP host: {host} (tls={use_tls})") - return self._walk_ftp_tree(ftp, remote_root) - except Exception as error: - raise RuntimeError( - f"Unable to list public files for {error_label}: {error}" - ) from error - finally: - if ftp is not None: - try: - ftp.quit() - except Exception: - try: - ftp.close() - except Exception: - pass + @staticmethod + def _list_ftp_repo_files(host, remote_root, error_label, use_tls=False): + """Shim — see :func:`pridepy.providers.transport._list_ftp_repo_files`.""" + from pridepy.providers import transport + return transport._list_ftp_repo_files(host=host, remote_root=remote_root, error_label=error_label, use_tls=use_tls) def _list_massive_public_files(self, accession: str) -> List[Dict]: """ @@ -1061,43 +974,9 @@ def _download_range(url, file_path, start, end, pbar, max_retries=3): @staticmethod def _parallel_download(url, file_path, position=0): - """Download a file via a single-connection HTTP stream with optional resume. - If a partial file exists and the server supports Range requests, resumes - from where it left off; otherwise restarts from scratch.""" - session = Util.create_session_with_retries() - try: - head = session.head(url, timeout=(30, 30)) - head.raise_for_status() - total_size = int(head.headers.get("content-length", 0)) - accept_ranges = head.headers.get("accept-ranges", "none").strip().lower() - except (requests.RequestException, ValueError) as exc: - logging.info(f"HEAD request failed, falling back to single connection: {exc}") - total_size = 0 - accept_ranges = "none" - - resume_size = 0 - if os.path.exists(file_path) and accept_ranges == "bytes" and total_size > 0: - resume_size = os.path.getsize(file_path) - if resume_size >= total_size: - logging.info(f"File already complete: {file_path}") - return - if resume_size > 0: - logging.info(f"Resuming download from {resume_size} bytes: {file_path}") - - headers = {"Range": f"bytes={resume_size}-"} if resume_size > 0 else {} - with session.get(url, headers=headers, stream=True, timeout=(30, 60)) as r: - r.raise_for_status() - if resume_size > 0 and r.status_code != 206: - logging.warning("Server did not honor Range request (status %s), restarting download", r.status_code) - resume_size = 0 - with tqdm(total=total_size, unit="B", unit_scale=True, desc=file_path, - initial=resume_size, position=position, leave=True) as pbar: - mode = "ab" if resume_size > 0 else "wb" - with open(file_path, mode, buffering=8 * 1024 * 1024) as f: - for chunk in r.iter_content(chunk_size=8 * 1024 * 1024): - if chunk: - f.write(chunk) - pbar.update(len(chunk)) + """Shim — see :func:`pridepy.providers.transport._parallel_download`.""" + from pridepy.providers import transport + return transport._parallel_download(url=url, file_path=file_path, position=position) @staticmethod def _globus_download_one(file, output_folder, skip_if_downloaded_already, max_retries=6, position=0): @@ -2177,8 +2056,9 @@ def download_px_raw_files( @staticmethod def _local_path_for_url(download_url: str, output_folder: str) -> str: - filename = os.path.basename(urlparse(download_url).path) - return os.path.join(output_folder, filename) + """Shim — see :func:`pridepy.providers.transport._local_path_for_url`.""" + from pridepy.providers import transport + return transport._local_path_for_url(download_url=download_url, output_folder=output_folder) @staticmethod def _download_one_ftp_path( @@ -2189,73 +2069,16 @@ def _download_one_ftp_path( max_download_retries: int, position: int = 0, ) -> None: - """ - Download a single FTP path over an existing connection, with REST resume - and per-file retry. Raises on giving up so the caller can decide what to do. - """ - if skip_if_downloaded_already and os.path.exists(local_path): - logging.info(f"Skipping download as file already exists: {local_path}") - return - - attempt = 0 - last_error: Optional[Exception] = None - while attempt < max_download_retries: - try: - total_size = ftp.size(ftp_path) - if os.path.exists(local_path): - current_size = os.path.getsize(local_path) - mode = "ab" - else: - current_size = 0 - mode = "wb" - - with open(local_path, mode) as f, tqdm( - total=total_size, - unit="B", - unit_scale=True, - desc=local_path, - initial=current_size, - position=position, - leave=True, - ) as pbar: - def callback(data): - f.write(data) - pbar.update(len(data)) - - if current_size: - try: - ftp.sendcmd(f"REST {current_size}") - except Exception: - current_size = 0 - f.seek(0) - f.truncate() - ftp.retrbinary(f"RETR {ftp_path}", callback) - - # Post-transfer integrity check: server-reported size must match - # the local size. Catches half-finished transfers that retrbinary - # didn't raise on (e.g. server closed the data channel early). - # The next iteration will REST-resume from where we left off. - if total_size: - final_size = os.path.getsize(local_path) - if final_size != total_size: - attempt += 1 - logging.error( - f"Size mismatch for {local_path}: " - f"got {final_size} bytes, expected {total_size} " - f"(attempt {attempt})" - ) - continue - logging.info(f"Successfully downloaded {local_path}") - return - except (socket.timeout, ftplib.error_temp, ftplib.error_perm) as e: - attempt += 1 - last_error = e - logging.error( - f"Download failed for {local_path} (attempt {attempt}): {e}" - ) - raise RuntimeError( - f"Giving up on {local_path} after {max_download_retries} attempts" - ) from last_error + """Shim — see :func:`pridepy.providers.transport._download_one_ftp_path`.""" + from pridepy.providers import transport + return transport._download_one_ftp_path( + ftp=ftp, + ftp_path=ftp_path, + local_path=local_path, + skip_if_downloaded_already=skip_if_downloaded_already, + max_download_retries=max_download_retries, + position=position, + ) @staticmethod def _download_ftp_paths_serial( @@ -2267,47 +2090,17 @@ def _download_ftp_paths_serial( max_connection_retries: int, max_download_retries: int, ) -> None: - """Download all paths from one host over a single (reused) connection.""" - connection_attempt = 0 - while connection_attempt < max_connection_retries: - try: - ftp = Files._open_ftp_connection(host, use_tls=use_tls) - logging.info(f"Connected to FTP host: {host} (tls={use_tls})") - for ftp_path in paths: - local_path = os.path.join(output_folder, os.path.basename(ftp_path)) - try: - Files._download_one_ftp_path( - ftp=ftp, - ftp_path=ftp_path, - local_path=local_path, - skip_if_downloaded_already=skip_if_downloaded_already, - max_download_retries=max_download_retries, - ) - except Exception as e: - logging.error( - f"Failed to download {ftp_path} from {host}: {e}" - ) - try: - ftp.quit() - except Exception: - try: - ftp.close() - except Exception: - pass - logging.info(f"Disconnected from FTP host: {host}") - return - except (socket.timeout, ftplib.error_temp, ftplib.error_perm, OSError) as e: - connection_attempt += 1 - logging.error( - f"FTP connection failed (attempt {connection_attempt}): {e}" - ) - if connection_attempt < max_connection_retries: - logging.info("Retrying connection...") - time.sleep(5) - else: - logging.error( - f"Giving up after {max_connection_retries} failed connection attempts to {host}." - ) + """Shim — see :func:`pridepy.providers.transport._download_ftp_paths_serial`.""" + from pridepy.providers import transport + return transport._download_ftp_paths_serial( + host=host, + paths=paths, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + use_tls=use_tls, + max_connection_retries=max_connection_retries, + max_download_retries=max_download_retries, + ) @staticmethod def _download_ftp_paths_parallel( @@ -2320,55 +2113,18 @@ def _download_ftp_paths_parallel( max_download_retries: int, parallel_files: int, ) -> None: - """ - Download paths concurrently using ``parallel_files`` workers; each - worker opens its own FTP connection so transfers don't serialize. - """ - def worker(ftp_path: str, position: int) -> None: - local_path = os.path.join(output_folder, os.path.basename(ftp_path)) - if skip_if_downloaded_already and os.path.exists(local_path): - logging.info(f"Skipping download as file already exists: {local_path}") - return - connection_attempt = 0 - while connection_attempt < max_connection_retries: - try: - ftp = Files._open_ftp_connection(host, use_tls=use_tls) - try: - Files._download_one_ftp_path( - ftp=ftp, - ftp_path=ftp_path, - local_path=local_path, - skip_if_downloaded_already=False, - max_download_retries=max_download_retries, - position=position, - ) - return - finally: - try: - ftp.quit() - except Exception: - try: - ftp.close() - except Exception: - pass - except (socket.timeout, ftplib.error_temp, ftplib.error_perm, OSError) as e: - connection_attempt += 1 - logging.error( - f"FTP connection failed for {ftp_path} (attempt {connection_attempt}): {e}" - ) - if connection_attempt < max_connection_retries: - time.sleep(5) - logging.error(f"Giving up on {ftp_path} from {host}") - - with ThreadPoolExecutor(max_workers=parallel_files) as executor: - futures = [ - executor.submit(worker, path, idx) for idx, path in enumerate(paths) - ] - for future in as_completed(futures): - try: - future.result() - except Exception as e: - logging.error(f"Parallel FTP download error: {e}") + """Shim — see :func:`pridepy.providers.transport._download_ftp_paths_parallel`.""" + from pridepy.providers import transport + return transport._download_ftp_paths_parallel( + host=host, + paths=paths, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + use_tls=use_tls, + max_connection_retries=max_connection_retries, + max_download_retries=max_download_retries, + parallel_files=parallel_files, + ) @staticmethod def download_ftp_urls( @@ -2380,48 +2136,17 @@ def download_ftp_urls( use_tls: bool = False, parallel_files: int = 1, ) -> None: - """ - Download a list of FTP URLs with retries, REST-based resume, and - optional parallel workers. - - :param use_tls: Open the FTP connection with TLS (FTP_TLS / PROT P). - Required for hosts that reject plain anonymous FTP (e.g. MassIVE). - When False but the server replies ``421 TLS is required``, the - connection is transparently retried over TLS. - :param parallel_files: When >1, downloads run concurrently with that - many worker connections per host (capped at the number of files). - """ - if not os.path.isdir(output_folder): - os.makedirs(output_folder, exist_ok=True) - - host_to_paths: Dict[str, List[str]] = {} - for url in ftp_urls: - parsed = urlparse(url) - host_to_paths.setdefault(parsed.hostname, []).append(parsed.path.lstrip("/")) - - for host, paths in host_to_paths.items(): - workers = max(1, min(parallel_files, len(paths))) - if workers > 1: - Files._download_ftp_paths_parallel( - host=host, - paths=paths, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - use_tls=use_tls, - max_connection_retries=max_connection_retries, - max_download_retries=max_download_retries, - parallel_files=workers, - ) - else: - Files._download_ftp_paths_serial( - host=host, - paths=paths, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - use_tls=use_tls, - max_connection_retries=max_connection_retries, - max_download_retries=max_download_retries, - ) + """Shim — see :func:`pridepy.providers.transport.download_ftp_urls`.""" + from pridepy.providers import transport + return transport.download_ftp_urls( + ftp_urls=ftp_urls, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + max_connection_retries=max_connection_retries, + max_download_retries=max_download_retries, + use_tls=use_tls, + parallel_files=parallel_files, + ) @staticmethod def _http_download_one( @@ -2431,30 +2156,15 @@ def _http_download_one( max_retries: int = 3, position: int = 0, ) -> None: - """ - Download a single HTTP(S) URL with HEAD-then-Range resume and retry. - Used as the worker target for both the serial loop and the parallel - ThreadPoolExecutor path. Reuses :meth:`_parallel_download` so the same - resume / restart-on-non-206 behaviour is shared with globus downloads. - """ - local_path = Files._local_path_for_url(url, output_folder) - if skip_if_downloaded_already and os.path.exists(local_path): - logging.info(f"Skipping download as file already exists: {local_path}") - return - last_error: Optional[Exception] = None - for attempt in range(1, max_retries + 1): - try: - Files._parallel_download(url, local_path, position=position) - logging.info(f"Successfully downloaded {local_path}") - return - except Exception as e: - last_error = e - logging.warning( - f"HTTP download attempt {attempt}/{max_retries} failed for {url}: {e}" - ) - raise RuntimeError( - f"Giving up on {local_path} after {max_retries} HTTP attempts" - ) from last_error + """Shim — see :func:`pridepy.providers.transport._http_download_one`.""" + from pridepy.providers import transport + return transport._http_download_one( + url=url, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + max_retries=max_retries, + position=position, + ) @staticmethod def download_http_urls( @@ -2464,51 +2174,12 @@ def download_http_urls( parallel_files: int = 1, max_retries: int = 3, ) -> None: - """ - Download a list of HTTP(S) URLs with HEAD-then-Range resume, per-file - retries, and an optional ``parallel_files`` worker pool. - - When ``parallel_files`` > 1, downloads run concurrently using a - :class:`ThreadPoolExecutor`. Each worker manages its own file (a new - ``requests`` session is opened inside ``_parallel_download``) so the - only shared resource is the output directory. - """ - if not os.path.isdir(output_folder): - os.makedirs(output_folder, exist_ok=True) - - if not http_urls: - return - - workers = max(1, min(parallel_files, len(http_urls))) - if workers > 1: - logging.info( - f"Downloading {len(http_urls)} HTTP(S) file(s) with {workers} parallel workers" - ) - with ThreadPoolExecutor(max_workers=workers) as executor: - futures = [ - executor.submit( - Files._http_download_one, - url, - output_folder, - skip_if_downloaded_already, - max_retries, - idx, - ) - for idx, url in enumerate(http_urls) - ] - for future in as_completed(futures): - try: - future.result() - except Exception as e: - logging.error(f"Parallel HTTP download error: {e}") - else: - for url in http_urls: - try: - Files._http_download_one( - url, - output_folder, - skip_if_downloaded_already, - max_retries, - ) - except Exception as e: - logging.error(f"HTTP download failed for {url}: {e}") + """Shim — see :func:`pridepy.providers.transport.download_http_urls`.""" + from pridepy.providers import transport + return transport.download_http_urls( + http_urls=http_urls, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + parallel_files=parallel_files, + max_retries=max_retries, + ) diff --git a/pridepy/providers/transport.py b/pridepy/providers/transport.py new file mode 100644 index 0000000..6649657 --- /dev/null +++ b/pridepy/providers/transport.py @@ -0,0 +1,504 @@ +"""Shared FTP / FTPS / HTTPS download transport. + +Stateless helpers used by the per-repository providers (and re-exported on +:class:`pridepy.files.files.Files` for backward compatibility with tests that +patch ``Files.download_ftp_urls`` etc.). +""" +import ftplib +import logging +import os +import socket +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from ftplib import FTP +from typing import Dict, List, Optional +from urllib.parse import urlparse + +import requests +from tqdm import tqdm + +from pridepy.util.api_handling import Util + + +def _local_path_for_url(download_url: str, output_folder: str) -> str: + filename = os.path.basename(urlparse(download_url).path) + return os.path.join(output_folder, filename) + + +def _open_ftp_connection(host: str, use_tls: bool, timeout: int = 30) -> FTP: + """ + Open an anonymous FTP connection, transparently using FTPS when the + server requires TLS (e.g., MassIVE). When ``use_tls`` is False but the + server replies ``421 TLS is required`` to ``login``, transparently + retry with FTPS so callers don't need to know the policy in advance. + """ + if use_tls: + ftp: FTP = ftplib.FTP_TLS(host, timeout=timeout) + ftp.login() + ftp.prot_p() + else: + ftp = FTP(host, timeout=timeout) + try: + ftp.login() + except ftplib.error_temp as e: + if "TLS" in str(e).upper(): + try: + ftp.close() + except Exception: + pass + ftp = ftplib.FTP_TLS(host, timeout=timeout) + ftp.login() + ftp.prot_p() + else: + raise + ftp.set_pasv(True) + return ftp + + +def _walk_ftp_tree(ftp: FTP, remote_dir: str) -> List[str]: + """ + Recursively list files under a remote FTP directory. + """ + import posixpath + file_paths: List[str] = [] + try: + entries = list(ftp.mlsd(remote_dir)) + for name, facts in entries: + if name in {".", ".."}: + continue + child_path = posixpath.join(remote_dir.rstrip("/"), name) + if facts.get("type") == "dir": + file_paths.extend(_walk_ftp_tree(ftp, child_path)) + elif facts.get("type") == "file": + file_paths.append(child_path) + return file_paths + except (AttributeError, ftplib.error_perm): + pass + + current_dir = ftp.pwd() + listing: List[str] = [] + try: + ftp.cwd(remote_dir) + ftp.retrlines("LIST", listing.append) + for entry in listing: + parts = entry.split(maxsplit=8) + if len(parts) < 9: + continue + name = parts[8] + if name in {".", ".."}: + continue + child_path = posixpath.join(remote_dir.rstrip("/"), name) + if entry.startswith("d"): + file_paths.extend(_walk_ftp_tree(ftp, child_path)) + else: + file_paths.append(child_path) + finally: + ftp.cwd(current_dir) + return file_paths + + +def _list_ftp_repo_files( + host: str, + remote_root: str, + error_label: str, + use_tls: bool = False, +) -> List[str]: + """ + Connect to an anonymous FTP host (FTP or FTPS), walk a directory tree, + and return file paths. + + ``use_tls`` should be True for servers that reject plain FTP (e.g. + MassIVE). Centralizes connection lifecycle so a constructor failure + doesn't mask the underlying error in ``finally`` (PR #98 review). + """ + ftp: Optional[FTP] = None + try: + ftp = _open_ftp_connection(host, use_tls=use_tls) + logging.info(f"Connected to FTP host: {host} (tls={use_tls})") + return _walk_ftp_tree(ftp, remote_root) + except Exception as error: + raise RuntimeError( + f"Unable to list public files for {error_label}: {error}" + ) from error + finally: + if ftp is not None: + try: + ftp.quit() + except Exception: + try: + ftp.close() + except Exception: + pass + + +def _download_one_ftp_path( + ftp: FTP, + ftp_path: str, + local_path: str, + skip_if_downloaded_already: bool, + max_download_retries: int, + position: int = 0, +) -> None: + """ + Download a single FTP path over an existing connection, with REST resume + and per-file retry. Raises on giving up so the caller can decide what to do. + """ + if skip_if_downloaded_already and os.path.exists(local_path): + logging.info(f"Skipping download as file already exists: {local_path}") + return + + attempt = 0 + last_error: Optional[Exception] = None + while attempt < max_download_retries: + try: + total_size = ftp.size(ftp_path) + if os.path.exists(local_path): + current_size = os.path.getsize(local_path) + mode = "ab" + else: + current_size = 0 + mode = "wb" + + with open(local_path, mode) as f, tqdm( + total=total_size, + unit="B", + unit_scale=True, + desc=local_path, + initial=current_size, + position=position, + leave=True, + ) as pbar: + def callback(data): + f.write(data) + pbar.update(len(data)) + + if current_size: + try: + ftp.sendcmd(f"REST {current_size}") + except Exception: + current_size = 0 + f.seek(0) + f.truncate() + ftp.retrbinary(f"RETR {ftp_path}", callback) + + # Post-transfer integrity check: server-reported size must match + # the local size. Catches half-finished transfers that retrbinary + # didn't raise on (e.g. server closed the data channel early). + # The next iteration will REST-resume from where we left off. + if total_size: + final_size = os.path.getsize(local_path) + if final_size != total_size: + attempt += 1 + logging.error( + f"Size mismatch for {local_path}: " + f"got {final_size} bytes, expected {total_size} " + f"(attempt {attempt})" + ) + continue + logging.info(f"Successfully downloaded {local_path}") + return + except (socket.timeout, ftplib.error_temp, ftplib.error_perm) as e: + attempt += 1 + last_error = e + logging.error( + f"Download failed for {local_path} (attempt {attempt}): {e}" + ) + raise RuntimeError( + f"Giving up on {local_path} after {max_download_retries} attempts" + ) from last_error + + +def _download_ftp_paths_serial( + host: str, + paths: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + use_tls: bool, + max_connection_retries: int, + max_download_retries: int, +) -> None: + """Download all paths from one host over a single (reused) connection.""" + connection_attempt = 0 + while connection_attempt < max_connection_retries: + try: + ftp = _open_ftp_connection(host, use_tls=use_tls) + logging.info(f"Connected to FTP host: {host} (tls={use_tls})") + for ftp_path in paths: + local_path = os.path.join(output_folder, os.path.basename(ftp_path)) + try: + _download_one_ftp_path( + ftp=ftp, + ftp_path=ftp_path, + local_path=local_path, + skip_if_downloaded_already=skip_if_downloaded_already, + max_download_retries=max_download_retries, + ) + except Exception as e: + logging.error( + f"Failed to download {ftp_path} from {host}: {e}" + ) + try: + ftp.quit() + except Exception: + try: + ftp.close() + except Exception: + pass + logging.info(f"Disconnected from FTP host: {host}") + return + except (socket.timeout, ftplib.error_temp, ftplib.error_perm, OSError) as e: + connection_attempt += 1 + logging.error( + f"FTP connection failed (attempt {connection_attempt}): {e}" + ) + if connection_attempt < max_connection_retries: + logging.info("Retrying connection...") + time.sleep(5) + else: + logging.error( + f"Giving up after {max_connection_retries} failed connection attempts to {host}." + ) + + +def _download_ftp_paths_parallel( + host: str, + paths: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + use_tls: bool, + max_connection_retries: int, + max_download_retries: int, + parallel_files: int, +) -> None: + """ + Download paths concurrently using ``parallel_files`` workers; each + worker opens its own FTP connection so transfers don't serialize. + """ + def worker(ftp_path: str, position: int) -> None: + local_path = os.path.join(output_folder, os.path.basename(ftp_path)) + if skip_if_downloaded_already and os.path.exists(local_path): + logging.info(f"Skipping download as file already exists: {local_path}") + return + connection_attempt = 0 + while connection_attempt < max_connection_retries: + try: + ftp = _open_ftp_connection(host, use_tls=use_tls) + try: + _download_one_ftp_path( + ftp=ftp, + ftp_path=ftp_path, + local_path=local_path, + skip_if_downloaded_already=False, + max_download_retries=max_download_retries, + position=position, + ) + return + finally: + try: + ftp.quit() + except Exception: + try: + ftp.close() + except Exception: + pass + except (socket.timeout, ftplib.error_temp, ftplib.error_perm, OSError) as e: + connection_attempt += 1 + logging.error( + f"FTP connection failed for {ftp_path} (attempt {connection_attempt}): {e}" + ) + if connection_attempt < max_connection_retries: + time.sleep(5) + logging.error(f"Giving up on {ftp_path} from {host}") + + with ThreadPoolExecutor(max_workers=parallel_files) as executor: + futures = [ + executor.submit(worker, path, idx) for idx, path in enumerate(paths) + ] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + logging.error(f"Parallel FTP download error: {e}") + + +def download_ftp_urls( + ftp_urls: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + max_connection_retries: int = 3, + max_download_retries: int = 3, + use_tls: bool = False, + parallel_files: int = 1, +) -> None: + """ + Download a list of FTP URLs with retries, REST-based resume, and + optional parallel workers. + + :param use_tls: Open the FTP connection with TLS (FTP_TLS / PROT P). + Required for hosts that reject plain anonymous FTP (e.g. MassIVE). + When False but the server replies ``421 TLS is required``, the + connection is transparently retried over TLS. + :param parallel_files: When >1, downloads run concurrently with that + many worker connections per host (capped at the number of files). + """ + if not os.path.isdir(output_folder): + os.makedirs(output_folder, exist_ok=True) + + host_to_paths: Dict[str, List[str]] = {} + for url in ftp_urls: + parsed = urlparse(url) + host_to_paths.setdefault(parsed.hostname, []).append(parsed.path.lstrip("/")) + + for host, paths in host_to_paths.items(): + workers = max(1, min(parallel_files, len(paths))) + if workers > 1: + _download_ftp_paths_parallel( + host=host, + paths=paths, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + use_tls=use_tls, + max_connection_retries=max_connection_retries, + max_download_retries=max_download_retries, + parallel_files=workers, + ) + else: + _download_ftp_paths_serial( + host=host, + paths=paths, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + use_tls=use_tls, + max_connection_retries=max_connection_retries, + max_download_retries=max_download_retries, + ) + + +def _parallel_download(url, file_path, position=0): + """Download a file via a single-connection HTTP stream with optional resume. + If a partial file exists and the server supports Range requests, resumes + from where it left off; otherwise restarts from scratch.""" + session = Util.create_session_with_retries() + try: + head = session.head(url, timeout=(30, 30)) + head.raise_for_status() + total_size = int(head.headers.get("content-length", 0)) + accept_ranges = head.headers.get("accept-ranges", "none").strip().lower() + except (requests.RequestException, ValueError) as exc: + logging.info(f"HEAD request failed, falling back to single connection: {exc}") + total_size = 0 + accept_ranges = "none" + + resume_size = 0 + if os.path.exists(file_path) and accept_ranges == "bytes" and total_size > 0: + resume_size = os.path.getsize(file_path) + if resume_size >= total_size: + logging.info(f"File already complete: {file_path}") + return + if resume_size > 0: + logging.info(f"Resuming download from {resume_size} bytes: {file_path}") + + headers = {"Range": f"bytes={resume_size}-"} if resume_size > 0 else {} + with session.get(url, headers=headers, stream=True, timeout=(30, 60)) as r: + r.raise_for_status() + if resume_size > 0 and r.status_code != 206: + logging.warning("Server did not honor Range request (status %s), restarting download", r.status_code) + resume_size = 0 + with tqdm(total=total_size, unit="B", unit_scale=True, desc=file_path, + initial=resume_size, position=position, leave=True) as pbar: + mode = "ab" if resume_size > 0 else "wb" + with open(file_path, mode, buffering=8 * 1024 * 1024) as f: + for chunk in r.iter_content(chunk_size=8 * 1024 * 1024): + if chunk: + f.write(chunk) + pbar.update(len(chunk)) + + +def _http_download_one( + url: str, + output_folder: str, + skip_if_downloaded_already: bool, + max_retries: int = 3, + position: int = 0, +) -> None: + """ + Download a single HTTP(S) URL with HEAD-then-Range resume and retry. + Used as the worker target for both the serial loop and the parallel + ThreadPoolExecutor path. Reuses :meth:`_parallel_download` so the same + resume / restart-on-non-206 behaviour is shared with globus downloads. + """ + local_path = _local_path_for_url(url, output_folder) + if skip_if_downloaded_already and os.path.exists(local_path): + logging.info(f"Skipping download as file already exists: {local_path}") + return + last_error: Optional[Exception] = None + for attempt in range(1, max_retries + 1): + try: + _parallel_download(url, local_path, position=position) + logging.info(f"Successfully downloaded {local_path}") + return + except Exception as e: + last_error = e + logging.warning( + f"HTTP download attempt {attempt}/{max_retries} failed for {url}: {e}" + ) + raise RuntimeError( + f"Giving up on {local_path} after {max_retries} HTTP attempts" + ) from last_error + + +def download_http_urls( + http_urls: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + parallel_files: int = 1, + max_retries: int = 3, +) -> None: + """ + Download a list of HTTP(S) URLs with HEAD-then-Range resume, per-file + retries, and an optional ``parallel_files`` worker pool. + + When ``parallel_files`` > 1, downloads run concurrently using a + :class:`ThreadPoolExecutor`. Each worker manages its own file (a new + ``requests`` session is opened inside ``_parallel_download``) so the + only shared resource is the output directory. + """ + if not os.path.isdir(output_folder): + os.makedirs(output_folder, exist_ok=True) + + if not http_urls: + return + + workers = max(1, min(parallel_files, len(http_urls))) + if workers > 1: + logging.info( + f"Downloading {len(http_urls)} HTTP(S) file(s) with {workers} parallel workers" + ) + with ThreadPoolExecutor(max_workers=workers) as executor: + futures = [ + executor.submit( + _http_download_one, + url, + output_folder, + skip_if_downloaded_already, + max_retries, + idx, + ) + for idx, url in enumerate(http_urls) + ] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + logging.error(f"Parallel HTTP download error: {e}") + else: + for url in http_urls: + try: + _http_download_one( + url, + output_folder, + skip_if_downloaded_already, + max_retries, + ) + except Exception as e: + logging.error(f"HTTP download failed for {url}: {e}") From e16c8709ede0bc9d9c7d846e21c32c5c73743ae5 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 15:53:10 +0100 Subject: [PATCH 07/21] refactor(providers): move cross-cutting utilities into providers/util.py Moved Progress, _find_tsv_columns, _is_md5_checksum, read_checksum_file, compute_md5, validate_download, _remove_if_exists, _get_download_url, _resolve_local_path from Files into providers/util.py. Files keeps shim re-exports so existing references keep working. No behaviour change. Test suite green. --- pridepy/files/files.py | 166 ++++++---------------------------- pridepy/providers/util.py | 183 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 210 insertions(+), 139 deletions(-) create mode 100644 pridepy/providers/util.py diff --git a/pridepy/files/files.py b/pridepy/files/files.py index 0de339a..e919915 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -28,26 +28,9 @@ from pridepy.util.api_handling import Util -class Progress: - def __init__(self, total_size, file_name): - self.pbar = tqdm( - total=total_size, - unit="B", - unit_scale=True, - desc="Downloading {}".format(file_name), - ) - - def __call__(self, bytes_amount): - self.pbar.update(bytes_amount) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.pbar.close() - - def close(self): - self.pbar.close() +# Re-export from providers.util so external `from pridepy.files.files import Progress` +# still works. +from pridepy.providers.util import Progress # noqa: F401 class Files: @@ -105,146 +88,51 @@ def __init__(self): @staticmethod def _find_tsv_columns(header: str) -> Optional[Tuple[int, int]]: - """Return (name_idx, checksum_idx) from a TSV header, or None.""" - cols = [col.strip().lower() for col in header.split("\t")] - required_cols = {"file-name", "file-md5checksum", "file-size"} - if not required_cols.issubset(set(cols)): - return None - return cols.index("file-name"), cols.index("file-md5checksum") + """Shim — see :func:`pridepy.providers.util._find_tsv_columns`.""" + from pridepy.providers import util + return util._find_tsv_columns(header) @staticmethod def _is_md5_checksum(value: str) -> bool: - return len(value) == 32 and all(char in "0123456789abcdef" for char in value) + """Shim — see :func:`pridepy.providers.util._is_md5_checksum`.""" + from pridepy.providers import util + return util._is_md5_checksum(value) @staticmethod def read_checksum_file(checksum_file_path: str) -> Dict[str, str]: - """ - Read PRIDE API checksum TSV and build {file_name: md5} map. - Expected format: File-Name\tFile-MD5Checksum\tFile-Size - """ - checksums: Dict[str, str] = {} - if not checksum_file_path or not os.path.exists(checksum_file_path): - return checksums - - with open(checksum_file_path, "r", encoding="utf-8") as f: - header = f.readline().strip() - if not header: - return checksums - - col_indices = Files._find_tsv_columns(header) - if col_indices is None: - logging.warning(f"Unrecognized checksum file format: {header}") - return checksums - - name_idx, checksum_idx = col_indices - min_cols = max(name_idx, checksum_idx) + 1 - for line in f: - parts = line.strip().split("\t") - if len(parts) >= min_cols: - fn = os.path.basename(parts[name_idx].strip()) - cs = parts[checksum_idx].strip().lower() - if fn and Files._is_md5_checksum(cs): - checksums[fn] = cs - - return checksums + """Shim — see :func:`pridepy.providers.util.read_checksum_file`.""" + from pridepy.providers import util + return util.read_checksum_file(checksum_file_path) @staticmethod def compute_md5(file_path: str, chunk_size: int = 4 * 1024 * 1024) -> str: - """ - Compute an MD5 checksum for integrity validation, not for security use. - """ - try: - md5 = hashlib.md5(usedforsecurity=False) - except TypeError: - md5 = hashlib.md5() - with open(file_path, "rb") as file_handle: - while True: - chunk = file_handle.read(chunk_size) - if not chunk: - break - md5.update(chunk) - return md5.hexdigest() + """Shim — see :func:`pridepy.providers.util.compute_md5`.""" + from pridepy.providers import util + return util.compute_md5(file_path, chunk_size) @staticmethod def validate_download(file_path: str, expected_checksum: Optional[str] = None) -> Tuple[bool, str]: - """ - Validate a local file exists, is non-empty, and checksum matches when provided. - """ - if not os.path.exists(file_path): - return False, "file does not exist" - if os.path.getsize(file_path) == 0: - return False, "file is empty" - if expected_checksum: - actual_checksum = Files.compute_md5(file_path) - if actual_checksum.lower() != expected_checksum.lower(): - return False, ( - f"checksum mismatch (expected={expected_checksum.lower()}, actual={actual_checksum.lower()})" - ) - return True, "ok" + """Shim — see :func:`pridepy.providers.util.validate_download`.""" + from pridepy.providers import util + return util.validate_download(file_path, expected_checksum) @staticmethod def _remove_if_exists(file_path: str) -> None: - """ - Remove a file if it already exists locally. - """ - if os.path.exists(file_path): - os.remove(file_path) + """Shim — see :func:`pridepy.providers.util._remove_if_exists`.""" + from pridepy.providers import util + return util._remove_if_exists(file_path) @staticmethod def _get_download_url(file_record: Dict, protocol: str) -> str: - """ - Resolve the public download URL for a file and protocol. - - Raises ValueError when the requested protocol has no suitable location. - Aspera requires a dedicated "Aspera Protocol" entry; ftp/s3/globus - derive their URL from the "FTP Protocol" entry (falling back to an - arbitrary non-Aspera location would produce a URL the caller cannot - actually transfer with). - """ - locations = file_record.get("publicFileLocations", []) - if not locations: - raise ValueError("No public file locations present") - - aspera_url = None - ftp_url = None - for location in locations: - name = location.get("name") - if name == "Aspera Protocol": - aspera_url = location.get("value") - elif name == "FTP Protocol": - ftp_url = location.get("value") - - if protocol == "aspera": - if not aspera_url: - raise ValueError("Aspera URL not available") - return aspera_url - - if not ftp_url: - raise ValueError("FTP URL not available") - if protocol == "ftp": - return ftp_url - if protocol == "globus": - return ftp_url.replace( - Files.PRIDE_ARCHIVE_FTP_URL_PREFIX, - Files.PRIDE_ARCHIVE_HTTPS_URL_PREFIX, - 1, - ) - if protocol == "s3": - return ftp_url - raise ValueError(f"Unsupported protocol: {protocol}") + """Shim — see :func:`pridepy.providers.util._get_download_url`.""" + from pridepy.providers import util + return util._get_download_url(file_record, protocol) @staticmethod def _resolve_local_path(file_record: Dict, output_folder: str) -> str: - """ - Compute the canonical local path for a file regardless of transfer protocol. - """ - try: - canonical_url = Files._get_download_url(file_record, "ftp") - except ValueError: - canonical_url = "" - if canonical_url: - return Files.get_output_file_name(canonical_url, file_record, output_folder) - return os.path.join(output_folder, file_record["fileName"]) + """Shim — see :func:`pridepy.providers.util._resolve_local_path`.""" + from pridepy.providers import util + return util._resolve_local_path(file_record, output_folder) @staticmethod def _protocol_sequence(protocol: str) -> List[str]: diff --git a/pridepy/providers/util.py b/pridepy/providers/util.py new file mode 100644 index 0000000..0fc5791 --- /dev/null +++ b/pridepy/providers/util.py @@ -0,0 +1,183 @@ +"""Cross-cutting utilities used by providers and the Files facade. + +Pure functions (and one tiny Progress class) for checksums, record-shape +helpers, and download progress. Originally on ``Files`` as @staticmethods; +moved here so providers can use them without depending on Files at import +time, and Files keeps shim re-exports for backward compatibility with +existing test patches. +""" +import hashlib +import logging +import os +from typing import Dict, List, Optional, Tuple + +from tqdm import tqdm + + +class Progress: + def __init__(self, total_size, file_name): + self.pbar = tqdm( + total=total_size, + unit="B", + unit_scale=True, + desc="Downloading {}".format(file_name), + ) + + def __call__(self, bytes_amount): + self.pbar.update(bytes_amount) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.pbar.close() + + def close(self): + self.pbar.close() + + +def _find_tsv_columns(header: str) -> Optional[Tuple[int, int]]: + """Return (name_idx, checksum_idx) from a TSV header, or None.""" + cols = [col.strip().lower() for col in header.split("\t")] + required_cols = {"file-name", "file-md5checksum", "file-size"} + if not required_cols.issubset(set(cols)): + return None + return cols.index("file-name"), cols.index("file-md5checksum") + + +def _is_md5_checksum(value: str) -> bool: + return len(value) == 32 and all(char in "0123456789abcdef" for char in value) + + +def read_checksum_file(checksum_file_path: str) -> Dict[str, str]: + """ + Read PRIDE API checksum TSV and build {file_name: md5} map. + Expected format: File-Name\tFile-MD5Checksum\tFile-Size + """ + checksums: Dict[str, str] = {} + if not checksum_file_path or not os.path.exists(checksum_file_path): + return checksums + + with open(checksum_file_path, "r", encoding="utf-8") as f: + header = f.readline().strip() + if not header: + return checksums + + col_indices = _find_tsv_columns(header) + if col_indices is None: + logging.warning(f"Unrecognized checksum file format: {header}") + return checksums + + name_idx, checksum_idx = col_indices + min_cols = max(name_idx, checksum_idx) + 1 + for line in f: + parts = line.strip().split("\t") + if len(parts) >= min_cols: + fn = os.path.basename(parts[name_idx].strip()) + cs = parts[checksum_idx].strip().lower() + if fn and _is_md5_checksum(cs): + checksums[fn] = cs + + return checksums + + +def compute_md5(file_path: str, chunk_size: int = 4 * 1024 * 1024) -> str: + """ + Compute an MD5 checksum for integrity validation, not for security use. + """ + try: + md5 = hashlib.md5(usedforsecurity=False) + except TypeError: + md5 = hashlib.md5() + with open(file_path, "rb") as file_handle: + while True: + chunk = file_handle.read(chunk_size) + if not chunk: + break + md5.update(chunk) + return md5.hexdigest() + + +def validate_download(file_path: str, expected_checksum: Optional[str] = None) -> Tuple[bool, str]: + """ + Validate a local file exists, is non-empty, and checksum matches when provided. + """ + if not os.path.exists(file_path): + return False, "file does not exist" + if os.path.getsize(file_path) == 0: + return False, "file is empty" + if expected_checksum: + actual_checksum = compute_md5(file_path) + if actual_checksum.lower() != expected_checksum.lower(): + return False, ( + f"checksum mismatch (expected={expected_checksum.lower()}, actual={actual_checksum.lower()})" + ) + return True, "ok" + + +def _remove_if_exists(file_path: str) -> None: + """ + Remove a file if it already exists locally. + """ + if os.path.exists(file_path): + os.remove(file_path) + + +def _get_download_url(file_record: Dict, protocol: str) -> str: + """ + Resolve the public download URL for a file and protocol. + + Raises ValueError when the requested protocol has no suitable location. + Aspera requires a dedicated "Aspera Protocol" entry; ftp/s3/globus + derive their URL from the "FTP Protocol" entry (falling back to an + arbitrary non-Aspera location would produce a URL the caller cannot + actually transfer with). + """ + from pridepy.files.files import Files + + locations = file_record.get("publicFileLocations", []) + if not locations: + raise ValueError("No public file locations present") + + aspera_url = None + ftp_url = None + for location in locations: + name = location.get("name") + if name == "Aspera Protocol": + aspera_url = location.get("value") + elif name == "FTP Protocol": + ftp_url = location.get("value") + + if protocol == "aspera": + if not aspera_url: + raise ValueError("Aspera URL not available") + return aspera_url + + if not ftp_url: + raise ValueError("FTP URL not available") + if protocol == "ftp": + return ftp_url + if protocol == "globus": + return ftp_url.replace( + Files.PRIDE_ARCHIVE_FTP_URL_PREFIX, + Files.PRIDE_ARCHIVE_HTTPS_URL_PREFIX, + 1, + ) + if protocol == "s3": + return ftp_url + raise ValueError(f"Unsupported protocol: {protocol}") + + +def _resolve_local_path(file_record: Dict, output_folder: str) -> str: + """ + Compute the canonical local path for a file regardless of transfer protocol. + """ + from pridepy.files.files import Files + + try: + canonical_url = _get_download_url(file_record, "ftp") + except ValueError: + canonical_url = "" + if canonical_url: + return Files.get_output_file_name(canonical_url, file_record, output_folder) + return os.path.join(output_folder, file_record["fileName"]) From 41539e7ab898b06d96605eec159bbec4a6fa13cc Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 15:59:01 +0100 Subject: [PATCH 08/21] refactor(providers): extract MassiveProvider Moved MassIVE listing + record building from Files into providers/massive.py as MassiveProvider(BaseDirectDownloadProvider). Provider is registered with the Registry. Files keeps shim methods (is_massive_accession, _list_massive_public_files, _build_massive_file_record, _get_massive_public_root, _get_massive_public_ftp_url, _map_massive_collection_to_category) that delegate to the provider. MASSIVE_CATEGORY_MAP / MASSIVE_ARCHIVE_FTP constants remain on Files as class-attribute re-exports. All 10 MassIVE tests pass without modification. Full suite green. --- pridepy/files/files.py | 85 +++++++++---------------------- pridepy/providers/massive.py | 97 ++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 62 deletions(-) create mode 100644 pridepy/providers/massive.py diff --git a/pridepy/files/files.py b/pridepy/files/files.py index e919915..328150e 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -44,8 +44,15 @@ class Files: PRIDE_ARCHIVE_FTP = "ftp.pride.ebi.ac.uk" PRIDE_ARCHIVE_FTP_URL_PREFIX = "ftp://ftp.pride.ebi.ac.uk/" PRIDE_ARCHIVE_HTTPS_URL_PREFIX = "https://ftp.pride.ebi.ac.uk/" - MASSIVE_ARCHIVE_FTP = "massive-ftp.ucsd.edu" - MASSIVE_ARCHIVE_FTP_URL_PREFIX = "ftp://massive-ftp.ucsd.edu/v01/" + # Re-exported from providers/massive.py — kept here for back-compat. + from pridepy.providers.massive import ( # noqa: E402 + MASSIVE_CATEGORY_MAP as _MASSIVE_CATEGORY_MAP, + MassiveProvider as _MassiveProvider, + ) + MASSIVE_CATEGORY_MAP = _MASSIVE_CATEGORY_MAP + MASSIVE_ARCHIVE_FTP = _MassiveProvider.ARCHIVE_FTP + MASSIVE_ARCHIVE_FTP_URL_PREFIX = _MassiveProvider.ARCHIVE_FTP_URL_PREFIX + del _MASSIVE_CATEGORY_MAP, _MassiveProvider JPOST_ARCHIVE_FTP = "ftp.jpostdb.org" JPOST_ARCHIVE_FTP_URL_PREFIX = "ftp://ftp.jpostdb.org/" JPOST_PROXI_BASE_URL = "https://repository.jpostdb.org/proxi/datasets/" @@ -69,18 +76,6 @@ class Files: S3_URL = "https://hh.fire.sdo.ebi.ac.uk" S3_BUCKET = "pride-public" PROTOCOL_ORDER = ["aspera", "s3", "ftp", "globus"] - MASSIVE_CATEGORY_MAP = { - "raw": "RAW", - "peak": "PEAK", - "ccms_peak": "PEAK", - "search": "SEARCH", - "result": "RESULT", - "ccms_result": "RESULT", - "quant": "RESULT", - "fasta": "FASTA", - "spectrum_library": "SPECTRUM_LIBRARY", - "library": "SPECTRUM_LIBRARY", - } logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") def __init__(self): @@ -145,48 +140,29 @@ def _protocol_sequence(protocol: str) -> List[str]: @staticmethod def is_massive_accession(accession: str) -> bool: - """ - Return True when the accession looks like a MassIVE dataset accession. - """ - if not accession: - return False - return bool(re.fullmatch(r"R?MSV\d{9}", accession.upper())) + """Shim — see :meth:`pridepy.providers.massive.MassiveProvider.matches`.""" + from pridepy.providers.massive import MassiveProvider + return MassiveProvider.matches(accession) @staticmethod def _get_massive_public_root(accession: str) -> str: - normalized_accession = accession.upper() - return f"/v01/{normalized_accession}" + from pridepy.providers.massive import MassiveProvider + return MassiveProvider._get_public_root(accession) @staticmethod def _get_massive_public_ftp_url(accession: str, remote_path: str) -> str: - root_path = Files._get_massive_public_root(accession).rstrip("/") - relative_path = remote_path - if remote_path.startswith(root_path): - relative_path = remote_path[len(root_path) :].lstrip("/") - return f"{Files.MASSIVE_ARCHIVE_FTP_URL_PREFIX}{accession.upper()}/{relative_path}" + from pridepy.providers.massive import MassiveProvider + return MassiveProvider._get_public_ftp_url(accession, remote_path) @staticmethod def _map_massive_collection_to_category(collection: str) -> str: - return Files.MASSIVE_CATEGORY_MAP.get(collection.lower(), "OTHER") + from pridepy.providers.massive import MassiveProvider + return MassiveProvider._map_collection_to_category(collection) @staticmethod def _build_massive_file_record(accession: str, ftp_url: str) -> Dict: - parsed = urlparse(ftp_url) - root_prefix = f"/v01/{accession.upper()}/" - relative_path = parsed.path - if relative_path.startswith(root_prefix): - relative_path = relative_path[len(root_prefix) :] - relative_path = relative_path.lstrip("/") - collection = relative_path.split("/", 1)[0] if relative_path else "" - return { - "accession": accession.upper(), - "fileName": os.path.basename(parsed.path), - "fileCategory": {"value": Files._map_massive_collection_to_category(collection)}, - "publicFileLocations": [{"name": "FTP Protocol", "value": ftp_url}], - "relativePath": relative_path, - "collection": collection, - "source": "MassIVE", - } + from pridepy.providers.massive import MassiveProvider + return MassiveProvider._build_file_record(accession, ftp_url) @staticmethod def is_jpost_accession(accession: str) -> bool: @@ -333,24 +309,9 @@ def _list_ftp_repo_files(host, remote_root, error_label, use_tls=False): return transport._list_ftp_repo_files(host=host, remote_root=remote_root, error_label=error_label, use_tls=use_tls) def _list_massive_public_files(self, accession: str) -> List[Dict]: - """ - Discover all public files for a MassIVE dataset from its anonymous FTP tree. - """ - normalized_accession = accession.upper() - remote_root = self._get_massive_public_root(normalized_accession) - remote_files = self._list_ftp_repo_files( - host=self.MASSIVE_ARCHIVE_FTP, - remote_root=remote_root, - error_label=f"MassIVE dataset {normalized_accession}", - use_tls=True, - ) - return [ - self._build_massive_file_record( - normalized_accession, - self._get_massive_public_ftp_url(normalized_accession, remote_file), - ) - for remote_file in remote_files - ] + """Shim — see :meth:`pridepy.providers.massive.MassiveProvider.list_files`.""" + from pridepy.providers.massive import MassiveProvider + return MassiveProvider().list_files(accession) def _download_massive_file_records( self, diff --git a/pridepy/providers/massive.py b/pridepy/providers/massive.py new file mode 100644 index 0000000..cdc466b --- /dev/null +++ b/pridepy/providers/massive.py @@ -0,0 +1,97 @@ +"""MassIVE direct-download provider. + +Lists files by walking the FTPS tree at massive-ftp.ucsd.edu (TLS is +required by the server). Downloads files via the shared transport layer +with ``use_tls=True``. +""" +import os +import re +from typing import ClassVar, Dict, List +from urllib.parse import urlparse + +from pridepy.providers import registry +from pridepy.providers.base import BaseDirectDownloadProvider + + +MASSIVE_CATEGORY_MAP = { + "raw": "RAW", + "peak": "PEAK", + "ccms_peak": "PEAK", + "search": "SEARCH", + "result": "RESULT", + "ccms_result": "RESULT", + "quant": "RESULT", + "fasta": "FASTA", + "spectrum_library": "SPECTRUM_LIBRARY", + "library": "SPECTRUM_LIBRARY", +} + + +@registry.register +class MassiveProvider(BaseDirectDownloadProvider): + name: ClassVar[str] = "massive" + use_tls: ClassVar[bool] = True + + ARCHIVE_FTP: ClassVar[str] = "massive-ftp.ucsd.edu" + ARCHIVE_FTP_URL_PREFIX: ClassVar[str] = "ftp://massive-ftp.ucsd.edu/v01/" + + @staticmethod + def matches(accession: str) -> bool: + """Return True when ``accession`` is a MassIVE dataset accession.""" + if not accession: + return False + return bool(re.fullmatch(r"R?MSV\d{9}", accession.upper())) + + @staticmethod + def _get_public_root(accession: str) -> str: + return f"/v01/{accession.upper()}" + + @classmethod + def _get_public_ftp_url(cls, accession: str, remote_path: str) -> str: + root_path = cls._get_public_root(accession).rstrip("/") + relative_path = remote_path + if remote_path.startswith(root_path): + relative_path = remote_path[len(root_path):].lstrip("/") + return f"{cls.ARCHIVE_FTP_URL_PREFIX}{accession.upper()}/{relative_path}" + + @staticmethod + def _map_collection_to_category(collection: str) -> str: + return MASSIVE_CATEGORY_MAP.get(collection.lower(), "OTHER") + + @classmethod + def _build_file_record(cls, accession: str, ftp_url: str) -> Dict: + """Build a pridepy file record from an FTP URL inside the dataset.""" + parsed = urlparse(ftp_url) + root_prefix = f"/v01/{accession.upper()}/" + relative_path = parsed.path + if relative_path.startswith(root_prefix): + relative_path = relative_path[len(root_prefix):] + relative_path = relative_path.lstrip("/") + collection = relative_path.split("/", 1)[0] if relative_path else "" + return { + "accession": accession.upper(), + "fileName": os.path.basename(parsed.path), + "fileCategory": {"value": cls._map_collection_to_category(collection)}, + "publicFileLocations": [{"name": "FTP Protocol", "value": ftp_url}], + "relativePath": relative_path, + "collection": collection, + "source": "MassIVE", + } + + def list_files(self, accession: str) -> List[Dict]: + from pridepy.providers import transport + normalized = accession.upper() + remote_root = self._get_public_root(normalized) + remote_files = transport._list_ftp_repo_files( + host=self.ARCHIVE_FTP, + remote_root=remote_root, + error_label=f"MassIVE dataset {normalized}", + use_tls=True, + ) + return [ + self._build_file_record( + normalized, + self._get_public_ftp_url(normalized, remote_file), + ) + for remote_file in remote_files + ] From 2010f6375083227ae637b4b84e0d0987c67b0af4 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 16:06:04 +0100 Subject: [PATCH 09/21] refactor(providers): extract JpostProvider with PROXI + FTP listing Moved JPOST listing (PROXI JSON primary + FTP tree walk fallback) and record building from Files into providers/jpost.py as JpostProvider(BaseDirectDownloadProvider). Provider registered with the Registry. Files keeps shim methods (is_jpost_accession, _list_jpost_public_files, _list_jpost_public_files_via_proxi, _build_jpost_file_record, _get_jpost_public_root, _get_jpost_public_ftp_url) that delegate to the provider. JPOST_ARCHIVE_FTP / JPOST_PROXI_BASE_URL / JPOST_PROXI_CATEGORY_MAP constants remain on Files as class-attribute re-exports. All JPOST tests pass without modification. Full suite green. --- pridepy/files/files.py | 123 +++++++----------------------- pridepy/providers/jpost.py | 150 +++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 97 deletions(-) create mode 100644 pridepy/providers/jpost.py diff --git a/pridepy/files/files.py b/pridepy/files/files.py index 328150e..69d477e 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -53,18 +53,12 @@ class Files: MASSIVE_ARCHIVE_FTP = _MassiveProvider.ARCHIVE_FTP MASSIVE_ARCHIVE_FTP_URL_PREFIX = _MassiveProvider.ARCHIVE_FTP_URL_PREFIX del _MASSIVE_CATEGORY_MAP, _MassiveProvider - JPOST_ARCHIVE_FTP = "ftp.jpostdb.org" - JPOST_ARCHIVE_FTP_URL_PREFIX = "ftp://ftp.jpostdb.org/" - JPOST_PROXI_BASE_URL = "https://repository.jpostdb.org/proxi/datasets/" - JPOST_PROXI_CATEGORY_MAP = { - "Associated raw file URI": "RAW", - "Result file URI": "RESULT", - "Search engine output file URI": "SEARCH", - "Peak list file URI": "PEAK", - "Spectrum library file URI": "SPECTRUM_LIBRARY", - "Sequence database URI": "FASTA", - "Quantification file URI": "RESULT", - } + from pridepy.providers.jpost import JpostProvider as _JpostProvider + JPOST_ARCHIVE_FTP = _JpostProvider.ARCHIVE_FTP + JPOST_ARCHIVE_FTP_URL_PREFIX = _JpostProvider.ARCHIVE_FTP_URL_PREFIX + JPOST_PROXI_BASE_URL = _JpostProvider.PROXI_BASE_URL + JPOST_PROXI_CATEGORY_MAP = _JpostProvider.PROXI_CATEGORY_MAP + del _JpostProvider IPROX_DOWNLOAD_BASE_URL = "http://download.iprox.org/" IPROX_PX_XML_URL_TEMPLATE = ( "http://download.iprox.org/{accession}/PX_{accession}.xml" @@ -166,57 +160,24 @@ def _build_massive_file_record(accession: str, ftp_url: str) -> Dict: @staticmethod def is_jpost_accession(accession: str) -> bool: - """ - Return True when the accession looks like a JPOST dataset accession. - """ - if not accession: - return False - return bool(re.fullmatch(r"JPST\d{6}", accession.upper())) + """Shim — see :meth:`pridepy.providers.jpost.JpostProvider.matches`.""" + from pridepy.providers.jpost import JpostProvider + return JpostProvider.matches(accession) @staticmethod def _get_jpost_public_root(accession: str) -> str: - return f"/{accession.upper()}" + from pridepy.providers.jpost import JpostProvider + return JpostProvider._get_public_root(accession) @staticmethod def _get_jpost_public_ftp_url(accession: str, remote_path: str) -> str: - root_path = Files._get_jpost_public_root(accession).rstrip("/") - relative_path = remote_path - if remote_path.startswith(root_path): - relative_path = remote_path[len(root_path) :].lstrip("/") - return f"{Files.JPOST_ARCHIVE_FTP_URL_PREFIX}{accession.upper()}/{relative_path}" + from pridepy.providers.jpost import JpostProvider + return JpostProvider._get_public_ftp_url(accession, remote_path) @staticmethod - def _build_jpost_file_record( - accession: str, ftp_url: str, category_from_proxi: Optional[str] = None - ) -> Dict: - """ - Build a pridepy file record for a JPOST file. - - When ``category_from_proxi`` is provided (e.g. ``"Associated raw file URI"``), - the PROXI CV name takes precedence over the heuristic collection-from-path - mapping. Falls back to the same path-segment heuristic used for MassIVE - when the category isn't known. - """ - parsed = urlparse(ftp_url) - root_prefix = f"/{accession.upper()}/" - relative_path = parsed.path - if relative_path.startswith(root_prefix): - relative_path = relative_path[len(root_prefix) :] - relative_path = relative_path.lstrip("/") - collection = relative_path.split("/", 1)[0] if relative_path else "" - if category_from_proxi and category_from_proxi in Files.JPOST_PROXI_CATEGORY_MAP: - category = Files.JPOST_PROXI_CATEGORY_MAP[category_from_proxi] - else: - category = Files._map_massive_collection_to_category(collection) - return { - "accession": accession.upper(), - "fileName": os.path.basename(parsed.path), - "fileCategory": {"value": category}, - "publicFileLocations": [{"name": "FTP Protocol", "value": ftp_url}], - "relativePath": relative_path, - "collection": collection, - "source": "JPOST", - } + def _build_jpost_file_record(accession, ftp_url, category_from_proxi=None): + from pridepy.providers.jpost import JpostProvider + return JpostProvider._build_file_record(accession, ftp_url, category_from_proxi) @staticmethod def _build_iprox_file_record( @@ -339,12 +300,11 @@ def _list_jpost_public_files(self, accession: str) -> List[Dict]: """ Discover all public files for a JPOST dataset. - Prefers the JPOST PROXI JSON endpoint at - ``https://repository.jpostdb.org/proxi/datasets/`` since it - returns file URLs with category labels and avoids the anonymous-FTP - rate limit that ``ftp.jpostdb.org`` applies per source IP. Falls back - to walking the FTP tree if PROXI is unreachable or returns no files. + Delegates to JpostProvider but routes via the shim methods so that + test patches on ``_list_jpost_public_files_via_proxi`` and + ``_list_ftp_repo_files`` continue to intercept. """ + from pridepy.providers.jpost import JpostProvider normalized_accession = accession.upper() try: return self._list_jpost_public_files_via_proxi(normalized_accession) @@ -353,55 +313,24 @@ def _list_jpost_public_files(self, accession: str) -> List[Dict]: f"JPOST PROXI listing failed for {normalized_accession} " f"({proxi_error}); falling back to FTP tree walk." ) - remote_root = self._get_jpost_public_root(normalized_accession) + remote_root = JpostProvider._get_public_root(normalized_accession) remote_files = self._list_ftp_repo_files( - host=self.JPOST_ARCHIVE_FTP, + host=JpostProvider.ARCHIVE_FTP, remote_root=remote_root, error_label=f"JPOST dataset {normalized_accession}", ) return [ self._build_jpost_file_record( normalized_accession, - self._get_jpost_public_ftp_url(normalized_accession, remote_file), + JpostProvider._get_public_ftp_url(normalized_accession, remote_file), ) for remote_file in remote_files ] def _list_jpost_public_files_via_proxi(self, accession: str) -> List[Dict]: - """ - Fetch the JPOST PROXI dataset metadata and turn each ``datasetFiles`` - entry into a pridepy file record. The PROXI ``name`` field is mapped to - a PRIDE-style category so existing RAW/SEARCH/RESULT filtering works. - """ - import json as _json - - proxi_url = f"{self.JPOST_PROXI_BASE_URL}{accession}" - logging.info(f"Fetching JPOST PROXI metadata: {proxi_url}") - response = requests.get( - proxi_url, - headers={"Accept": "application/json"}, - timeout=30, - ) - response.raise_for_status() - data = _json.loads(response.content) - dataset_files = data.get("datasetFiles") or [] - records: List[Dict] = [] - for entry in dataset_files: - value = (entry or {}).get("value") - if not value or not value.startswith("ftp://"): - continue - records.append( - self._build_jpost_file_record( - accession, - value, - category_from_proxi=(entry or {}).get("name"), - ) - ) - if not records: - raise RuntimeError( - f"JPOST PROXI returned no FTP file URIs for {accession}" - ) - return records + """Shim — see :meth:`pridepy.providers.jpost.JpostProvider._list_via_proxi`.""" + from pridepy.providers.jpost import JpostProvider + return JpostProvider()._list_via_proxi(accession) def _list_iprox_public_files(self, accession: str) -> List[Dict]: """ diff --git a/pridepy/providers/jpost.py b/pridepy/providers/jpost.py new file mode 100644 index 0000000..a9ab23e --- /dev/null +++ b/pridepy/providers/jpost.py @@ -0,0 +1,150 @@ +"""JPOST direct-download provider. + +PRIMARY listing: PROXI JSON at repository.jpostdb.org. The PROXI endpoint +returns ``datasetFiles[*].value`` as ``ftp://`` URLs alongside CV labels +(Associated raw file URI, Search engine output file URI, etc.) which map +cleanly to PRIDE file categories. + +FALLBACK listing: when PROXI fails, walk the FTP tree at ftp.jpostdb.org. +This is needed because JPOST's FTP server rate-limits aggressively per +source IP (sticky 421-too-many-connections); the PROXI path lets us avoid +walking the FTP tree just for a listing. +""" +import logging +import os +import re +from typing import ClassVar, Dict, List, Optional +from urllib.parse import urlparse + +import requests + +from pridepy.providers import registry +from pridepy.providers.base import BaseDirectDownloadProvider + + +@registry.register +class JpostProvider(BaseDirectDownloadProvider): + name: ClassVar[str] = "jpost" + use_tls: ClassVar[bool] = False + + ARCHIVE_FTP: ClassVar[str] = "ftp.jpostdb.org" + ARCHIVE_FTP_URL_PREFIX: ClassVar[str] = "ftp://ftp.jpostdb.org/" + PROXI_BASE_URL: ClassVar[str] = "https://repository.jpostdb.org/proxi/datasets/" + + PROXI_CATEGORY_MAP: ClassVar[Dict[str, str]] = { + "Associated raw file URI": "RAW", + "Result file URI": "RESULT", + "Search engine output file URI": "SEARCH", + "Peak list file URI": "PEAK", + "Spectrum library file URI": "SPECTRUM_LIBRARY", + "Sequence database URI": "FASTA", + "Quantification file URI": "RESULT", + } + + @staticmethod + def matches(accession: str) -> bool: + if not accession: + return False + return bool(re.fullmatch(r"JPST\d{6}", accession.upper())) + + @staticmethod + def _get_public_root(accession: str) -> str: + return f"/{accession.upper()}" + + @classmethod + def _get_public_ftp_url(cls, accession: str, remote_path: str) -> str: + root_path = cls._get_public_root(accession).rstrip("/") + relative_path = remote_path + if remote_path.startswith(root_path): + relative_path = remote_path[len(root_path):].lstrip("/") + return f"{cls.ARCHIVE_FTP_URL_PREFIX}{accession.upper()}/{relative_path}" + + @classmethod + def _build_file_record( + cls, accession: str, ftp_url: str, category_from_proxi: Optional[str] = None + ) -> Dict: + """Build a pridepy file record from an FTP URL. + + When ``category_from_proxi`` is provided (e.g. ``"Associated raw file URI"``), + the PROXI CV name takes precedence over the heuristic collection-from-path + mapping. Falls back to the same path-segment heuristic used for MassIVE + when the category isn't known. + """ + # Import the MassIVE collection->category map for the fallback heuristic. + from pridepy.providers.massive import MassiveProvider + parsed = urlparse(ftp_url) + root_prefix = f"/{accession.upper()}/" + relative_path = parsed.path + if relative_path.startswith(root_prefix): + relative_path = relative_path[len(root_prefix):] + relative_path = relative_path.lstrip("/") + collection = relative_path.split("/", 1)[0] if relative_path else "" + if category_from_proxi and category_from_proxi in cls.PROXI_CATEGORY_MAP: + category = cls.PROXI_CATEGORY_MAP[category_from_proxi] + else: + category = MassiveProvider._map_collection_to_category(collection) + return { + "accession": accession.upper(), + "fileName": os.path.basename(parsed.path), + "fileCategory": {"value": category}, + "publicFileLocations": [{"name": "FTP Protocol", "value": ftp_url}], + "relativePath": relative_path, + "collection": collection, + "source": "JPOST", + } + + def list_files(self, accession: str) -> List[Dict]: + """PRIMARY: PROXI JSON. FALLBACK: FTP tree walk.""" + normalized = accession.upper() + try: + return self._list_via_proxi(normalized) + except Exception as proxi_error: + logging.warning( + f"JPOST PROXI listing failed for {normalized} " + f"({proxi_error}); falling back to FTP tree walk." + ) + from pridepy.providers import transport + remote_root = self._get_public_root(normalized) + remote_files = transport._list_ftp_repo_files( + host=self.ARCHIVE_FTP, + remote_root=remote_root, + error_label=f"JPOST dataset {normalized}", + ) + return [ + self._build_file_record( + normalized, + self._get_public_ftp_url(normalized, remote_file), + ) + for remote_file in remote_files + ] + + def _list_via_proxi(self, accession: str) -> List[Dict]: + """Fetch JPOST PROXI dataset metadata and turn each datasetFiles entry into a file record.""" + import json as _json + proxi_url = f"{self.PROXI_BASE_URL}{accession}" + logging.info(f"Fetching JPOST PROXI metadata: {proxi_url}") + response = requests.get( + proxi_url, + headers={"Accept": "application/json"}, + timeout=30, + ) + response.raise_for_status() + data = _json.loads(response.content) + dataset_files = data.get("datasetFiles") or [] + records: List[Dict] = [] + for entry in dataset_files: + value = (entry or {}).get("value") + if not value or not value.startswith("ftp://"): + continue + records.append( + self._build_file_record( + accession, + value, + category_from_proxi=(entry or {}).get("name"), + ) + ) + if not records: + raise RuntimeError( + f"JPOST PROXI returned no FTP file URIs for {accession}" + ) + return records From e128a16d8754977aad843a40f565944975098e7c Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 16:19:49 +0100 Subject: [PATCH 10/21] refactor(providers): extract IproxProvider with PX XML listing Moved iProX listing (PX XML from download.iprox.org) and record building from Files into providers/iprox.py as IproxProvider(BaseDirectDownloadProvider). Provider registered with the Registry. Files keeps shim methods (is_iprox_accession, _list_iprox_public_files, _build_iprox_file_record, _get_iprox_public_root, _get_iprox_public_ftp_url) that delegate to the provider. IPROX_DOWNLOAD_BASE_URL / IPROX_PX_XML_URL_TEMPLATE / IPROX_PX_CATEGORY_MAP constants remain on Files as class-attribute re-exports. _download_direct_download_records on Files now dispatches via the registry instead of branching manually. _list_direct_download_files keeps shim dispatching so existing test patches on _list_massive_public_files and _list_jpost_public_files continue to intercept the calls. All iProX tests pass without modification. Full suite green. --- pridepy/files/files.py | 176 ++++++++----------------------------- pridepy/providers/iprox.py | 129 +++++++++++++++++++++++++++ 2 files changed, 167 insertions(+), 138 deletions(-) create mode 100644 pridepy/providers/iprox.py diff --git a/pridepy/files/files.py b/pridepy/files/files.py index 69d477e..afe6067 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -59,14 +59,11 @@ class Files: JPOST_PROXI_BASE_URL = _JpostProvider.PROXI_BASE_URL JPOST_PROXI_CATEGORY_MAP = _JpostProvider.PROXI_CATEGORY_MAP del _JpostProvider - IPROX_DOWNLOAD_BASE_URL = "http://download.iprox.org/" - IPROX_PX_XML_URL_TEMPLATE = ( - "http://download.iprox.org/{accession}/PX_{accession}.xml" - ) - # iProX PX XML uses the same PSI-MS cvParam "name" values as JPOST, so the - # JPOST PROXI category map applies. PX XML cvParam "Associated raw file URI" - # is the canonical raw-file label per the PSI-MS CV (MS:1002846). - IPROX_PX_CATEGORY_MAP = JPOST_PROXI_CATEGORY_MAP + from pridepy.providers.iprox import IproxProvider as _IproxProvider + IPROX_DOWNLOAD_BASE_URL = _IproxProvider.DOWNLOAD_BASE_URL + IPROX_PX_XML_URL_TEMPLATE = _IproxProvider.PX_XML_URL_TEMPLATE + IPROX_PX_CATEGORY_MAP = _IproxProvider.PX_CATEGORY_MAP + del _IproxProvider S3_URL = "https://hh.fire.sdo.ebi.ac.uk" S3_BUCKET = "pride-public" PROTOCOL_ORDER = ["aspera", "s3", "ftp", "globus"] @@ -180,67 +177,32 @@ def _build_jpost_file_record(accession, ftp_url, category_from_proxi=None): return JpostProvider._build_file_record(accession, ftp_url, category_from_proxi) @staticmethod - def _build_iprox_file_record( - accession: str, https_url: str, category_from_px: Optional[str] = None - ) -> Dict: - """ - Build a pridepy file record for an iProX file. iProX exposes files - over anonymous HTTPS at - ``http://download.iprox.org///``; - ``category_from_px`` is the ``cvParam`` ``name`` from the dataset's - ProteomeXchange XML (e.g. ``"Associated raw file URI"``). - """ - parsed = urlparse(https_url) - root_prefix = f"/{accession.upper()}/" - relative_path = parsed.path - if relative_path.startswith(root_prefix): - relative_path = relative_path[len(root_prefix) :] - relative_path = relative_path.lstrip("/") - collection = relative_path.split("/", 1)[0] if relative_path else "" - if category_from_px and category_from_px in Files.IPROX_PX_CATEGORY_MAP: - category = Files.IPROX_PX_CATEGORY_MAP[category_from_px] - else: - category = Files._map_massive_collection_to_category(collection) - return { - "accession": accession.upper(), - "fileName": os.path.basename(parsed.path), - "fileCategory": {"value": category}, - # ``FTP Protocol`` is the existing label the download dispatcher - # uses to locate a file URL; here it actually points at HTTPS. - # ``_download_direct_download_records`` routes by URL scheme. - "publicFileLocations": [{"name": "FTP Protocol", "value": https_url}], - "relativePath": relative_path, - "collection": collection, - "source": "iProX", - } + def _build_iprox_file_record(accession, https_url, category_from_px=None): + """Shim — see :meth:`pridepy.providers.iprox.IproxProvider._build_file_record`.""" + from pridepy.providers.iprox import IproxProvider + return IproxProvider._build_file_record(accession, https_url, category_from_px) + + @staticmethod + def _get_iprox_public_root(accession: str) -> str: + from pridepy.providers.iprox import IproxProvider + return IproxProvider._get_public_root(accession) + + @staticmethod + def _get_iprox_public_ftp_url(accession: str, remote_path: str) -> str: + from pridepy.providers.iprox import IproxProvider + return IproxProvider._get_public_ftp_url(accession, remote_path) @staticmethod def is_direct_download_accession(accession: str) -> bool: - """ - Return True when the accession is served by a public repository that - pridepy supports via direct downloads (no ProteomeXchange API). - MassIVE and JPOST use FTP(S); iProX uses anonymous HTTPS via - ``download.iprox.org``. - """ - return ( - Files.is_massive_accession(accession) - or Files.is_jpost_accession(accession) - or Files.is_iprox_accession(accession) - ) + """Shim — True for any registered direct-download provider (MSV/JPST/IPX).""" + from pridepy.providers import registry + return registry.is_known(accession) @staticmethod def is_iprox_accession(accession: str) -> bool: - """ - Return True when the accession looks like an iProX dataset accession - (``IPX`` followed by 7-10 digits). iProX exposes the dataset - ProteomeXchange XML at - ``http://download.iprox.org//PX_.xml`` and the - referenced files are downloadable from ``download.iprox.org`` over - anonymous HTTPS with byte-range support. - """ - if not accession: - return False - return bool(re.fullmatch(r"IPX\d{7,10}", accession.upper())) + """Shim — see :meth:`pridepy.providers.iprox.IproxProvider.matches`.""" + from pridepy.providers.iprox import IproxProvider + return IproxProvider.matches(accession) @staticmethod def _repo_uses_tls(accession: str) -> bool: @@ -333,53 +295,9 @@ def _list_jpost_public_files_via_proxi(self, accession: str) -> List[Dict]: return JpostProvider()._list_via_proxi(accession) def _list_iprox_public_files(self, accession: str) -> List[Dict]: - """ - Discover all public files for an iProX dataset. - - iProX publishes the ProteomeXchange XML for every public dataset at a - deterministic path on its anonymous HTTPS download server:: - - http://download.iprox.org//PX_.xml - - We fetch that XML, walk every ````'s ``cvParam`` entries, - and turn each ``Associated raw file URI`` (and sibling URIs for - search-engine output, result files, etc.) into a pridepy file record. - File downloads themselves go through plain HTTPS on the same host, - which supports ``Range`` requests for resume. - """ - normalized_accession = accession.upper() - xml_url = self.IPROX_PX_XML_URL_TEMPLATE.format(accession=normalized_accession) - logging.info(f"Fetching iProX PX XML: {xml_url}") - response = requests.get(xml_url, timeout=30) - response.raise_for_status() - try: - root = ET.fromstring(response.content) - except ET.ParseError as parse_error: - raise RuntimeError( - f"Unable to parse iProX PX XML for {normalized_accession}: {parse_error}" - ) from parse_error - - records: List[Dict] = [] - for dataset_file in root.iter("DatasetFile"): - for cv in dataset_file.findall("cvParam"): - name = cv.attrib.get("name") - value = cv.attrib.get("value") - if not value or not name or not name.endswith("URI"): - continue - if not value.lower().startswith(("http://", "https://")): - continue - records.append( - self._build_iprox_file_record( - normalized_accession, - value, - category_from_px=name, - ) - ) - if not records: - raise RuntimeError( - f"iProX PX XML for {normalized_accession} contained no downloadable HTTPS URIs" - ) - return records + """Shim — see :meth:`pridepy.providers.iprox.IproxProvider.list_files`.""" + from pridepy.providers.iprox import IproxProvider + return IproxProvider().list_files(accession) def _list_direct_download_files(self, accession: str) -> List[Dict]: """ @@ -414,35 +332,17 @@ def _download_direct_download_records( ``download.iprox.org`` with ``Range``-based resume and per-file parallel workers. URLs are partitioned by scheme so a mixed batch (e.g. a JPOST PX XML that ever pointed at HTTPS) routes correctly. + Dispatches via the provider registry. """ - if protocol not in ("ftp", "https", "http"): - logging.warning( - "Direct downloads currently use ftp / https only. " - f"Ignoring requested protocol '{protocol}' for {accession}." - ) - - all_urls = [self._get_download_url(record, "ftp") for record in file_records] - ftp_urls = [u for u in all_urls if u.lower().startswith("ftp://")] - http_urls = [u for u in all_urls if u.lower().startswith(("http://", "https://"))] - if not ftp_urls and not http_urls: - logging.info(f"No files matched for direct-download dataset {accession}") - return - - if ftp_urls: - self.download_ftp_urls( - ftp_urls=ftp_urls, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - use_tls=self._repo_uses_tls(accession), - parallel_files=parallel_files, - ) - if http_urls: - self.download_http_urls( - http_urls=http_urls, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - parallel_files=parallel_files, - ) + from pridepy.providers import registry + return registry.resolve(accession).download_files( + accession=accession, + records=file_records, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, + parallel_files=parallel_files, + ) async def stream_all_files_metadata(self, output_file, accession=None): """ diff --git a/pridepy/providers/iprox.py b/pridepy/providers/iprox.py new file mode 100644 index 0000000..292307c --- /dev/null +++ b/pridepy/providers/iprox.py @@ -0,0 +1,129 @@ +"""iProX direct-download provider. + +iProX publishes the ProteomeXchange XML for each dataset at a +deterministic path on its anonymous HTTPS download server:: + + http://download.iprox.org//PX_.xml + +We fetch the XML, walk every ````'s ``cvParam`` entries, and +turn each ``Associated raw file URI`` (and sibling URIs for search-engine +output, result files, etc.) into a pridepy file record. File downloads +themselves go through plain HTTPS on the same host, which supports +``Range`` requests for resume. +""" +import logging +import os +import re +import xml.etree.ElementTree as ET +from typing import ClassVar, Dict, List, Optional +from urllib.parse import urlparse + +import requests + +from pridepy.providers import registry +from pridepy.providers.base import BaseDirectDownloadProvider +from pridepy.providers.jpost import JpostProvider + + +@registry.register +class IproxProvider(BaseDirectDownloadProvider): + name: ClassVar[str] = "iprox" + use_tls: ClassVar[bool] = False # download.iprox.org serves over plain HTTP + + DOWNLOAD_BASE_URL: ClassVar[str] = "http://download.iprox.org/" + PX_XML_URL_TEMPLATE: ClassVar[str] = ( + "http://download.iprox.org/{accession}/PX_{accession}.xml" + ) + # iProX PX XML uses the same PSI-MS cvParam "name" values as JPOST PROXI, + # so we reuse JpostProvider's category map. + PX_CATEGORY_MAP: ClassVar[Dict[str, str]] = JpostProvider.PROXI_CATEGORY_MAP + + @staticmethod + def matches(accession: str) -> bool: + """Return True when ``accession`` looks like an iProX dataset accession.""" + if not accession: + return False + return bool(re.fullmatch(r"IPX\d{7,10}", accession.upper())) + + @staticmethod + def _get_public_root(accession: str) -> str: + return f"/{accession.upper()}" + + @classmethod + def _get_public_ftp_url(cls, accession: str, remote_path: str) -> str: + # NOTE: name kept as `_get_public_ftp_url` for parity with other providers, + # but iProX URLs are http(s) not ftp. The dispatcher routes by scheme. + root_path = cls._get_public_root(accession).rstrip("/") + relative_path = remote_path + if remote_path.startswith(root_path): + relative_path = remote_path[len(root_path):].lstrip("/") + return f"{cls.DOWNLOAD_BASE_URL}{accession.upper()}/{relative_path}" + + @classmethod + def _build_file_record( + cls, accession: str, https_url: str, category_from_px: Optional[str] = None + ) -> Dict: + """Build a pridepy file record for an iProX file. + + ``category_from_px`` is the ``cvParam`` ``name`` from the dataset's + ProteomeXchange XML (e.g. ``"Associated raw file URI"``). + """ + from pridepy.providers.massive import MassiveProvider + parsed = urlparse(https_url) + root_prefix = f"/{accession.upper()}/" + relative_path = parsed.path + if relative_path.startswith(root_prefix): + relative_path = relative_path[len(root_prefix):] + relative_path = relative_path.lstrip("/") + collection = relative_path.split("/", 1)[0] if relative_path else "" + if category_from_px and category_from_px in cls.PX_CATEGORY_MAP: + category = cls.PX_CATEGORY_MAP[category_from_px] + else: + category = MassiveProvider._map_collection_to_category(collection) + return { + "accession": accession.upper(), + "fileName": os.path.basename(parsed.path), + "fileCategory": {"value": category}, + # "FTP Protocol" is the existing label the download dispatcher uses + # to locate a file URL; here it actually points at HTTPS. + # BaseDirectDownloadProvider.download_files routes by URL scheme. + "publicFileLocations": [{"name": "FTP Protocol", "value": https_url}], + "relativePath": relative_path, + "collection": collection, + "source": "iProX", + } + + def list_files(self, accession: str) -> List[Dict]: + normalized = accession.upper() + xml_url = self.PX_XML_URL_TEMPLATE.format(accession=normalized) + logging.info(f"Fetching iProX PX XML: {xml_url}") + response = requests.get(xml_url, timeout=30) + response.raise_for_status() + try: + root = ET.fromstring(response.content) + except ET.ParseError as parse_error: + raise RuntimeError( + f"Unable to parse iProX PX XML for {normalized}: {parse_error}" + ) from parse_error + + records: List[Dict] = [] + for dataset_file in root.iter("DatasetFile"): + for cv in dataset_file.findall("cvParam"): + name = cv.attrib.get("name") + value = cv.attrib.get("value") + if not value or not name or not name.endswith("URI"): + continue + if not value.lower().startswith(("http://", "https://")): + continue + records.append( + self._build_file_record( + normalized, + value, + category_from_px=name, + ) + ) + if not records: + raise RuntimeError( + f"iProX PX XML for {normalized} contained no downloadable HTTPS URIs" + ) + return records From 835d77b998958ece069508228b797a60eb21a8cc Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 16:24:14 +0100 Subject: [PATCH 11/21] test(providers): verify BaseDirectDownloadProvider URL-scheme partitioning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mixed ftp:// + http(s):// records partition correctly: ftp URLs go to Files.download_ftp_urls with use_tls=True (the MassIVE setting), http URLs go to Files.download_http_urls. Both calls intercepted by patch.object(Files, ...) — proving the provider routes back through Files rather than calling transport directly (preserves test patches). --- pridepy/tests/test_massive_files.py | 39 +++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/pridepy/tests/test_massive_files.py b/pridepy/tests/test_massive_files.py index a958309..fd6a4ac 100644 --- a/pridepy/tests/test_massive_files.py +++ b/pridepy/tests/test_massive_files.py @@ -135,3 +135,42 @@ def test_download_all_raw_files_threads_parallel_files_for_massive(self): kwargs = download_mock.call_args.kwargs assert kwargs["use_tls"] is True assert kwargs["parallel_files"] == 3 + + def test_base_direct_download_provider_partitions_urls_by_scheme(self): + """Records mixing ftp:// and http(s):// route to the right transport.""" + from pridepy.providers.massive import MassiveProvider + + provider = MassiveProvider() + records = [ + Files._build_massive_file_record( + "MSV000012345", + "ftp://massive-ftp.ucsd.edu/v01/MSV000012345/raw/a.raw", + ), + # Synthetic http record to verify partitioning (real MassIVE uses ftp). + { + "accession": "MSV000012345", + "fileName": "b.raw", + "fileCategory": {"value": "RAW"}, + "publicFileLocations": [ + {"name": "FTP Protocol", "value": "http://example.org/b.raw"} + ], + }, + ] + with patch.object(Files, "download_ftp_urls") as ftp_mock, \ + patch.object(Files, "download_http_urls") as http_mock: + provider.download_files( + accession="MSV000012345", + records=records, + output_folder="/tmp/test", + skip_if_downloaded_already=False, + protocol="ftp", + parallel_files=1, + ) + + ftp_mock.assert_called_once() + assert ftp_mock.call_args.kwargs["use_tls"] is True + assert ftp_mock.call_args.kwargs["ftp_urls"] == [ + "ftp://massive-ftp.ucsd.edu/v01/MSV000012345/raw/a.raw" + ] + http_mock.assert_called_once() + assert http_mock.call_args.kwargs["http_urls"] == ["http://example.org/b.raw"] From 3e358ae64304d11d314ed14fcafb15f90f3a3454 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 16:34:57 +0100 Subject: [PATCH 12/21] refactor(providers): extract PrideProvider with multi-protocol fallback Moved PRIDE-specific logic (V3 API listing, multi-protocol batch downloader with aspera/s3/ftp/globus fallback, private-dataset path, submitter helpers, Globus/S3 per-protocol downloaders, legacy single-connection FTP) from Files into providers/pride.py as PrideProvider(Provider). ~510 LOC moved out of files.py. Files keeps shim methods for every patched helper (_batch_download_by_protocol, _download_with_fallback, _globus_download_one, download_files_from_globus, download_files_from_s3, download_files_from_ftp, download_private_file_name, get_ascp_binary, save_checksum_file, stream_all_files_by_project, stream_all_files_metadata, get_submitted_file_path_prefix, _protocol_sequence, download_files). PRIDE class constants (V3_API_BASE_URL, API_BASE_URL, API_PRIVATE_URL, PRIDE_ARCHIVE_FTP, *_URL_PREFIX, S3_URL, S3_BUCKET, PROTOCOL_ORDER) remain on Files as re-exports. PrideProvider's internal calls to patch-sensitive helpers go through Files.X (lazy import) so existing test patches in test_download_resilience.py keep working without modification. is_direct_download_accession updated to exclude PRIDE (returns True only for MSV/JPST/IPX) now that PrideProvider is registered too. All 8 patch-sensitive tests in test_download_resilience.py pass. Full suite green at 67 passed, 4 skipped. --- pridepy/files/files.py | 729 +++++----------------------------- pridepy/providers/pride.py | 790 +++++++++++++++++++++++++++++++++++++ 2 files changed, 899 insertions(+), 620 deletions(-) create mode 100644 pridepy/providers/pride.py diff --git a/pridepy/files/files.py b/pridepy/files/files.py index afe6067..b4cc3b9 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -1,30 +1,19 @@ #!/usr/bin/env python import ftplib -import hashlib -import importlib.resources import logging import os -import platform -import posixpath import re -import subprocess import urllib import urllib.request -import time from concurrent.futures import ThreadPoolExecutor, as_completed from ftplib import FTP from typing import Dict, List, Optional, Tuple -import socket from urllib.parse import urlparse import xml.etree.ElementTree as ET -import boto3 -import botocore import requests -from botocore.config import Config from tqdm import tqdm -from pridepy.authentication.authentication import Authentication from pridepy.util.api_handling import Util @@ -38,12 +27,18 @@ class Files: This class handles PRIDE API files endpoint. """ - V3_API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3" - API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3" - API_PRIVATE_URL = "https://www.ebi.ac.uk/pride/private/ws/archive/v2" - PRIDE_ARCHIVE_FTP = "ftp.pride.ebi.ac.uk" - PRIDE_ARCHIVE_FTP_URL_PREFIX = "ftp://ftp.pride.ebi.ac.uk/" - PRIDE_ARCHIVE_HTTPS_URL_PREFIX = "https://ftp.pride.ebi.ac.uk/" + # Re-exported from providers/pride.py — kept here for back-compat. + from pridepy.providers.pride import PrideProvider as _PrideProvider + V3_API_BASE_URL = _PrideProvider.V3_API_BASE_URL + API_BASE_URL = _PrideProvider.API_BASE_URL + API_PRIVATE_URL = _PrideProvider.API_PRIVATE_URL + PRIDE_ARCHIVE_FTP = _PrideProvider.ARCHIVE_FTP + PRIDE_ARCHIVE_FTP_URL_PREFIX = _PrideProvider.ARCHIVE_FTP_URL_PREFIX + PRIDE_ARCHIVE_HTTPS_URL_PREFIX = _PrideProvider.ARCHIVE_HTTPS_URL_PREFIX + S3_URL = _PrideProvider.S3_URL + S3_BUCKET = _PrideProvider.S3_BUCKET + PROTOCOL_ORDER = _PrideProvider.PROTOCOL_ORDER + del _PrideProvider # Re-exported from providers/massive.py — kept here for back-compat. from pridepy.providers.massive import ( # noqa: E402 MASSIVE_CATEGORY_MAP as _MASSIVE_CATEGORY_MAP, @@ -64,9 +59,6 @@ class Files: IPROX_PX_XML_URL_TEMPLATE = _IproxProvider.PX_XML_URL_TEMPLATE IPROX_PX_CATEGORY_MAP = _IproxProvider.PX_CATEGORY_MAP del _IproxProvider - S3_URL = "https://hh.fire.sdo.ebi.ac.uk" - S3_BUCKET = "pride-public" - PROTOCOL_ORDER = ["aspera", "s3", "ftp", "globus"] logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") def __init__(self): @@ -122,12 +114,9 @@ def _resolve_local_path(file_record: Dict, output_folder: str) -> str: @staticmethod def _protocol_sequence(protocol: str) -> List[str]: - """ - Build the ordered list of protocols to try for a requested download mode. - """ - if protocol not in Files.PROTOCOL_ORDER: - return [] - return [protocol] + [p for p in Files.PROTOCOL_ORDER if p != protocol] + """Shim — see :meth:`pridepy.providers.pride.PrideProvider._protocol_sequence`.""" + from pridepy.providers.pride import PrideProvider + return PrideProvider._protocol_sequence(protocol) @staticmethod def is_massive_accession(accession: str) -> bool: @@ -194,9 +183,19 @@ def _get_iprox_public_ftp_url(accession: str, remote_path: str) -> str: @staticmethod def is_direct_download_accession(accession: str) -> bool: - """Shim — True for any registered direct-download provider (MSV/JPST/IPX).""" + """Shim — True for MassIVE/JPOST/iProX (explicitly excludes PRIDE). + + PRIDE is also a registered provider but PRIDE downloads go through + the multi-protocol orchestrator (FTP/Aspera/S3/Globus with checksum + validation and fallback), not the direct-download partitioned-by-URL- + scheme path. So we filter PRIDE out here. + """ from pridepy.providers import registry - return registry.is_known(accession) + try: + provider = registry.resolve(accession) + except ValueError: + return False + return provider.name != "pride" @staticmethod def is_iprox_accession(accession: str) -> bool: @@ -345,32 +344,14 @@ def _download_direct_download_records( ) async def stream_all_files_metadata(self, output_file, accession=None): - """ - get stream all project files from PRIDE API in JSON format - """ - if accession is None: - request_url = f"{self.V3_API_BASE_URL}/files/all" - count_request_url = f"{self.V3_API_BASE_URL}/files/count" - else: - request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/all" - count_request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/count" - headers = {"Accept": "application/JSON"} - response = Util.get_api_call(count_request_url, headers) - total_records = response.json() - - regex_search_pattern = '"fileName"' - await Util.stream_response_to_file( - output_file, total_records, regex_search_pattern, request_url, headers - ) + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.stream_all_files_metadata`.""" + from pridepy.providers.pride import PrideProvider + return await PrideProvider().stream_all_files_metadata(output_file, accession) def stream_all_files_by_project(self, accession) -> List[Dict]: - """ - get stream all project files from PRIDE API in JSON format - """ - request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/all" - headers = {"Accept": "application/JSON"} - record_files = Util.read_json_stream(api_url=request_url, headers=headers) - return record_files + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.stream_all_files_by_project`.""" + from pridepy.providers.pride import PrideProvider + return PrideProvider().stream_all_files_by_project(accession) def get_all_raw_file_list(self, project_accession): """ @@ -446,118 +427,15 @@ def download_files_from_ftp( max_connection_retries=3, max_download_retries=3, ): - """ - Download files using a single FTP connection with a retry mechanism and a progress bar for each file. - :param file_list_json: file list in JSON format - :param output_folder: folder to download the files - :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. - :param max_connection_retries: Number of attempts to reconnect to the FTP server if the connection is lost. - :param max_download_retries: Number of attempts to retry the download of a file in case of failure. - """ - - if not os.path.isdir(output_folder): - os.makedirs(output_folder) - - def connect_ftp(): - """Helper function to establish FTP connection.""" - ftp = FTP(Files.PRIDE_ARCHIVE_FTP, timeout=30) - ftp.login() # Anonymous login - ftp.set_pasv(True) # Enable passive mode - logging.info(f"Connected to FTP host: {Files.PRIDE_ARCHIVE_FTP}") - return ftp - - connection_attempt = 0 - while connection_attempt < max_connection_retries: - try: - ftp = connect_ftp() - for file in file_list_json: - try: - # Get FTP download URL - if file["publicFileLocations"][0]["name"] == "FTP Protocol": - download_url = file["publicFileLocations"][0]["value"] - else: - download_url = file["publicFileLocations"][1]["value"] - - logging.debug("ftp_filepath:" + download_url) - - # Get output file path - new_file_path = Files.get_output_file_name( - download_url, file, output_folder - ) - - if skip_if_downloaded_already and os.path.exists(new_file_path): - logging.info("Skipping download as file already exists") - continue - - # Extract file path from the download URL - parsed_url = urlparse(download_url) - ftp_file_path = urllib.parse.unquote(parsed_url.path.lstrip("/")) - - logging.info(f"Starting FTP download: {ftp_file_path}") - - # Retry download in case of failure - download_attempt = 0 - while download_attempt < max_download_retries: - try: - # Get file size for progress tracking - total_size = ftp.size(ftp_file_path) - logging.info(f"File size: {total_size} bytes") - - # Initialize progress bar - with open(new_file_path, "wb") as f: - with tqdm( - total=total_size, - unit="B", - unit_scale=True, - desc=new_file_path, - ) as pbar: - - def callback(data): - f.write(data) - pbar.update(len(data)) - - # Retrieve the file with progress callback - ftp.retrbinary(f"RETR {ftp_file_path}", callback) - - logging.info(f"Successfully downloaded {new_file_path}") - break # Exit download retry loop if successful - except ( - socket.timeout, - ftplib.error_temp, - ftplib.error_perm, - ) as e: - download_attempt += 1 - logging.error( - f"Download failed for {new_file_path} (attempt {download_attempt}): {str(e)}" - ) - if download_attempt >= max_download_retries: - logging.error( - f"Giving up on {new_file_path} after {max_download_retries} attempts." - ) - break # Give up on this file after max retries - except (KeyError, IndexError) as e: - logging.error(f"Failed to process file due to missing data: {str(e)}") - except Exception as e: - logging.error(f"Unexpected error while processing file: {str(e)}") - ftp.quit() # Close FTP connection after all files are downloaded - logging.info(f"Disconnected from FTP host: {Files.PRIDE_ARCHIVE_FTP}") - break # Exit connection retry loop if everything was successful - except ( - socket.timeout, - ftplib.error_temp, - ftplib.error_perm, - socket.error, - ) as e: - connection_attempt += 1 - logging.error(f"FTP connection failed (attempt {connection_attempt}): {str(e)}") - if connection_attempt < max_connection_retries: - logging.info("Retrying connection...") - time.sleep(5) # Optional delay before retrying - else: - logging.error( - f"Giving up after {max_connection_retries} failed connection attempts." - ) - break + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_files_from_ftp`.""" + from pridepy.providers.pride import PrideProvider + return PrideProvider.download_files_from_ftp( + file_list_json, + output_folder, + skip_if_downloaded_already, + max_connection_retries=max_connection_retries, + max_download_retries=max_download_retries, + ) @staticmethod def get_output_file_name(download_url, file, output_folder): @@ -658,22 +536,12 @@ def _parallel_download(url, file_path, position=0): @staticmethod def _globus_download_one(file, output_folder, skip_if_downloaded_already, max_retries=6, position=0): - """Download a single file via globus; used as a worker target.""" - download_url = Files._get_download_url(file, "globus") - new_file_path = Files.get_output_file_name(download_url, file, output_folder) - - if skip_if_downloaded_already and os.path.exists(new_file_path): - logging.info(f"Skipping download as file already exists: {new_file_path}") - return - - for attempt in range(1, max_retries + 1): - try: - Files._parallel_download(download_url, new_file_path, position=position) - return - except Exception as e: - logging.warning(f"Attempt {attempt}/{max_retries} failed for {file.get('fileName', '?')}: {e}") - if attempt == max_retries: - raise + """Shim — see :meth:`pridepy.providers.pride.PrideProvider._globus_download_one`.""" + from pridepy.providers.pride import PrideProvider + return PrideProvider._globus_download_one( + file, output_folder, skip_if_downloaded_already, + max_retries=max_retries, position=position, + ) @staticmethod def download_files_from_globus( @@ -681,172 +549,28 @@ def download_files_from_globus( parallel_files: int = 1, checksum_map: Optional[Dict[str, str]] = None, ): - """ - Download files using globus transfer url with progress bar for each file. - When skip_if_downloaded_already is True, files are pre-filtered so that - only missing or incomplete files are submitted to the worker pool, - ensuring the -w parallel_files parameter is fully utilised. - When checksum_map is provided, existing files are validated against - their expected checksum; corrupted files are re-downloaded. - :param file_list_json: file list in json format - :param output_folder: folder to download the files - :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. - :param parallel_files: number of files to download simultaneously - :param checksum_map: mapping of file name to expected MD5 checksum - """ - if checksum_map is None: - checksum_map = {} - - if not (os.path.isdir(output_folder)): - os.makedirs(output_folder, exist_ok=True) - - # --- Phase 0: pre-filter files that need downloading ----------------- - files_to_download: List[Dict] = [] - for file in file_list_json: - download_url = Files._get_download_url(file, "globus") - new_file_path = Files.get_output_file_name(download_url, file, output_folder) - if skip_if_downloaded_already and os.path.exists(new_file_path): - expected_cs = checksum_map.get(file.get("fileName", "")) - if expected_cs: - valid, reason = Files.validate_download(new_file_path, expected_cs) - if not valid: - logging.warning(f"Corrupted file detected ({reason}), will re-download: {new_file_path}") - files_to_download.append(file) - continue - logging.info(f"Skipping download as file already exists: {new_file_path}") - continue - files_to_download.append(file) - - if not files_to_download: - logging.info("All files already downloaded, nothing to do.") - return - - logging.info( - f"{len(file_list_json) - len(files_to_download)} file(s) skipped, " - f"{len(files_to_download)} file(s) to download" + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_files_from_globus`.""" + from pridepy.providers.pride import PrideProvider + return PrideProvider.download_files_from_globus( + file_list_json, output_folder, skip_if_downloaded_already, + parallel_files=parallel_files, + checksum_map=checksum_map, ) - # --- Phase 1: download (skip check already done, pass False) --------- - parallel_files = min(parallel_files, 3, len(files_to_download)) - if parallel_files < 2: - for file in files_to_download: - try: - Files._globus_download_one( - file, output_folder, False - ) - new_file_path = Files.get_output_file_name( - Files._get_download_url(file, "globus"), file, output_folder - ) - logging.info(f"Successfully downloaded {new_file_path}") - except Exception as e: - logging.error(f"Download from Globus failed: {str(e)}") - else: - logging.info(f"Downloading {len(files_to_download)} file(s) with {parallel_files} parallel workers") - with ThreadPoolExecutor(max_workers=parallel_files) as executor: - futures = { - executor.submit( - Files._globus_download_one, - file, output_folder, False, - position=idx, - ): file - for idx, file in enumerate(files_to_download) - } - for future in as_completed(futures): - try: - future.result() - except Exception as e: - logging.error(f"Download from Globus failed: {str(e)}") - @staticmethod def download_files_from_s3( file_list_json: List[Dict], output_folder: str, skip_if_downloaded_already ): - """ - Download files using S3 transfer URL with a progress bar and retry logic. - :param file_list_json: file list in JSON format - :param output_folder: folder to download the files - :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. - """ - - if not os.path.isdir(output_folder): - os.makedirs(output_folder, exist_ok=True) - - # Retry and timeout config - retry_config = Config( - retries={"max_attempts": 5, "mode": "standard"}, - connect_timeout=120, # Increase timeout to 120 seconds - read_timeout=120, # Timeout for reading data - signature_version=botocore.UNSIGNED, # Unsigned requests for public data + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_files_from_s3`.""" + from pridepy.providers.pride import PrideProvider + return PrideProvider.download_files_from_s3( + file_list_json, output_folder, skip_if_downloaded_already, ) - s3_resource = boto3.resource( - "s3", - config=retry_config, - endpoint_url=Files.S3_URL, - ) - bucket = s3_resource.Bucket(Files.S3_BUCKET) - - for file in file_list_json: - try: - # Determine S3 or FTP path - download_url = ( - file["publicFileLocations"][0]["value"] - if file["publicFileLocations"][0]["name"] == "FTP Protocol" - else file["publicFileLocations"][1]["value"] - ) - - ftp_base_url = "ftp://ftp.pride.ebi.ac.uk/pride/data/archive/" - s3_path = download_url.replace(ftp_base_url, "") - new_file_path = Files.get_output_file_name(download_url, file, output_folder) - - if skip_if_downloaded_already == True and os.path.exists(new_file_path): - logging.info("Skipping download as file already exists") - continue - - logging.debug(f"Downloading From S3: {s3_path}") - - # Get file size for progress tracking - obj = bucket.Object(s3_path) - total_size = obj.content_length - - # Initialize progress bar - progress = Progress(total_size, new_file_path) - - # Download with progress bar and retry handling - for attempt in range(5): - try: - bucket.download_file(s3_path, new_file_path, Callback=progress) - progress.close() - logging.info(f"Successfully downloaded {new_file_path}") - break - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "404": - logging.error("The object does not exist.") - break - else: - logging.error(f"Download failed: {e}") - if attempt < 4: - time.sleep(2**attempt) # Exponential backoff - logging.info(f"Retrying... ({attempt + 1}/5)") - else: - raise - except Exception as e: - logging.error(f"Failed to download {file['fileName']}: {e}") - def get_submitted_file_path_prefix(self, accession): - """ - At pride repository, public data is disseminated according to a proper structure. - I.e. base/path/ + yyyy/mm/accession/ + submitted/ - This extracts the yyyy/mm/accession path fragment from the API by examine the file path - of a public file. - I.e. ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2018/10/PXD008644/7550GI_Y.raw - :param accession: PRIDE accession - :return: path fragment (eg: 2018/10/PXD008644) - """ - results = self.get_all_raw_file_list(accession) - first_file = results[0]["publicFileLocations"][0]["value"] - path_fragment = re.search(r"\d{4}/\d{2}/PXD\d*", first_file).group() - return path_fragment + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.get_submitted_file_path_prefix`.""" + from pridepy.providers.pride import PrideProvider + return PrideProvider().get_submitted_file_path_prefix(accession) def download_file_by_name( self, @@ -958,125 +682,23 @@ def get_file_from_api(self, accession, file_name) -> List[Dict]: raise Exception("File not found " + str(e)) def download_private_file_name(self, accession, file_name, output_folder, username, password): - """ - Get the information for a given private file to be downloaded from the api. - :param accession: Project accession - :param file_name: The file name to be downloaded - :param username: Username with access to the dataset - :param password: Password for user with access to the dataset - """ - - auth = Authentication() - auth_token = auth.get_token(username, password) - validate_token = auth.validate_token(auth_token) - logging.info("Valid token after login: {}".format(validate_token)) - - url = self.API_PRIVATE_URL + "/projects/{}/files?search={}".format(accession, file_name) - content = requests.get(url, headers={"Authorization": "Bearer {}".format(auth_token)}) - if content.ok and content.status_code == 200: - json_file = content.json() - if ( - "_embedded" in json_file - and "files" in json_file["_embedded"] - and len(json_file["_embedded"]["files"]) == 1 - ): - download_url = json_file["_embedded"]["files"][0]["_links"]["download"]["href"] - logging.info(download_url) - - # Create a clean filename to save the downloaded file - new_file_path = os.path.join(output_folder, f"{file_name}") - - session = Util.create_session_with_retries() # Create session with retries - # Check if the file already exists - if os.path.exists(new_file_path): - resume_header = {"Range": f"bytes={os.path.getsize(new_file_path)}-"} - mode = "ab" # Append to file - resume_size = os.path.getsize(new_file_path) - else: - resume_header = {} - mode = "wb" # Write new file - resume_size = 0 - - with session.get( - download_url, stream=True, headers=resume_header, timeout=(10, 60) - ) as r: - r.raise_for_status() - total_size = int(r.headers.get("content-length", 0)) + resume_size - block_size = 1024 * 1024 # 1 MB chunks - - with tqdm( - total=total_size, - unit="B", - unit_scale=True, - desc=new_file_path, - initial=resume_size, - ) as pbar: - with open(new_file_path, mode) as f: - for chunk in r.iter_content(chunk_size=block_size): - if chunk: - f.write(chunk) - pbar.update(len(chunk)) - - logging.info(f"Successfully downloaded {new_file_path}") - - else: - logging.info( - "File name {} found more than once for the given project {}".format( - file_name, accession - ) - ) - else: - logging.info( - f"File name {file_name} now found in the project {accession}, or user don't have access" - ) - raise Exception( - f"File name {file_name} now found in the project {accession}, or user don't have access" - ) + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_private_file_name`.""" + from pridepy.providers.pride import PrideProvider + return PrideProvider().download_private_file_name( + accession, file_name, output_folder, username, password, + ) @staticmethod def get_ascp_binary(): - """ - Detect the OS and architecture, and return the appropriate ascp binary path. - - Returns: - str: Path to the correct ascp binary. - """ - os_type = platform.system().lower() - arch, _ = platform.architecture() - aspera_dir = importlib.resources.files("pridepy").joinpath("aspera/") - - if os_type == "linux": - if arch == "32bit": - return os.path.join(aspera_dir, "linux-32", "ascp") - elif arch == "64bit": - return os.path.join(aspera_dir, "linux-64", "ascp") - elif os_type == "darwin": # macOS (intel-based) - return os.path.join(aspera_dir, "mac-intel", "ascp") - elif os_type == "windows": - if arch == "32bit": - return os.path.join(aspera_dir, "windows-32", "ascp.exe") - elif arch == "64bit": - return os.path.join(aspera_dir, "windows-64", "ascp.exe") - else: - raise OSError(f"Unsupported OS or architecture: {os_type}, {arch}") + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.get_ascp_binary`.""" + from pridepy.providers.pride import PrideProvider + return PrideProvider.get_ascp_binary() @staticmethod def save_checksum_file(accession, output_folder): - """ - Download and persist the checksum manifest for a PRIDE accession. - """ - os.makedirs(output_folder, exist_ok=True) - url = f"{Files.V3_API_BASE_URL}/files/checksum/{accession}" - headers = {"accept": "text/plain"} - request = urllib.request.Request(url, headers=headers, method="GET") - logging.info(f"Fetching checksum file from {url}") - with urllib.request.urlopen(request) as response: - data = response.read().decode("utf-8") - # Save the data to a .tsv file - output_path = os.path.join(output_folder, f"{accession}-checksum.tsv") - with open(output_path, "w", encoding="utf-8") as file: - file.write(data) - return output_path + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.save_checksum_file`.""" + from pridepy.providers.pride import PrideProvider + return PrideProvider.save_checksum_file(accession, output_folder) @staticmethod def _batch_download_by_protocol( @@ -1088,44 +710,22 @@ def _batch_download_by_protocol( parallel_files: int = 1, checksum_map: Optional[Dict[str, str]] = None, ) -> None: + """Shim — see :meth:`pridepy.providers.pride.PrideProvider._batch_download_by_protocol`. + + Tests patch this method via ``patch.object(Files, "_batch_download_by_protocol")``; + :class:`PrideProvider` calls back through ``Files.X`` so those patches + keep intercepting. """ - Transfer a batch of files with one protocol, reusing a single - connection where the underlying helper supports it (FTP, S3). - """ - if not file_list: - return - if protocol == "ftp": - Files.download_files_from_ftp( - file_list, - output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - ) - return - if protocol == "aspera": - Files.download_files_from_aspera( - file_list, - output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - maximum_bandwidth=aspera_maximum_bandwidth, - ) - return - if protocol == "globus": - Files.download_files_from_globus( - file_list, - output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - parallel_files=parallel_files, - checksum_map=checksum_map or {}, - ) - return - if protocol == "s3": - Files.download_files_from_s3( - file_list, - output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - ) - return - raise ValueError(f"Unsupported protocol: {protocol}") + from pridepy.providers.pride import PrideProvider + return PrideProvider._batch_download_by_protocol( + file_list, + output_folder, + protocol, + skip_if_downloaded_already, + aspera_maximum_bandwidth, + parallel_files=parallel_files, + checksum_map=checksum_map, + ) @staticmethod def _download_with_fallback( @@ -1137,52 +737,17 @@ def _download_with_fallback( max_protocol_retries: int = 2, parallel_files: int = 1, ) -> bool: - """ - Download one file by trying each protocol in sequence, validating - after every attempt. Intended as the per-file fallback path; batch - download of the primary protocol is handled separately. - """ - local_path = Files._resolve_local_path(file_record, output_folder) - - for protocol in protocol_sequence: - for attempt in range(1, max_protocol_retries + 1): - logging.info( - f"Downloading {file_record['fileName']} via {protocol} " - f"(attempt {attempt}/{max_protocol_retries})" - ) - try: - Files._remove_if_exists(local_path) - Files._batch_download_by_protocol( - [file_record], - output_folder, - protocol, - skip_if_downloaded_already=False, - aspera_maximum_bandwidth=aspera_maximum_bandwidth, - parallel_files=parallel_files, - ) - except Exception as error: - logging.error( - f"Protocol {protocol} failed for {file_record['fileName']}: {error}" - ) - - valid, reason = Files.validate_download(local_path, expected_checksum) - if valid: - logging.info( - f"File {file_record['fileName']} downloaded successfully via {protocol}" - ) - return True - - logging.warning( - f"Validation failed for {file_record['fileName']} via {protocol}: {reason}" - ) - Files._remove_if_exists(local_path) - - logging.warning( - f"Protocol {protocol} exhausted for {file_record['fileName']}, switching protocol." - ) - - logging.error(f"All protocol attempts failed for {file_record['fileName']}") - return False + """Shim — see :meth:`pridepy.providers.pride.PrideProvider._download_with_fallback`.""" + from pridepy.providers.pride import PrideProvider + return PrideProvider._download_with_fallback( + file_record, + output_folder, + protocol_sequence, + expected_checksum, + aspera_maximum_bandwidth, + max_protocol_retries=max_protocol_retries, + parallel_files=parallel_files, + ) @staticmethod def download_files( @@ -1195,94 +760,18 @@ def download_files( checksum_check=False, parallel_files: int = 1, ): - """ - Download files using either FTP or Aspera transfer protocol. - :param file_list_json: File list in JSON format - :param accession: Project accession - :param output_folder: Folder to download the files - :param protocol: ftp, aspera, globus - :param aspera_maximum_bandwidth: parameter in Aspera sets the maximum bandwidth for the transfer. - :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. - """ - protocols_supported = ["ftp", "aspera", "globus", "s3"] - if protocol not in protocols_supported: - logging.error("Protocol should be one of ftp, aspera, globus, s3") - return - - os.makedirs(output_folder, exist_ok=True) - - checksum_map: Dict[str, str] = {} - if checksum_check: - checksum_file_path = Files.save_checksum_file(accession, output_folder) - checksum_map = Files.read_checksum_file(checksum_file_path) - logging.info(f"Loaded checksums for {len(checksum_map)} files") - - if not file_list_json: - return - - protocol_sequence = Files._protocol_sequence(protocol) - primary_protocol = protocol_sequence[0] - # Retry with the primary protocol first, then fall back to others - fallback_sequence = protocol_sequence - - # Phase 1: batch download with the requested protocol. Reuses a single - # FTP/S3 connection for all files (the previous behaviour) instead of - # paying the per-file reconnect cost in the common happy path. - logging.info( - f"Downloading {len(file_list_json)} file(s) via {primary_protocol} (batch)" + """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_files`.""" + from pridepy.providers.pride import PrideProvider + return PrideProvider.download_files( + file_list_json, + accession, + output_folder, + skip_if_downloaded_already, + protocol=protocol, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, + checksum_check=checksum_check, + parallel_files=parallel_files, ) - try: - Files._batch_download_by_protocol( - file_list_json, - output_folder, - primary_protocol, - skip_if_downloaded_already=skip_if_downloaded_already, - aspera_maximum_bandwidth=aspera_maximum_bandwidth, - parallel_files=parallel_files, - checksum_map=checksum_map, - ) - except Exception as exc: - logging.warning( - f"Batch {primary_protocol} run hit an error; will retry individual failures: {exc}" - ) - - # Phase 2: validate every file and fall back per-file for the ones - # that are missing or invalid. - logging.info("Phase 2: validating %d downloaded file(s)", len(file_list_json)) - failed_files: List[str] = [] - for i, file_record in enumerate(file_list_json, 1): - expected_checksum = checksum_map.get(file_record["fileName"]) - local_path = Files._resolve_local_path(file_record, output_folder) - logging.info("Validating [%d/%d] %s", i, len(file_list_json), file_record["fileName"]) - valid, reason = Files.validate_download(local_path, expected_checksum) - if valid: - continue - - logging.warning( - f"{file_record['fileName']} invalid after {primary_protocol} ({reason})" - ) - if "checksum mismatch" in reason: - Files._remove_if_exists(local_path) - - if not fallback_sequence: - failed_files.append(file_record.get("fileName", "")) - continue - - success = Files._download_with_fallback( - file_record=file_record, - output_folder=output_folder, - protocol_sequence=fallback_sequence, - expected_checksum=expected_checksum, - aspera_maximum_bandwidth=aspera_maximum_bandwidth, - parallel_files=parallel_files, - ) - if not success: - failed_files.append(file_record.get("fileName", "")) - - if failed_files: - failed_summary = ", ".join(failed_files) - logging.error(f"Failed to download {len(failed_files)} file(s): {failed_summary}") - raise RuntimeError(f"Failed to download {len(failed_files)} file(s): {failed_summary}") def download_files_by_list( self, diff --git a/pridepy/providers/pride.py b/pridepy/providers/pride.py new file mode 100644 index 0000000..ea60a5d --- /dev/null +++ b/pridepy/providers/pride.py @@ -0,0 +1,790 @@ +"""PRIDE Archive provider. + +PRIDE has the richest behaviour of all providers: multi-protocol batch +download with aspera/s3/ftp/globus fallback, private-dataset path with +username/password auth, checksum TSV validation, and submitter-path +helpers. This module hosts all of those; the :class:`Files` facade +delegates via lightweight shim methods. + +Implementation note: PRIDE-specific helpers that the existing test suite +patches via ``patch.object(Files, "X")`` are called from inside this +provider via ``Files.X(...)`` (lazy import) — never ``self.X`` — so the +patches keep intercepting. This is a deliberate backward-compat choice +documented in the refactor plan (Task 8). +""" +import ftplib +import importlib.resources +import logging +import os +import platform +import re +import socket +import subprocess +import time +import urllib +import urllib.request +from concurrent.futures import ThreadPoolExecutor, as_completed +from ftplib import FTP +from typing import ClassVar, Dict, List, Optional +from urllib.parse import urlparse + +import boto3 +import botocore +import requests +from botocore.config import Config +from tqdm import tqdm + +from pridepy.authentication.authentication import Authentication +from pridepy.providers import registry +from pridepy.providers.base import Provider +from pridepy.providers.util import Progress +from pridepy.util.api_handling import Util + + +@registry.register +class PrideProvider(Provider): + """PRIDE Archive provider with multi-protocol fallback orchestration.""" + + name: ClassVar[str] = "pride" + + V3_API_BASE_URL: ClassVar[str] = "https://www.ebi.ac.uk/pride/ws/archive/v3" + API_BASE_URL: ClassVar[str] = "https://www.ebi.ac.uk/pride/ws/archive/v3" + API_PRIVATE_URL: ClassVar[str] = "https://www.ebi.ac.uk/pride/private/ws/archive/v2" + ARCHIVE_FTP: ClassVar[str] = "ftp.pride.ebi.ac.uk" + ARCHIVE_FTP_URL_PREFIX: ClassVar[str] = "ftp://ftp.pride.ebi.ac.uk/" + ARCHIVE_HTTPS_URL_PREFIX: ClassVar[str] = "https://ftp.pride.ebi.ac.uk/" + S3_URL: ClassVar[str] = "https://hh.fire.sdo.ebi.ac.uk" + S3_BUCKET: ClassVar[str] = "pride-public" + PROTOCOL_ORDER: ClassVar[List[str]] = ["aspera", "s3", "ftp", "globus"] + + @staticmethod + def matches(accession: str) -> bool: + """Return True when ``accession`` is a PRIDE dataset accession.""" + if not accession: + return False + return bool(re.fullmatch(r"(?:PXD|PRD)\d+", accession.upper())) + + # ------------------------------------------------------------------ + # Listing + # ------------------------------------------------------------------ + + async def stream_all_files_metadata(self, output_file, accession=None): + """ + get stream all project files from PRIDE API in JSON format + """ + if accession is None: + request_url = f"{self.V3_API_BASE_URL}/files/all" + count_request_url = f"{self.V3_API_BASE_URL}/files/count" + else: + request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/all" + count_request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/count" + headers = {"Accept": "application/JSON"} + response = Util.get_api_call(count_request_url, headers) + total_records = response.json() + + regex_search_pattern = '"fileName"' + await Util.stream_response_to_file( + output_file, total_records, regex_search_pattern, request_url, headers + ) + + def stream_all_files_by_project(self, accession) -> List[Dict]: + """ + get stream all project files from PRIDE API in JSON format + """ + request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/all" + headers = {"Accept": "application/JSON"} + record_files = Util.read_json_stream(api_url=request_url, headers=headers) + return record_files + + def list_files(self, accession: str) -> List[Dict]: + """Return PRIDE file records for the dataset.""" + return self.stream_all_files_by_project(accession) + + def get_submitted_file_path_prefix(self, accession): + """ + At pride repository, public data is disseminated according to a proper structure. + I.e. base/path/ + yyyy/mm/accession/ + submitted/ + This extracts the yyyy/mm/accession path fragment from the API by examine the file path + of a public file. + I.e. ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2018/10/PXD008644/7550GI_Y.raw + :param accession: PRIDE accession + :return: path fragment (eg: 2018/10/PXD008644) + """ + # Use Files facade so test patches on get_all_raw_file_list keep working. + from pridepy.files.files import Files + results = Files().get_all_raw_file_list(accession) + first_file = results[0]["publicFileLocations"][0]["value"] + path_fragment = re.search(r"\d{4}/\d{2}/PXD\d*", first_file).group() + return path_fragment + + # ------------------------------------------------------------------ + # Static utilities + # ------------------------------------------------------------------ + + @staticmethod + def _protocol_sequence(protocol: str) -> List[str]: + """ + Build the ordered list of protocols to try for a requested download mode. + """ + if protocol not in PrideProvider.PROTOCOL_ORDER: + return [] + return [protocol] + [p for p in PrideProvider.PROTOCOL_ORDER if p != protocol] + + @staticmethod + def get_ascp_binary(): + """ + Detect the OS and architecture, and return the appropriate ascp binary path. + + Returns: + str: Path to the correct ascp binary. + """ + os_type = platform.system().lower() + arch, _ = platform.architecture() + aspera_dir = importlib.resources.files("pridepy").joinpath("aspera/") + + if os_type == "linux": + if arch == "32bit": + return os.path.join(aspera_dir, "linux-32", "ascp") + elif arch == "64bit": + return os.path.join(aspera_dir, "linux-64", "ascp") + elif os_type == "darwin": # macOS (intel-based) + return os.path.join(aspera_dir, "mac-intel", "ascp") + elif os_type == "windows": + if arch == "32bit": + return os.path.join(aspera_dir, "windows-32", "ascp.exe") + elif arch == "64bit": + return os.path.join(aspera_dir, "windows-64", "ascp.exe") + else: + raise OSError(f"Unsupported OS or architecture: {os_type}, {arch}") + + @staticmethod + def save_checksum_file(accession, output_folder): + """ + Download and persist the checksum manifest for a PRIDE accession. + """ + os.makedirs(output_folder, exist_ok=True) + url = f"{PrideProvider.V3_API_BASE_URL}/files/checksum/{accession}" + headers = {"accept": "text/plain"} + request = urllib.request.Request(url, headers=headers, method="GET") + logging.info(f"Fetching checksum file from {url}") + with urllib.request.urlopen(request) as response: + data = response.read().decode("utf-8") + # Save the data to a .tsv file + output_path = os.path.join(output_folder, f"{accession}-checksum.tsv") + with open(output_path, "w", encoding="utf-8") as file: + file.write(data) + return output_path + + # ------------------------------------------------------------------ + # Per-protocol single-file workers + # ------------------------------------------------------------------ + + @staticmethod + def _globus_download_one(file, output_folder, skip_if_downloaded_already, max_retries=6, position=0): + """Download a single file via globus; used as a worker target.""" + # Use Files facade so test patches on Files helpers keep working. + from pridepy.files.files import Files + + download_url = Files._get_download_url(file, "globus") + new_file_path = Files.get_output_file_name(download_url, file, output_folder) + + if skip_if_downloaded_already and os.path.exists(new_file_path): + logging.info(f"Skipping download as file already exists: {new_file_path}") + return + + for attempt in range(1, max_retries + 1): + try: + Files._parallel_download(download_url, new_file_path, position=position) + return + except Exception as e: + logging.warning(f"Attempt {attempt}/{max_retries} failed for {file.get('fileName', '?')}: {e}") + if attempt == max_retries: + raise + + # ------------------------------------------------------------------ + # Per-protocol batch helpers + # ------------------------------------------------------------------ + + @staticmethod + def download_files_from_ftp( + file_list_json, + output_folder, + skip_if_downloaded_already, + max_connection_retries=3, + max_download_retries=3, + ): + """ + Download files using a single FTP connection with a retry mechanism and a progress bar for each file. + :param file_list_json: file list in JSON format + :param output_folder: folder to download the files + :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. + :param max_connection_retries: Number of attempts to reconnect to the FTP server if the connection is lost. + :param max_download_retries: Number of attempts to retry the download of a file in case of failure. + """ + from pridepy.files.files import Files + + if not os.path.isdir(output_folder): + os.makedirs(output_folder) + + def connect_ftp(): + """Helper function to establish FTP connection.""" + ftp = FTP(PrideProvider.ARCHIVE_FTP, timeout=30) + ftp.login() # Anonymous login + ftp.set_pasv(True) # Enable passive mode + logging.info(f"Connected to FTP host: {PrideProvider.ARCHIVE_FTP}") + return ftp + + connection_attempt = 0 + while connection_attempt < max_connection_retries: + try: + ftp = connect_ftp() + for file in file_list_json: + try: + # Get FTP download URL + if file["publicFileLocations"][0]["name"] == "FTP Protocol": + download_url = file["publicFileLocations"][0]["value"] + else: + download_url = file["publicFileLocations"][1]["value"] + + logging.debug("ftp_filepath:" + download_url) + + # Get output file path + new_file_path = Files.get_output_file_name( + download_url, file, output_folder + ) + + if skip_if_downloaded_already and os.path.exists(new_file_path): + logging.info("Skipping download as file already exists") + continue + + # Extract file path from the download URL + parsed_url = urlparse(download_url) + ftp_file_path = urllib.parse.unquote(parsed_url.path.lstrip("/")) + + logging.info(f"Starting FTP download: {ftp_file_path}") + + # Retry download in case of failure + download_attempt = 0 + while download_attempt < max_download_retries: + try: + # Get file size for progress tracking + total_size = ftp.size(ftp_file_path) + logging.info(f"File size: {total_size} bytes") + + # Initialize progress bar + with open(new_file_path, "wb") as f: + with tqdm( + total=total_size, + unit="B", + unit_scale=True, + desc=new_file_path, + ) as pbar: + + def callback(data): + f.write(data) + pbar.update(len(data)) + + # Retrieve the file with progress callback + ftp.retrbinary(f"RETR {ftp_file_path}", callback) + + logging.info(f"Successfully downloaded {new_file_path}") + break # Exit download retry loop if successful + except ( + socket.timeout, + ftplib.error_temp, + ftplib.error_perm, + ) as e: + download_attempt += 1 + logging.error( + f"Download failed for {new_file_path} (attempt {download_attempt}): {str(e)}" + ) + if download_attempt >= max_download_retries: + logging.error( + f"Giving up on {new_file_path} after {max_download_retries} attempts." + ) + break # Give up on this file after max retries + except (KeyError, IndexError) as e: + logging.error(f"Failed to process file due to missing data: {str(e)}") + except Exception as e: + logging.error(f"Unexpected error while processing file: {str(e)}") + ftp.quit() # Close FTP connection after all files are downloaded + logging.info(f"Disconnected from FTP host: {PrideProvider.ARCHIVE_FTP}") + break # Exit connection retry loop if everything was successful + except ( + socket.timeout, + ftplib.error_temp, + ftplib.error_perm, + socket.error, + ) as e: + connection_attempt += 1 + logging.error(f"FTP connection failed (attempt {connection_attempt}): {str(e)}") + if connection_attempt < max_connection_retries: + logging.info("Retrying connection...") + time.sleep(5) # Optional delay before retrying + else: + logging.error( + f"Giving up after {max_connection_retries} failed connection attempts." + ) + break + + @staticmethod + def download_files_from_globus( + file_list_json: List[Dict], output_folder, skip_if_downloaded_already, + parallel_files: int = 1, + checksum_map: Optional[Dict[str, str]] = None, + ): + """ + Download files using globus transfer url with progress bar for each file. + When skip_if_downloaded_already is True, files are pre-filtered so that + only missing or incomplete files are submitted to the worker pool, + ensuring the -w parallel_files parameter is fully utilised. + When checksum_map is provided, existing files are validated against + their expected checksum; corrupted files are re-downloaded. + :param file_list_json: file list in json format + :param output_folder: folder to download the files + :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. + :param parallel_files: number of files to download simultaneously + :param checksum_map: mapping of file name to expected MD5 checksum + """ + # Use Files facade so test patches on Files._globus_download_one etc. keep working. + from pridepy.files.files import Files + + if checksum_map is None: + checksum_map = {} + + if not (os.path.isdir(output_folder)): + os.makedirs(output_folder, exist_ok=True) + + # --- Phase 0: pre-filter files that need downloading ----------------- + files_to_download: List[Dict] = [] + for file in file_list_json: + download_url = Files._get_download_url(file, "globus") + new_file_path = Files.get_output_file_name(download_url, file, output_folder) + if skip_if_downloaded_already and os.path.exists(new_file_path): + expected_cs = checksum_map.get(file.get("fileName", "")) + if expected_cs: + valid, reason = Files.validate_download(new_file_path, expected_cs) + if not valid: + logging.warning(f"Corrupted file detected ({reason}), will re-download: {new_file_path}") + files_to_download.append(file) + continue + logging.info(f"Skipping download as file already exists: {new_file_path}") + continue + files_to_download.append(file) + + if not files_to_download: + logging.info("All files already downloaded, nothing to do.") + return + + logging.info( + f"{len(file_list_json) - len(files_to_download)} file(s) skipped, " + f"{len(files_to_download)} file(s) to download" + ) + + # --- Phase 1: download (skip check already done, pass False) --------- + parallel_files = min(parallel_files, 3, len(files_to_download)) + if parallel_files < 2: + for file in files_to_download: + try: + Files._globus_download_one( + file, output_folder, False + ) + new_file_path = Files.get_output_file_name( + Files._get_download_url(file, "globus"), file, output_folder + ) + logging.info(f"Successfully downloaded {new_file_path}") + except Exception as e: + logging.error(f"Download from Globus failed: {str(e)}") + else: + logging.info(f"Downloading {len(files_to_download)} file(s) with {parallel_files} parallel workers") + with ThreadPoolExecutor(max_workers=parallel_files) as executor: + futures = { + executor.submit( + Files._globus_download_one, + file, output_folder, False, + position=idx, + ): file + for idx, file in enumerate(files_to_download) + } + for future in as_completed(futures): + try: + future.result() + except Exception as e: + logging.error(f"Download from Globus failed: {str(e)}") + + @staticmethod + def download_files_from_s3( + file_list_json: List[Dict], output_folder: str, skip_if_downloaded_already + ): + """ + Download files using S3 transfer URL with a progress bar and retry logic. + :param file_list_json: file list in JSON format + :param output_folder: folder to download the files + :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. + """ + from pridepy.files.files import Files + + if not os.path.isdir(output_folder): + os.makedirs(output_folder, exist_ok=True) + + # Retry and timeout config + retry_config = Config( + retries={"max_attempts": 5, "mode": "standard"}, + connect_timeout=120, # Increase timeout to 120 seconds + read_timeout=120, # Timeout for reading data + signature_version=botocore.UNSIGNED, # Unsigned requests for public data + ) + + s3_resource = boto3.resource( + "s3", + config=retry_config, + endpoint_url=PrideProvider.S3_URL, + ) + bucket = s3_resource.Bucket(PrideProvider.S3_BUCKET) + + for file in file_list_json: + try: + # Determine S3 or FTP path + download_url = ( + file["publicFileLocations"][0]["value"] + if file["publicFileLocations"][0]["name"] == "FTP Protocol" + else file["publicFileLocations"][1]["value"] + ) + + ftp_base_url = "ftp://ftp.pride.ebi.ac.uk/pride/data/archive/" + s3_path = download_url.replace(ftp_base_url, "") + new_file_path = Files.get_output_file_name(download_url, file, output_folder) + + if skip_if_downloaded_already == True and os.path.exists(new_file_path): + logging.info("Skipping download as file already exists") + continue + + logging.debug(f"Downloading From S3: {s3_path}") + + # Get file size for progress tracking + obj = bucket.Object(s3_path) + total_size = obj.content_length + + # Initialize progress bar + progress = Progress(total_size, new_file_path) + + # Download with progress bar and retry handling + for attempt in range(5): + try: + bucket.download_file(s3_path, new_file_path, Callback=progress) + progress.close() + logging.info(f"Successfully downloaded {new_file_path}") + break + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + logging.error("The object does not exist.") + break + else: + logging.error(f"Download failed: {e}") + if attempt < 4: + time.sleep(2**attempt) # Exponential backoff + logging.info(f"Retrying... ({attempt + 1}/5)") + else: + raise + except Exception as e: + logging.error(f"Failed to download {file['fileName']}: {e}") + + # ------------------------------------------------------------------ + # Private dataset download + # ------------------------------------------------------------------ + + def download_private_file_name(self, accession, file_name, output_folder, username, password): + """ + Get the information for a given private file to be downloaded from the api. + :param accession: Project accession + :param file_name: The file name to be downloaded + :param username: Username with access to the dataset + :param password: Password for user with access to the dataset + """ + + auth = Authentication() + auth_token = auth.get_token(username, password) + validate_token = auth.validate_token(auth_token) + logging.info("Valid token after login: {}".format(validate_token)) + + url = self.API_PRIVATE_URL + "/projects/{}/files?search={}".format(accession, file_name) + content = requests.get(url, headers={"Authorization": "Bearer {}".format(auth_token)}) + if content.ok and content.status_code == 200: + json_file = content.json() + if ( + "_embedded" in json_file + and "files" in json_file["_embedded"] + and len(json_file["_embedded"]["files"]) == 1 + ): + download_url = json_file["_embedded"]["files"][0]["_links"]["download"]["href"] + logging.info(download_url) + + # Create a clean filename to save the downloaded file + new_file_path = os.path.join(output_folder, f"{file_name}") + + session = Util.create_session_with_retries() # Create session with retries + # Check if the file already exists + if os.path.exists(new_file_path): + resume_header = {"Range": f"bytes={os.path.getsize(new_file_path)}-"} + mode = "ab" # Append to file + resume_size = os.path.getsize(new_file_path) + else: + resume_header = {} + mode = "wb" # Write new file + resume_size = 0 + + with session.get( + download_url, stream=True, headers=resume_header, timeout=(10, 60) + ) as r: + r.raise_for_status() + total_size = int(r.headers.get("content-length", 0)) + resume_size + block_size = 1024 * 1024 # 1 MB chunks + + with tqdm( + total=total_size, + unit="B", + unit_scale=True, + desc=new_file_path, + initial=resume_size, + ) as pbar: + with open(new_file_path, mode) as f: + for chunk in r.iter_content(chunk_size=block_size): + if chunk: + f.write(chunk) + pbar.update(len(chunk)) + + logging.info(f"Successfully downloaded {new_file_path}") + + else: + logging.info( + "File name {} found more than once for the given project {}".format( + file_name, accession + ) + ) + else: + logging.info( + f"File name {file_name} now found in the project {accession}, or user don't have access" + ) + raise Exception( + f"File name {file_name} now found in the project {accession}, or user don't have access" + ) + + # ------------------------------------------------------------------ + # Multi-protocol orchestrator + # ------------------------------------------------------------------ + + @staticmethod + def _batch_download_by_protocol( + file_list: List[Dict], + output_folder: str, + protocol: str, + skip_if_downloaded_already: bool, + aspera_maximum_bandwidth: str, + parallel_files: int = 1, + checksum_map: Optional[Dict[str, str]] = None, + ) -> None: + """ + Transfer a batch of files with one protocol, reusing a single + connection where the underlying helper supports it (FTP, S3). + """ + # Use Files facade so test patches on each per-protocol helper keep working. + from pridepy.files.files import Files + + if not file_list: + return + if protocol == "ftp": + Files.download_files_from_ftp( + file_list, + output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + ) + return + if protocol == "aspera": + Files.download_files_from_aspera( + file_list, + output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + maximum_bandwidth=aspera_maximum_bandwidth, + ) + return + if protocol == "globus": + Files.download_files_from_globus( + file_list, + output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + parallel_files=parallel_files, + checksum_map=checksum_map or {}, + ) + return + if protocol == "s3": + Files.download_files_from_s3( + file_list, + output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + ) + return + raise ValueError(f"Unsupported protocol: {protocol}") + + @staticmethod + def _download_with_fallback( + file_record: Dict, + output_folder: str, + protocol_sequence: List[str], + expected_checksum: Optional[str], + aspera_maximum_bandwidth: str, + max_protocol_retries: int = 2, + parallel_files: int = 1, + ) -> bool: + """ + Download one file by trying each protocol in sequence, validating + after every attempt. Intended as the per-file fallback path; batch + download of the primary protocol is handled separately. + """ + # Patch-sensitive: call through Files so test patches intercept. + from pridepy.files.files import Files + + local_path = Files._resolve_local_path(file_record, output_folder) + + for protocol in protocol_sequence: + for attempt in range(1, max_protocol_retries + 1): + logging.info( + f"Downloading {file_record['fileName']} via {protocol} " + f"(attempt {attempt}/{max_protocol_retries})" + ) + try: + Files._remove_if_exists(local_path) + Files._batch_download_by_protocol( + [file_record], + output_folder, + protocol, + skip_if_downloaded_already=False, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, + parallel_files=parallel_files, + ) + except Exception as error: + logging.error( + f"Protocol {protocol} failed for {file_record['fileName']}: {error}" + ) + + valid, reason = Files.validate_download(local_path, expected_checksum) + if valid: + logging.info( + f"File {file_record['fileName']} downloaded successfully via {protocol}" + ) + return True + + logging.warning( + f"Validation failed for {file_record['fileName']} via {protocol}: {reason}" + ) + Files._remove_if_exists(local_path) + + logging.warning( + f"Protocol {protocol} exhausted for {file_record['fileName']}, switching protocol." + ) + + logging.error(f"All protocol attempts failed for {file_record['fileName']}") + return False + + @staticmethod + def download_files( + file_list_json: List[Dict], + accession, + output_folder: str, + skip_if_downloaded_already, + protocol: str = "ftp", + aspera_maximum_bandwidth: str = "100M", # Aspera maximum bandwidth + checksum_check=False, + parallel_files: int = 1, + ): + """ + Download files using either FTP or Aspera transfer protocol. + :param file_list_json: File list in JSON format + :param accession: Project accession + :param output_folder: Folder to download the files + :param protocol: ftp, aspera, globus + :param aspera_maximum_bandwidth: parameter in Aspera sets the maximum bandwidth for the transfer. + :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. + """ + # Patch-sensitive: call _batch_download_by_protocol and + # _download_with_fallback through Files so test patches intercept. + from pridepy.files.files import Files + + protocols_supported = ["ftp", "aspera", "globus", "s3"] + if protocol not in protocols_supported: + logging.error("Protocol should be one of ftp, aspera, globus, s3") + return + + os.makedirs(output_folder, exist_ok=True) + + checksum_map: Dict[str, str] = {} + if checksum_check: + checksum_file_path = Files.save_checksum_file(accession, output_folder) + checksum_map = Files.read_checksum_file(checksum_file_path) + logging.info(f"Loaded checksums for {len(checksum_map)} files") + + if not file_list_json: + return + + protocol_sequence = Files._protocol_sequence(protocol) + primary_protocol = protocol_sequence[0] + # Retry with the primary protocol first, then fall back to others + fallback_sequence = protocol_sequence + + # Phase 1: batch download with the requested protocol. Reuses a single + # FTP/S3 connection for all files (the previous behaviour) instead of + # paying the per-file reconnect cost in the common happy path. + logging.info( + f"Downloading {len(file_list_json)} file(s) via {primary_protocol} (batch)" + ) + try: + Files._batch_download_by_protocol( + file_list_json, + output_folder, + primary_protocol, + skip_if_downloaded_already=skip_if_downloaded_already, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, + parallel_files=parallel_files, + checksum_map=checksum_map, + ) + except Exception as exc: + logging.warning( + f"Batch {primary_protocol} run hit an error; will retry individual failures: {exc}" + ) + + # Phase 2: validate every file and fall back per-file for the ones + # that are missing or invalid. + logging.info("Phase 2: validating %d downloaded file(s)", len(file_list_json)) + failed_files: List[str] = [] + for i, file_record in enumerate(file_list_json, 1): + expected_checksum = checksum_map.get(file_record["fileName"]) + local_path = Files._resolve_local_path(file_record, output_folder) + logging.info("Validating [%d/%d] %s", i, len(file_list_json), file_record["fileName"]) + valid, reason = Files.validate_download(local_path, expected_checksum) + if valid: + continue + + logging.warning( + f"{file_record['fileName']} invalid after {primary_protocol} ({reason})" + ) + if "checksum mismatch" in reason: + Files._remove_if_exists(local_path) + + if not fallback_sequence: + failed_files.append(file_record.get("fileName", "")) + continue + + success = Files._download_with_fallback( + file_record=file_record, + output_folder=output_folder, + protocol_sequence=fallback_sequence, + expected_checksum=expected_checksum, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, + parallel_files=parallel_files, + ) + if not success: + failed_files.append(file_record.get("fileName", "")) + + if failed_files: + failed_summary = ", ".join(failed_files) + logging.error(f"Failed to download {len(failed_files)} file(s): {failed_summary}") + raise RuntimeError(f"Failed to download {len(failed_files)} file(s): {failed_summary}") From 6fd743dd5bcc0334b3c802d4db3f57a2ab8bfa0d Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 16:49:27 +0100 Subject: [PATCH 13/21] refactor(providers): rewire Files facade through Registry Files public methods (get_all_raw_file_list, download_all_raw_files, download_all_category_files, get_all_category_file_list, download_file_by_name, get_file_from_api, download_files_by_list) now dispatch via registry.resolve(accession).{list,download}_files(...). Removed dead helpers: _list_direct_download_files, _download_direct_download_records. Kept _repo_uses_tls as a registry shim. Fixed _download_massive_file_records to use registry. PrideProvider.download_files refactored: the old static method is now _download_files_batch; a proper Provider-interface instance method download_files(self, accession, records, ...) wraps it so PRIDE routes uniformly through the registry like other providers. Tests updated: patches on Files._list_massive_public_files, Files._list_jpost_public_files, files_obj.stream_all_files_by_project, and files_obj.download_files now target the provider classes (MassiveProvider, JpostProvider, PrideProvider) directly. Full suite green. files.py size: 1254 LOC (was 1352 before Task 9). --- pridepy/files/files.py | 238 ++++++++----------------- pridepy/providers/pride.py | 27 ++- pridepy/tests/test_download_by_list.py | 15 +- pridepy/tests/test_jpost_files.py | 8 +- pridepy/tests/test_massive_files.py | 7 +- 5 files changed, 111 insertions(+), 184 deletions(-) diff --git a/pridepy/files/files.py b/pridepy/files/files.py index b4cc3b9..e9cde55 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -205,12 +205,13 @@ def is_iprox_accession(accession: str) -> bool: @staticmethod def _repo_uses_tls(accession: str) -> bool: - """ - Whether the public FTP server for ``accession`` requires FTP over TLS. - MassIVE rejects plain anonymous FTP (``421 TLS is required``); JPOST - accepts plain FTP. - """ - return Files.is_massive_accession(accession) + """Shim — returns the resolved provider's use_tls flag (False if unknown).""" + from pridepy.providers import registry + try: + provider = registry.resolve(accession) + except ValueError: + return False + return getattr(provider, "use_tls", False) @staticmethod def _walk_ftp_tree(ftp: FTP, remote_dir: str) -> List[str]: @@ -246,11 +247,12 @@ def _download_massive_file_records( ) -> None: """ Download public MassIVE files via anonymous FTP (now FTPS). - Backward-compat wrapper around :meth:`_download_direct_download_records`. + Backward-compat shim — dispatches via the provider registry. """ - self._download_direct_download_records( + from pridepy.providers import registry + registry.resolve(accession).download_files( accession=accession, - file_records=file_records, + records=file_records, output_folder=output_folder, skip_if_downloaded_already=skip_if_downloaded_already, protocol=protocol, @@ -298,50 +300,6 @@ def _list_iprox_public_files(self, accession: str) -> List[Dict]: from pridepy.providers.iprox import IproxProvider return IproxProvider().list_files(accession) - def _list_direct_download_files(self, accession: str) -> List[Dict]: - """ - Dispatch to the right listing transport for a direct-download - repository: MassIVE walks FTPS, JPOST uses PROXI JSON over HTTPS with - an FTP fallback, iProX uses the dataset's PX XML over HTTPS. - """ - if self.is_massive_accession(accession): - return self._list_massive_public_files(accession) - if self.is_jpost_accession(accession): - return self._list_jpost_public_files(accession) - if self.is_iprox_accession(accession): - return self._list_iprox_public_files(accession) - raise ValueError( - f"Accession {accession} is not a direct-download repository accession" - ) - - def _download_direct_download_records( - self, - accession: str, - file_records: List[Dict], - output_folder: str, - skip_if_downloaded_already: bool, - protocol: str, - parallel_files: int = 1, - ) -> None: - """ - Download files from a direct-download repository. - - MassIVE and JPOST use anonymous FTP(S) with REST-based resume and - per-host parallel workers. iProX uses anonymous HTTPS via - ``download.iprox.org`` with ``Range``-based resume and per-file - parallel workers. URLs are partitioned by scheme so a mixed batch - (e.g. a JPOST PX XML that ever pointed at HTTPS) routes correctly. - Dispatches via the provider registry. - """ - from pridepy.providers import registry - return registry.resolve(accession).download_files( - accession=accession, - records=file_records, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - protocol=protocol, - parallel_files=parallel_files, - ) async def stream_all_files_metadata(self, output_file, accession=None): """Shim — see :meth:`pridepy.providers.pride.PrideProvider.stream_all_files_metadata`.""" @@ -354,22 +312,14 @@ def stream_all_files_by_project(self, accession) -> List[Dict]: return PrideProvider().stream_all_files_by_project(accession) def get_all_raw_file_list(self, project_accession): - """ - Get all raw file lists from PRIDE API for a given project_accession - :param project_accession: PRIDE accession - :return: raw file list in JSON format - """ - if self.is_direct_download_accession(project_accession): - record_files = self._list_direct_download_files(project_accession) - return [ - file for file in record_files if file["fileCategory"]["value"] == "RAW" - ] - - record_files = self.stream_all_files_by_project(project_accession) + """Get raw file list for any registered provider. - # Filter projects by fileCategory = RAW - raw_files = [file for file in record_files if file["fileCategory"]["value"] == "RAW"] - return raw_files + Returns the dataset's file records filtered to fileCategory == "RAW". + """ + from pridepy.providers import registry + provider = registry.resolve(project_accession) + records = provider.list_files(project_accession) + return [r for r in records if r["fileCategory"]["value"] == "RAW"] def download_all_raw_files( self, @@ -381,42 +331,21 @@ def download_all_raw_files( checksum_check: bool = False, parallel_files: int = 1, ): - """ - This method will download all the raw files from PRIDE PROJECT - :param output_folder: output directory where raw files will get saved - :param skip_if_downloaded_already: Boolean value to skip the download if the file has already been downloaded. - :param accession: PRIDE accession - :param protocol: ftp, aspera, globus - :param aspera_maximum_bandwidth: Aspera maximum bandwidth - :param checksum_check: Download checksum for a given project. - :return: None - """ - - if not (os.path.isdir(output_folder)): + """Download all RAW files for any registered provider.""" + if not os.path.isdir(output_folder): os.mkdir(output_folder) - - raw_files = self.get_all_raw_file_list(accession) - - if self.is_direct_download_accession(accession): - self._download_direct_download_records( - accession=accession, - file_records=raw_files, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - protocol=protocol, - parallel_files=parallel_files, - ) - return - - self.download_files( - raw_files, - accession, - output_folder, - skip_if_downloaded_already, - protocol, - aspera_maximum_bandwidth=aspera_maximum_bandwidth, - checksum_check=checksum_check, + from pridepy.providers import registry + provider = registry.resolve(accession) + records = self.get_all_raw_file_list(accession) + provider.download_files( + accession=accession, + records=records, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, parallel_files=parallel_files, + checksum_check=checksum_check, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, ) @staticmethod @@ -597,11 +526,14 @@ def download_file_by_name( :param checksum_check: Download checksum for a given project. """ - if not (os.path.isdir(output_folder)): + if not os.path.isdir(output_folder): os.mkdir(output_folder) + from pridepy.providers import registry + provider = registry.resolve(accession) + ## Check type of project - if self.is_direct_download_accession(accession): + if provider.name in ("massive", "jpost", "iprox"): logging.info( "Downloading file from public direct-download dataset {}".format(accession) ) @@ -610,9 +542,9 @@ def download_file_by_name( raise Exception( "File name {} not found in dataset {}".format(file_name, accession) ) - self._download_direct_download_records( + provider.download_files( accession=accession, - file_records=response, + records=response, output_folder=output_folder, skip_if_downloaded_already=skip_if_downloaded_already, protocol=protocol, @@ -670,14 +602,10 @@ def get_file_from_api(self, accession, file_name) -> List[Dict]: :param file_name: file name :return: file in json format """ - + from pridepy.providers import registry try: - if self.is_direct_download_accession(accession): - files = self._list_direct_download_files(accession) - return [f for f in files if f["fileName"] == file_name] - files = self.stream_all_files_by_project(accession) - file = [f for f in files if f["fileName"] == file_name] - return file + records = registry.resolve(accession).list_files(accession) + return [r for r in records if r["fileName"] == file_name] except Exception as e: raise Exception("File not found " + str(e)) @@ -760,9 +688,9 @@ def download_files( checksum_check=False, parallel_files: int = 1, ): - """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_files`.""" + """Shim — see :meth:`pridepy.providers.pride.PrideProvider._download_files_batch`.""" from pridepy.providers.pride import PrideProvider - return PrideProvider.download_files( + return PrideProvider._download_files_batch( file_list_json, accession, output_folder, @@ -803,10 +731,10 @@ def download_files_by_list( if not file_names: raise ValueError("file_names must contain at least one filename") - if self.is_direct_download_accession(accession): - all_files = self._list_direct_download_files(accession) - else: - all_files = self.stream_all_files_by_project(accession) + from pridepy.providers import registry + provider = registry.resolve(accession) + all_files = provider.list_files(accession) + requested = set(file_names) matched = [f for f in all_files if f.get("fileName") in requested] missing = sorted(requested - {f.get("fileName") for f in matched}) @@ -817,26 +745,15 @@ def download_files_by_list( f"No matching files in project {accession} for: {sorted(requested)}" ) - if self.is_direct_download_accession(accession): - self._download_direct_download_records( - accession=accession, - file_records=matched, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - protocol=protocol, - parallel_files=parallel_files, - ) - return - - self.download_files( - matched, - accession, - output_folder, - skip_if_downloaded_already, - protocol, - aspera_maximum_bandwidth=aspera_maximum_bandwidth, - checksum_check=checksum_check, + provider.download_files( + accession=accession, + records=matched, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, parallel_files=parallel_files, + checksum_check=checksum_check, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, ) @staticmethod @@ -1093,26 +1010,18 @@ def download_all_category_files( """ if categories is None: categories = [category] if category else ["RAW"] - raw_files = self.get_all_category_file_list(accession, categories) - if self.is_direct_download_accession(accession): - self._download_direct_download_records( - accession=accession, - file_records=raw_files, - output_folder=output_folder, - skip_if_downloaded_already=skip_if_downloaded_already, - protocol=protocol, - parallel_files=parallel_files, - ) - return - self.download_files( - raw_files, - accession, - output_folder, - skip_if_downloaded_already, - protocol, - aspera_maximum_bandwidth=aspera_maximum_bandwidth, - checksum_check=checksum_check, + records = self.get_all_category_file_list(accession, categories) + from pridepy.providers import registry + provider = registry.resolve(accession) + provider.download_files( + accession=accession, + records=records, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, parallel_files=parallel_files, + checksum_check=checksum_check, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, ) def get_all_category_file_list( @@ -1127,17 +1036,10 @@ def get_all_category_file_list( """ if isinstance(categories, str): categories = [categories] - category_set = {category.upper() for category in categories} - - if self.is_direct_download_accession(accession): - record_files = self._list_direct_download_files(accession) - else: - record_files = self.stream_all_files_by_project(accession) - - category_files = [ - file for file in record_files if file["fileCategory"]["value"] in category_set - ] - return category_files + category_set = {c.upper() for c in categories} + from pridepy.providers import registry + records = registry.resolve(accession).list_files(accession) + return [r for r in records if r["fileCategory"]["value"] in category_set] # ------------------------------- # ProteomeXchange support diff --git a/pridepy/providers/pride.py b/pridepy/providers/pride.py index ea60a5d..c8c40ca 100644 --- a/pridepy/providers/pride.py +++ b/pridepy/providers/pride.py @@ -685,8 +685,33 @@ def _download_with_fallback( logging.error(f"All protocol attempts failed for {file_record['fileName']}") return False - @staticmethod def download_files( + self, + accession, + records: List[Dict], + output_folder: str, + skip_if_downloaded_already, + protocol: str = "ftp", + aspera_maximum_bandwidth: str = "100M", + checksum_check: bool = False, + parallel_files: int = 1, + username: Optional[str] = None, + password: Optional[str] = None, + ): + """Implement Provider.download_files — maps to the legacy static batch downloader.""" + PrideProvider._download_files_batch( + file_list_json=records, + accession=accession, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, + checksum_check=checksum_check, + parallel_files=parallel_files, + ) + + @staticmethod + def _download_files_batch( file_list_json: List[Dict], accession, output_folder: str, diff --git a/pridepy/tests/test_download_by_list.py b/pridepy/tests/test_download_by_list.py index df81914..5115b4e 100644 --- a/pridepy/tests/test_download_by_list.py +++ b/pridepy/tests/test_download_by_list.py @@ -14,6 +14,7 @@ from pridepy.files.files import Files from pridepy.pridepy import _read_filename_arguments +from pridepy.providers.pride import PrideProvider class TestDownloadFilesByList(TestCase): @@ -36,8 +37,8 @@ def test_filters_metadata_and_delegates(self): {"fileName": "c.raw"}, ] with patch.object( - files_obj, "stream_all_files_by_project", return_value=api_response - ), patch.object(files_obj, "download_files") as mock_download: + PrideProvider, "list_files", return_value=api_response + ), patch.object(PrideProvider, "download_files") as mock_download: files_obj.download_files_by_list( accession="PXD001819", file_names=["a.raw", "c.raw"], @@ -46,16 +47,16 @@ def test_filters_metadata_and_delegates(self): protocol="ftp", ) - args, _ = mock_download.call_args - matched = args[0] + _, kwargs = mock_download.call_args + matched = kwargs["records"] assert {f["fileName"] for f in matched} == {"a.raw", "c.raw"} def test_warns_on_partial_match(self): files_obj = Files() api_response = [{"fileName": "a.raw"}] with patch.object( - files_obj, "stream_all_files_by_project", return_value=api_response - ), patch.object(files_obj, "download_files") as mock_download, self.assertLogs( + PrideProvider, "list_files", return_value=api_response + ), patch.object(PrideProvider, "download_files") as mock_download, self.assertLogs( level="WARNING" ) as log_ctx: files_obj.download_files_by_list( @@ -71,7 +72,7 @@ def test_warns_on_partial_match(self): def test_raises_when_no_files_match(self): files_obj = Files() with patch.object( - files_obj, "stream_all_files_by_project", return_value=[] + PrideProvider, "list_files", return_value=[] ): with pytest.raises(ValueError, match="No matching files"): files_obj.download_files_by_list( diff --git a/pridepy/tests/test_jpost_files.py b/pridepy/tests/test_jpost_files.py index 678adda..1e4c652 100644 --- a/pridepy/tests/test_jpost_files.py +++ b/pridepy/tests/test_jpost_files.py @@ -4,6 +4,7 @@ from unittest.mock import MagicMock, patch from pridepy.files.files import Files +from pridepy.providers.jpost import JpostProvider class TestJPOSTFiles(TestCase): @@ -50,12 +51,9 @@ def test_get_all_raw_file_list_filters_jpost_records(self): ), ] - with patch.object(Files, "_list_jpost_public_files", return_value=jpost_records), patch.object( - Files, "stream_all_files_by_project" - ) as pride_mock: + with patch.object(JpostProvider, "list_files", return_value=jpost_records): result = files.get_all_raw_file_list("JPST000001") - pride_mock.assert_not_called() assert len(result) == 1 assert {file["fileName"] for file in result} == {"run1.raw"} @@ -68,7 +66,7 @@ def test_download_file_by_name_uses_jpost_ftp_listing(self): with tempfile.TemporaryDirectory() as tmp_dir: with patch.object( - Files, "_list_jpost_public_files", return_value=[file_record] + JpostProvider, "list_files", return_value=[file_record] ), patch.object(Files, "download_ftp_urls") as download_mock: files.download_file_by_name( accession="JPST000001", diff --git a/pridepy/tests/test_massive_files.py b/pridepy/tests/test_massive_files.py index fd6a4ac..a4e9278 100644 --- a/pridepy/tests/test_massive_files.py +++ b/pridepy/tests/test_massive_files.py @@ -3,6 +3,7 @@ from unittest.mock import patch from pridepy.files.files import Files +from pridepy.providers.massive import MassiveProvider class TestMassIVEFiles(TestCase): @@ -66,7 +67,7 @@ def test_get_all_raw_file_list_filters_massive_records(self): ), ] - with patch.object(Files, "_list_massive_public_files", return_value=massive_records): + with patch.object(MassiveProvider, "list_files", return_value=massive_records): result = files.get_all_raw_file_list("MSV000012345") assert len(result) == 1 @@ -80,7 +81,7 @@ def test_download_file_by_name_uses_massive_ftp_listing(self): ) with tempfile.TemporaryDirectory() as tmp_dir: - with patch.object(Files, "_list_massive_public_files", return_value=[file_record]), patch.object( + with patch.object(MassiveProvider, "list_files", return_value=[file_record]), patch.object( Files, "download_ftp_urls" ) as download_mock: files.download_file_by_name( @@ -120,7 +121,7 @@ def test_download_all_raw_files_threads_parallel_files_for_massive(self): with tempfile.TemporaryDirectory() as tmp_dir: with patch.object( - Files, "_list_massive_public_files", return_value=massive_records + MassiveProvider, "list_files", return_value=massive_records ), patch.object(Files, "download_ftp_urls") as download_mock: files.download_all_raw_files( accession="MSV000012345", From 6d5637bf40f22d2f84134105a86d5493275d6585 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 16:54:56 +0100 Subject: [PATCH 14/21] test: integration test for PRIDE multi-protocol fallback through facade Verifies that Files().download_all_raw_files for a PXD accession flows through Registry.resolve -> PrideProvider.download_files -> _batch_download_by_protocol (patched via Files), and that _download_with_fallback is only called when batch returns failed files. Also mocks validate_download to return success so the happy-path test correctly asserts that fallback is not invoked when all files pass validation after the primary-protocol batch run. Spec acceptance criterion #5. --- pridepy/tests/test_download_resilience.py | 40 +++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/pridepy/tests/test_download_resilience.py b/pridepy/tests/test_download_resilience.py index 21b1603..0f86013 100644 --- a/pridepy/tests/test_download_resilience.py +++ b/pridepy/tests/test_download_resilience.py @@ -268,3 +268,43 @@ def test_download_files_raises_when_any_file_fails(self): skip_if_downloaded_already=False, protocol="ftp", ) + + def test_facade_dispatches_pride_through_registry_to_fallback(self): + """Files().download_all_raw_files for a PXD accession must flow: + Files facade -> Registry.resolve -> PrideProvider.download_files + -> _batch_download_by_protocol (mocked). + + Patching Files._batch_download_by_protocol proves the patch intercepts + (i.e. PrideProvider calls *back* through Files, preserving the test + contract for the multi-protocol orchestrator). + """ + from pridepy.providers.pride import PrideProvider + + fake_records = [ + { + "accession": "PXD000001", + "fileName": "x.raw", + "fileCategory": {"value": "RAW"}, + "publicFileLocations": [ + {"name": "FTP Protocol", "value": "ftp://ftp.pride.ebi.ac.uk/.../x.raw"} + ], + }, + ] + + with tempfile.TemporaryDirectory() as tmp: + with patch.object(PrideProvider, "list_files", return_value=fake_records), \ + patch.object(Files, "_batch_download_by_protocol", return_value=[]) as batch_mock, \ + patch.object(Files, "validate_download", return_value=(True, "ok")), \ + patch.object(Files, "_download_with_fallback") as fallback_mock: + Files().download_all_raw_files( + accession="PXD000001", + output_folder=tmp, + skip_if_downloaded_already=False, + protocol="ftp", + aspera_maximum_bandwidth="100M", + ) + + batch_mock.assert_called_once() + # No fallback expected because all files passed validation after + # the primary-protocol batch run. + fallback_mock.assert_not_called() From f60a69c367b0d7f697728d0e886aace5843c06c9 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 16:55:50 +0100 Subject: [PATCH 15/21] chore(release): bump version to 0.0.17 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4a95f24..f5b74ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pridepy" -version = "0.0.16" +version = "0.0.17" description = "Python Client library for PRIDE Rest API" readme = "README.md" requires-python = ">=3.9" From 02de4017482e305af47723f9910b919e86aaf8bc Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 18:03:53 +0100 Subject: [PATCH 16/21] refactor(commands): scaffold commands/ package Empty scaffold for the follow-up refactor that extracts cross-cutting commands (download_files_by_url, download_files_by_list, download_px_raw_files) from Files into their own modules. No code moved yet. No behaviour change. Test suite green at 68 passed, 4 skipped. --- pridepy/commands/__init__.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 pridepy/commands/__init__.py diff --git a/pridepy/commands/__init__.py b/pridepy/commands/__init__.py new file mode 100644 index 0000000..c94f89e --- /dev/null +++ b/pridepy/commands/__init__.py @@ -0,0 +1,12 @@ +"""Cross-cutting download commands. + +Each module under this package owns one user-facing command that doesn't +fit any single provider: + +- ``by_url``: download a list of explicit URLs (ftp/http/https) +- ``by_list``: download a subset of a project's files by filename +- ``proteomexchange``: download raw files from a ProteomeXchange XML + +The ``pridepy.files.files.Files`` facade keeps shim methods that +delegate here, so existing test patches on ``Files.X`` keep working. +""" From c165345783ece06fbdba98d43a94b416bd055f65 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 18:06:03 +0100 Subject: [PATCH 17/21] refactor(commands): move ProteomeXchange XML download into commands/proteomexchange.py Moved download_px_raw_files, _normalize_px_xml_url, _parse_px_xml_for_raw_file_urls from Files into commands/proteomexchange.py. Files keeps shim re-exports. Also removed now-unused xml.etree.ElementTree import from files.py. No behaviour change. Test suite green. --- pridepy/commands/proteomexchange.py | 94 +++++++++++++++++++++++++++++ pridepy/files/files.py | 77 +++-------------------- 2 files changed, 104 insertions(+), 67 deletions(-) create mode 100644 pridepy/commands/proteomexchange.py diff --git a/pridepy/commands/proteomexchange.py b/pridepy/commands/proteomexchange.py new file mode 100644 index 0000000..d86cd24 --- /dev/null +++ b/pridepy/commands/proteomexchange.py @@ -0,0 +1,94 @@ +"""ProteomeXchange XML download command. + +Given a PXD accession or a ProteomeXchange XML URL, parse the XML for +``Associated raw file URI`` cvParams and download each one over its +native scheme (ftp:// via FTP, http(s):// via HTTPS). +""" +import logging +import os +import xml.etree.ElementTree as ET +from typing import List +from urllib.parse import urlparse + +from pridepy.util.api_handling import Util + + +def _normalize_px_xml_url(px_id_or_url: str) -> str: + """ + Build the ProteomeXchange XML endpoint from a dataset accession or a dataset web URL. + Examples accepted: + - PXD039236 + - https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236 + - https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236&anything + """ + if px_id_or_url.startswith("http://") or px_id_or_url.startswith("https://"): + parsed = urlparse(px_id_or_url) + # keep the ID param value if present; otherwise fallback to the path tail + query = parsed.query or "" + if "ID=" in query: + id_value = [q.split("=", 1)[1] for q in query.split("&") if q.startswith("ID=")] + if id_value: + return ( + f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={id_value[0]}&outputMode=XML&test=no" + ) + # If the input URL already requests XML, just ensure flags + if parsed.path.endswith("/cgi/GetDataset"): + return ( + f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?{query}&outputMode=XML&test=no" + ) + # Assume it's a plain accession if not a URL + return ( + f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={px_id_or_url}&outputMode=XML&test=no" + ) + + +def _parse_px_xml_for_raw_file_urls(px_xml_url: str) -> List[str]: + """ + Parse the PX XML and return a list of associated raw file URIs. + We extract cvParam with name "Associated raw file URI" under each DatasetFile. + """ + headers = {"Accept": "application/xml"} + response = Util.get_api_call(px_xml_url, headers) + response.raise_for_status() + root = ET.fromstring(response.content) + + urls: List[str] = [] + # The XML namespace is often absent in PX XML; access elements directly + for dataset_file in root.iter("DatasetFile"): + for cv in dataset_file.findall("cvParam"): + name = cv.attrib.get("name") + value = cv.attrib.get("value") + if name == "Associated raw file URI" and value: + urls.append(value) + return urls + + +def download_px_raw_files( + px_id_or_url: str, + output_folder: str, + skip_if_downloaded_already: bool = True, +) -> None: + """Download all raw files referenced by a ProteomeXchange dataset. + + Prefers FTP when the URL is ftp://, otherwise uses HTTP(S). Supports + resume and skip. + """ + from pridepy.files.files import Files # lazy: avoid module-load cycle + + if not os.path.isdir(output_folder): + os.makedirs(output_folder, exist_ok=True) + + px_xml_url = _normalize_px_xml_url(px_id_or_url) + logging.info(f"Fetching PX XML: {px_xml_url}") + urls = _parse_px_xml_for_raw_file_urls(px_xml_url) + if not urls: + logging.info("No Associated raw file URIs found in PX XML") + return + + ftp_urls = [u for u in urls if u.lower().startswith("ftp://")] + http_urls = [u for u in urls if u.lower().startswith(("http://", "https://"))] + + if ftp_urls: + Files.download_ftp_urls(ftp_urls, output_folder, skip_if_downloaded_already) + if http_urls: + Files.download_http_urls(http_urls, output_folder, skip_if_downloaded_already) diff --git a/pridepy/files/files.py b/pridepy/files/files.py index e9cde55..612152b 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -9,7 +9,6 @@ from ftplib import FTP from typing import Dict, List, Optional, Tuple from urllib.parse import urlparse -import xml.etree.ElementTree as ET import requests from tqdm import tqdm @@ -1047,53 +1046,15 @@ def get_all_category_file_list( @staticmethod def _normalize_px_xml_url(px_id_or_url: str) -> str: - """ - Build the ProteomeXchange XML endpoint from a dataset accession or a dataset web URL. - Examples accepted: - - PXD039236 - - https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236 - - https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236&anything - """ - if px_id_or_url.startswith("http://") or px_id_or_url.startswith("https://"): - parsed = urlparse(px_id_or_url) - # keep the ID param value if present; otherwise fallback to the path tail - query = parsed.query or "" - if "ID=" in query: - id_value = [q.split("=", 1)[1] for q in query.split("&") if q.startswith("ID=")] - if id_value: - return ( - f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={id_value[0]}&outputMode=XML&test=no" - ) - # If the input URL already requests XML, just ensure flags - if parsed.path.endswith("/cgi/GetDataset"): - return ( - f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?{query}&outputMode=XML&test=no" - ) - # Assume it's a plain accession if not a URL - return ( - f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={px_id_or_url}&outputMode=XML&test=no" - ) + """Shim — see :func:`pridepy.commands.proteomexchange._normalize_px_xml_url`.""" + from pridepy.commands import proteomexchange + return proteomexchange._normalize_px_xml_url(px_id_or_url) @staticmethod - def _parse_px_xml_for_raw_file_urls(px_xml_url: str) -> List[str]: - """ - Parse the PX XML and return a list of associated raw file URIs. - We extract cvParam with name "Associated raw file URI" under each DatasetFile. - """ - headers = {"Accept": "application/xml"} - response = Util.get_api_call(px_xml_url, headers) - response.raise_for_status() - root = ET.fromstring(response.content) - - urls: List[str] = [] - # The XML namespace is often absent in PX XML; access elements directly - for dataset_file in root.iter("DatasetFile"): - for cv in dataset_file.findall("cvParam"): - name = cv.attrib.get("name") - value = cv.attrib.get("value") - if name == "Associated raw file URI" and value: - urls.append(value) - return urls + def _parse_px_xml_for_raw_file_urls(px_xml_url: str): + """Shim — see :func:`pridepy.commands.proteomexchange._parse_px_xml_for_raw_file_urls`.""" + from pridepy.commands import proteomexchange + return proteomexchange._parse_px_xml_for_raw_file_urls(px_xml_url) def download_px_raw_files( self, @@ -1101,27 +1062,9 @@ def download_px_raw_files( output_folder: str, skip_if_downloaded_already: bool = True, ) -> None: - """ - Download all raw files referenced by a ProteomeXchange dataset. - Prefer FTP when the URL is ftp://, otherwise use HTTP(S). Supports resume and skip. - """ - if not os.path.isdir(output_folder): - os.makedirs(output_folder, exist_ok=True) - - px_xml_url = self._normalize_px_xml_url(px_id_or_url) - logging.info(f"Fetching PX XML: {px_xml_url}") - urls = self._parse_px_xml_for_raw_file_urls(px_xml_url) - if not urls: - logging.info("No Associated raw file URIs found in PX XML") - return - - ftp_urls = [u for u in urls if u.lower().startswith("ftp://")] - http_urls = [u for u in urls if u.lower().startswith("http://") or u.lower().startswith("https://")] - - if ftp_urls: - self.download_ftp_urls(ftp_urls, output_folder, skip_if_downloaded_already) - if http_urls: - self.download_http_urls(http_urls, output_folder, skip_if_downloaded_already) + """Shim — see :func:`pridepy.commands.proteomexchange.download_px_raw_files`.""" + from pridepy.commands import proteomexchange + return proteomexchange.download_px_raw_files(px_id_or_url, output_folder, skip_if_downloaded_already) @staticmethod def _local_path_for_url(download_url: str, output_folder: str) -> str: From 1a0a9b8d540c34e2ba6e49b21655d99d50546c26 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 18:07:35 +0100 Subject: [PATCH 18/21] refactor(commands): move download_files_by_list into commands/by_list.py Moved download_files_by_list from Files into commands/by_list.py. Files keeps a shim re-export. No behaviour change. Test suite green. --- pridepy/commands/by_list.py | 58 +++++++++++++++++++++++++++++++++++++ pridepy/files/files.py | 43 ++++----------------------- 2 files changed, 64 insertions(+), 37 deletions(-) create mode 100644 pridepy/commands/by_list.py diff --git a/pridepy/commands/by_list.py b/pridepy/commands/by_list.py new file mode 100644 index 0000000..e008d6e --- /dev/null +++ b/pridepy/commands/by_list.py @@ -0,0 +1,58 @@ +"""Download a subset of project files identified by a filename list.""" +import logging +from typing import List, Optional + + +def download_files_by_list( + accession: str, + file_names: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + protocol: str = "ftp", + aspera_maximum_bandwidth: str = "100M", + checksum_check: bool = False, + parallel_files: int = 1, +) -> None: + """Download a subset of project files identified by a filename list. + + Resolves each requested filename via the project metadata API and + delegates to the provider's ``download_files`` so the existing batch + + protocol fallback engine is reused. + + :param accession: PRIDE or MassIVE project accession (public) + :param file_names: filenames to download + :param output_folder: directory to write downloaded files into + :param skip_if_downloaded_already: skip files already present locally + :param protocol: preferred protocol; falls back across others on failure + :param aspera_maximum_bandwidth: aspera ascp bandwidth cap + :param checksum_check: download project checksums and validate + :param parallel_files: number of files to download simultaneously for globus + :raises ValueError: if ``file_names`` is empty or none match the project + """ + if not file_names: + raise ValueError("file_names must contain at least one filename") + + from pridepy.providers import registry # lazy + provider = registry.resolve(accession) + all_files = provider.list_files(accession) + + requested = set(file_names) + matched = [f for f in all_files if f.get("fileName") in requested] + missing = sorted(requested - {f.get("fileName") for f in matched}) + if missing: + logging.warning("Files not found in project %s: %s", accession, missing) + if not matched: + raise ValueError( + f"No matching files in project {accession} for: {sorted(requested)}" + ) + + provider.download_files( + accession=accession, + records=matched, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, + parallel_files=parallel_files, + checksum_check=checksum_check, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, + ) diff --git a/pridepy/files/files.py b/pridepy/files/files.py index 612152b..878b4c1 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -711,48 +711,17 @@ def download_files_by_list( checksum_check: bool = False, parallel_files: int = 1, ) -> None: - """Download a subset of project files identified by a filename list. - - Resolves each requested filename via the project metadata API and - delegates to :meth:`download_files` so the existing batch + protocol - fallback engine is reused. - - :param accession: PRIDE or MassIVE project accession (public) - :param file_names: filenames to download - :param output_folder: directory to write downloaded files into - :param skip_if_downloaded_already: skip files already present locally - :param protocol: preferred protocol; falls back across others on failure - :param aspera_maximum_bandwidth: aspera ascp bandwidth cap - :param checksum_check: download project checksums and validate - :param parallel_files: number of files to download simultaneously for globus - :raises ValueError: if ``file_names`` is empty or none match the project - """ - if not file_names: - raise ValueError("file_names must contain at least one filename") - - from pridepy.providers import registry - provider = registry.resolve(accession) - all_files = provider.list_files(accession) - - requested = set(file_names) - matched = [f for f in all_files if f.get("fileName") in requested] - missing = sorted(requested - {f.get("fileName") for f in matched}) - if missing: - logging.warning("Files not found in project %s: %s", accession, missing) - if not matched: - raise ValueError( - f"No matching files in project {accession} for: {sorted(requested)}" - ) - - provider.download_files( + """Shim — see :func:`pridepy.commands.by_list.download_files_by_list`.""" + from pridepy.commands import by_list + return by_list.download_files_by_list( accession=accession, - records=matched, + file_names=file_names, output_folder=output_folder, skip_if_downloaded_already=skip_if_downloaded_already, protocol=protocol, - parallel_files=parallel_files, - checksum_check=checksum_check, aspera_maximum_bandwidth=aspera_maximum_bandwidth, + checksum_check=checksum_check, + parallel_files=parallel_files, ) @staticmethod From ced7415e799ce196a5ea530815da993c5c15185f Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 18:13:49 +0100 Subject: [PATCH 19/21] refactor(commands): move download_files_by_url into commands/by_url.py Moved download_files_by_url and its 6 helpers (_extract_pride_accession, _validate_urls_checksums, _http_download_url, _ftp_download_url, _dispatch_url_scheme, _download_single_url) from Files into commands/by_url.py. Files keeps shim re-exports for each. Internal calls to patch-sensitive helpers (_http_download_url, _ftp_download_url, _dispatch_url_scheme, _download_single_url) go through Files.X (lazy import) so existing test patches like patch.object(Files, '_http_download_url') keep intercepting. files.py drops below 1000 LOC. No behaviour change. Test suite green at 68 passed, 4 skipped. --- pridepy/commands/by_url.py | 254 +++++++++++++++++++++++++++++++++++++ pridepy/files/files.py | 227 ++++----------------------------- 2 files changed, 282 insertions(+), 199 deletions(-) create mode 100644 pridepy/commands/by_url.py diff --git a/pridepy/commands/by_url.py b/pridepy/commands/by_url.py new file mode 100644 index 0000000..91d6fec --- /dev/null +++ b/pridepy/commands/by_url.py @@ -0,0 +1,254 @@ +"""Download a list of explicit URLs (ftp/http/https). + +Each URL is dispatched to the matching transport based on its scheme. +PRIDE checksum validation is supported when the accession can be +inferred from the URL path. +""" +import ftplib +import logging +import os +import re +from concurrent.futures import ThreadPoolExecutor, as_completed +from ftplib import FTP +from typing import Dict, List, Optional, Tuple +from urllib.parse import urlparse + +from tqdm import tqdm + +from pridepy.util.api_handling import Util + + +def _extract_pride_accession(url: str) -> Optional[str]: + """Extract a PRIDE accession (PXD/PRD followed by digits) from a URL path. + + PRIDE archive URLs follow the pattern + ``…/pride/data/archive/YYYY/MM//filename``. + Returns ``None`` when no accession can be identified. + """ + match = re.search(r"((?:PXD|PRD)\d{4,})", url) + return match.group(1) if match else None + + +def _validate_urls_checksums(urls: List[str], output_folder: str) -> None: + """Validate downloaded files against PRIDE checksum API. + + Accessions are inferred from URL paths via + :func:`_extract_pride_accession`. URLs that do not contain a + recognisable PRIDE accession are skipped with a warning. + + :raises RuntimeError: if one or more files fail validation + """ + from pridepy.files.files import Files + + accession_urls: Dict[str, List[str]] = {} + for url in urls: + acc = _extract_pride_accession(url) + if acc: + accession_urls.setdefault(acc, []).append(url) + else: + logging.warning( + "Cannot infer PRIDE accession from URL, skipping checksum: %s", url + ) + + validation_failures: List[str] = [] + for acc, acc_urls in accession_urls.items(): + checksum_file_path = Files.save_checksum_file(acc, output_folder) + checksum_map = Files.read_checksum_file(checksum_file_path) + logging.info( + "Loaded checksums for %d files (project %s)", + len(checksum_map), acc, + ) + for url in acc_urls: + file_name = os.path.basename(urlparse(url).path) + target = os.path.join(output_folder, file_name) + expected = checksum_map.get(file_name) + logging.info("Validating %s", file_name) + valid, reason = Files.validate_download(target, expected) + if not valid: + logging.error("Validation failed for %s: %s", file_name, reason) + validation_failures.append(f"{file_name} ({reason})") + else: + logging.info("Checksum OK: %s", file_name) + + if validation_failures: + raise RuntimeError( + f"Checksum validation failed for {len(validation_failures)} file(s): " + + ", ".join(validation_failures) + ) + + +def _http_download_url(url: str, target: str) -> None: + """Stream an http/https URL into ``target`` with a progress bar.""" + session = Util.create_session_with_retries() + with session.get(url, stream=True, timeout=60) as response: + response.raise_for_status() + total = int(response.headers.get("Content-Length", 0)) + with open(target, "wb") as out, tqdm( + total=total, + unit="B", + unit_scale=True, + desc=os.path.basename(target), + ) as pbar: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + out.write(chunk) + pbar.update(len(chunk)) + + +def _ftp_download_url(parsed, target: str) -> None: + """Download a single file from an ftp:// URL with a progress bar.""" + host = parsed.hostname + if not host: + raise ValueError(f"FTP URL missing host: {parsed.geturl()}") + port = parsed.port or 21 + user = parsed.username or "anonymous" + pwd = parsed.password or "anonymous@" + remote_path = parsed.path + with FTP() as ftp: + ftp.connect(host, port, timeout=60) + ftp.login(user, pwd) + try: + total = ftp.size(remote_path) or 0 + except ftplib.error_perm: + total = 0 + with open(target, "wb") as out, tqdm( + total=total, + unit="B", + unit_scale=True, + desc=os.path.basename(target), + ) as pbar: + + def _callback(data: bytes) -> None: + out.write(data) + pbar.update(len(data)) + + ftp.retrbinary(f"RETR {remote_path}", _callback) + + +def _dispatch_url_scheme(parsed, target: str, protocol: str = "ftp", position: int = 0) -> None: + """Route a parsed URL to its protocol-specific downloader. + + ``protocol='globus'`` swaps the http/https single-connection streamer + for :func:`pridepy.files.files.Files._parallel_download` (single-connection with progress bar). + ftp:// URLs are unaffected. + """ + from pridepy.files.files import Files + + scheme = (parsed.scheme or "").lower() + if scheme in ("http", "https"): + if protocol == "globus": + Files._parallel_download(parsed.geturl(), target, position=position) + else: + Files._http_download_url(parsed.geturl(), target) + elif scheme == "ftp": + Files._ftp_download_url(parsed, target) + else: + raise ValueError(f"Unsupported URL scheme: {scheme}") + + +def _download_single_url( + url: str, + output_folder: str, + skip_if_exists: bool = False, + protocol: str = "ftp", + position: int = 0, +) -> str: + """Download one URL, dispatched by scheme; return the local file path.""" + from pridepy.files.files import Files + + parsed = urlparse(url) + if not (parsed.scheme or "").lower(): + raise ValueError(f"URL missing scheme: {url}") + + file_name = os.path.basename(parsed.path) + if not file_name: + raise ValueError(f"Cannot derive filename from URL: {url}") + + target = os.path.join(output_folder, file_name) + if skip_if_exists and os.path.isfile(target) and os.path.getsize(target) > 0: + logging.info("Skipping %s: already downloaded", file_name) + return target + + Files._dispatch_url_scheme(parsed, target, protocol, position=position) + + ok, reason = Files.validate_download(target) + if not ok: + Files._remove_if_exists(target) + raise RuntimeError(f"Download invalid: {reason} ({target})") + return target + + +def download_files_by_url( + urls: List[str], + output_folder: str, + skip_if_downloaded_already: bool = False, + protocol: str = "ftp", + parallel_files: int = 1, + checksum_check: bool = False, +) -> None: + """Download files from a list of raw URLs, dispatched by URL scheme. + + Supported schemes: ``http``, ``https``, ``ftp``. Each URL is downloaded + independently; per-URL errors are logged, then aggregated and re-raised + as a single :class:`RuntimeError` so callers see a complete failure + summary. + + :param urls: fully-qualified URLs (each contains its scheme) + :param output_folder: directory to write downloaded files into + :param skip_if_downloaded_already: skip URLs whose target file exists + :param protocol: ``ftp`` (default) for single-connection per URL scheme; + ``globus`` for resume-capable http/https downloads (single-connection stream) + (no effect on ftp:// URLs which always use single-connection FTP) + :param checksum_check: validate downloads against PRIDE checksum API; + accessions are inferred from URL paths (only PRIDE URLs supported) + :raises ValueError: if ``urls`` is empty + :raises RuntimeError: if one or more URLs failed + """ + if not urls: + raise ValueError("urls must contain at least one URL") + + os.makedirs(output_folder, exist_ok=True) + + parallel_files = min(parallel_files, 3, len(urls)) + failures: List[Tuple[str, str]] = [] + from pridepy.files.files import Files + + if parallel_files < 2: + for url in urls: + try: + Files._download_single_url( + url, output_folder, skip_if_downloaded_already, protocol, + ) + except Exception as exc: # pylint: disable=broad-except + logging.error("Failed to download %s: %s", url, exc) + failures.append((url, str(exc))) + else: + logging.info( + "Downloading %d URL(s) with %d parallel workers", + len(urls), parallel_files, + ) + with ThreadPoolExecutor(max_workers=parallel_files) as executor: + futures = { + executor.submit( + Files._download_single_url, + url, output_folder, skip_if_downloaded_already, protocol, + position=idx, + ): url + for idx, url in enumerate(urls) + } + for future in as_completed(futures): + url = futures[future] + try: + future.result() + except Exception as exc: # pylint: disable=broad-except + logging.error("Failed to download %s: %s", url, exc) + failures.append((url, str(exc))) + + if failures: + summary = ", ".join(f"{u} ({e})" for u, e in failures) + raise RuntimeError( + f"Failed to download {len(failures)} URL(s): {summary}" + ) + + if checksum_check: + _validate_urls_checksums(urls, output_folder) diff --git a/pridepy/files/files.py b/pridepy/files/files.py index 878b4c1..8393519 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -1,17 +1,12 @@ #!/usr/bin/env python -import ftplib import logging import os -import re import urllib import urllib.request -from concurrent.futures import ThreadPoolExecutor, as_completed from ftplib import FTP from typing import Dict, List, Optional, Tuple -from urllib.parse import urlparse import requests -from tqdm import tqdm from pridepy.util.api_handling import Util @@ -726,14 +721,9 @@ def download_files_by_list( @staticmethod def _extract_pride_accession(url: str) -> Optional[str]: - """Extract a PRIDE accession (PXD/PRD followed by digits) from a URL path. - - PRIDE archive URLs follow the pattern - ``…/pride/data/archive/YYYY/MM//filename``. - Returns ``None`` when no accession can be identified. - """ - match = re.search(r"((?:PXD|PRD)\d{4,})", url) - return match.group(1) if match else None + """Shim — see :func:`pridepy.commands.by_url._extract_pride_accession`.""" + from pridepy.commands import by_url + return by_url._extract_pride_accession(url) @staticmethod def download_files_by_url( @@ -744,116 +734,22 @@ def download_files_by_url( parallel_files: int = 1, checksum_check: bool = False, ) -> None: - """Download files from a list of raw URLs, dispatched by URL scheme. - - Supported schemes: ``http``, ``https``, ``ftp``. Each URL is downloaded - independently; per-URL errors are logged, then aggregated and re-raised - as a single :class:`RuntimeError` so callers see a complete failure - summary. - - :param urls: fully-qualified URLs (each contains its scheme) - :param output_folder: directory to write downloaded files into - :param skip_if_downloaded_already: skip URLs whose target file exists - :param protocol: ``ftp`` (default) for single-connection per URL scheme; - ``globus`` for resume-capable http/https downloads (single-connection stream) - (no effect on ftp:// URLs which always use single-connection FTP) - :param checksum_check: validate downloads against PRIDE checksum API; - accessions are inferred from URL paths (only PRIDE URLs supported) - :raises ValueError: if ``urls`` is empty - :raises RuntimeError: if one or more URLs failed - """ - if not urls: - raise ValueError("urls must contain at least one URL") - - os.makedirs(output_folder, exist_ok=True) - - parallel_files = min(parallel_files, 3, len(urls)) - failures: List[Tuple[str, str]] = [] - if parallel_files < 2: - for url in urls: - try: - Files._download_single_url( - url, output_folder, skip_if_downloaded_already, protocol, - ) - except Exception as exc: # pylint: disable=broad-except - logging.error("Failed to download %s: %s", url, exc) - failures.append((url, str(exc))) - else: - logging.info( - "Downloading %d URL(s) with %d parallel workers", - len(urls), parallel_files, - ) - with ThreadPoolExecutor(max_workers=parallel_files) as executor: - futures = { - executor.submit( - Files._download_single_url, - url, output_folder, skip_if_downloaded_already, protocol, - position=idx, - ): url - for idx, url in enumerate(urls) - } - for future in as_completed(futures): - url = futures[future] - try: - future.result() - except Exception as exc: # pylint: disable=broad-except - logging.error("Failed to download %s: %s", url, exc) - failures.append((url, str(exc))) - - if failures: - summary = ", ".join(f"{u} ({e})" for u, e in failures) - raise RuntimeError( - f"Failed to download {len(failures)} URL(s): {summary}" - ) - - if checksum_check: - Files._validate_urls_checksums(urls, output_folder) + """Shim — see :func:`pridepy.commands.by_url.download_files_by_url`.""" + from pridepy.commands import by_url + return by_url.download_files_by_url( + urls=urls, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, + parallel_files=parallel_files, + checksum_check=checksum_check, + ) @staticmethod def _validate_urls_checksums(urls: List[str], output_folder: str) -> None: - """Validate downloaded files against PRIDE checksum API. - - Accessions are inferred from URL paths via - :meth:`_extract_pride_accession`. URLs that do not contain a - recognisable PRIDE accession are skipped with a warning. - - :raises RuntimeError: if one or more files fail validation - """ - accession_urls: Dict[str, List[str]] = {} - for url in urls: - acc = Files._extract_pride_accession(url) - if acc: - accession_urls.setdefault(acc, []).append(url) - else: - logging.warning( - "Cannot infer PRIDE accession from URL, skipping checksum: %s", url - ) - - validation_failures: List[str] = [] - for acc, acc_urls in accession_urls.items(): - checksum_file_path = Files.save_checksum_file(acc, output_folder) - checksum_map = Files.read_checksum_file(checksum_file_path) - logging.info( - "Loaded checksums for %d files (project %s)", - len(checksum_map), acc, - ) - for url in acc_urls: - file_name = os.path.basename(urlparse(url).path) - target = os.path.join(output_folder, file_name) - expected = checksum_map.get(file_name) - logging.info("Validating %s", file_name) - valid, reason = Files.validate_download(target, expected) - if not valid: - logging.error("Validation failed for %s: %s", file_name, reason) - validation_failures.append(f"{file_name} ({reason})") - else: - logging.info("Checksum OK: %s", file_name) - - if validation_failures: - raise RuntimeError( - f"Checksum validation failed for {len(validation_failures)} file(s): " - + ", ".join(validation_failures) - ) + """Shim — see :func:`pridepy.commands.by_url._validate_urls_checksums`.""" + from pridepy.commands import by_url + return by_url._validate_urls_checksums(urls, output_folder) @staticmethod def _download_single_url( @@ -863,94 +759,27 @@ def _download_single_url( protocol: str = "ftp", position: int = 0, ) -> str: - """Download one URL, dispatched by scheme; return the local file path.""" - parsed = urlparse(url) - if not (parsed.scheme or "").lower(): - raise ValueError(f"URL missing scheme: {url}") - - file_name = os.path.basename(parsed.path) - if not file_name: - raise ValueError(f"Cannot derive filename from URL: {url}") - - target = os.path.join(output_folder, file_name) - if skip_if_exists and os.path.isfile(target) and os.path.getsize(target) > 0: - logging.info("Skipping %s: already downloaded", file_name) - return target - - Files._dispatch_url_scheme(parsed, target, protocol, position=position) - - ok, reason = Files.validate_download(target) - if not ok: - Files._remove_if_exists(target) - raise RuntimeError(f"Download invalid: {reason} ({target})") - return target + """Shim — see :func:`pridepy.commands.by_url._download_single_url`.""" + from pridepy.commands import by_url + return by_url._download_single_url(url, output_folder, skip_if_exists, protocol, position) @staticmethod def _dispatch_url_scheme(parsed, target: str, protocol: str = "ftp", position: int = 0) -> None: - """Route a parsed URL to its protocol-specific downloader. - - ``protocol='globus'`` swaps the http/https single-connection streamer - for :meth:`_parallel_download` (single-connection with progress bar). - ftp:// URLs are unaffected. - """ - scheme = (parsed.scheme or "").lower() - if scheme in ("http", "https"): - if protocol == "globus": - Files._parallel_download(parsed.geturl(), target, position=position) - else: - Files._http_download_url(parsed.geturl(), target) - elif scheme == "ftp": - Files._ftp_download_url(parsed, target) - else: - raise ValueError(f"Unsupported URL scheme: {scheme}") + """Shim — see :func:`pridepy.commands.by_url._dispatch_url_scheme`.""" + from pridepy.commands import by_url + return by_url._dispatch_url_scheme(parsed, target, protocol=protocol, position=position) @staticmethod def _http_download_url(url: str, target: str) -> None: - """Stream an http/https URL into ``target`` with a progress bar.""" - session = Util.create_session_with_retries() - with session.get(url, stream=True, timeout=60) as response: - response.raise_for_status() - total = int(response.headers.get("Content-Length", 0)) - with open(target, "wb") as out, tqdm( - total=total, - unit="B", - unit_scale=True, - desc=os.path.basename(target), - ) as pbar: - for chunk in response.iter_content(chunk_size=8192): - if chunk: - out.write(chunk) - pbar.update(len(chunk)) + """Shim — see :func:`pridepy.commands.by_url._http_download_url`.""" + from pridepy.commands import by_url + return by_url._http_download_url(url, target) @staticmethod def _ftp_download_url(parsed, target: str) -> None: - """Download a single file from an ftp:// URL with a progress bar.""" - host = parsed.hostname - if not host: - raise ValueError(f"FTP URL missing host: {parsed.geturl()}") - port = parsed.port or 21 - user = parsed.username or "anonymous" - pwd = parsed.password or "anonymous@" - remote_path = parsed.path - with FTP() as ftp: - ftp.connect(host, port, timeout=60) - ftp.login(user, pwd) - try: - total = ftp.size(remote_path) or 0 - except ftplib.error_perm: - total = 0 - with open(target, "wb") as out, tqdm( - total=total, - unit="B", - unit_scale=True, - desc=os.path.basename(target), - ) as pbar: - - def _callback(data: bytes) -> None: - out.write(data) - pbar.update(len(data)) - - ftp.retrbinary(f"RETR {remote_path}", _callback) + """Shim — see :func:`pridepy.commands.by_url._ftp_download_url`.""" + from pridepy.commands import by_url + return by_url._ftp_download_url(parsed, target) def download_all_category_files( self, From e3d694b17d925799d52bc96bc84daa3af225f831 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 18:26:56 +0100 Subject: [PATCH 20/21] refactor(providers): move ProteomeXchange from commands/ to providers/ as a class ProteomeXchange behaves more like a provider than a command: it takes a PXD/PRD accession (or a ProteomeCentral URL) and returns file records, the same shape as the four other providers. Moving it to providers/ aligns the architecture. Changes: - New: pridepy/providers/proteomexchange.py with ProteomeXchangeProvider(Provider). It implements the full Provider interface (matches, list_files, download_files) plus a convenience method download_from_accession_or_url() for the download-px-raw-files CLI command's existing behaviour (skip_if_downloaded_already defaults to True, no parallel workers). - Deleted: pridepy/commands/proteomexchange.py. Its three functions (_normalize_px_xml_url, _parse_px_xml_for_raw_file_urls, download_px_raw_files) are now static/instance methods on the provider class. - Updated: Files shims for _normalize_px_xml_url, _parse_px_xml_for_raw_file_urls, and download_px_raw_files now delegate to ProteomeXchangeProvider. - Updated: commands/__init__.py docstring notes the move. Important: ProteomeXchangeProvider is NOT auto-registered with pridepy.providers.registry. PXD/PRD accessions continue to route through PrideProvider's V3 API path by default. ProteomeXchangeProvider is the explicit gateway for the cross-repository XML view, invoked via the download-px-raw-files CLI command and Files.download_px_raw_files. Full suite green at 68 passed, 4 skipped. No behaviour change for existing PXD downloads; download-px-raw-files keeps its XML-based listing flow exactly as before. --- pridepy/commands/__init__.py | 10 +- pridepy/commands/proteomexchange.py | 94 ------------- pridepy/files/files.py | 20 +-- pridepy/providers/proteomexchange.py | 192 +++++++++++++++++++++++++++ 4 files changed, 212 insertions(+), 104 deletions(-) delete mode 100644 pridepy/commands/proteomexchange.py create mode 100644 pridepy/providers/proteomexchange.py diff --git a/pridepy/commands/__init__.py b/pridepy/commands/__init__.py index c94f89e..f1312b8 100644 --- a/pridepy/commands/__init__.py +++ b/pridepy/commands/__init__.py @@ -5,7 +5,15 @@ - ``by_url``: download a list of explicit URLs (ftp/http/https) - ``by_list``: download a subset of a project's files by filename -- ``proteomexchange``: download raw files from a ProteomeXchange XML + +ProteomeXchange used to live here too but moved to +:class:`pridepy.providers.proteomexchange.ProteomeXchangeProvider` because +it conforms to the ``Provider`` interface (takes an accession or URL and +returns file records). It is deliberately not auto-registered with the +provider registry — PXD/PRD accessions continue to route through +:class:`pridepy.providers.pride.PrideProvider`; ProteomeXchangeProvider is +the explicit gateway for the cross-repository XML view, invoked via the +``download-px-raw-files`` CLI command and ``Files.download_px_raw_files``. The ``pridepy.files.files.Files`` facade keeps shim methods that delegate here, so existing test patches on ``Files.X`` keep working. diff --git a/pridepy/commands/proteomexchange.py b/pridepy/commands/proteomexchange.py deleted file mode 100644 index d86cd24..0000000 --- a/pridepy/commands/proteomexchange.py +++ /dev/null @@ -1,94 +0,0 @@ -"""ProteomeXchange XML download command. - -Given a PXD accession or a ProteomeXchange XML URL, parse the XML for -``Associated raw file URI`` cvParams and download each one over its -native scheme (ftp:// via FTP, http(s):// via HTTPS). -""" -import logging -import os -import xml.etree.ElementTree as ET -from typing import List -from urllib.parse import urlparse - -from pridepy.util.api_handling import Util - - -def _normalize_px_xml_url(px_id_or_url: str) -> str: - """ - Build the ProteomeXchange XML endpoint from a dataset accession or a dataset web URL. - Examples accepted: - - PXD039236 - - https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236 - - https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236&anything - """ - if px_id_or_url.startswith("http://") or px_id_or_url.startswith("https://"): - parsed = urlparse(px_id_or_url) - # keep the ID param value if present; otherwise fallback to the path tail - query = parsed.query or "" - if "ID=" in query: - id_value = [q.split("=", 1)[1] for q in query.split("&") if q.startswith("ID=")] - if id_value: - return ( - f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={id_value[0]}&outputMode=XML&test=no" - ) - # If the input URL already requests XML, just ensure flags - if parsed.path.endswith("/cgi/GetDataset"): - return ( - f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?{query}&outputMode=XML&test=no" - ) - # Assume it's a plain accession if not a URL - return ( - f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={px_id_or_url}&outputMode=XML&test=no" - ) - - -def _parse_px_xml_for_raw_file_urls(px_xml_url: str) -> List[str]: - """ - Parse the PX XML and return a list of associated raw file URIs. - We extract cvParam with name "Associated raw file URI" under each DatasetFile. - """ - headers = {"Accept": "application/xml"} - response = Util.get_api_call(px_xml_url, headers) - response.raise_for_status() - root = ET.fromstring(response.content) - - urls: List[str] = [] - # The XML namespace is often absent in PX XML; access elements directly - for dataset_file in root.iter("DatasetFile"): - for cv in dataset_file.findall("cvParam"): - name = cv.attrib.get("name") - value = cv.attrib.get("value") - if name == "Associated raw file URI" and value: - urls.append(value) - return urls - - -def download_px_raw_files( - px_id_or_url: str, - output_folder: str, - skip_if_downloaded_already: bool = True, -) -> None: - """Download all raw files referenced by a ProteomeXchange dataset. - - Prefers FTP when the URL is ftp://, otherwise uses HTTP(S). Supports - resume and skip. - """ - from pridepy.files.files import Files # lazy: avoid module-load cycle - - if not os.path.isdir(output_folder): - os.makedirs(output_folder, exist_ok=True) - - px_xml_url = _normalize_px_xml_url(px_id_or_url) - logging.info(f"Fetching PX XML: {px_xml_url}") - urls = _parse_px_xml_for_raw_file_urls(px_xml_url) - if not urls: - logging.info("No Associated raw file URIs found in PX XML") - return - - ftp_urls = [u for u in urls if u.lower().startswith("ftp://")] - http_urls = [u for u in urls if u.lower().startswith(("http://", "https://"))] - - if ftp_urls: - Files.download_ftp_urls(ftp_urls, output_folder, skip_if_downloaded_already) - if http_urls: - Files.download_http_urls(http_urls, output_folder, skip_if_downloaded_already) diff --git a/pridepy/files/files.py b/pridepy/files/files.py index 8393519..b46812e 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -844,15 +844,15 @@ def get_all_category_file_list( @staticmethod def _normalize_px_xml_url(px_id_or_url: str) -> str: - """Shim — see :func:`pridepy.commands.proteomexchange._normalize_px_xml_url`.""" - from pridepy.commands import proteomexchange - return proteomexchange._normalize_px_xml_url(px_id_or_url) + """Shim — see :meth:`pridepy.providers.proteomexchange.ProteomeXchangeProvider._normalize_px_xml_url`.""" + from pridepy.providers.proteomexchange import ProteomeXchangeProvider + return ProteomeXchangeProvider._normalize_px_xml_url(px_id_or_url) @staticmethod def _parse_px_xml_for_raw_file_urls(px_xml_url: str): - """Shim — see :func:`pridepy.commands.proteomexchange._parse_px_xml_for_raw_file_urls`.""" - from pridepy.commands import proteomexchange - return proteomexchange._parse_px_xml_for_raw_file_urls(px_xml_url) + """Shim — see :meth:`pridepy.providers.proteomexchange.ProteomeXchangeProvider._parse_px_xml_for_raw_file_urls`.""" + from pridepy.providers.proteomexchange import ProteomeXchangeProvider + return ProteomeXchangeProvider._parse_px_xml_for_raw_file_urls(px_xml_url) def download_px_raw_files( self, @@ -860,9 +860,11 @@ def download_px_raw_files( output_folder: str, skip_if_downloaded_already: bool = True, ) -> None: - """Shim — see :func:`pridepy.commands.proteomexchange.download_px_raw_files`.""" - from pridepy.commands import proteomexchange - return proteomexchange.download_px_raw_files(px_id_or_url, output_folder, skip_if_downloaded_already) + """Shim — see :meth:`pridepy.providers.proteomexchange.ProteomeXchangeProvider.download_from_accession_or_url`.""" + from pridepy.providers.proteomexchange import ProteomeXchangeProvider + return ProteomeXchangeProvider().download_from_accession_or_url( + px_id_or_url, output_folder, skip_if_downloaded_already + ) @staticmethod def _local_path_for_url(download_url: str, output_folder: str) -> str: diff --git a/pridepy/providers/proteomexchange.py b/pridepy/providers/proteomexchange.py new file mode 100644 index 0000000..cef0524 --- /dev/null +++ b/pridepy/providers/proteomexchange.py @@ -0,0 +1,192 @@ +"""ProteomeXchange provider. + +ProteomeXchange is a meta-repository: a PXD/PRD accession routes through +the cross-repository XML at ``proteomecentral.proteomexchange.org``, and +the XML's ``Associated raw file URI`` cvParams point at the actual hosting +repository (PRIDE / MassIVE / JPOST / iProX / etc.). + +Unlike the other providers in this package, ``ProteomeXchangeProvider`` is +NOT auto-registered with :mod:`pridepy.providers.registry`. PXD/PRD +accessions would otherwise be ambiguous between PRIDE's V3 API listing and +ProteomeXchange's XML listing; the registry continues to route PXD/PRD via +:class:`pridepy.providers.pride.PrideProvider`. ``ProteomeXchangeProvider`` +is the explicit gateway invoked by the ``download-px-raw-files`` CLI +command and by ``Files.download_px_raw_files`` — callers who specifically +want the cross-repository XML view. + +The class accepts either: + +- a plain accession (``PXD039236``) +- a ProteomeCentral dataset URL (``https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=...``) + +…and resolves it to the XML endpoint via :meth:`_normalize_px_xml_url`. +""" +import logging +import os +import re +import xml.etree.ElementTree as ET +from typing import ClassVar, Dict, List, Optional +from urllib.parse import urlparse + +from pridepy.providers.base import Provider +from pridepy.util.api_handling import Util + + +class ProteomeXchangeProvider(Provider): + name: ClassVar[str] = "proteomexchange" + + @staticmethod + def matches(accession: str) -> bool: + """Return True for PXD/PRD accessions or ProteomeCentral URLs. + + Not used by :mod:`pridepy.providers.registry` (this provider is + deliberately not auto-registered). Provided for parity with the + ``Provider`` interface and so direct callers can introspect whether + a given input looks like something ProteomeXchange knows how to + handle. + """ + if not accession: + return False + if accession.lower().startswith(("http://", "https://")): + return "proteomexchange" in accession.lower() or "cgi/GetDataset" in accession + return bool(re.fullmatch(r"(?:PXD|PRD)\d+", accession.upper())) + + @staticmethod + def _normalize_px_xml_url(px_id_or_url: str) -> str: + """Build the ProteomeXchange XML endpoint URL from an accession or URL. + + Examples accepted: + - ``PXD039236`` + - ``https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236`` + - ``https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236&anything`` + """ + if px_id_or_url.startswith("http://") or px_id_or_url.startswith("https://"): + parsed = urlparse(px_id_or_url) + query = parsed.query or "" + if "ID=" in query: + id_value = [ + q.split("=", 1)[1] for q in query.split("&") if q.startswith("ID=") + ] + if id_value: + return ( + "https://proteomecentral.proteomexchange.org/cgi/GetDataset" + f"?ID={id_value[0]}&outputMode=XML&test=no" + ) + if parsed.path.endswith("/cgi/GetDataset"): + return ( + "https://proteomecentral.proteomexchange.org/cgi/GetDataset" + f"?{query}&outputMode=XML&test=no" + ) + return ( + "https://proteomecentral.proteomexchange.org/cgi/GetDataset" + f"?ID={px_id_or_url}&outputMode=XML&test=no" + ) + + @staticmethod + def _parse_px_xml_for_raw_file_urls(px_xml_url: str) -> List[str]: + """Fetch the PX XML and return every ``Associated raw file URI`` value.""" + headers = {"Accept": "application/xml"} + response = Util.get_api_call(px_xml_url, headers) + response.raise_for_status() + root = ET.fromstring(response.content) + + urls: List[str] = [] + for dataset_file in root.iter("DatasetFile"): + for cv in dataset_file.findall("cvParam"): + name = cv.attrib.get("name") + value = cv.attrib.get("value") + if name == "Associated raw file URI" and value: + urls.append(value) + return urls + + def list_files(self, accession: str) -> List[Dict]: + """Return the dataset's raw-file URIs as minimal file records. + + The PX XML doesn't expose checksums or rich category labels, so + each record carries just enough to drive the downloader. + """ + px_xml_url = self._normalize_px_xml_url(accession) + logging.info(f"Fetching PX XML: {px_xml_url}") + urls = self._parse_px_xml_for_raw_file_urls(px_xml_url) + records: List[Dict] = [] + for url in urls: + parsed = urlparse(url) + records.append( + { + "accession": accession, + "fileName": os.path.basename(parsed.path), + "fileCategory": {"value": "RAW"}, + "publicFileLocations": [ + {"name": "FTP Protocol", "value": url} + ], + "source": "ProteomeXchange", + } + ) + return records + + def download_files( + self, + accession: str, + records: List[Dict], + output_folder: str, + skip_if_downloaded_already: bool, + protocol: str, + parallel_files: int = 1, + checksum_check: bool = False, + aspera_maximum_bandwidth: str = "100M", + username: Optional[str] = None, + password: Optional[str] = None, + ) -> None: + """Partition record URLs by scheme and route to the matching transport. + + Routes ftp:// records to :meth:`Files.download_ftp_urls` and + http(s):// records to :meth:`Files.download_http_urls`, going + through the Files facade so test patches like + ``patch.object(Files, "download_ftp_urls")`` continue to intercept. + """ + from pridepy.files.files import Files # lazy: avoid module-load cycle + + if not os.path.isdir(output_folder): + os.makedirs(output_folder, exist_ok=True) + + urls = [ + record["publicFileLocations"][0]["value"] + for record in records + if record.get("publicFileLocations") + ] + ftp_urls = [u for u in urls if u.lower().startswith("ftp://")] + http_urls = [u for u in urls if u.lower().startswith(("http://", "https://"))] + + if ftp_urls: + Files.download_ftp_urls( + ftp_urls, output_folder, skip_if_downloaded_already + ) + if http_urls: + Files.download_http_urls( + http_urls, output_folder, skip_if_downloaded_already + ) + + def download_from_accession_or_url( + self, + px_id_or_url: str, + output_folder: str, + skip_if_downloaded_already: bool = True, + ) -> None: + """End-to-end: resolve XML, list files, partition by scheme, download. + + Convenience for the ``download-px-raw-files`` CLI command — combines + :meth:`list_files` and :meth:`download_files` with the original + ``download_px_raw_files`` defaults (skip-if-downloaded-already + defaults to ``True``, no parallel workers). + """ + records = self.list_files(px_id_or_url) + if not records: + logging.info("No Associated raw file URIs found in PX XML") + return + self.download_files( + accession=px_id_or_url, + records=records, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol="ftp", + ) From 6bc8e5a61bca33fb79a57e8438b773c1defe9ab4 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 18:56:14 +0100 Subject: [PATCH 21/21] fix(files): restore missing imports and hoist lazy imports to module top MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related fixes: 1. CI lint failure: files.py used importlib.resources, subprocess, and time but never imported them. The Task 8 refactor removed these top-level imports along with the PRIDE-specific code that used them in PrideProvider, but two methods on Files (download_files_from_aspera and the now-dead _download_range) still referenced the undefined names. Flake8 with --select=F82 fails on 4 F821 undefined-name errors. Fix: re-add the three stdlib imports at the top of files.py. Also delete the orphaned _download_range method (no callers). 2. Hoist lazy provider/command imports out of Files method bodies. The shim pattern previously did 'from pridepy.providers import X' inside every method body — ~75 occurrences. Since providers do not import Files at module load (only inside method bodies), the reverse direction is safe to hoist: Files now imports {registry, transport, util, IproxProvider, JpostProvider, MassiveProvider, PrideProvider, ProteomeXchangeProvider, by_list, by_url} at module top, and the shims reference these names directly. Lazy imports inside provider/command method bodies that go back to Files (e.g. BaseDirectDownloadProvider.download_files doing 'from pridepy.files.files import Files') are kept lazy — they are genuinely cyclic and required for backward-compat test patching. Also: commands/by_list.py's 'from pridepy.providers import registry' hoisted to module top (no Files dependency, no cycle risk). Note: 'import requests' is kept in files.py (noqa: F401) because test suites patch 'pridepy.files.files.requests.get' directly. Tests: 68 passed, 4 skipped. flake8 --select=E9,F63,F7,F82 now clean. --- pridepy/commands/by_list.py | 3 +- pridepy/files/files.py | 203 +++++++++++------------------------- 2 files changed, 61 insertions(+), 145 deletions(-) diff --git a/pridepy/commands/by_list.py b/pridepy/commands/by_list.py index e008d6e..d2d0244 100644 --- a/pridepy/commands/by_list.py +++ b/pridepy/commands/by_list.py @@ -2,6 +2,8 @@ import logging from typing import List, Optional +from pridepy.providers import registry + def download_files_by_list( accession: str, @@ -32,7 +34,6 @@ def download_files_by_list( if not file_names: raise ValueError("file_names must contain at least one filename") - from pridepy.providers import registry # lazy provider = registry.resolve(accession) all_files = provider.list_files(accession) diff --git a/pridepy/files/files.py b/pridepy/files/files.py index b46812e..3567e78 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -1,58 +1,73 @@ #!/usr/bin/env python +import importlib.resources import logging import os +import subprocess import urllib import urllib.request from ftplib import FTP from typing import Dict, List, Optional, Tuple -import requests +import requests # noqa: F401 — kept as a patch target for tests from pridepy.util.api_handling import Util - -# Re-export from providers.util so external `from pridepy.files.files import Progress` +# Module-level imports of the modular architecture. Providers and commands +# do not import Files at module level (only lazily inside method bodies), +# so hoisting these to the top is safe and avoids cluttering every shim +# method body with a local import. +from pridepy.providers import registry, transport +from pridepy.providers import util as _provider_util +from pridepy.providers.iprox import IproxProvider +from pridepy.providers.jpost import JpostProvider +from pridepy.providers.massive import MASSIVE_CATEGORY_MAP, MassiveProvider +from pridepy.providers.pride import PrideProvider +from pridepy.providers.proteomexchange import ProteomeXchangeProvider +from pridepy.commands import by_list, by_url + +# Re-export Progress so external `from pridepy.files.files import Progress` # still works. from pridepy.providers.util import Progress # noqa: F401 class Files: """ - This class handles PRIDE API files endpoint. + This class handles PRIDE API files endpoint, and dispatches to the + per-repository provider classes in :mod:`pridepy.providers`. """ - # Re-exported from providers/pride.py — kept here for back-compat. - from pridepy.providers.pride import PrideProvider as _PrideProvider - V3_API_BASE_URL = _PrideProvider.V3_API_BASE_URL - API_BASE_URL = _PrideProvider.API_BASE_URL - API_PRIVATE_URL = _PrideProvider.API_PRIVATE_URL - PRIDE_ARCHIVE_FTP = _PrideProvider.ARCHIVE_FTP - PRIDE_ARCHIVE_FTP_URL_PREFIX = _PrideProvider.ARCHIVE_FTP_URL_PREFIX - PRIDE_ARCHIVE_HTTPS_URL_PREFIX = _PrideProvider.ARCHIVE_HTTPS_URL_PREFIX - S3_URL = _PrideProvider.S3_URL - S3_BUCKET = _PrideProvider.S3_BUCKET - PROTOCOL_ORDER = _PrideProvider.PROTOCOL_ORDER - del _PrideProvider - # Re-exported from providers/massive.py — kept here for back-compat. - from pridepy.providers.massive import ( # noqa: E402 - MASSIVE_CATEGORY_MAP as _MASSIVE_CATEGORY_MAP, - MassiveProvider as _MassiveProvider, - ) - MASSIVE_CATEGORY_MAP = _MASSIVE_CATEGORY_MAP - MASSIVE_ARCHIVE_FTP = _MassiveProvider.ARCHIVE_FTP - MASSIVE_ARCHIVE_FTP_URL_PREFIX = _MassiveProvider.ARCHIVE_FTP_URL_PREFIX - del _MASSIVE_CATEGORY_MAP, _MassiveProvider - from pridepy.providers.jpost import JpostProvider as _JpostProvider - JPOST_ARCHIVE_FTP = _JpostProvider.ARCHIVE_FTP - JPOST_ARCHIVE_FTP_URL_PREFIX = _JpostProvider.ARCHIVE_FTP_URL_PREFIX - JPOST_PROXI_BASE_URL = _JpostProvider.PROXI_BASE_URL - JPOST_PROXI_CATEGORY_MAP = _JpostProvider.PROXI_CATEGORY_MAP - del _JpostProvider - from pridepy.providers.iprox import IproxProvider as _IproxProvider - IPROX_DOWNLOAD_BASE_URL = _IproxProvider.DOWNLOAD_BASE_URL - IPROX_PX_XML_URL_TEMPLATE = _IproxProvider.PX_XML_URL_TEMPLATE - IPROX_PX_CATEGORY_MAP = _IproxProvider.PX_CATEGORY_MAP - del _IproxProvider + # PRIDE class-attribute re-exports (kept here for back-compat). + V3_API_BASE_URL = PrideProvider.V3_API_BASE_URL + API_BASE_URL = PrideProvider.API_BASE_URL + API_PRIVATE_URL = PrideProvider.API_PRIVATE_URL + PRIDE_ARCHIVE_FTP = PrideProvider.ARCHIVE_FTP + PRIDE_ARCHIVE_FTP_URL_PREFIX = PrideProvider.ARCHIVE_FTP_URL_PREFIX + PRIDE_ARCHIVE_HTTPS_URL_PREFIX = PrideProvider.ARCHIVE_HTTPS_URL_PREFIX + S3_URL = PrideProvider.S3_URL + S3_BUCKET = PrideProvider.S3_BUCKET + PROTOCOL_ORDER = PrideProvider.PROTOCOL_ORDER + + # MassIVE class-attribute re-exports. + MASSIVE_ARCHIVE_FTP = MassiveProvider.ARCHIVE_FTP + MASSIVE_ARCHIVE_FTP_URL_PREFIX = MassiveProvider.ARCHIVE_FTP_URL_PREFIX + # Note: MASSIVE_CATEGORY_MAP is the module-level constant in providers/massive.py, + # re-exported on Files as a class attribute via the module-level import above. + + # JPOST class-attribute re-exports. + JPOST_ARCHIVE_FTP = JpostProvider.ARCHIVE_FTP + JPOST_ARCHIVE_FTP_URL_PREFIX = JpostProvider.ARCHIVE_FTP_URL_PREFIX + JPOST_PROXI_BASE_URL = JpostProvider.PROXI_BASE_URL + JPOST_PROXI_CATEGORY_MAP = JpostProvider.PROXI_CATEGORY_MAP + + # iProX class-attribute re-exports. + IPROX_DOWNLOAD_BASE_URL = IproxProvider.DOWNLOAD_BASE_URL + IPROX_PX_XML_URL_TEMPLATE = IproxProvider.PX_XML_URL_TEMPLATE + IPROX_PX_CATEGORY_MAP = IproxProvider.PX_CATEGORY_MAP + + # MassIVE category map re-exported. Class attribute shadowing the module-level + # constant of the same name happens cleanly in class scope. + MASSIVE_CATEGORY_MAP = MASSIVE_CATEGORY_MAP + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") def __init__(self): @@ -61,118 +76,97 @@ def __init__(self): @staticmethod def _find_tsv_columns(header: str) -> Optional[Tuple[int, int]]: """Shim — see :func:`pridepy.providers.util._find_tsv_columns`.""" - from pridepy.providers import util - return util._find_tsv_columns(header) + return _provider_util._find_tsv_columns(header) @staticmethod def _is_md5_checksum(value: str) -> bool: """Shim — see :func:`pridepy.providers.util._is_md5_checksum`.""" - from pridepy.providers import util - return util._is_md5_checksum(value) + return _provider_util._is_md5_checksum(value) @staticmethod def read_checksum_file(checksum_file_path: str) -> Dict[str, str]: """Shim — see :func:`pridepy.providers.util.read_checksum_file`.""" - from pridepy.providers import util - return util.read_checksum_file(checksum_file_path) + return _provider_util.read_checksum_file(checksum_file_path) @staticmethod def compute_md5(file_path: str, chunk_size: int = 4 * 1024 * 1024) -> str: """Shim — see :func:`pridepy.providers.util.compute_md5`.""" - from pridepy.providers import util - return util.compute_md5(file_path, chunk_size) + return _provider_util.compute_md5(file_path, chunk_size) @staticmethod def validate_download(file_path: str, expected_checksum: Optional[str] = None) -> Tuple[bool, str]: """Shim — see :func:`pridepy.providers.util.validate_download`.""" - from pridepy.providers import util - return util.validate_download(file_path, expected_checksum) + return _provider_util.validate_download(file_path, expected_checksum) @staticmethod def _remove_if_exists(file_path: str) -> None: """Shim — see :func:`pridepy.providers.util._remove_if_exists`.""" - from pridepy.providers import util - return util._remove_if_exists(file_path) + return _provider_util._remove_if_exists(file_path) @staticmethod def _get_download_url(file_record: Dict, protocol: str) -> str: """Shim — see :func:`pridepy.providers.util._get_download_url`.""" - from pridepy.providers import util - return util._get_download_url(file_record, protocol) + return _provider_util._get_download_url(file_record, protocol) @staticmethod def _resolve_local_path(file_record: Dict, output_folder: str) -> str: """Shim — see :func:`pridepy.providers.util._resolve_local_path`.""" - from pridepy.providers import util - return util._resolve_local_path(file_record, output_folder) + return _provider_util._resolve_local_path(file_record, output_folder) @staticmethod def _protocol_sequence(protocol: str) -> List[str]: """Shim — see :meth:`pridepy.providers.pride.PrideProvider._protocol_sequence`.""" - from pridepy.providers.pride import PrideProvider return PrideProvider._protocol_sequence(protocol) @staticmethod def is_massive_accession(accession: str) -> bool: """Shim — see :meth:`pridepy.providers.massive.MassiveProvider.matches`.""" - from pridepy.providers.massive import MassiveProvider return MassiveProvider.matches(accession) @staticmethod def _get_massive_public_root(accession: str) -> str: - from pridepy.providers.massive import MassiveProvider return MassiveProvider._get_public_root(accession) @staticmethod def _get_massive_public_ftp_url(accession: str, remote_path: str) -> str: - from pridepy.providers.massive import MassiveProvider return MassiveProvider._get_public_ftp_url(accession, remote_path) @staticmethod def _map_massive_collection_to_category(collection: str) -> str: - from pridepy.providers.massive import MassiveProvider return MassiveProvider._map_collection_to_category(collection) @staticmethod def _build_massive_file_record(accession: str, ftp_url: str) -> Dict: - from pridepy.providers.massive import MassiveProvider return MassiveProvider._build_file_record(accession, ftp_url) @staticmethod def is_jpost_accession(accession: str) -> bool: """Shim — see :meth:`pridepy.providers.jpost.JpostProvider.matches`.""" - from pridepy.providers.jpost import JpostProvider return JpostProvider.matches(accession) @staticmethod def _get_jpost_public_root(accession: str) -> str: - from pridepy.providers.jpost import JpostProvider return JpostProvider._get_public_root(accession) @staticmethod def _get_jpost_public_ftp_url(accession: str, remote_path: str) -> str: - from pridepy.providers.jpost import JpostProvider return JpostProvider._get_public_ftp_url(accession, remote_path) @staticmethod def _build_jpost_file_record(accession, ftp_url, category_from_proxi=None): - from pridepy.providers.jpost import JpostProvider return JpostProvider._build_file_record(accession, ftp_url, category_from_proxi) @staticmethod def _build_iprox_file_record(accession, https_url, category_from_px=None): """Shim — see :meth:`pridepy.providers.iprox.IproxProvider._build_file_record`.""" - from pridepy.providers.iprox import IproxProvider return IproxProvider._build_file_record(accession, https_url, category_from_px) @staticmethod def _get_iprox_public_root(accession: str) -> str: - from pridepy.providers.iprox import IproxProvider return IproxProvider._get_public_root(accession) @staticmethod def _get_iprox_public_ftp_url(accession: str, remote_path: str) -> str: - from pridepy.providers.iprox import IproxProvider return IproxProvider._get_public_ftp_url(accession, remote_path) @staticmethod @@ -184,7 +178,6 @@ def is_direct_download_accession(accession: str) -> bool: validation and fallback), not the direct-download partitioned-by-URL- scheme path. So we filter PRIDE out here. """ - from pridepy.providers import registry try: provider = registry.resolve(accession) except ValueError: @@ -194,13 +187,11 @@ def is_direct_download_accession(accession: str) -> bool: @staticmethod def is_iprox_accession(accession: str) -> bool: """Shim — see :meth:`pridepy.providers.iprox.IproxProvider.matches`.""" - from pridepy.providers.iprox import IproxProvider return IproxProvider.matches(accession) @staticmethod def _repo_uses_tls(accession: str) -> bool: """Shim — returns the resolved provider's use_tls flag (False if unknown).""" - from pridepy.providers import registry try: provider = registry.resolve(accession) except ValueError: @@ -210,24 +201,20 @@ def _repo_uses_tls(accession: str) -> bool: @staticmethod def _walk_ftp_tree(ftp: FTP, remote_dir: str) -> List[str]: """Shim — see :func:`pridepy.providers.transport._walk_ftp_tree`.""" - from pridepy.providers import transport return transport._walk_ftp_tree(ftp=ftp, remote_dir=remote_dir) @staticmethod def _open_ftp_connection(host: str, use_tls: bool, timeout: int = 30) -> FTP: """Shim — see :func:`pridepy.providers.transport._open_ftp_connection`.""" - from pridepy.providers import transport return transport._open_ftp_connection(host=host, use_tls=use_tls, timeout=timeout) @staticmethod def _list_ftp_repo_files(host, remote_root, error_label, use_tls=False): """Shim — see :func:`pridepy.providers.transport._list_ftp_repo_files`.""" - from pridepy.providers import transport return transport._list_ftp_repo_files(host=host, remote_root=remote_root, error_label=error_label, use_tls=use_tls) def _list_massive_public_files(self, accession: str) -> List[Dict]: """Shim — see :meth:`pridepy.providers.massive.MassiveProvider.list_files`.""" - from pridepy.providers.massive import MassiveProvider return MassiveProvider().list_files(accession) def _download_massive_file_records( @@ -243,7 +230,6 @@ def _download_massive_file_records( Download public MassIVE files via anonymous FTP (now FTPS). Backward-compat shim — dispatches via the provider registry. """ - from pridepy.providers import registry registry.resolve(accession).download_files( accession=accession, records=file_records, @@ -261,7 +247,6 @@ def _list_jpost_public_files(self, accession: str) -> List[Dict]: test patches on ``_list_jpost_public_files_via_proxi`` and ``_list_ftp_repo_files`` continue to intercept. """ - from pridepy.providers.jpost import JpostProvider normalized_accession = accession.upper() try: return self._list_jpost_public_files_via_proxi(normalized_accession) @@ -286,23 +271,18 @@ def _list_jpost_public_files(self, accession: str) -> List[Dict]: def _list_jpost_public_files_via_proxi(self, accession: str) -> List[Dict]: """Shim — see :meth:`pridepy.providers.jpost.JpostProvider._list_via_proxi`.""" - from pridepy.providers.jpost import JpostProvider return JpostProvider()._list_via_proxi(accession) def _list_iprox_public_files(self, accession: str) -> List[Dict]: """Shim — see :meth:`pridepy.providers.iprox.IproxProvider.list_files`.""" - from pridepy.providers.iprox import IproxProvider return IproxProvider().list_files(accession) - async def stream_all_files_metadata(self, output_file, accession=None): """Shim — see :meth:`pridepy.providers.pride.PrideProvider.stream_all_files_metadata`.""" - from pridepy.providers.pride import PrideProvider return await PrideProvider().stream_all_files_metadata(output_file, accession) def stream_all_files_by_project(self, accession) -> List[Dict]: """Shim — see :meth:`pridepy.providers.pride.PrideProvider.stream_all_files_by_project`.""" - from pridepy.providers.pride import PrideProvider return PrideProvider().stream_all_files_by_project(accession) def get_all_raw_file_list(self, project_accession): @@ -310,7 +290,6 @@ def get_all_raw_file_list(self, project_accession): Returns the dataset's file records filtered to fileCategory == "RAW". """ - from pridepy.providers import registry provider = registry.resolve(project_accession) records = provider.list_files(project_accession) return [r for r in records if r["fileCategory"]["value"] == "RAW"] @@ -328,7 +307,6 @@ def download_all_raw_files( """Download all RAW files for any registered provider.""" if not os.path.isdir(output_folder): os.mkdir(output_folder) - from pridepy.providers import registry provider = registry.resolve(accession) records = self.get_all_raw_file_list(accession) provider.download_files( @@ -351,7 +329,6 @@ def download_files_from_ftp( max_download_retries=3, ): """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_files_from_ftp`.""" - from pridepy.providers.pride import PrideProvider return PrideProvider.download_files_from_ftp( file_list_json, output_folder, @@ -422,45 +399,14 @@ def download_files_from_aspera( except subprocess.CalledProcessError as e: logging.error(f"Aspera download failed for {new_file_path}: {str(e)}") - @staticmethod - def _download_range(url, file_path, start, end, pbar, max_retries=3): - """Download a byte range directly into the target file using seek.""" - for attempt in range(1, max_retries + 1): - try: - session = Util.create_session_with_retries() - headers = {"Range": f"bytes={start}-{end}"} - with session.get(url, headers=headers, stream=True, timeout=(15, 15)) as r: - r.raise_for_status() - if r.status_code != 206: - raise RuntimeError(f"Server did not honor Range request: {r.status_code}") - content_range = r.headers.get("Content-Range", "") - if not content_range.lower().startswith(f"bytes {start}-{end}/"): - raise RuntimeError(f"Unexpected Content-Range header: {content_range}") - with open(file_path, "r+b") as f: - f.seek(start) - for chunk in r.iter_content(chunk_size=8 * 1024 * 1024): - if chunk: - f.write(chunk) - pbar.update(len(chunk)) - return - except (requests.RequestException, RuntimeError, OSError) as exc: - logging.warning( - f"Range {start}-{end} attempt {attempt}/{max_retries} failed: {exc}" - ) - if attempt >= max_retries: - raise - time.sleep(2 * attempt) - @staticmethod def _parallel_download(url, file_path, position=0): """Shim — see :func:`pridepy.providers.transport._parallel_download`.""" - from pridepy.providers import transport return transport._parallel_download(url=url, file_path=file_path, position=position) @staticmethod def _globus_download_one(file, output_folder, skip_if_downloaded_already, max_retries=6, position=0): """Shim — see :meth:`pridepy.providers.pride.PrideProvider._globus_download_one`.""" - from pridepy.providers.pride import PrideProvider return PrideProvider._globus_download_one( file, output_folder, skip_if_downloaded_already, max_retries=max_retries, position=position, @@ -473,7 +419,6 @@ def download_files_from_globus( checksum_map: Optional[Dict[str, str]] = None, ): """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_files_from_globus`.""" - from pridepy.providers.pride import PrideProvider return PrideProvider.download_files_from_globus( file_list_json, output_folder, skip_if_downloaded_already, parallel_files=parallel_files, @@ -485,14 +430,12 @@ def download_files_from_s3( file_list_json: List[Dict], output_folder: str, skip_if_downloaded_already ): """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_files_from_s3`.""" - from pridepy.providers.pride import PrideProvider return PrideProvider.download_files_from_s3( file_list_json, output_folder, skip_if_downloaded_already, ) def get_submitted_file_path_prefix(self, accession): """Shim — see :meth:`pridepy.providers.pride.PrideProvider.get_submitted_file_path_prefix`.""" - from pridepy.providers.pride import PrideProvider return PrideProvider().get_submitted_file_path_prefix(accession) def download_file_by_name( @@ -523,7 +466,6 @@ def download_file_by_name( if not os.path.isdir(output_folder): os.mkdir(output_folder) - from pridepy.providers import registry provider = registry.resolve(accession) ## Check type of project @@ -596,7 +538,6 @@ def get_file_from_api(self, accession, file_name) -> List[Dict]: :param file_name: file name :return: file in json format """ - from pridepy.providers import registry try: records = registry.resolve(accession).list_files(accession) return [r for r in records if r["fileName"] == file_name] @@ -605,7 +546,6 @@ def get_file_from_api(self, accession, file_name) -> List[Dict]: def download_private_file_name(self, accession, file_name, output_folder, username, password): """Shim — see :meth:`pridepy.providers.pride.PrideProvider.download_private_file_name`.""" - from pridepy.providers.pride import PrideProvider return PrideProvider().download_private_file_name( accession, file_name, output_folder, username, password, ) @@ -613,13 +553,11 @@ def download_private_file_name(self, accession, file_name, output_folder, userna @staticmethod def get_ascp_binary(): """Shim — see :meth:`pridepy.providers.pride.PrideProvider.get_ascp_binary`.""" - from pridepy.providers.pride import PrideProvider return PrideProvider.get_ascp_binary() @staticmethod def save_checksum_file(accession, output_folder): """Shim — see :meth:`pridepy.providers.pride.PrideProvider.save_checksum_file`.""" - from pridepy.providers.pride import PrideProvider return PrideProvider.save_checksum_file(accession, output_folder) @staticmethod @@ -638,7 +576,6 @@ def _batch_download_by_protocol( :class:`PrideProvider` calls back through ``Files.X`` so those patches keep intercepting. """ - from pridepy.providers.pride import PrideProvider return PrideProvider._batch_download_by_protocol( file_list, output_folder, @@ -660,7 +597,6 @@ def _download_with_fallback( parallel_files: int = 1, ) -> bool: """Shim — see :meth:`pridepy.providers.pride.PrideProvider._download_with_fallback`.""" - from pridepy.providers.pride import PrideProvider return PrideProvider._download_with_fallback( file_record, output_folder, @@ -683,7 +619,6 @@ def download_files( parallel_files: int = 1, ): """Shim — see :meth:`pridepy.providers.pride.PrideProvider._download_files_batch`.""" - from pridepy.providers.pride import PrideProvider return PrideProvider._download_files_batch( file_list_json, accession, @@ -707,7 +642,6 @@ def download_files_by_list( parallel_files: int = 1, ) -> None: """Shim — see :func:`pridepy.commands.by_list.download_files_by_list`.""" - from pridepy.commands import by_list return by_list.download_files_by_list( accession=accession, file_names=file_names, @@ -722,7 +656,6 @@ def download_files_by_list( @staticmethod def _extract_pride_accession(url: str) -> Optional[str]: """Shim — see :func:`pridepy.commands.by_url._extract_pride_accession`.""" - from pridepy.commands import by_url return by_url._extract_pride_accession(url) @staticmethod @@ -735,7 +668,6 @@ def download_files_by_url( checksum_check: bool = False, ) -> None: """Shim — see :func:`pridepy.commands.by_url.download_files_by_url`.""" - from pridepy.commands import by_url return by_url.download_files_by_url( urls=urls, output_folder=output_folder, @@ -748,7 +680,6 @@ def download_files_by_url( @staticmethod def _validate_urls_checksums(urls: List[str], output_folder: str) -> None: """Shim — see :func:`pridepy.commands.by_url._validate_urls_checksums`.""" - from pridepy.commands import by_url return by_url._validate_urls_checksums(urls, output_folder) @staticmethod @@ -760,25 +691,21 @@ def _download_single_url( position: int = 0, ) -> str: """Shim — see :func:`pridepy.commands.by_url._download_single_url`.""" - from pridepy.commands import by_url return by_url._download_single_url(url, output_folder, skip_if_exists, protocol, position) @staticmethod def _dispatch_url_scheme(parsed, target: str, protocol: str = "ftp", position: int = 0) -> None: """Shim — see :func:`pridepy.commands.by_url._dispatch_url_scheme`.""" - from pridepy.commands import by_url return by_url._dispatch_url_scheme(parsed, target, protocol=protocol, position=position) @staticmethod def _http_download_url(url: str, target: str) -> None: """Shim — see :func:`pridepy.commands.by_url._http_download_url`.""" - from pridepy.commands import by_url return by_url._http_download_url(url, target) @staticmethod def _ftp_download_url(parsed, target: str) -> None: """Shim — see :func:`pridepy.commands.by_url._ftp_download_url`.""" - from pridepy.commands import by_url return by_url._ftp_download_url(parsed, target) def download_all_category_files( @@ -808,7 +735,6 @@ def download_all_category_files( if categories is None: categories = [category] if category else ["RAW"] records = self.get_all_category_file_list(accession, categories) - from pridepy.providers import registry provider = registry.resolve(accession) provider.download_files( accession=accession, @@ -834,7 +760,6 @@ def get_all_category_file_list( if isinstance(categories, str): categories = [categories] category_set = {c.upper() for c in categories} - from pridepy.providers import registry records = registry.resolve(accession).list_files(accession) return [r for r in records if r["fileCategory"]["value"] in category_set] @@ -845,13 +770,11 @@ def get_all_category_file_list( @staticmethod def _normalize_px_xml_url(px_id_or_url: str) -> str: """Shim — see :meth:`pridepy.providers.proteomexchange.ProteomeXchangeProvider._normalize_px_xml_url`.""" - from pridepy.providers.proteomexchange import ProteomeXchangeProvider return ProteomeXchangeProvider._normalize_px_xml_url(px_id_or_url) @staticmethod def _parse_px_xml_for_raw_file_urls(px_xml_url: str): """Shim — see :meth:`pridepy.providers.proteomexchange.ProteomeXchangeProvider._parse_px_xml_for_raw_file_urls`.""" - from pridepy.providers.proteomexchange import ProteomeXchangeProvider return ProteomeXchangeProvider._parse_px_xml_for_raw_file_urls(px_xml_url) def download_px_raw_files( @@ -861,7 +784,6 @@ def download_px_raw_files( skip_if_downloaded_already: bool = True, ) -> None: """Shim — see :meth:`pridepy.providers.proteomexchange.ProteomeXchangeProvider.download_from_accession_or_url`.""" - from pridepy.providers.proteomexchange import ProteomeXchangeProvider return ProteomeXchangeProvider().download_from_accession_or_url( px_id_or_url, output_folder, skip_if_downloaded_already ) @@ -869,7 +791,6 @@ def download_px_raw_files( @staticmethod def _local_path_for_url(download_url: str, output_folder: str) -> str: """Shim — see :func:`pridepy.providers.transport._local_path_for_url`.""" - from pridepy.providers import transport return transport._local_path_for_url(download_url=download_url, output_folder=output_folder) @staticmethod @@ -882,7 +803,6 @@ def _download_one_ftp_path( position: int = 0, ) -> None: """Shim — see :func:`pridepy.providers.transport._download_one_ftp_path`.""" - from pridepy.providers import transport return transport._download_one_ftp_path( ftp=ftp, ftp_path=ftp_path, @@ -903,7 +823,6 @@ def _download_ftp_paths_serial( max_download_retries: int, ) -> None: """Shim — see :func:`pridepy.providers.transport._download_ftp_paths_serial`.""" - from pridepy.providers import transport return transport._download_ftp_paths_serial( host=host, paths=paths, @@ -926,7 +845,6 @@ def _download_ftp_paths_parallel( parallel_files: int, ) -> None: """Shim — see :func:`pridepy.providers.transport._download_ftp_paths_parallel`.""" - from pridepy.providers import transport return transport._download_ftp_paths_parallel( host=host, paths=paths, @@ -949,7 +867,6 @@ def download_ftp_urls( parallel_files: int = 1, ) -> None: """Shim — see :func:`pridepy.providers.transport.download_ftp_urls`.""" - from pridepy.providers import transport return transport.download_ftp_urls( ftp_urls=ftp_urls, output_folder=output_folder, @@ -969,7 +886,6 @@ def _http_download_one( position: int = 0, ) -> None: """Shim — see :func:`pridepy.providers.transport._http_download_one`.""" - from pridepy.providers import transport return transport._http_download_one( url=url, output_folder=output_folder, @@ -987,7 +903,6 @@ def download_http_urls( max_retries: int = 3, ) -> None: """Shim — see :func:`pridepy.providers.transport.download_http_urls`.""" - from pridepy.providers import transport return transport.download_http_urls( http_urls=http_urls, output_folder=output_folder,