From 02de4017482e305af47723f9910b919e86aaf8bc Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 18:03:53 +0100 Subject: [PATCH 1/4] refactor(commands): scaffold commands/ package Empty scaffold for the follow-up refactor that extracts cross-cutting commands (download_files_by_url, download_files_by_list, download_px_raw_files) from Files into their own modules. No code moved yet. No behaviour change. Test suite green at 68 passed, 4 skipped. --- pridepy/commands/__init__.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 pridepy/commands/__init__.py diff --git a/pridepy/commands/__init__.py b/pridepy/commands/__init__.py new file mode 100644 index 0000000..c94f89e --- /dev/null +++ b/pridepy/commands/__init__.py @@ -0,0 +1,12 @@ +"""Cross-cutting download commands. + +Each module under this package owns one user-facing command that doesn't +fit any single provider: + +- ``by_url``: download a list of explicit URLs (ftp/http/https) +- ``by_list``: download a subset of a project's files by filename +- ``proteomexchange``: download raw files from a ProteomeXchange XML + +The ``pridepy.files.files.Files`` facade keeps shim methods that +delegate here, so existing test patches on ``Files.X`` keep working. +""" From c165345783ece06fbdba98d43a94b416bd055f65 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 18:06:03 +0100 Subject: [PATCH 2/4] refactor(commands): move ProteomeXchange XML download into commands/proteomexchange.py Moved download_px_raw_files, _normalize_px_xml_url, _parse_px_xml_for_raw_file_urls from Files into commands/proteomexchange.py. Files keeps shim re-exports. Also removed now-unused xml.etree.ElementTree import from files.py. No behaviour change. Test suite green. --- pridepy/commands/proteomexchange.py | 94 +++++++++++++++++++++++++++++ pridepy/files/files.py | 77 +++-------------------- 2 files changed, 104 insertions(+), 67 deletions(-) create mode 100644 pridepy/commands/proteomexchange.py diff --git a/pridepy/commands/proteomexchange.py b/pridepy/commands/proteomexchange.py new file mode 100644 index 0000000..d86cd24 --- /dev/null +++ b/pridepy/commands/proteomexchange.py @@ -0,0 +1,94 @@ +"""ProteomeXchange XML download command. + +Given a PXD accession or a ProteomeXchange XML URL, parse the XML for +``Associated raw file URI`` cvParams and download each one over its +native scheme (ftp:// via FTP, http(s):// via HTTPS). +""" +import logging +import os +import xml.etree.ElementTree as ET +from typing import List +from urllib.parse import urlparse + +from pridepy.util.api_handling import Util + + +def _normalize_px_xml_url(px_id_or_url: str) -> str: + """ + Build the ProteomeXchange XML endpoint from a dataset accession or a dataset web URL. + Examples accepted: + - PXD039236 + - https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236 + - https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236&anything + """ + if px_id_or_url.startswith("http://") or px_id_or_url.startswith("https://"): + parsed = urlparse(px_id_or_url) + # keep the ID param value if present; otherwise fallback to the path tail + query = parsed.query or "" + if "ID=" in query: + id_value = [q.split("=", 1)[1] for q in query.split("&") if q.startswith("ID=")] + if id_value: + return ( + f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={id_value[0]}&outputMode=XML&test=no" + ) + # If the input URL already requests XML, just ensure flags + if parsed.path.endswith("/cgi/GetDataset"): + return ( + f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?{query}&outputMode=XML&test=no" + ) + # Assume it's a plain accession if not a URL + return ( + f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={px_id_or_url}&outputMode=XML&test=no" + ) + + +def _parse_px_xml_for_raw_file_urls(px_xml_url: str) -> List[str]: + """ + Parse the PX XML and return a list of associated raw file URIs. + We extract cvParam with name "Associated raw file URI" under each DatasetFile. + """ + headers = {"Accept": "application/xml"} + response = Util.get_api_call(px_xml_url, headers) + response.raise_for_status() + root = ET.fromstring(response.content) + + urls: List[str] = [] + # The XML namespace is often absent in PX XML; access elements directly + for dataset_file in root.iter("DatasetFile"): + for cv in dataset_file.findall("cvParam"): + name = cv.attrib.get("name") + value = cv.attrib.get("value") + if name == "Associated raw file URI" and value: + urls.append(value) + return urls + + +def download_px_raw_files( + px_id_or_url: str, + output_folder: str, + skip_if_downloaded_already: bool = True, +) -> None: + """Download all raw files referenced by a ProteomeXchange dataset. + + Prefers FTP when the URL is ftp://, otherwise uses HTTP(S). Supports + resume and skip. + """ + from pridepy.files.files import Files # lazy: avoid module-load cycle + + if not os.path.isdir(output_folder): + os.makedirs(output_folder, exist_ok=True) + + px_xml_url = _normalize_px_xml_url(px_id_or_url) + logging.info(f"Fetching PX XML: {px_xml_url}") + urls = _parse_px_xml_for_raw_file_urls(px_xml_url) + if not urls: + logging.info("No Associated raw file URIs found in PX XML") + return + + ftp_urls = [u for u in urls if u.lower().startswith("ftp://")] + http_urls = [u for u in urls if u.lower().startswith(("http://", "https://"))] + + if ftp_urls: + Files.download_ftp_urls(ftp_urls, output_folder, skip_if_downloaded_already) + if http_urls: + Files.download_http_urls(http_urls, output_folder, skip_if_downloaded_already) diff --git a/pridepy/files/files.py b/pridepy/files/files.py index e9cde55..612152b 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -9,7 +9,6 @@ from ftplib import FTP from typing import Dict, List, Optional, Tuple from urllib.parse import urlparse -import xml.etree.ElementTree as ET import requests from tqdm import tqdm @@ -1047,53 +1046,15 @@ def get_all_category_file_list( @staticmethod def _normalize_px_xml_url(px_id_or_url: str) -> str: - """ - Build the ProteomeXchange XML endpoint from a dataset accession or a dataset web URL. - Examples accepted: - - PXD039236 - - https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236 - - https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236&anything - """ - if px_id_or_url.startswith("http://") or px_id_or_url.startswith("https://"): - parsed = urlparse(px_id_or_url) - # keep the ID param value if present; otherwise fallback to the path tail - query = parsed.query or "" - if "ID=" in query: - id_value = [q.split("=", 1)[1] for q in query.split("&") if q.startswith("ID=")] - if id_value: - return ( - f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={id_value[0]}&outputMode=XML&test=no" - ) - # If the input URL already requests XML, just ensure flags - if parsed.path.endswith("/cgi/GetDataset"): - return ( - f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?{query}&outputMode=XML&test=no" - ) - # Assume it's a plain accession if not a URL - return ( - f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={px_id_or_url}&outputMode=XML&test=no" - ) + """Shim — see :func:`pridepy.commands.proteomexchange._normalize_px_xml_url`.""" + from pridepy.commands import proteomexchange + return proteomexchange._normalize_px_xml_url(px_id_or_url) @staticmethod - def _parse_px_xml_for_raw_file_urls(px_xml_url: str) -> List[str]: - """ - Parse the PX XML and return a list of associated raw file URIs. - We extract cvParam with name "Associated raw file URI" under each DatasetFile. - """ - headers = {"Accept": "application/xml"} - response = Util.get_api_call(px_xml_url, headers) - response.raise_for_status() - root = ET.fromstring(response.content) - - urls: List[str] = [] - # The XML namespace is often absent in PX XML; access elements directly - for dataset_file in root.iter("DatasetFile"): - for cv in dataset_file.findall("cvParam"): - name = cv.attrib.get("name") - value = cv.attrib.get("value") - if name == "Associated raw file URI" and value: - urls.append(value) - return urls + def _parse_px_xml_for_raw_file_urls(px_xml_url: str): + """Shim — see :func:`pridepy.commands.proteomexchange._parse_px_xml_for_raw_file_urls`.""" + from pridepy.commands import proteomexchange + return proteomexchange._parse_px_xml_for_raw_file_urls(px_xml_url) def download_px_raw_files( self, @@ -1101,27 +1062,9 @@ def download_px_raw_files( output_folder: str, skip_if_downloaded_already: bool = True, ) -> None: - """ - Download all raw files referenced by a ProteomeXchange dataset. - Prefer FTP when the URL is ftp://, otherwise use HTTP(S). Supports resume and skip. - """ - if not os.path.isdir(output_folder): - os.makedirs(output_folder, exist_ok=True) - - px_xml_url = self._normalize_px_xml_url(px_id_or_url) - logging.info(f"Fetching PX XML: {px_xml_url}") - urls = self._parse_px_xml_for_raw_file_urls(px_xml_url) - if not urls: - logging.info("No Associated raw file URIs found in PX XML") - return - - ftp_urls = [u for u in urls if u.lower().startswith("ftp://")] - http_urls = [u for u in urls if u.lower().startswith("http://") or u.lower().startswith("https://")] - - if ftp_urls: - self.download_ftp_urls(ftp_urls, output_folder, skip_if_downloaded_already) - if http_urls: - self.download_http_urls(http_urls, output_folder, skip_if_downloaded_already) + """Shim — see :func:`pridepy.commands.proteomexchange.download_px_raw_files`.""" + from pridepy.commands import proteomexchange + return proteomexchange.download_px_raw_files(px_id_or_url, output_folder, skip_if_downloaded_already) @staticmethod def _local_path_for_url(download_url: str, output_folder: str) -> str: From 1a0a9b8d540c34e2ba6e49b21655d99d50546c26 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 18:07:35 +0100 Subject: [PATCH 3/4] refactor(commands): move download_files_by_list into commands/by_list.py Moved download_files_by_list from Files into commands/by_list.py. Files keeps a shim re-export. No behaviour change. Test suite green. --- pridepy/commands/by_list.py | 58 +++++++++++++++++++++++++++++++++++++ pridepy/files/files.py | 43 ++++----------------------- 2 files changed, 64 insertions(+), 37 deletions(-) create mode 100644 pridepy/commands/by_list.py diff --git a/pridepy/commands/by_list.py b/pridepy/commands/by_list.py new file mode 100644 index 0000000..e008d6e --- /dev/null +++ b/pridepy/commands/by_list.py @@ -0,0 +1,58 @@ +"""Download a subset of project files identified by a filename list.""" +import logging +from typing import List, Optional + + +def download_files_by_list( + accession: str, + file_names: List[str], + output_folder: str, + skip_if_downloaded_already: bool, + protocol: str = "ftp", + aspera_maximum_bandwidth: str = "100M", + checksum_check: bool = False, + parallel_files: int = 1, +) -> None: + """Download a subset of project files identified by a filename list. + + Resolves each requested filename via the project metadata API and + delegates to the provider's ``download_files`` so the existing batch + + protocol fallback engine is reused. + + :param accession: PRIDE or MassIVE project accession (public) + :param file_names: filenames to download + :param output_folder: directory to write downloaded files into + :param skip_if_downloaded_already: skip files already present locally + :param protocol: preferred protocol; falls back across others on failure + :param aspera_maximum_bandwidth: aspera ascp bandwidth cap + :param checksum_check: download project checksums and validate + :param parallel_files: number of files to download simultaneously for globus + :raises ValueError: if ``file_names`` is empty or none match the project + """ + if not file_names: + raise ValueError("file_names must contain at least one filename") + + from pridepy.providers import registry # lazy + provider = registry.resolve(accession) + all_files = provider.list_files(accession) + + requested = set(file_names) + matched = [f for f in all_files if f.get("fileName") in requested] + missing = sorted(requested - {f.get("fileName") for f in matched}) + if missing: + logging.warning("Files not found in project %s: %s", accession, missing) + if not matched: + raise ValueError( + f"No matching files in project {accession} for: {sorted(requested)}" + ) + + provider.download_files( + accession=accession, + records=matched, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, + parallel_files=parallel_files, + checksum_check=checksum_check, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, + ) diff --git a/pridepy/files/files.py b/pridepy/files/files.py index 612152b..878b4c1 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -711,48 +711,17 @@ def download_files_by_list( checksum_check: bool = False, parallel_files: int = 1, ) -> None: - """Download a subset of project files identified by a filename list. - - Resolves each requested filename via the project metadata API and - delegates to :meth:`download_files` so the existing batch + protocol - fallback engine is reused. - - :param accession: PRIDE or MassIVE project accession (public) - :param file_names: filenames to download - :param output_folder: directory to write downloaded files into - :param skip_if_downloaded_already: skip files already present locally - :param protocol: preferred protocol; falls back across others on failure - :param aspera_maximum_bandwidth: aspera ascp bandwidth cap - :param checksum_check: download project checksums and validate - :param parallel_files: number of files to download simultaneously for globus - :raises ValueError: if ``file_names`` is empty or none match the project - """ - if not file_names: - raise ValueError("file_names must contain at least one filename") - - from pridepy.providers import registry - provider = registry.resolve(accession) - all_files = provider.list_files(accession) - - requested = set(file_names) - matched = [f for f in all_files if f.get("fileName") in requested] - missing = sorted(requested - {f.get("fileName") for f in matched}) - if missing: - logging.warning("Files not found in project %s: %s", accession, missing) - if not matched: - raise ValueError( - f"No matching files in project {accession} for: {sorted(requested)}" - ) - - provider.download_files( + """Shim — see :func:`pridepy.commands.by_list.download_files_by_list`.""" + from pridepy.commands import by_list + return by_list.download_files_by_list( accession=accession, - records=matched, + file_names=file_names, output_folder=output_folder, skip_if_downloaded_already=skip_if_downloaded_already, protocol=protocol, - parallel_files=parallel_files, - checksum_check=checksum_check, aspera_maximum_bandwidth=aspera_maximum_bandwidth, + checksum_check=checksum_check, + parallel_files=parallel_files, ) @staticmethod From ced7415e799ce196a5ea530815da993c5c15185f Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 18:13:49 +0100 Subject: [PATCH 4/4] refactor(commands): move download_files_by_url into commands/by_url.py Moved download_files_by_url and its 6 helpers (_extract_pride_accession, _validate_urls_checksums, _http_download_url, _ftp_download_url, _dispatch_url_scheme, _download_single_url) from Files into commands/by_url.py. Files keeps shim re-exports for each. Internal calls to patch-sensitive helpers (_http_download_url, _ftp_download_url, _dispatch_url_scheme, _download_single_url) go through Files.X (lazy import) so existing test patches like patch.object(Files, '_http_download_url') keep intercepting. files.py drops below 1000 LOC. No behaviour change. Test suite green at 68 passed, 4 skipped. --- pridepy/commands/by_url.py | 254 +++++++++++++++++++++++++++++++++++++ pridepy/files/files.py | 227 ++++----------------------------- 2 files changed, 282 insertions(+), 199 deletions(-) create mode 100644 pridepy/commands/by_url.py diff --git a/pridepy/commands/by_url.py b/pridepy/commands/by_url.py new file mode 100644 index 0000000..91d6fec --- /dev/null +++ b/pridepy/commands/by_url.py @@ -0,0 +1,254 @@ +"""Download a list of explicit URLs (ftp/http/https). + +Each URL is dispatched to the matching transport based on its scheme. +PRIDE checksum validation is supported when the accession can be +inferred from the URL path. +""" +import ftplib +import logging +import os +import re +from concurrent.futures import ThreadPoolExecutor, as_completed +from ftplib import FTP +from typing import Dict, List, Optional, Tuple +from urllib.parse import urlparse + +from tqdm import tqdm + +from pridepy.util.api_handling import Util + + +def _extract_pride_accession(url: str) -> Optional[str]: + """Extract a PRIDE accession (PXD/PRD followed by digits) from a URL path. + + PRIDE archive URLs follow the pattern + ``…/pride/data/archive/YYYY/MM//filename``. + Returns ``None`` when no accession can be identified. + """ + match = re.search(r"((?:PXD|PRD)\d{4,})", url) + return match.group(1) if match else None + + +def _validate_urls_checksums(urls: List[str], output_folder: str) -> None: + """Validate downloaded files against PRIDE checksum API. + + Accessions are inferred from URL paths via + :func:`_extract_pride_accession`. URLs that do not contain a + recognisable PRIDE accession are skipped with a warning. + + :raises RuntimeError: if one or more files fail validation + """ + from pridepy.files.files import Files + + accession_urls: Dict[str, List[str]] = {} + for url in urls: + acc = _extract_pride_accession(url) + if acc: + accession_urls.setdefault(acc, []).append(url) + else: + logging.warning( + "Cannot infer PRIDE accession from URL, skipping checksum: %s", url + ) + + validation_failures: List[str] = [] + for acc, acc_urls in accession_urls.items(): + checksum_file_path = Files.save_checksum_file(acc, output_folder) + checksum_map = Files.read_checksum_file(checksum_file_path) + logging.info( + "Loaded checksums for %d files (project %s)", + len(checksum_map), acc, + ) + for url in acc_urls: + file_name = os.path.basename(urlparse(url).path) + target = os.path.join(output_folder, file_name) + expected = checksum_map.get(file_name) + logging.info("Validating %s", file_name) + valid, reason = Files.validate_download(target, expected) + if not valid: + logging.error("Validation failed for %s: %s", file_name, reason) + validation_failures.append(f"{file_name} ({reason})") + else: + logging.info("Checksum OK: %s", file_name) + + if validation_failures: + raise RuntimeError( + f"Checksum validation failed for {len(validation_failures)} file(s): " + + ", ".join(validation_failures) + ) + + +def _http_download_url(url: str, target: str) -> None: + """Stream an http/https URL into ``target`` with a progress bar.""" + session = Util.create_session_with_retries() + with session.get(url, stream=True, timeout=60) as response: + response.raise_for_status() + total = int(response.headers.get("Content-Length", 0)) + with open(target, "wb") as out, tqdm( + total=total, + unit="B", + unit_scale=True, + desc=os.path.basename(target), + ) as pbar: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + out.write(chunk) + pbar.update(len(chunk)) + + +def _ftp_download_url(parsed, target: str) -> None: + """Download a single file from an ftp:// URL with a progress bar.""" + host = parsed.hostname + if not host: + raise ValueError(f"FTP URL missing host: {parsed.geturl()}") + port = parsed.port or 21 + user = parsed.username or "anonymous" + pwd = parsed.password or "anonymous@" + remote_path = parsed.path + with FTP() as ftp: + ftp.connect(host, port, timeout=60) + ftp.login(user, pwd) + try: + total = ftp.size(remote_path) or 0 + except ftplib.error_perm: + total = 0 + with open(target, "wb") as out, tqdm( + total=total, + unit="B", + unit_scale=True, + desc=os.path.basename(target), + ) as pbar: + + def _callback(data: bytes) -> None: + out.write(data) + pbar.update(len(data)) + + ftp.retrbinary(f"RETR {remote_path}", _callback) + + +def _dispatch_url_scheme(parsed, target: str, protocol: str = "ftp", position: int = 0) -> None: + """Route a parsed URL to its protocol-specific downloader. + + ``protocol='globus'`` swaps the http/https single-connection streamer + for :func:`pridepy.files.files.Files._parallel_download` (single-connection with progress bar). + ftp:// URLs are unaffected. + """ + from pridepy.files.files import Files + + scheme = (parsed.scheme or "").lower() + if scheme in ("http", "https"): + if protocol == "globus": + Files._parallel_download(parsed.geturl(), target, position=position) + else: + Files._http_download_url(parsed.geturl(), target) + elif scheme == "ftp": + Files._ftp_download_url(parsed, target) + else: + raise ValueError(f"Unsupported URL scheme: {scheme}") + + +def _download_single_url( + url: str, + output_folder: str, + skip_if_exists: bool = False, + protocol: str = "ftp", + position: int = 0, +) -> str: + """Download one URL, dispatched by scheme; return the local file path.""" + from pridepy.files.files import Files + + parsed = urlparse(url) + if not (parsed.scheme or "").lower(): + raise ValueError(f"URL missing scheme: {url}") + + file_name = os.path.basename(parsed.path) + if not file_name: + raise ValueError(f"Cannot derive filename from URL: {url}") + + target = os.path.join(output_folder, file_name) + if skip_if_exists and os.path.isfile(target) and os.path.getsize(target) > 0: + logging.info("Skipping %s: already downloaded", file_name) + return target + + Files._dispatch_url_scheme(parsed, target, protocol, position=position) + + ok, reason = Files.validate_download(target) + if not ok: + Files._remove_if_exists(target) + raise RuntimeError(f"Download invalid: {reason} ({target})") + return target + + +def download_files_by_url( + urls: List[str], + output_folder: str, + skip_if_downloaded_already: bool = False, + protocol: str = "ftp", + parallel_files: int = 1, + checksum_check: bool = False, +) -> None: + """Download files from a list of raw URLs, dispatched by URL scheme. + + Supported schemes: ``http``, ``https``, ``ftp``. Each URL is downloaded + independently; per-URL errors are logged, then aggregated and re-raised + as a single :class:`RuntimeError` so callers see a complete failure + summary. + + :param urls: fully-qualified URLs (each contains its scheme) + :param output_folder: directory to write downloaded files into + :param skip_if_downloaded_already: skip URLs whose target file exists + :param protocol: ``ftp`` (default) for single-connection per URL scheme; + ``globus`` for resume-capable http/https downloads (single-connection stream) + (no effect on ftp:// URLs which always use single-connection FTP) + :param checksum_check: validate downloads against PRIDE checksum API; + accessions are inferred from URL paths (only PRIDE URLs supported) + :raises ValueError: if ``urls`` is empty + :raises RuntimeError: if one or more URLs failed + """ + if not urls: + raise ValueError("urls must contain at least one URL") + + os.makedirs(output_folder, exist_ok=True) + + parallel_files = min(parallel_files, 3, len(urls)) + failures: List[Tuple[str, str]] = [] + from pridepy.files.files import Files + + if parallel_files < 2: + for url in urls: + try: + Files._download_single_url( + url, output_folder, skip_if_downloaded_already, protocol, + ) + except Exception as exc: # pylint: disable=broad-except + logging.error("Failed to download %s: %s", url, exc) + failures.append((url, str(exc))) + else: + logging.info( + "Downloading %d URL(s) with %d parallel workers", + len(urls), parallel_files, + ) + with ThreadPoolExecutor(max_workers=parallel_files) as executor: + futures = { + executor.submit( + Files._download_single_url, + url, output_folder, skip_if_downloaded_already, protocol, + position=idx, + ): url + for idx, url in enumerate(urls) + } + for future in as_completed(futures): + url = futures[future] + try: + future.result() + except Exception as exc: # pylint: disable=broad-except + logging.error("Failed to download %s: %s", url, exc) + failures.append((url, str(exc))) + + if failures: + summary = ", ".join(f"{u} ({e})" for u, e in failures) + raise RuntimeError( + f"Failed to download {len(failures)} URL(s): {summary}" + ) + + if checksum_check: + _validate_urls_checksums(urls, output_folder) diff --git a/pridepy/files/files.py b/pridepy/files/files.py index 878b4c1..8393519 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -1,17 +1,12 @@ #!/usr/bin/env python -import ftplib import logging import os -import re import urllib import urllib.request -from concurrent.futures import ThreadPoolExecutor, as_completed from ftplib import FTP from typing import Dict, List, Optional, Tuple -from urllib.parse import urlparse import requests -from tqdm import tqdm from pridepy.util.api_handling import Util @@ -726,14 +721,9 @@ def download_files_by_list( @staticmethod def _extract_pride_accession(url: str) -> Optional[str]: - """Extract a PRIDE accession (PXD/PRD followed by digits) from a URL path. - - PRIDE archive URLs follow the pattern - ``…/pride/data/archive/YYYY/MM//filename``. - Returns ``None`` when no accession can be identified. - """ - match = re.search(r"((?:PXD|PRD)\d{4,})", url) - return match.group(1) if match else None + """Shim — see :func:`pridepy.commands.by_url._extract_pride_accession`.""" + from pridepy.commands import by_url + return by_url._extract_pride_accession(url) @staticmethod def download_files_by_url( @@ -744,116 +734,22 @@ def download_files_by_url( parallel_files: int = 1, checksum_check: bool = False, ) -> None: - """Download files from a list of raw URLs, dispatched by URL scheme. - - Supported schemes: ``http``, ``https``, ``ftp``. Each URL is downloaded - independently; per-URL errors are logged, then aggregated and re-raised - as a single :class:`RuntimeError` so callers see a complete failure - summary. - - :param urls: fully-qualified URLs (each contains its scheme) - :param output_folder: directory to write downloaded files into - :param skip_if_downloaded_already: skip URLs whose target file exists - :param protocol: ``ftp`` (default) for single-connection per URL scheme; - ``globus`` for resume-capable http/https downloads (single-connection stream) - (no effect on ftp:// URLs which always use single-connection FTP) - :param checksum_check: validate downloads against PRIDE checksum API; - accessions are inferred from URL paths (only PRIDE URLs supported) - :raises ValueError: if ``urls`` is empty - :raises RuntimeError: if one or more URLs failed - """ - if not urls: - raise ValueError("urls must contain at least one URL") - - os.makedirs(output_folder, exist_ok=True) - - parallel_files = min(parallel_files, 3, len(urls)) - failures: List[Tuple[str, str]] = [] - if parallel_files < 2: - for url in urls: - try: - Files._download_single_url( - url, output_folder, skip_if_downloaded_already, protocol, - ) - except Exception as exc: # pylint: disable=broad-except - logging.error("Failed to download %s: %s", url, exc) - failures.append((url, str(exc))) - else: - logging.info( - "Downloading %d URL(s) with %d parallel workers", - len(urls), parallel_files, - ) - with ThreadPoolExecutor(max_workers=parallel_files) as executor: - futures = { - executor.submit( - Files._download_single_url, - url, output_folder, skip_if_downloaded_already, protocol, - position=idx, - ): url - for idx, url in enumerate(urls) - } - for future in as_completed(futures): - url = futures[future] - try: - future.result() - except Exception as exc: # pylint: disable=broad-except - logging.error("Failed to download %s: %s", url, exc) - failures.append((url, str(exc))) - - if failures: - summary = ", ".join(f"{u} ({e})" for u, e in failures) - raise RuntimeError( - f"Failed to download {len(failures)} URL(s): {summary}" - ) - - if checksum_check: - Files._validate_urls_checksums(urls, output_folder) + """Shim — see :func:`pridepy.commands.by_url.download_files_by_url`.""" + from pridepy.commands import by_url + return by_url.download_files_by_url( + urls=urls, + output_folder=output_folder, + skip_if_downloaded_already=skip_if_downloaded_already, + protocol=protocol, + parallel_files=parallel_files, + checksum_check=checksum_check, + ) @staticmethod def _validate_urls_checksums(urls: List[str], output_folder: str) -> None: - """Validate downloaded files against PRIDE checksum API. - - Accessions are inferred from URL paths via - :meth:`_extract_pride_accession`. URLs that do not contain a - recognisable PRIDE accession are skipped with a warning. - - :raises RuntimeError: if one or more files fail validation - """ - accession_urls: Dict[str, List[str]] = {} - for url in urls: - acc = Files._extract_pride_accession(url) - if acc: - accession_urls.setdefault(acc, []).append(url) - else: - logging.warning( - "Cannot infer PRIDE accession from URL, skipping checksum: %s", url - ) - - validation_failures: List[str] = [] - for acc, acc_urls in accession_urls.items(): - checksum_file_path = Files.save_checksum_file(acc, output_folder) - checksum_map = Files.read_checksum_file(checksum_file_path) - logging.info( - "Loaded checksums for %d files (project %s)", - len(checksum_map), acc, - ) - for url in acc_urls: - file_name = os.path.basename(urlparse(url).path) - target = os.path.join(output_folder, file_name) - expected = checksum_map.get(file_name) - logging.info("Validating %s", file_name) - valid, reason = Files.validate_download(target, expected) - if not valid: - logging.error("Validation failed for %s: %s", file_name, reason) - validation_failures.append(f"{file_name} ({reason})") - else: - logging.info("Checksum OK: %s", file_name) - - if validation_failures: - raise RuntimeError( - f"Checksum validation failed for {len(validation_failures)} file(s): " - + ", ".join(validation_failures) - ) + """Shim — see :func:`pridepy.commands.by_url._validate_urls_checksums`.""" + from pridepy.commands import by_url + return by_url._validate_urls_checksums(urls, output_folder) @staticmethod def _download_single_url( @@ -863,94 +759,27 @@ def _download_single_url( protocol: str = "ftp", position: int = 0, ) -> str: - """Download one URL, dispatched by scheme; return the local file path.""" - parsed = urlparse(url) - if not (parsed.scheme or "").lower(): - raise ValueError(f"URL missing scheme: {url}") - - file_name = os.path.basename(parsed.path) - if not file_name: - raise ValueError(f"Cannot derive filename from URL: {url}") - - target = os.path.join(output_folder, file_name) - if skip_if_exists and os.path.isfile(target) and os.path.getsize(target) > 0: - logging.info("Skipping %s: already downloaded", file_name) - return target - - Files._dispatch_url_scheme(parsed, target, protocol, position=position) - - ok, reason = Files.validate_download(target) - if not ok: - Files._remove_if_exists(target) - raise RuntimeError(f"Download invalid: {reason} ({target})") - return target + """Shim — see :func:`pridepy.commands.by_url._download_single_url`.""" + from pridepy.commands import by_url + return by_url._download_single_url(url, output_folder, skip_if_exists, protocol, position) @staticmethod def _dispatch_url_scheme(parsed, target: str, protocol: str = "ftp", position: int = 0) -> None: - """Route a parsed URL to its protocol-specific downloader. - - ``protocol='globus'`` swaps the http/https single-connection streamer - for :meth:`_parallel_download` (single-connection with progress bar). - ftp:// URLs are unaffected. - """ - scheme = (parsed.scheme or "").lower() - if scheme in ("http", "https"): - if protocol == "globus": - Files._parallel_download(parsed.geturl(), target, position=position) - else: - Files._http_download_url(parsed.geturl(), target) - elif scheme == "ftp": - Files._ftp_download_url(parsed, target) - else: - raise ValueError(f"Unsupported URL scheme: {scheme}") + """Shim — see :func:`pridepy.commands.by_url._dispatch_url_scheme`.""" + from pridepy.commands import by_url + return by_url._dispatch_url_scheme(parsed, target, protocol=protocol, position=position) @staticmethod def _http_download_url(url: str, target: str) -> None: - """Stream an http/https URL into ``target`` with a progress bar.""" - session = Util.create_session_with_retries() - with session.get(url, stream=True, timeout=60) as response: - response.raise_for_status() - total = int(response.headers.get("Content-Length", 0)) - with open(target, "wb") as out, tqdm( - total=total, - unit="B", - unit_scale=True, - desc=os.path.basename(target), - ) as pbar: - for chunk in response.iter_content(chunk_size=8192): - if chunk: - out.write(chunk) - pbar.update(len(chunk)) + """Shim — see :func:`pridepy.commands.by_url._http_download_url`.""" + from pridepy.commands import by_url + return by_url._http_download_url(url, target) @staticmethod def _ftp_download_url(parsed, target: str) -> None: - """Download a single file from an ftp:// URL with a progress bar.""" - host = parsed.hostname - if not host: - raise ValueError(f"FTP URL missing host: {parsed.geturl()}") - port = parsed.port or 21 - user = parsed.username or "anonymous" - pwd = parsed.password or "anonymous@" - remote_path = parsed.path - with FTP() as ftp: - ftp.connect(host, port, timeout=60) - ftp.login(user, pwd) - try: - total = ftp.size(remote_path) or 0 - except ftplib.error_perm: - total = 0 - with open(target, "wb") as out, tqdm( - total=total, - unit="B", - unit_scale=True, - desc=os.path.basename(target), - ) as pbar: - - def _callback(data: bytes) -> None: - out.write(data) - pbar.update(len(data)) - - ftp.retrbinary(f"RETR {remote_path}", _callback) + """Shim — see :func:`pridepy.commands.by_url._ftp_download_url`.""" + from pridepy.commands import by_url + return by_url._ftp_download_url(parsed, target) def download_all_category_files( self,