PRIDE-Archive · ypriverol · May 27, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/pridepy/files/files.py b/pridepy/files/files.py
diff --git a/pridepy/providers/__init__.py b/pridepy/providers/__init__.py
@@ -0,0 +1,7 @@
+"""Per-repository provider classes used by :class:`pridepy.files.files.Files`.
+
+Each module under this package owns the listing, transport choice, and
+record-construction logic for one repository: PRIDE, MassIVE, JPOST, iProX.
+The :mod:`registry` module maps an accession to the right provider; the
+:mod:`transport` module hosts the shared FTP/FTPS/HTTPS download plumbing.
+"""
diff --git a/pridepy/providers/base.py b/pridepy/providers/base.py
@@ -0,0 +1,107 @@
+"""Abstract base classes for pridepy providers."""
+from abc import ABC, abstractmethod
+from typing import ClassVar, Dict, List, Optional
+
+
+class Provider(ABC):
+    """Abstract base for every repository pridepy can list and download from."""
+
+    name: ClassVar[str]  # "pride", "massive", "jpost", "iprox"
+
+    @staticmethod
+    @abstractmethod
+    def matches(accession: str) -> bool:
+        """Return True if this provider should handle ``accession``."""
+
+    @abstractmethod
+    def list_files(self, accession: str) -> List[Dict]:
+        """Return pridepy file records for the dataset.
+
+        Each record is a dict shaped like the PRIDE V3 API file response,
+        with at minimum: ``accession``, ``fileName``, ``fileCategory``
+        (with nested ``value``), ``publicFileLocations`` (list of
+        ``{"name": ..., "value": <URL>}``).
+        """
+
+    @abstractmethod
+    def download_files(
+        self,
+        accession: str,
+        records: List[Dict],
+        output_folder: str,
+        skip_if_downloaded_already: bool,
+        protocol: str,
+        parallel_files: int = 1,
+        checksum_check: bool = False,
+        aspera_maximum_bandwidth: str = "100M",
+        username: Optional[str] = None,
+        password: Optional[str] = None,
+    ) -> None:
+        """Download the given records into ``output_folder``."""
+
+
+class BaseDirectDownloadProvider(Provider):
+    """Shared ``download_files`` for MassIVE / JPOST / iProX.
+
+    Subclasses set the ``use_tls`` class var (True for MassIVE FTPS, False for
+    JPOST plain FTP) and override :meth:`list_files`. The shared
+    ``download_files`` implementation partitions record URLs by scheme:
+    ``ftp://`` URLs are handed to :meth:`Files.download_ftp_urls`; ``http(s)://``
+    URLs go to :meth:`Files.download_http_urls`. It calls **back** into
+    ``Files`` so that test patches on ``Files.download_ftp_urls`` /
+    ``Files.download_http_urls`` continue to intercept the calls.
+    """
+
+    use_tls: ClassVar[bool] = False
+
+    def download_files(
+        self,
+        accession: str,
+        records: List[Dict],
+        output_folder: str,
+        skip_if_downloaded_already: bool,
+        protocol: str,
+        parallel_files: int = 1,
+        checksum_check: bool = False,
+        aspera_maximum_bandwidth: str = "100M",
+        username: Optional[str] = None,
+        password: Optional[str] = None,
+    ) -> None:
+        # Lazy import: providers know about Files (the facade) only via the
+        # public attributes that tests may patch; avoid module-load cycle.
+        from pridepy.files.files import Files
+
+        if protocol not in ("ftp", "https", "http"):
+            import logging
+            logging.warning(
+                "Direct downloads currently use ftp / https only. "
+                f"Ignoring requested protocol '{protocol}' for {accession}."
+            )
+
+        all_urls = [Files._get_download_url(record, "ftp") for record in records]
+        ftp_urls = [u for u in all_urls if u.lower().startswith("ftp://")]
+        http_urls = [
+            u for u in all_urls if u.lower().startswith(("http://", "https://"))
+        ]
+        if not ftp_urls and not http_urls:
+            import logging
+            logging.info(
+                f"No files matched for direct-download dataset {accession}"
+            )
+            return
+
+        if ftp_urls:
+            Files.download_ftp_urls(
+                ftp_urls=ftp_urls,
+                output_folder=output_folder,
+                skip_if_downloaded_already=skip_if_downloaded_already,
+                use_tls=self.use_tls,
+                parallel_files=parallel_files,
+            )
+        if http_urls:
+            Files.download_http_urls(
+                http_urls=http_urls,
+                output_folder=output_folder,
+                skip_if_downloaded_already=skip_if_downloaded_already,
+                parallel_files=parallel_files,
+            )
diff --git a/pridepy/providers/iprox.py b/pridepy/providers/iprox.py
@@ -0,0 +1,129 @@
+"""iProX direct-download provider.
+
+iProX publishes the ProteomeXchange XML for each dataset at a
+deterministic path on its anonymous HTTPS download server::
+
+    http://download.iprox.org/<accession>/PX_<accession>.xml
+
+We fetch the XML, walk every ``<DatasetFile>``'s ``cvParam`` entries, and
+turn each ``Associated raw file URI`` (and sibling URIs for search-engine
+output, result files, etc.) into a pridepy file record. File downloads
+themselves go through plain HTTPS on the same host, which supports
+``Range`` requests for resume.
+"""
+import logging
+import os
+import re
+import xml.etree.ElementTree as ET
+from typing import ClassVar, Dict, List, Optional
+from urllib.parse import urlparse
+
+import requests
+
+from pridepy.providers import registry
+from pridepy.providers.base import BaseDirectDownloadProvider
+from pridepy.providers.jpost import JpostProvider
+
+
+@registry.register
+class IproxProvider(BaseDirectDownloadProvider):
+    name: ClassVar[str] = "iprox"
+    use_tls: ClassVar[bool] = False  # download.iprox.org serves over plain HTTP
+
+    DOWNLOAD_BASE_URL: ClassVar[str] = "http://download.iprox.org/"
+    PX_XML_URL_TEMPLATE: ClassVar[str] = (
+        "http://download.iprox.org/{accession}/PX_{accession}.xml"
+    )
+    # iProX PX XML uses the same PSI-MS cvParam "name" values as JPOST PROXI,
+    # so we reuse JpostProvider's category map.
+    PX_CATEGORY_MAP: ClassVar[Dict[str, str]] = JpostProvider.PROXI_CATEGORY_MAP
+
+    @staticmethod
+    def matches(accession: str) -> bool:
+        """Return True when ``accession`` looks like an iProX dataset accession."""
+        if not accession:
+            return False
+        return bool(re.fullmatch(r"IPX\d{7,10}", accession.upper()))
+
+    @staticmethod
+    def _get_public_root(accession: str) -> str:
+        return f"/{accession.upper()}"
+
+    @classmethod
+    def _get_public_ftp_url(cls, accession: str, remote_path: str) -> str:
+        # NOTE: name kept as `_get_public_ftp_url` for parity with other providers,
+        # but iProX URLs are http(s) not ftp. The dispatcher routes by scheme.
+        root_path = cls._get_public_root(accession).rstrip("/")
+        relative_path = remote_path
+        if remote_path.startswith(root_path):
+            relative_path = remote_path[len(root_path):].lstrip("/")
+        return f"{cls.DOWNLOAD_BASE_URL}{accession.upper()}/{relative_path}"
+
+    @classmethod
+    def _build_file_record(
+        cls, accession: str, https_url: str, category_from_px: Optional[str] = None
+    ) -> Dict:
+        """Build a pridepy file record for an iProX file.
+
+        ``category_from_px`` is the ``cvParam`` ``name`` from the dataset's
+        ProteomeXchange XML (e.g. ``"Associated raw file URI"``).
+        """
+        from pridepy.providers.massive import MassiveProvider
+        parsed = urlparse(https_url)
+        root_prefix = f"/{accession.upper()}/"
+        relative_path = parsed.path
+        if relative_path.startswith(root_prefix):
+            relative_path = relative_path[len(root_prefix):]
+        relative_path = relative_path.lstrip("/")
+        collection = relative_path.split("/", 1)[0] if relative_path else ""
+        if category_from_px and category_from_px in cls.PX_CATEGORY_MAP:
+            category = cls.PX_CATEGORY_MAP[category_from_px]
+        else:
+            category = MassiveProvider._map_collection_to_category(collection)
+        return {
+            "accession": accession.upper(),
+            "fileName": os.path.basename(parsed.path),
+            "fileCategory": {"value": category},
+            # "FTP Protocol" is the existing label the download dispatcher uses
+            # to locate a file URL; here it actually points at HTTPS.
+            # BaseDirectDownloadProvider.download_files routes by URL scheme.
+            "publicFileLocations": [{"name": "FTP Protocol", "value": https_url}],
+            "relativePath": relative_path,
+            "collection": collection,
+            "source": "iProX",
+        }
+
+    def list_files(self, accession: str) -> List[Dict]:
+        normalized = accession.upper()
+        xml_url = self.PX_XML_URL_TEMPLATE.format(accession=normalized)
+        logging.info(f"Fetching iProX PX XML: {xml_url}")
+        response = requests.get(xml_url, timeout=30)
+        response.raise_for_status()
+        try:
+            root = ET.fromstring(response.content)
+        except ET.ParseError as parse_error:
+            raise RuntimeError(
+                f"Unable to parse iProX PX XML for {normalized}: {parse_error}"
+            ) from parse_error
+
+        records: List[Dict] = []
+        for dataset_file in root.iter("DatasetFile"):
+            for cv in dataset_file.findall("cvParam"):
+                name = cv.attrib.get("name")
+                value = cv.attrib.get("value")
+                if not value or not name or not name.endswith("URI"):
+                    continue
+                if not value.lower().startswith(("http://", "https://")):
+                    continue
+                records.append(
+                    self._build_file_record(
+                        normalized,
+                        value,
+                        category_from_px=name,
+                    )
+                )
+        if not records:
+            raise RuntimeError(
+                f"iProX PX XML for {normalized} contained no downloadable HTTPS URIs"
+            )
+        return records
diff --git a/pridepy/providers/jpost.py b/pridepy/providers/jpost.py
@@ -0,0 +1,150 @@
+"""JPOST direct-download provider.
+
+PRIMARY listing: PROXI JSON at repository.jpostdb.org. The PROXI endpoint
+returns ``datasetFiles[*].value`` as ``ftp://`` URLs alongside CV labels
+(Associated raw file URI, Search engine output file URI, etc.) which map
+cleanly to PRIDE file categories.
+
+FALLBACK listing: when PROXI fails, walk the FTP tree at ftp.jpostdb.org.
+This is needed because JPOST's FTP server rate-limits aggressively per
+source IP (sticky 421-too-many-connections); the PROXI path lets us avoid
+walking the FTP tree just for a listing.
+"""
+import logging
+import os
+import re
+from typing import ClassVar, Dict, List, Optional
+from urllib.parse import urlparse
+
+import requests
+
+from pridepy.providers import registry
+from pridepy.providers.base import BaseDirectDownloadProvider
+
+
+@registry.register
+class JpostProvider(BaseDirectDownloadProvider):
+    name: ClassVar[str] = "jpost"
+    use_tls: ClassVar[bool] = False
+
+    ARCHIVE_FTP: ClassVar[str] = "ftp.jpostdb.org"
+    ARCHIVE_FTP_URL_PREFIX: ClassVar[str] = "ftp://ftp.jpostdb.org/"
+    PROXI_BASE_URL: ClassVar[str] = "https://repository.jpostdb.org/proxi/datasets/"
+
+    PROXI_CATEGORY_MAP: ClassVar[Dict[str, str]] = {
+        "Associated raw file URI": "RAW",
+        "Result file URI": "RESULT",
+        "Search engine output file URI": "SEARCH",
+        "Peak list file URI": "PEAK",
+        "Spectrum library file URI": "SPECTRUM_LIBRARY",
+        "Sequence database URI": "FASTA",
+        "Quantification file URI": "RESULT",
+    }
+
+    @staticmethod
+    def matches(accession: str) -> bool:
+        if not accession:
+            return False
+        return bool(re.fullmatch(r"JPST\d{6}", accession.upper()))
+
+    @staticmethod
+    def _get_public_root(accession: str) -> str:
+        return f"/{accession.upper()}"
+
+    @classmethod
+    def _get_public_ftp_url(cls, accession: str, remote_path: str) -> str:
+        root_path = cls._get_public_root(accession).rstrip("/")
+        relative_path = remote_path
+        if remote_path.startswith(root_path):
+            relative_path = remote_path[len(root_path):].lstrip("/")
+        return f"{cls.ARCHIVE_FTP_URL_PREFIX}{accession.upper()}/{relative_path}"
+
+    @classmethod
+    def _build_file_record(
+        cls, accession: str, ftp_url: str, category_from_proxi: Optional[str] = None
+    ) -> Dict:
+        """Build a pridepy file record from an FTP URL.
+
+        When ``category_from_proxi`` is provided (e.g. ``"Associated raw file URI"``),
+        the PROXI CV name takes precedence over the heuristic collection-from-path
+        mapping. Falls back to the same path-segment heuristic used for MassIVE
+        when the category isn't known.
+        """
+        # Import the MassIVE collection->category map for the fallback heuristic.
+        from pridepy.providers.massive import MassiveProvider
+        parsed = urlparse(ftp_url)
+        root_prefix = f"/{accession.upper()}/"
+        relative_path = parsed.path
+        if relative_path.startswith(root_prefix):
+            relative_path = relative_path[len(root_prefix):]
+        relative_path = relative_path.lstrip("/")
+        collection = relative_path.split("/", 1)[0] if relative_path else ""
+        if category_from_proxi and category_from_proxi in cls.PROXI_CATEGORY_MAP:
+            category = cls.PROXI_CATEGORY_MAP[category_from_proxi]
+        else:
+            category = MassiveProvider._map_collection_to_category(collection)
+        return {
+            "accession": accession.upper(),
+            "fileName": os.path.basename(parsed.path),
+            "fileCategory": {"value": category},
+            "publicFileLocations": [{"name": "FTP Protocol", "value": ftp_url}],
+            "relativePath": relative_path,
+            "collection": collection,
+            "source": "JPOST",
+        }
+
+    def list_files(self, accession: str) -> List[Dict]:
+        """PRIMARY: PROXI JSON. FALLBACK: FTP tree walk."""
+        normalized = accession.upper()
+        try:
+            return self._list_via_proxi(normalized)
+        except Exception as proxi_error:
+            logging.warning(
+                f"JPOST PROXI listing failed for {normalized} "
+                f"({proxi_error}); falling back to FTP tree walk."
+            )
+            from pridepy.providers import transport
+            remote_root = self._get_public_root(normalized)
+            remote_files = transport._list_ftp_repo_files(
+                host=self.ARCHIVE_FTP,
+                remote_root=remote_root,
+                error_label=f"JPOST dataset {normalized}",
+            )
+            return [
+                self._build_file_record(
+                    normalized,
+                    self._get_public_ftp_url(normalized, remote_file),
+                )
+                for remote_file in remote_files
+            ]
+
+    def _list_via_proxi(self, accession: str) -> List[Dict]:
+        """Fetch JPOST PROXI dataset metadata and turn each datasetFiles entry into a file record."""
+        import json as _json
+        proxi_url = f"{self.PROXI_BASE_URL}{accession}"
+        logging.info(f"Fetching JPOST PROXI metadata: {proxi_url}")
+        response = requests.get(
+            proxi_url,
+            headers={"Accept": "application/json"},
+            timeout=30,
+        )
+        response.raise_for_status()
+        data = _json.loads(response.content)
+        dataset_files = data.get("datasetFiles") or []
+        records: List[Dict] = []
+        for entry in dataset_files:
+            value = (entry or {}).get("value")
+            if not value or not value.startswith("ftp://"):
+                continue
+            records.append(
+                self._build_file_record(
+                    accession,
+                    value,
+                    category_from_proxi=(entry or {}).get("name"),
+                )
+            )
+        if not records:
+            raise RuntimeError(
+                f"JPOST PROXI returned no FTP file URIs for {accession}"
+            )
+        return records