Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,970 changes: 355 additions & 1,615 deletions pridepy/files/files.py

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions pridepy/providers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""Per-repository provider classes used by :class:`pridepy.files.files.Files`.

Each module under this package owns the listing, transport choice, and
record-construction logic for one repository: PRIDE, MassIVE, JPOST, iProX.
The :mod:`registry` module maps an accession to the right provider; the
:mod:`transport` module hosts the shared FTP/FTPS/HTTPS download plumbing.
"""
107 changes: 107 additions & 0 deletions pridepy/providers/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""Abstract base classes for pridepy providers."""
from abc import ABC, abstractmethod
from typing import ClassVar, Dict, List, Optional


class Provider(ABC):
"""Abstract base for every repository pridepy can list and download from."""

name: ClassVar[str] # "pride", "massive", "jpost", "iprox"

@staticmethod
@abstractmethod
def matches(accession: str) -> bool:
"""Return True if this provider should handle ``accession``."""

@abstractmethod
def list_files(self, accession: str) -> List[Dict]:
"""Return pridepy file records for the dataset.

Each record is a dict shaped like the PRIDE V3 API file response,
with at minimum: ``accession``, ``fileName``, ``fileCategory``
(with nested ``value``), ``publicFileLocations`` (list of
``{"name": ..., "value": <URL>}``).
"""

@abstractmethod
def download_files(
self,
accession: str,
records: List[Dict],
output_folder: str,
skip_if_downloaded_already: bool,
protocol: str,
parallel_files: int = 1,
checksum_check: bool = False,
aspera_maximum_bandwidth: str = "100M",
username: Optional[str] = None,
password: Optional[str] = None,
) -> None:
"""Download the given records into ``output_folder``."""


class BaseDirectDownloadProvider(Provider):
"""Shared ``download_files`` for MassIVE / JPOST / iProX.

Subclasses set the ``use_tls`` class var (True for MassIVE FTPS, False for
JPOST plain FTP) and override :meth:`list_files`. The shared
``download_files`` implementation partitions record URLs by scheme:
``ftp://`` URLs are handed to :meth:`Files.download_ftp_urls`; ``http(s)://``
URLs go to :meth:`Files.download_http_urls`. It calls **back** into
``Files`` so that test patches on ``Files.download_ftp_urls`` /
``Files.download_http_urls`` continue to intercept the calls.
"""

use_tls: ClassVar[bool] = False

def download_files(
self,
accession: str,
records: List[Dict],
output_folder: str,
skip_if_downloaded_already: bool,
protocol: str,
parallel_files: int = 1,
checksum_check: bool = False,
aspera_maximum_bandwidth: str = "100M",
username: Optional[str] = None,
password: Optional[str] = None,
) -> None:
# Lazy import: providers know about Files (the facade) only via the
# public attributes that tests may patch; avoid module-load cycle.
from pridepy.files.files import Files

if protocol not in ("ftp", "https", "http"):
import logging
logging.warning(
"Direct downloads currently use ftp / https only. "
f"Ignoring requested protocol '{protocol}' for {accession}."
)

all_urls = [Files._get_download_url(record, "ftp") for record in records]
ftp_urls = [u for u in all_urls if u.lower().startswith("ftp://")]
http_urls = [
u for u in all_urls if u.lower().startswith(("http://", "https://"))
]
if not ftp_urls and not http_urls:
import logging
logging.info(
f"No files matched for direct-download dataset {accession}"
)
return

if ftp_urls:
Files.download_ftp_urls(
ftp_urls=ftp_urls,
output_folder=output_folder,
skip_if_downloaded_already=skip_if_downloaded_already,
use_tls=self.use_tls,
parallel_files=parallel_files,
)
if http_urls:
Files.download_http_urls(
http_urls=http_urls,
output_folder=output_folder,
skip_if_downloaded_already=skip_if_downloaded_already,
parallel_files=parallel_files,
)
129 changes: 129 additions & 0 deletions pridepy/providers/iprox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""iProX direct-download provider.

iProX publishes the ProteomeXchange XML for each dataset at a
deterministic path on its anonymous HTTPS download server::

http://download.iprox.org/<accession>/PX_<accession>.xml

We fetch the XML, walk every ``<DatasetFile>``'s ``cvParam`` entries, and
turn each ``Associated raw file URI`` (and sibling URIs for search-engine
output, result files, etc.) into a pridepy file record. File downloads
themselves go through plain HTTPS on the same host, which supports
``Range`` requests for resume.
"""
import logging
import os
import re
import xml.etree.ElementTree as ET
from typing import ClassVar, Dict, List, Optional
from urllib.parse import urlparse

import requests

from pridepy.providers import registry
from pridepy.providers.base import BaseDirectDownloadProvider
from pridepy.providers.jpost import JpostProvider


@registry.register
class IproxProvider(BaseDirectDownloadProvider):
name: ClassVar[str] = "iprox"
use_tls: ClassVar[bool] = False # download.iprox.org serves over plain HTTP

DOWNLOAD_BASE_URL: ClassVar[str] = "http://download.iprox.org/"
PX_XML_URL_TEMPLATE: ClassVar[str] = (
"http://download.iprox.org/{accession}/PX_{accession}.xml"
)
# iProX PX XML uses the same PSI-MS cvParam "name" values as JPOST PROXI,
# so we reuse JpostProvider's category map.
PX_CATEGORY_MAP: ClassVar[Dict[str, str]] = JpostProvider.PROXI_CATEGORY_MAP

@staticmethod
def matches(accession: str) -> bool:
"""Return True when ``accession`` looks like an iProX dataset accession."""
if not accession:
return False
return bool(re.fullmatch(r"IPX\d{7,10}", accession.upper()))

@staticmethod
def _get_public_root(accession: str) -> str:
return f"/{accession.upper()}"

@classmethod
def _get_public_ftp_url(cls, accession: str, remote_path: str) -> str:
# NOTE: name kept as `_get_public_ftp_url` for parity with other providers,
# but iProX URLs are http(s) not ftp. The dispatcher routes by scheme.
root_path = cls._get_public_root(accession).rstrip("/")
relative_path = remote_path
if remote_path.startswith(root_path):
relative_path = remote_path[len(root_path):].lstrip("/")
return f"{cls.DOWNLOAD_BASE_URL}{accession.upper()}/{relative_path}"

@classmethod
def _build_file_record(
cls, accession: str, https_url: str, category_from_px: Optional[str] = None
) -> Dict:
"""Build a pridepy file record for an iProX file.

``category_from_px`` is the ``cvParam`` ``name`` from the dataset's
ProteomeXchange XML (e.g. ``"Associated raw file URI"``).
"""
from pridepy.providers.massive import MassiveProvider
parsed = urlparse(https_url)
root_prefix = f"/{accession.upper()}/"
relative_path = parsed.path
if relative_path.startswith(root_prefix):
relative_path = relative_path[len(root_prefix):]
relative_path = relative_path.lstrip("/")
collection = relative_path.split("/", 1)[0] if relative_path else ""
if category_from_px and category_from_px in cls.PX_CATEGORY_MAP:
category = cls.PX_CATEGORY_MAP[category_from_px]
else:
category = MassiveProvider._map_collection_to_category(collection)
return {
"accession": accession.upper(),
"fileName": os.path.basename(parsed.path),
"fileCategory": {"value": category},
# "FTP Protocol" is the existing label the download dispatcher uses
# to locate a file URL; here it actually points at HTTPS.
# BaseDirectDownloadProvider.download_files routes by URL scheme.
"publicFileLocations": [{"name": "FTP Protocol", "value": https_url}],
"relativePath": relative_path,
"collection": collection,
"source": "iProX",
}

def list_files(self, accession: str) -> List[Dict]:
normalized = accession.upper()
xml_url = self.PX_XML_URL_TEMPLATE.format(accession=normalized)
logging.info(f"Fetching iProX PX XML: {xml_url}")
response = requests.get(xml_url, timeout=30)
response.raise_for_status()
try:
root = ET.fromstring(response.content)
except ET.ParseError as parse_error:
raise RuntimeError(
f"Unable to parse iProX PX XML for {normalized}: {parse_error}"
) from parse_error

records: List[Dict] = []
for dataset_file in root.iter("DatasetFile"):
for cv in dataset_file.findall("cvParam"):
name = cv.attrib.get("name")
value = cv.attrib.get("value")
if not value or not name or not name.endswith("URI"):
continue
if not value.lower().startswith(("http://", "https://")):
continue
records.append(
self._build_file_record(
normalized,
value,
category_from_px=name,
)
)
if not records:
raise RuntimeError(
f"iProX PX XML for {normalized} contained no downloadable HTTPS URIs"
)
return records
150 changes: 150 additions & 0 deletions pridepy/providers/jpost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
"""JPOST direct-download provider.

PRIMARY listing: PROXI JSON at repository.jpostdb.org. The PROXI endpoint
returns ``datasetFiles[*].value`` as ``ftp://`` URLs alongside CV labels
(Associated raw file URI, Search engine output file URI, etc.) which map
cleanly to PRIDE file categories.

FALLBACK listing: when PROXI fails, walk the FTP tree at ftp.jpostdb.org.
This is needed because JPOST's FTP server rate-limits aggressively per
source IP (sticky 421-too-many-connections); the PROXI path lets us avoid
walking the FTP tree just for a listing.
"""
import logging
import os
import re
from typing import ClassVar, Dict, List, Optional
from urllib.parse import urlparse

import requests

from pridepy.providers import registry
from pridepy.providers.base import BaseDirectDownloadProvider


@registry.register
class JpostProvider(BaseDirectDownloadProvider):
name: ClassVar[str] = "jpost"
use_tls: ClassVar[bool] = False

ARCHIVE_FTP: ClassVar[str] = "ftp.jpostdb.org"
ARCHIVE_FTP_URL_PREFIX: ClassVar[str] = "ftp://ftp.jpostdb.org/"
PROXI_BASE_URL: ClassVar[str] = "https://repository.jpostdb.org/proxi/datasets/"

PROXI_CATEGORY_MAP: ClassVar[Dict[str, str]] = {
"Associated raw file URI": "RAW",
"Result file URI": "RESULT",
"Search engine output file URI": "SEARCH",
"Peak list file URI": "PEAK",
"Spectrum library file URI": "SPECTRUM_LIBRARY",
"Sequence database URI": "FASTA",
"Quantification file URI": "RESULT",
}

@staticmethod
def matches(accession: str) -> bool:
if not accession:
return False
return bool(re.fullmatch(r"JPST\d{6}", accession.upper()))

@staticmethod
def _get_public_root(accession: str) -> str:
return f"/{accession.upper()}"

@classmethod
def _get_public_ftp_url(cls, accession: str, remote_path: str) -> str:
root_path = cls._get_public_root(accession).rstrip("/")
relative_path = remote_path
if remote_path.startswith(root_path):
relative_path = remote_path[len(root_path):].lstrip("/")
return f"{cls.ARCHIVE_FTP_URL_PREFIX}{accession.upper()}/{relative_path}"

@classmethod
def _build_file_record(
cls, accession: str, ftp_url: str, category_from_proxi: Optional[str] = None
) -> Dict:
"""Build a pridepy file record from an FTP URL.

When ``category_from_proxi`` is provided (e.g. ``"Associated raw file URI"``),
the PROXI CV name takes precedence over the heuristic collection-from-path
mapping. Falls back to the same path-segment heuristic used for MassIVE
when the category isn't known.
"""
# Import the MassIVE collection->category map for the fallback heuristic.
from pridepy.providers.massive import MassiveProvider
parsed = urlparse(ftp_url)
root_prefix = f"/{accession.upper()}/"
relative_path = parsed.path
if relative_path.startswith(root_prefix):
relative_path = relative_path[len(root_prefix):]
relative_path = relative_path.lstrip("/")
collection = relative_path.split("/", 1)[0] if relative_path else ""
if category_from_proxi and category_from_proxi in cls.PROXI_CATEGORY_MAP:
category = cls.PROXI_CATEGORY_MAP[category_from_proxi]
else:
category = MassiveProvider._map_collection_to_category(collection)
return {
"accession": accession.upper(),
"fileName": os.path.basename(parsed.path),
"fileCategory": {"value": category},
"publicFileLocations": [{"name": "FTP Protocol", "value": ftp_url}],
"relativePath": relative_path,
"collection": collection,
"source": "JPOST",
}

def list_files(self, accession: str) -> List[Dict]:
"""PRIMARY: PROXI JSON. FALLBACK: FTP tree walk."""
normalized = accession.upper()
try:
return self._list_via_proxi(normalized)
except Exception as proxi_error:
logging.warning(
f"JPOST PROXI listing failed for {normalized} "
f"({proxi_error}); falling back to FTP tree walk."
)
from pridepy.providers import transport
remote_root = self._get_public_root(normalized)
remote_files = transport._list_ftp_repo_files(
host=self.ARCHIVE_FTP,
remote_root=remote_root,
error_label=f"JPOST dataset {normalized}",
)
return [
self._build_file_record(
normalized,
self._get_public_ftp_url(normalized, remote_file),
)
for remote_file in remote_files
]

def _list_via_proxi(self, accession: str) -> List[Dict]:
"""Fetch JPOST PROXI dataset metadata and turn each datasetFiles entry into a file record."""
import json as _json
proxi_url = f"{self.PROXI_BASE_URL}{accession}"
logging.info(f"Fetching JPOST PROXI metadata: {proxi_url}")
response = requests.get(
proxi_url,
headers={"Accept": "application/json"},
timeout=30,
)
response.raise_for_status()
data = _json.loads(response.content)
dataset_files = data.get("datasetFiles") or []
records: List[Dict] = []
for entry in dataset_files:
value = (entry or {}).get("value")
if not value or not value.startswith("ftp://"):
continue
records.append(
self._build_file_record(
accession,
value,
category_from_proxi=(entry or {}).get("name"),
)
)
if not records:
raise RuntimeError(
f"JPOST PROXI returned no FTP file URIs for {accession}"
)
return records
Loading
Loading