Skip to content

Commit 4ba8ab9

Browse files
authored
Merge pull request #1 from seqcode/sphinx
Sphinx document integration
2 parents b404786 + 76ed44d commit 4ba8ab9

8 files changed

Lines changed: 315 additions & 2 deletions

File tree

docs/Makefile

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Minimal makefile for Sphinx documentation
2+
#
3+
4+
# You can set these variables from the command line, and also
5+
# from the environment for the first two.
6+
SPHINXOPTS ?=
7+
SPHINXBUILD ?= sphinx-build
8+
SOURCEDIR = source
9+
BUILDDIR = build
10+
11+
# Put it first so that "make" without argument is like "make help".
12+
help:
13+
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14+
15+
.PHONY: help Makefile
16+
17+
# Catch-all target: route all unknown targets to Sphinx using the new
18+
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19+
%: Makefile
20+
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

docs/make.bat

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
@ECHO OFF
2+
3+
pushd %~dp0
4+
5+
REM Command file for Sphinx documentation
6+
7+
if "%SPHINXBUILD%" == "" (
8+
set SPHINXBUILD=sphinx-build
9+
)
10+
set SOURCEDIR=source
11+
set BUILDDIR=build
12+
13+
if "%1" == "" goto help
14+
15+
%SPHINXBUILD% >NUL 2>NUL
16+
if errorlevel 9009 (
17+
echo.
18+
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19+
echo.installed, then set the SPHINXBUILD environment variable to point
20+
echo.to the full path of the 'sphinx-build' executable. Alternatively you
21+
echo.may add the Sphinx directory to PATH.
22+
echo.
23+
echo.If you don't have Sphinx installed, grab it from
24+
echo.http://sphinx-doc.org/
25+
exit /b 1
26+
)
27+
28+
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29+
goto end
30+
31+
:help
32+
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33+
34+
:end
35+
popd

docs/source/conf.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Configuration file for the Sphinx documentation builder.
2+
#
3+
# This file only contains a selection of the most common options. For a full
4+
# list see the documentation:
5+
# https://www.sphinx-doc.org/en/master/usage/configuration.html
6+
7+
# -- Path setup --------------------------------------------------------------
8+
9+
# If extensions (or modules to document with autodoc) are in another directory,
10+
# add these directories to sys.path here. If the directory is relative to the
11+
# documentation root, use os.path.abspath to make it absolute, like shown here.
12+
#
13+
# import os
14+
# import sys
15+
# sys.path.insert(0, os.path.abspath('.'))
16+
import pathlib
17+
import sys
18+
sys.path.insert(0, pathlib.Path(__file__).parents[2].resolve().as_posix())
19+
20+
# import MOCK
21+
from unittest import mock
22+
23+
# Mock imports
24+
autodoc_mock_imports = ["numpy",
25+
"pandas",
26+
"torch",
27+
"torch.utils.data",
28+
"pysam",
29+
"pybedtools",
30+
"pyfasta",
31+
"pyBigWig",
32+
"pytorch_lightning",
33+
"webdataset"]
34+
35+
# -- Project information -----------------------------------------------------
36+
37+
project = 'seqchromloader'
38+
copyright = '2023, Jianyu Yang'
39+
author = 'Jianyu Yang'
40+
41+
# The full version, including alpha/beta/rc tags
42+
release = '0.2.4'
43+
44+
45+
# -- General configuration ---------------------------------------------------
46+
47+
# Add any Sphinx extension module names here, as strings. They can be
48+
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
49+
# ones.
50+
extensions = [
51+
'sphinx.ext.duration',
52+
'sphinx.ext.autodoc'
53+
]
54+
55+
# Add any paths that contain templates here, relative to this directory.
56+
templates_path = ['_templates']
57+
58+
# List of patterns, relative to source directory, that match files and
59+
# directories to ignore when looking for source files.
60+
# This pattern also affects html_static_path and html_extra_path.
61+
exclude_patterns = []
62+
63+
intersphinx_mapping = {
64+
'python': ('https://docs.python.org/3/', None),
65+
'sphinx': ('https://www.sphinx-doc.org/en/master/', None),
66+
}
67+
intersphinx_disabled_domains = ['std']
68+
69+
templates_path = ['_templates']
70+
71+
# -- Options for HTML output
72+
73+
html_theme = 'sphinx_rtd_theme'
74+
75+
# Add any paths that contain custom static files (such as style sheets) here,
76+
# relative to this directory. They are copied after the builtin static files,
77+
# so a file named "default.css" will overwrite the builtin "default.css".
78+
# html_static_path = ['_static']

docs/source/index.rst

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
.. seqchromloader documentation master file, created by
2+
sphinx-quickstart on Mon Mar 20 20:12:15 2023.
3+
You can adapt this file completely to your liking, but it should at least
4+
contain the root `toctree` directive.
5+
6+
Welcome to seqchromloader's documentation!
7+
==========================================
8+
9+
seqchromloader aims to provide versatile and ready-to-use writer/loader for applying deep learning to bioinformatics study.
10+
11+
Plan to support dataset formats including:
12+
13+
* webdataset (done)
14+
* tfrecord (x)
15+
16+
Training framework support:
17+
18+
* pytorch dataloader (done)
19+
* pytorch-lightning datamodule (done)
20+
* NVIDIA-DALI (x)
21+
22+
Check out the :doc:`usage` section for further information, including how to
23+
:doc:`install` the project.
24+
25+
.. toctree::
26+
install
27+
usage
28+
:maxdepth: 2
29+
:caption: Contents:
30+
31+
32+
Indices and tables
33+
==================
34+
35+
* :ref:`genindex`
36+
* :ref:`modindex`
37+
* :ref:`search`

docs/source/install.rst

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
Installation
2+
============
3+
4+
conda (suggested):
5+
6+
.. code-block:: console
7+
8+
mamba install -c bioconda -c conda-forge seqchromloader
9+
10+
or
11+
12+
.. code-block:: console
13+
14+
mamba install -c bioconda -c conda-forge seqchromloader
15+
16+
pip
17+
18+
.. code-block:: console
19+
20+
pip install seqchromloader

docs/source/usage.rst

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
Usage
2+
=====
3+
4+
``seqchromloader`` is composed of two types of functions: ``writer`` and ``loader``. You can use ``writer`` to dump dataset into webdataset format file for future use, or directly call ``loader`` to get tensors immediately.
5+
6+
Generally ``seqchromloader`` would produce four kinds of tensors: **[seq, chrom, target, label]**
7+
8+
* **seq** is one-hot coded DNA sequence tensor of shape *[batch_size, 4, len]* using the DNA mapping order of "ACGT" (which means, A = [1,0,0,0], C = [0,1,0,0], ...)
9+
* **chrom** is chromatin track tensor of shape *[batch_size, # tracks, len]*, chromatin track bigwig files are usually provided by ``bigwig_filelist`` parameter
10+
* **target** is the tensor representing the number of sequencing reads in the region, this is from the bam file given by ``target_bam`` parameter
11+
* **label** is the integer label of each sample, when given bed file input, this info would be from the fourth column. While given a pandas DataFrame, it should have a column named *label*
12+
13+
Writer
14+
------
15+
16+
Currently only webdataset format is supported, you can write tensors into webdataset in this way:
17+
18+
.. code-block:: python3
19+
20+
import pandas as pd
21+
from seqchromloader import dump_data_webdataset
22+
23+
coords = pd.DataFrame({
24+
"chrom": ["chr1", "chr10"],
25+
"start": [1000, 5000],
26+
"end": [1200, 5200],
27+
"label": [0, 1]
28+
})
29+
wds_file_lists = dump_data_webdataset(coords,
30+
genome_fasta="mm10.fa",
31+
bigwig_filelist=["h3k4me3.bw", "atacseq.bw"],
32+
outdir="dataset/"
33+
outprefix="test",
34+
compress=True,
35+
numPorcessors=4,
36+
transforms={"chrom": lambda x: x+1})
37+
38+
.. note::
39+
Each region should be of the same length! As in this example, every region is 200bp long.
40+
41+
The returned ``wds_file_lists`` contain the output file paths, every file has ~7000 samples.
42+
43+
One thing worth noting is the ``transforms`` parameter here, ``transforms`` accepts a dictionary of function, each function will be called on the output that its key refers to. In this example, the add 1 lambda function was called on each ``chrom`` tensor, you can do more complicated transformations in this way, e.g., standardize the tensor.
44+
45+
Loader
46+
------
47+
48+
You can easily load the webdataset files generated by ``seqchromloader.dump_data_webdataset`` above by:
49+
50+
.. code-block:: python3
51+
52+
from seqchromloader import SeqChromDatasetByWds
53+
54+
dataloader = SeqChromDatasetByWds(wds_file_lists, transforms=None, rank=0, world_size=1)
55+
seq, chrom, target, label = next(iter(dataloader))
56+
57+
If you are using multiple GPUs, you can use ``rank`` and ``world_size`` to do sharding on dataset to ensure each GPU getting non-overlapped piece of dataset
58+
59+
A more straightforward way is using ``seqchromloader.SeqChromDatasetByBed``, which can output tensors given a bed file and other required files.
60+
61+
.. code-block:: python3
62+
63+
from seqchromloader import SeqChromDatasetByBed
64+
65+
dataloader = SeqChromDatasetByWds(bed="regions.bed",
66+
genome_fasta="mm10.fa",
67+
bigwig_filelist=["h3k4me3.bw", "atacseq.bw"],
68+
target_bam="foxa1.bam",
69+
transforms={"label": lambda x: x-1},
70+
dataloader_kws={num_workers: 4})
71+
seq, chrom, target, label = next(iter(dataloader))
72+
73+
Here I pass a dictionary describing the keywords arguments would be further passed to ``torch.utils.data.DataLoader`` to increase the number of workers (default is 1), you can refer to `Pytorch DataLoader Document <https://pytorch.org/docs/stable/data.html>`_ to explore more controls on DataLoader behavior
74+
75+
API
76+
---
77+
78+
.. autofunction:: seqchromloader.dump_data_webdataset
79+
80+
.. autofunction:: seqchromloader.SeqChromDatasetByBed
81+
82+
.. autofunction:: seqchromloader.SeqChromDatasetByWds

seqchromloader/loader.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,13 @@ def worker_init_fn(worker_id):
2626
dataset.initialize()
2727

2828
class SeqChromLoader():
29+
"""
30+
:param dataloader_kws: keyword arguments passed to ``torch.utils.data.DataLoader``
31+
:type dataloader_kws: dict of kwargs
32+
"""
2933
def __init__(self, SeqChromDataset):
3034
self.SeqChromDataset = SeqChromDataset
35+
self.__doc__ = self.__doc__ + self.SeqChromDataset.__doc__
3136

3237
def __call__(self, *args, dataloader_kws:dict={}, **kwargs):
3338
# default dataloader kws
@@ -46,6 +51,12 @@ def seqChromLoaderCurry(SeqChromDataset):
4651
return SeqChromLoader(SeqChromDataset)
4752

4853
class _SeqChromDatasetByWds(IterableDataset):
54+
"""
55+
:param wds: list of webdataset files to get samples from
56+
:type wds: list of str
57+
:param transforms: A dictionary of functions to transform the output data, accepted keys are **["seq", "chrom", "target", "label"]**
58+
:type transforms: dict of functions
59+
"""
4960
def __init__(self, wds, transforms:dict=None, rank=0, world_size=1):
5061
self.wds = wds
5162
self.transforms = transforms
@@ -83,6 +94,18 @@ def __iter__(self):
8394
SeqChromDatasetByWds = seqChromLoaderCurry(_SeqChromDatasetByWds)
8495

8596
class _SeqChromDatasetByBed(Dataset):
97+
"""
98+
:param bed: Bed file describing genomics regions to extract info from, every region has to be of the same length.
99+
:type bed: str
100+
:param genome_fasta: Genome fasta file.
101+
:type genome_fasta: str
102+
:param bigwig_filelist: A list of bigwig files containing track information (e.g., histone modifications)
103+
:type bigwig_filelist: list of str or None
104+
:param target_bam: bam file to get # reads in each region
105+
:type target_bam: str or None
106+
:param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]*
107+
:type transforms: dict of functions
108+
"""
86109
def __init__(self, bed, genome_fasta, bigwig_filelist:list, target_bam=None, transforms:dict=None, initialize_first=False):
87110
self.bed = pd.read_table(bed, header=None, names=['chrom', 'start', 'end', 'label', 'score', 'strand' ])
88111

seqchromloader/writer.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,26 @@ def dump_data_webdataset(coords, genome_fasta, bigwig_filelist,
2626
numProcessors=1,
2727
transforms=None):
2828
"""
29-
Given coordinates dataframe, extract the sequence and chromatin signal,
30-
Then save in **TFReocrd** format
29+
Given coordinates dataframe, extract the sequence and chromatin signal, save in webdataset format
30+
31+
:param coords: pandas DataFrame containing genomic coordinates with columns **[chrom, start, end, label]**
32+
:type coords: pandas DataFrame
33+
:param genome_fasta: Genome fasta file.
34+
:type genome_fasta: str
35+
:param bigwig_filelist: A list of bigwig files containing track information (e.g., histone modifications)
36+
:type bigwig_filelist: list of str or None
37+
:param target_bam: bam file to get # reads in each region
38+
:type target_bam: str or None
39+
:param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]*
40+
:type transforms: dict of functions
41+
:param outdir: output directory to save files in
42+
:type outdir: str
43+
:param outprefix: prefix of output files
44+
:type outprefix: str
45+
:param compress: whether to compress the output files
46+
:type compress: boolean
47+
:param numProcessors: number of processors
48+
:type numProcessors: int
3149
"""
3250

3351
# split coordinates and assign chunks to workers

0 commit comments

Comments
 (0)