Skip to content

Commit d6293bc

Browse files
committed
Revisions (#59)
* Added QC flags * Added per-read GC content histograms * Added % read length vs. base modification probability scatter plot * Added base modification probability histogram
1 parent 4a610e4 commit d6293bc

27 files changed

Lines changed: 1610 additions & 890 deletions

.github/workflows/build-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ jobs:
3535

3636
- name: Build LongReadSum
3737
shell: bash --login {0} # --login enables PATH variable access
38-
run: make
38+
run: make -d
3939

4040
- name: Run tests
4141
shell: bash --login {0}

Dockerfile

Lines changed: 12 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,19 @@
11
# Use the miniconda container
2-
FROM continuumio/miniconda3
2+
FROM continuumio/miniconda3:main
33

4-
# Copy the project directory
5-
COPY . /app/longreadsum
6-
WORKDIR /app/longreadsum
4+
WORKDIR /app
75

8-
# Install build tools
9-
RUN apt-get update && apt-get install build-essential -y
6+
RUN apt-get update
7+
RUN conda update conda
108

11-
# Install VBZ compression
12-
RUN wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz
13-
RUN tar -xf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz
14-
15-
# Create the environment
16-
RUN conda env create -f environment.yml
17-
18-
# Activate the environment
9+
# Install LongReadSum
10+
RUN conda config --add channels wglab
11+
RUN conda config --add channels conda-forge
12+
RUN conda config --add channels bioconda
13+
RUN conda config --add channels jannessp
14+
RUN conda create -n longreadsum python=3.9
1915
RUN echo "conda activate longreadsum" >> ~/.bashrc
2016
SHELL ["/bin/bash", "--login", "-c"]
17+
RUN conda install -n longreadsum -c wglab -c conda-forge -c jannessp -c bioconda longreadsum=1.5.0 && conda clean -afy
2118

22-
# Ensure the correct environment is being used
23-
RUN export PATH="/opt/conda/envs/longreadsum/bin/python"
24-
RUN which python
25-
26-
# Build LongReadSum
27-
RUN make
28-
29-
# Set up the HDF5 plugin path
30-
ENV HDF5_PLUGIN_PATH="/longreadsum/lib/"
31-
32-
# The code to run when container is started:
33-
ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "longreadsum", "python", "/app/longreadsum"]
19+
ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "longreadsum", "longreadsum"]

Makefile

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,28 @@ SRC_DIR := $(CURDIR)/src
33
LIB_DIR := $(CURDIR)/lib
44

55
# Set the library paths for the compiler
6-
LIBRARY_PATHS := -L$(LIB_DIR) -L/usr/share/miniconda/envs/longreadsum/lib
7-
INCLUDE_PATHS := -I$(INCL_DIR) -I/usr/share/miniconda/envs/longreadsum/include
6+
CONDA_PREFIX ?= $(shell echo $$CONDA_PREFIX)
7+
LIBRARY_PATHS := -L$(LIB_DIR) -L$(CONDA_PREFIX)/lib
8+
INCLUDE_PATHS := -I$(INCL_DIR) -I$(CONDA_PREFIX)/include
89

910
# All targets
1011
all: swig_build compile
1112

1213
# Generate the SWIG Python/C++ wrappers
1314
swig_build:
15+
mkdir -p $(LIB_DIR)
1416
swig -c++ -python -outdir $(LIB_DIR) -I$(INCL_DIR) -o $(SRC_DIR)/lrst_wrap.cpp $(SRC_DIR)/lrst.i
1517

18+
# Create the lib directory if it doesn't exist
19+
mkdir -p $(LIB_DIR)
20+
1621
# Compile the C++ shared libraries into lib/
1722
compile:
18-
LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):/usr/share/miniconda/envs/longreadsum/lib \
23+
LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(LIB_DIR) \
1924
CXXFLAGS="$(INCLUDE_PATHS)" LDFLAGS="$(LIBRARY_PATHS)" python3 setup.py build_ext --build-lib $(LIB_DIR)
25+
26+
# Clean the build directory
27+
clean:
28+
$(RM) -r $(LIB_DIR)/*.so $(LIB_DIR)/*.py $(SRC_DIR)/lrst_wrap.cpp build/
29+
30+
# LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(CONDA_PREFIX)/lib \

README.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ conda activate longreadsum
4141
LongReadSum and its dependencies can then be installed using the following command:
4242

4343
```
44-
conda install -c wglab -c conda-forge -c jannessp -c bioconda longreadsum=1.4.0
44+
conda install -c wglab -c conda-forge -c jannessp -c bioconda longreadsum=1.5.0
4545
```
4646

4747
# Installation using Docker
@@ -148,7 +148,7 @@ MinION R9.4.1 from https://labs.epi2me.io/gm24385-5mc/)
148148

149149
## General usage
150150
```
151-
longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY --ref $REF_GENOME --modprob 0.8
151+
longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY --mod --modprob 0.8 --ref $REF_GENOME
152152
```
153153

154154
# RRMS BAM
@@ -258,7 +258,12 @@ longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY
258258
# ONT POD5
259259

260260
This section describes how to generate QC reports for ONT POD5 (signal) files and their corresponding basecalled BAM files (data shown is HG002 using ONT
261-
R10.4.1 and LSK114 downloaded from the tutorial https://github.com/epi2me-labs/wf-basecalling).
261+
R10.4.1 and LSK114 downloaded from the tutorial
262+
https://github.com/epi2me-labs/wf-basecalling).
263+
264+
> [!NOTE]
265+
> This requires generating basecalled BAM files with the move table output. For
266+
> example, for [dorado](https://github.com/nanoporetech/dorado), the parameter is `--emit-moves`
262267
263268
![image](https://github.com/user-attachments/assets/62c3c810-5c1a-4124-816b-74245af8b57c)
264269

conda/build.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,27 @@
33
# Add the library path to the LD_LIBRARY_PATH
44
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PREFIX}/lib
55

6+
# Ensure the lib directory exists
7+
mkdir -p "${SRC_DIR}"/lib
8+
69
# Generate the SWIG files
10+
echo "Generating SWIG files..."
711
swig -c++ -python -outdir "${SRC_DIR}"/lib -I"${SRC_DIR}"/include -I"${PREFIX}"/include -o "${SRC_DIR}"/src/lrst_wrap.cpp "${SRC_DIR}"/src/lrst.i
812

913
# Generate the shared library
14+
echo "Building the shared library..."
1015
$PYTHON setup.py -I"${PREFIX}"/include -L"${PREFIX}"/lib install
1116

1217
# Create the src directory
1318
mkdir -p "${PREFIX}"/src
1419

1520
# Copy source files to the bin directory
21+
echo "Copying source files..."
1622
cp -r "${SRC_DIR}"/src/*.py "${PREFIX}"/bin
1723

1824
# Copy the SWIG generated library to the lib directory
25+
echo "Copying SWIG generated library..."
1926
cp -r "${SRC_DIR}"/lib/*.py "${PREFIX}"/lib
2027
cp -r "${SRC_DIR}"/lib/*.so "${PREFIX}"/lib
28+
29+
echo "Build complete."

conda/meta.yaml

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
1-
{% set version = "1.4.0" %}
2-
# {% set revision = "b06670513616fd6342233c1c77e6d0bcf138b3bc" %}
1+
{% set version = "1.5.0" %}
2+
{% set revision = "11961b2cc93534057df2bd668c2277267222916b" %}
33

44
package:
55
name: longreadsum
66
version: {{ version }}
77

88
source:
9-
path: ../
10-
# git_url: https://github.com/WGLab/LongReadSum.git
11-
# git_rev: {{ revision }}
9+
git_url: https://github.com/WGLab/LongReadSum.git
10+
git_rev: {{ revision }}
11+
# path: ../
1212

1313
channels:
1414
- conda-forge
@@ -29,20 +29,16 @@ requirements:
2929
host:
3030
- python=3.9
3131
- swig
32-
- hdf5
3332
- htslib=1.20
34-
# - jannessp::pod5
35-
# - jannessp::lib-pod5
33+
- ont_vbz_hdf_plugin # Contains HDF5 as a dependency as well
3634
run:
3735
- python=3.9
3836
- numpy
39-
- hdf5
4037
- ont_vbz_hdf_plugin
41-
- htslib=1.20
38+
- bioconda::htslib=1.20
4239
- plotly
43-
- janessp::pod5
40+
- jannessp::pod5
4441
- pyarrow
45-
# - janessp::lib-pod5
4642

4743
test:
4844
commands:

environment.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
name: longreadsum
22
channels:
33
- conda-forge
4+
- jannessp # for pod5
45
- bioconda
56
- defaults
6-
- jannessp # for pod5
7+
78
dependencies:
89
- python=3.9
910
- numpy
10-
- hdf5
1111
- ont_vbz_hdf_plugin
12-
- htslib=1.20
12+
- bioconda::htslib=1.20
1313
- swig
1414
- plotly
1515
- pytest
16-
- pod5
17-
- pyarrow
16+
- jannessp::pod5
17+
- pyarrow

include/hts_reader.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ class HTSReader {
3838
bool reading_complete = false;
3939

4040
// Update read and base counts
41-
int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics* basic_qc, uint64_t *base_quality_distribution);
41+
int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, Basic_Seq_Quality_Statistics& seq_quality_info, bool is_primary);
4242

4343
// Read the next batch of records from the BAM file
4444
int readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex, std::unordered_set<std::string>& read_ids, double base_mod_threshold);
@@ -47,9 +47,12 @@ class HTSReader {
4747
bool hasNextRecord();
4848

4949
// Return the number of records in the BAM file using the BAM index
50-
int64_t getNumRecords(const std::string &bam_file_name, Output_BAM &final_output, bool mod_analysis, double base_mod_threshold);
50+
int getNumRecords(const std::string &bam_file_name, int thread_count);
5151

52-
std::map<int, int> getQueryToRefMap(bam1_t *record);
52+
// Run base modification analysis
53+
void runBaseModificationAnalysis(const std::string &bam_filename, Output_BAM& final_output, double base_mod_threshold, int read_count, int sample_count, int thread_count);
54+
55+
std::map<int, int> getQueryToRefMap(bam1_t* record);
5356

5457
// Add a modification to the base modification map
5558
void addModificationToQueryMap(std::map<int32_t, std::tuple<char, char, double, int>> &base_modifications, int32_t pos, char mod_type, char canonical_base, double likelihood, int strand);

0 commit comments

Comments
 (0)