Skip to content

Commit dd561d9

Browse files
committed
Merge branch 'dev' into docs_v2
2 parents e939a24 + 085a3f1 commit dd561d9

32 files changed

Lines changed: 1751 additions & 801485 deletions

bin/build_coord_files

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/usr/bin/env python3
2-
# Copyright 2022 Vicky Hunt Lab Members
2+
# Copyright 2022 - 2025 Vicky Hunt Lab Members
33
#
44
# Licensed under the Apache License, Version 2.0 (the "License");
55
# you may not use this file except in compliance with the License.
Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# Copyright 2022 Vicky Hunt Lab Members
1+
#!/usr/bin/env python3
2+
# Copyright 2022 - 2025 Vicky Hunt Lab Members
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");
45
# you may not use this file except in compliance with the License.
@@ -16,8 +17,7 @@
1617

1718
from sys import argv
1819
from collections import defaultdict
19-
20-
from .config import do_log
20+
from argparse import ArgumentParser
2121

2222
def get_pairs(array):
2323
'''
@@ -115,14 +115,14 @@ def merge_cds(coding_reigon):
115115

116116
return n_merged
117117

118-
def extract_noncoding(genome, gff_path, quiet=0, output='result.fasta'):
118+
def extract_noncoding(genome, gff_path, output='result.fasta'):
119119
'''
120120
Extract the noncoding reigon from the genome, basied on a GFF file
121121
'''
122122
gff_iter = DataIterator(gff_path)
123123
genome_data = SeqIO.parse(genome, 'fasta')
124124

125-
do_log(quiet, '====> Calculating coordinates')
125+
print('====> Calculating coordinates')
126126

127127
coordinates = defaultdict(lambda: [])
128128
mRNAs = defaultdict(lambda: [])
@@ -141,13 +141,13 @@ def extract_noncoding(genome, gff_path, quiet=0, output='result.fasta'):
141141
if item[2] == 'CDS':
142142
coding_reigon[item[0] + item[6]].append([int(item[3]), int(item[4])])
143143

144-
do_log(quiet, '====> Merging and validating coordinates')
144+
print('====> Merging and validating coordinates')
145145

146146
cds_merged = merge_cds(coding_reigon)
147147
mRNA_merged = merge_cds(mRNAs)
148148
validate_gff(mRNAs, coding_reigon)
149149

150-
do_log(quiet, f'Merged {cds_merged} coding reigons and {mRNA_merged} mRNAs')
150+
print(f'Merged {cds_merged} coding reigons and {mRNA_merged} mRNAs')
151151

152152
for key in mRNAs.keys():
153153
for item in mRNAs[key]:
@@ -159,12 +159,21 @@ def extract_noncoding(genome, gff_path, quiet=0, output='result.fasta'):
159159
coordinates[key].append(item[0])
160160
coordinates[key].append(item[1])
161161

162-
do_log(quiet, '====> Extarcting fragments')
162+
print('====> Extarcting fragments')
163163

164164
for key in coordinates.keys():
165165
coordinates[key].sort()
166166

167167
SeqIO.write(extract_fragments(genome_data, coordinates, mRNA_start, mRNA_end), output, 'fasta')
168168

169169
if __name__ == '__main__':
170-
extract_noncoding(argv[1], argv[2])
170+
parser = ArgumentParser('')
171+
172+
parser.add_argument('genome', help='FASTA containing the genome to extract from')
173+
parser.add_argument('gff_file', help='GFF file containing annotations of CDS and mRNA regions')
174+
175+
parser.add_argument('-o', '--output', help='FASTA file to write output to', default='result.fasta')
176+
177+
args = parser.parse_args()
178+
179+
extract_noncoding(args.genome, args.gff_file, output=args.output)

bin/overlap_ss.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/bin/sh
2-
# Copyright 2022 Vicky Hunt Lab Members
2+
# Copyright 2022 - 2025 Vicky Hunt Lab Members
33
#
44
# Licensed under the Apache License, Version 2.0 (the "License");
55
# you may not use this file except in compliance with the License.
@@ -177,4 +177,4 @@ tar -cf Ouput_zip.tar unique.txt genome* Main* Pass* beg* end* length* overhang*
177177
rm unique.txt genome* Main* Pass* beg* end* length* overhang* cleanup.txt No* ps* Rev* ms*
178178

179179
echo "Distribution of sequences counted"
180-
echo "Complete"
180+
echo "Complete"

bin/revcomp_rna

100644100755
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/usr/bin/env python3
2-
# Copyright 2022 Vicky Hunt Lab Members
2+
# Copyright 2022 - 2025 Vicky Hunt Lab Members
33
#
44
# Licensed under the Apache License, Version 2.0 (the "License");
55
# you may not use this file except in compliance with the License.
@@ -37,4 +37,4 @@ def reverse_complement(seq):
3737
seqs = SeqIO.parse(args.input, 'fasta')
3838
seqs = map(reverse_complement, seqs)
3939

40-
SeqIO.write(seqs, args.output, 'fasta')
40+
SeqIO.write(seqs, args.output, 'fasta')

config.toml

Lines changed: 0 additions & 28 deletions
This file was deleted.

environment.yml

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,23 @@ channels:
33
- bioconda
44
- conda-forge
55
dependencies:
6-
- python=3.8
7-
- toml=0.10.*
8-
- pysam=0.19.*
9-
- biopython=1.*
10-
- gffutils=0.11.*
11-
- matplotlib=3.5.*
12-
- cutadapt=4.*
13-
- fastqc=0.11.*
14-
- bbmap=38.*
15-
- unitas=1.6.1
16-
- perl-archive-extract=0.88
17-
- bioconda/label/main::perl-lwp-simple=6.39
18-
- bowtie2=2.4.*
19-
- samtools=1.*
20-
- bedtools=2.*
6+
- python
7+
- pyyaml
8+
- pysam
9+
- biopython
10+
- gffutils
11+
- matplotlib
12+
- cutadapt
13+
- unitas
14+
- perl-archive-extract
15+
- bioconda/label/main::perl-lwp-simple
16+
- bowtie2
17+
- samtools
18+
- bedtools
19+
- eggnog-mapper
20+
- scipy
21+
- r
22+
- bioconductor-topgo
2123
- pip
2224
- pip:
2325
- .
24-

example_config.yml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
smallRNA_fastq: smallrna.fastq
2+
# size_sorted_fastqs: binned_reads
3+
compress_output: true
4+
keep_intermediate_files: true
5+
cds: cds.fasta
6+
unspliced_transcriptome: unspliced.fasta
7+
trim:
8+
# input: smallrna.fastq
9+
kit: qiagen
10+
# 5_prime: ACGTTTAG
11+
# 3_prime: CGTAGGAT
12+
min_quality: 20
13+
# new behaviour: output by first base as well e.g. file of 26G
14+
sort:
15+
# input: output/trimmed_reads.fq
16+
genome: genome.fasta
17+
align_to_cds: True
18+
min_length: 5
19+
max_length: 50
20+
mismatches: 0
21+
unitas:
22+
# input: output/binned_rna
23+
refseq:
24+
- gene
25+
- miRNA: test/miRNA.fasta
26+
- piRNA: test/piRNA.fasta
27+
- tRNA: test/tRNA.fasta
28+
- TE: test/transposable_elements.fasta
29+
species: x
30+
targetid:
31+
min_seq_length: 5
32+
target_files:
33+
- test/file1.fasta
34+
- test/file2.fasta
35+
mismatches: 0
36+
enrich:
37+
eggnog_data_dir: /home/user/eggnog-mapper-data
38+
exclude_files:
39+
- test/file2.fasta

hlsmallrna/__init__.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2022 Vicky Hunt Lab Members
1+
# Copyright 2022 - 2025 Vicky Hunt Lab Members
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -11,10 +11,15 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
from .__main__ import main, main_ssoverlap
14+
from .__main__ import main
15+
from .ss_overlap import main_ssoverlap
16+
from .label_for_unitas import label_for_unitas_cli
1517

1618
def climain():
1719
main()
1820

1921
def ssoverlap_main():
20-
main_ssoverlap()
22+
main_ssoverlap()
23+
24+
def labelforunitas_main():
25+
label_for_unitas_cli()

0 commit comments

Comments
 (0)