mutation_analysis/median_coverage.py at main · markhilt/mutation_analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
median_coverage.py
Author: Markus Hiltunen
E-mail: markus.hiltunen@su.se

This script is used to calculate the median coverage per sample from
a file containing the sequencing coverage for each base in a reference
genome.

Copyright (c) 2023, Johannesson lab
Licensed under the MIT license. See LICENSE file.
"""

import argparse
import numpy as np
import gzip

__version__ = "0.1"

parser = argparse.ArgumentParser(description='	Calculates the median coverage both per sample \
												and per base pair, from a file containing the \
												sequencing coverage for each base in a reference \
												genome. The coverage file is gzipped and can be \
												the output from e.g. samtools depth. WARNING: \
												script may use a lot of memory, depending on \
												genome size and number of samples.')
parser.add_argument("input", \
                    help="Input gzipped tab delimited file of depth of coverage.\n \
                    	  Format is: 'scaffold\tcoordinate\tdepth_sample_1\t...\tdepth_sample_n' .\n \
                    	  The first line is a header in the format: '# CHROM\tPOS\tname_sample_1\t...\tname_sample_n'", \
                    type = str)
parser.add_argument("-o","--output", \
                    help="Output prefix.", \
                    default="median_coverage_out", \
                    type = str)
parser.add_argument("-v","--version", \
                    help="Print version and quit.", \
                    action = "version", \
                    version = "median_coverage v.{}".format(__version__))
args = parser.parse_args()

def main():
	with gzip.open(args.input, "rt") as infile:
		with open(args.output+".txt", "w") as outfile:
			for line in infile:
				line = line.strip()
				fields = line.split("\t")

				# First line is the header. Here we define the samples
				# Order is important, since the columns are in the same order
				if line.startswith("#"):
					# Keep the samples in a list, so we know the order
					samples = [ f for f in fields[2:] ]
					# Also define a dict, which is unordered, but we will use the list to fill it.
					cov_per_sample = { f:[] for f in fields[2:] }

				else:
					# For each data line we want to do two things
					# 1. Calculate the median for the line = genomic position
					dat = np.asarray(fields[2:], dtype = int)
					median = np.median(dat)
					sd = np.std(dat)
					outfile.write("{}\t{}\t{}\t{}\n".format(fields[0], fields[1], str(median), str(sd)))
					# 2. Save each value to its respective dict entry. Use enumerate to get the index for the sample
					for idx, val in enumerate(fields[2:]):
						cov_per_sample[samples[idx]].append(int(val))

	# After done looping through the file, convert lists in the dict to arrays and calculate median, sd
	print("# Median coverage per sample")
	for k,v in cov_per_sample.items():
		dat = np.asarray(v)
		median, sd = np.median(dat), np.std(dat)
		print("\t".join([k, str(median), str(sd)]))

if __name__ == "__main__":
    main()