Skip to content

Commit a79cb02

Browse files
committed
adding data processing scripts
1 parent ed5a86d commit a79cb02

8 files changed

Lines changed: 310 additions & 0 deletions

bin/addCI.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env python
2+
3+
"""
4+
Add CI columns from atlas files to opur dataframe.
5+
"""
6+
7+
import numpy as np
8+
import pandas as pd
9+
import sys
10+
11+
in_df = sys.argv[1] # input dataframe
12+
in_atlas = sys.argv[2] # atlas file
13+
out_df = sys.argv[3] # output dataframe
14+
15+
data = pd.read_csv(in_df, sep='\t', index_col=False, skipinitialspace=True)
16+
atlas = pd.read_csv(in_atlas, comment='#', skipinitialspace=True, usecols=['Position', 'AgeCI95Lower_Jnt', 'AgeCI95Upper_Jnt'], index_col=False)
17+
18+
data2 = pd.merge(data, atlas, on='Position', how='inner')
19+
20+
data3 = data2.drop_duplicates(subset='Position', keep='first', inplace=False, ignore_index=True)
21+
22+
data3.to_csv(out_df+'.tsv', sep='\t', index=False)

bin/addMaskedSites.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
This script reads a dataframe and a bed file with introgressed sites
5+
coordinates, and adds these coordiantes to the dataframe.
6+
"""
7+
8+
import numpy as np
9+
import pandas as pd
10+
import sys
11+
12+
in_df = sys.argv[1] # input dataframe
13+
in_mask = sys.argv[2] # bed file
14+
out_df = sys.argv[3] # output dataframe
15+
16+
header_names = ['Chromosome', 'Position', 'AlleleRef', 'AlleleAlt', 'AlleleAnc', 'DataSource', 'AgeMedian_Jnt', 'Ancestral', 'Derived', 'RecRate', 'ID', 'rs', 'Ref', 'Alt', 'AFR', 'AMR', 'EAS', 'EUR', 'SAS', 'sameAlt']
17+
18+
data = pd.read_csv(in_df, sep='\t', header=None, names=header_names)
19+
mask = pd.read_csv(in_mask, sep='\t', header=None, names=['chr', 'pos'])
20+
21+
22+
mask_col = data['Position'].isin(mask['pos'])
23+
data['Masked'] = mask_col
24+
25+
data.to_csv(out_df+'.tsv', sep='\t', index=False)

bin/addRecombinRate.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/usr/bin/env python
2+
3+
"""
4+
This script read a recombination map file and add the a column for
5+
rates to our dataframe.
6+
"""
7+
8+
import numpy as np
9+
import pandas as pd
10+
import sys
11+
12+
recmap = sys.argv[1] # recombination map file
13+
in_df = sys.argv[2] # dataframe
14+
15+
genmap = pd.read_csv(recmap, sep='\t', index_col=False)
16+
17+
data = pd.read_csv(in_df, sep='\t', index_col=False)
18+
19+
frompos = []
20+
prev = 1
21+
for i, pos in enumerate(genmap['Position(bp)']):
22+
if i == 0:
23+
frompos.append(prev)
24+
prev = pos
25+
else:
26+
frompos.append(prev+1)
27+
prev = pos
28+
29+
genmap['FromPos'] = frompos
30+
31+
Rec_Rate = []
32+
idx = 0
33+
rcount = 0
34+
for i, row in genmap.iterrows():
35+
for i, pos in enumerate(data['Position'][idx:]):
36+
rcount += 1
37+
if pos >= row[4] and pos <= row[1]:
38+
Rec_Rate.append(row[2])
39+
else:
40+
idx = rcount+1
41+
break
42+
43+
rate = pd.DataFrame(data=np.array(Rec_Rate), columns=['Rate'])
44+
df = pd.concat([data, rate], ignore_index=True, axis=1)
45+
46+
df.to_csv('chr1-rec.txt', sep='\t', index=False, header=True)
47+
48+

bin/extractGVFInfo.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
This script extract population-specific annotations from a genome
5+
variation format (GVF) file.
6+
"""
7+
8+
from argparse import ArgumentParser
9+
import numpy as np
10+
import pandas as pd
11+
import urllib.request, urllib.parse, urllib.error
12+
13+
14+
def parseGFFAttributes(attributeString):
15+
"""Parse the GFF3 attribute column and return a dict"""
16+
if attributeString == ".": return {}
17+
ret = {}
18+
for attribute in attributeString.split(';'):
19+
key, value = attribute.split('=')
20+
ret[urllib.parse.unquote(key)] = urllib.parse.unquote(value)
21+
return ret
22+
23+
24+
def parseGVF(row):
25+
"""
26+
A simple GVF format parser, modified from a GFF3 format parser.
27+
Return a dictionary per row for our dataframe.
28+
"""
29+
30+
populations = ['AFR', 'AMR', 'EAS', 'EUR', 'SAS']
31+
32+
record = {}
33+
34+
parts = row.strip().split('\t')
35+
36+
record['Chr'] = parts[0]
37+
record['Pos'] = parts[3]
38+
39+
attr = parseGFFAttributes(parts[8])
40+
record['ID'] = attr['ID']
41+
dbSNP = attr['Dbxref'].split(':')
42+
record['rs'] = dbSNP[1]
43+
#maf = attr['global_minor_allele_frequency']
44+
record['Ref'] = attr['Reference_seq']
45+
record['Alt'] = attr['Variant_seq']
46+
47+
for pop in populations:
48+
if pop in attr.keys():
49+
record[pop] = attr[pop]
50+
else:
51+
record[pop] = 0.0
52+
53+
return record
54+
55+
56+
if __name__ == '__main__':
57+
parser = ArgumentParser(description='Parse a file in GVF format and create a dataframe')
58+
parser.add_argument('-i', '--infile', help='Input a file in GVF format')
59+
parser.add_argument('-o', '--outfile', help='Outputfile name and path')
60+
args = parser.parse_args()
61+
62+
fn = args.infile
63+
out = args.outfile
64+
65+
rows_list = []
66+
with open(fn) as infile:
67+
for row in infile:
68+
if row.startswith('#'):
69+
continue
70+
else:
71+
rows_list.append(parseGVF(row))
72+
73+
names = ['Chr', 'ID', 'rs', 'Pos', 'Ref', 'Alt', 'AFR', 'AMR', 'EAS', 'EUR', 'SAS']
74+
df = pd.DataFrame(rows_list, columns=names)
75+
76+
df.to_csv(out, sep='\t', index=False)
77+
78+

bin/getTriNucSeq.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
This script read the atlas and ref chromosome files and add columns
5+
for tri-nucleotide mutation spectrum to its output.
6+
"""
7+
8+
import pandas as pd
9+
from pyfaidx import Fasta
10+
import sys
11+
12+
fn = sys.argv[1] # atlas file
13+
ref = sys.argv[2] # refseq file
14+
out = sys.argv[3] # name for output file
15+
16+
df = pd.read_csv(fn, usecols=[1,2,3,4,5,6,23], comment='#', skipinitialspace=True, index_col=False)
17+
df = df[(df['AlleleAnc'] != '.')]
18+
#df.drop_duplicates(subset=['Position'], keep='last', inplace=True)
19+
20+
chrom = Fasta(ref)
21+
name = [i for i in chrom.keys()]
22+
23+
24+
Ancestral, Derived = [], []
25+
for i, row in df.iterrows():
26+
pos = row[1]-1
27+
prev_nuc = chrom[name[0]][pos-1].seq
28+
next_nuc = chrom[name[0]][pos+1].seq
29+
#nuc = chrom[name[0]][pos].seq
30+
#Ancestral.append(prev_nuc + nuc + next_nuc)
31+
Ancestral.append(prev_nuc.upper() + row[4] + next_nuc.upper())
32+
if row[4] == row[2]:
33+
Derived.append(prev_nuc.upper() + row[3] + next_nuc.upper())
34+
# elif row[4] != row[2]:
35+
else:
36+
Derived.append(prev_nuc.upper() + row[2] + next_nuc.upper())
37+
# else:
38+
# raise Exception('I do not know which base in anc/der')
39+
40+
41+
df['Ancestral'] = Ancestral
42+
df['Derived'] = Derived
43+
44+
noTCC_index = df[(df['Ancestral'] == 'TCC') & (df['Derived'] == 'TTC')].index
45+
df.drop(noTCC_index, inplace = True)
46+
47+
noAGG_index = df[(df['Ancestral'] == 'AGG') & (df['Derived'] == 'AAG')].index
48+
df.drop(noAGG_index, inplace = True)
49+
50+
df.to_csv(out, sep='\t', index=False)
51+

bin/introgressedSites.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/usr/bin/env python
2+
3+
"""
4+
This script reads a dataframe and a bed file with introgression
5+
coordinates, and outputs a new bed file containing the introgressed
6+
sites to be removed later.
7+
"""
8+
9+
import numpy as np
10+
import pandas as pd
11+
import sys
12+
13+
chrom = int(sys.argv[1]) # chromosome number
14+
in_df = sys.argv[2] # dataframe
15+
in_bed = sys.argv[3] # coordinate file
16+
17+
data = pd.read_csv(in_df, sep='\t', usecols=[0, 1])
18+
mask = pd.read_csv(in_bed, sep='\t')
19+
20+
idx1 = data['Chromosome'] == chrom
21+
idx2 = mask['chrom'] == chrom
22+
23+
pointer = 0
24+
for i, row in mask[idx2].iterrows():
25+
for p, pos in data[idx1].iterrows():
26+
if np.logical_and(pos['Position'] >= row['start'], pos['Position'] <= row['end']):
27+
print('%s\t%d' % (chrom, pos['Position']))
28+
elif pos['Position'] > row['end']:
29+
break
30+
else:
31+
continue
32+
33+

bin/makeIntrogressMap.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/usr/bin/env python
2+
3+
"""
4+
This script join introgression annotations across all individuals and
5+
all populations found in the Steinrucken et al, 2018 paper, and outputs
6+
a bed file with combined region annotations.
7+
8+
The input is a directory containing all files.
9+
"""
10+
11+
import os
12+
import sys
13+
import pandas as pd
14+
15+
indir = sys.argv[1]
16+
17+
files = os.listdir(indir)
18+
19+
biglist = []
20+
for file in files:
21+
if os.path.isfile(os.path.join(indir, file)):
22+
#with open(os.path.join(indir, file), 'r') as f:
23+
df = pd.read_csv(os.path.join(indir, file), sep='\t', names=['chr', 'start', 'end'])
24+
df['length'] = df['end'] - df['start']
25+
sum_diff = sum(df['length'])
26+
print(f"{file[:7]}\t{sum_diff}")
27+
28+

bin/splitfasta.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
This script extract per chromosome sequence from the GRCh37
5+
reference genome downloaded from:
6+
https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.25/
7+
8+
It takes the ref genome file as its only argument.
9+
"""
10+
11+
from Bio import SeqIO
12+
import sys
13+
14+
RefSeq = ['NC_000001.10', 'NC_000002.11'
15+
'NC_000003.11', 'NC_000004.11', 'NC_000005.9', 'NC_000006.11',
16+
'NC_000007.13', 'NC_000008.10', 'NC_000009.11', 'NC_000010.10',
17+
'NC_000011.9', 'NC_000012.11', 'NC_000013.10', 'NC_000014.8',
18+
'NC_000015.9', 'NC_000016.9', 'NC_000017.10', 'NC_000018.9',
19+
'NC_000019.9', 'NC_000020.10', 'NC_000021.8', 'NC_000022.10']
20+
21+
#wantedSeqs = sys.argv[2]
22+
seqs = SeqIO.parse(open(sys.argv[1]), 'fasta')
23+
for name in RefSeq:
24+
SeqIO.write((seq for seq in seqs if seq.id in name), sys.stdout, 'fasta')
25+

0 commit comments

Comments
 (0)