adding data processing scripts

sialsaffar · sialsaffar · commit a79cb029434d · 2022-05-16T19:28:04.000-04:00
diff --git a/bin/addCI.py b/bin/addCI.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+
+"""
+Add CI columns from atlas files to opur dataframe.
+"""
+
+import numpy as np
+import pandas as pd
+import sys
+
+in_df = sys.argv[1]     # input dataframe
+in_atlas = sys.argv[2]  # atlas file
+out_df = sys.argv[3]    # output dataframe
+
+data = pd.read_csv(in_df, sep='\t', index_col=False, skipinitialspace=True)
+atlas = pd.read_csv(in_atlas, comment='#', skipinitialspace=True, usecols=['Position', 'AgeCI95Lower_Jnt', 'AgeCI95Upper_Jnt'], index_col=False)
+
+data2 = pd.merge(data, atlas, on='Position', how='inner')
+
+data3 = data2.drop_duplicates(subset='Position', keep='first', inplace=False, ignore_index=True)
+
+data3.to_csv(out_df+'.tsv', sep='\t', index=False)
diff --git a/bin/addMaskedSites.py b/bin/addMaskedSites.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+
+"""
+This script reads a dataframe and a bed file with introgressed sites 
+coordinates, and adds these coordiantes to the dataframe.
+"""
+
+import numpy as np
+import pandas as pd
+import sys
+
+in_df = sys.argv[1]     # input dataframe
+in_mask = sys.argv[2]   # bed file
+out_df = sys.argv[3]    # output dataframe
+
+header_names = ['Chromosome', 'Position', 'AlleleRef', 'AlleleAlt', 'AlleleAnc', 'DataSource', 'AgeMedian_Jnt', 'Ancestral', 'Derived', 'RecRate', 'ID', 'rs', 'Ref', 'Alt', 'AFR', 'AMR', 'EAS', 'EUR', 'SAS', 'sameAlt']
+
+data = pd.read_csv(in_df, sep='\t', header=None, names=header_names)
+mask = pd.read_csv(in_mask, sep='\t', header=None, names=['chr', 'pos'])
+
+
+mask_col = data['Position'].isin(mask['pos'])
+data['Masked'] = mask_col
+
+data.to_csv(out_df+'.tsv', sep='\t', index=False)
diff --git a/bin/addRecombinRate.py b/bin/addRecombinRate.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+
+"""
+This script read a recombination map file and add the a column for 
+rates to our dataframe.
+"""
+
+import numpy as np
+import pandas as pd
+import sys
+
+recmap = sys.argv[1]    # recombination map file
+in_df = sys.argv[2]     # dataframe
+
+genmap = pd.read_csv(recmap, sep='\t', index_col=False)
+
+data = pd.read_csv(in_df, sep='\t', index_col=False)
+
+frompos = []
+prev = 1
+for i, pos in enumerate(genmap['Position(bp)']):
+    if i == 0:
+        frompos.append(prev)
+        prev = pos
+    else:
+        frompos.append(prev+1)
+        prev = pos
+
+genmap['FromPos'] = frompos
+
+Rec_Rate = []
+idx = 0
+rcount = 0
+for i, row in genmap.iterrows():
+    for i, pos in enumerate(data['Position'][idx:]):
+        rcount += 1
+        if pos >= row[4] and pos <= row[1]:
+            Rec_Rate.append(row[2])
+        else:
+            idx = rcount+1
+            break
+
+rate = pd.DataFrame(data=np.array(Rec_Rate), columns=['Rate'])
+df = pd.concat([data, rate], ignore_index=True, axis=1)
+
+df.to_csv('chr1-rec.txt', sep='\t', index=False, header=True)
+
+
diff --git a/bin/extractGVFInfo.py b/bin/extractGVFInfo.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+"""
+This script extract population-specific annotations from a genome 
+variation format (GVF) file.
+"""
+
+from argparse import ArgumentParser
+import numpy as np
+import pandas as pd
+import urllib.request, urllib.parse, urllib.error
+
+
+def parseGFFAttributes(attributeString):
+    """Parse the GFF3 attribute column and return a dict"""
+    if attributeString == ".": return {}
+    ret = {}
+    for attribute in attributeString.split(';'):
+        key, value = attribute.split('=')
+        ret[urllib.parse.unquote(key)] = urllib.parse.unquote(value)
+    return ret
+
+
+def parseGVF(row):
+    """
+    A simple GVF format parser, modified from a GFF3 format parser.
+    Return a dictionary per row for our dataframe.
+    """
+    
+    populations = ['AFR', 'AMR', 'EAS', 'EUR', 'SAS']
+    
+    record = {}
+    
+    parts = row.strip().split('\t')
+
+    record['Chr'] = parts[0]
+    record['Pos'] = parts[3]
+    
+    attr = parseGFFAttributes(parts[8])
+    record['ID'] = attr['ID']
+    dbSNP = attr['Dbxref'].split(':')
+    record['rs'] = dbSNP[1]
+    #maf = attr['global_minor_allele_frequency']
+    record['Ref'] = attr['Reference_seq']
+    record['Alt'] = attr['Variant_seq']
+    
+    for pop in populations:
+        if pop in attr.keys(): 
+            record[pop] = attr[pop] 
+        else: 
+            record[pop] = 0.0
+            
+    return record
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(description='Parse a file in GVF format and create a dataframe')
+    parser.add_argument('-i', '--infile', help='Input a file in GVF format')
+    parser.add_argument('-o', '--outfile', help='Outputfile name and path')
+    args = parser.parse_args()
+
+    fn = args.infile
+    out = args.outfile
+
+    rows_list = []
+    with open(fn) as infile:
+        for row in infile:
+            if row.startswith('#'):
+                continue
+            else:    
+                rows_list.append(parseGVF(row))
+
+    names = ['Chr', 'ID', 'rs', 'Pos', 'Ref', 'Alt', 'AFR', 'AMR', 'EAS', 'EUR', 'SAS']
+    df = pd.DataFrame(rows_list, columns=names) 
+
+    df.to_csv(out, sep='\t', index=False)
+
+
diff --git a/bin/getTriNucSeq.py b/bin/getTriNucSeq.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+
+"""
+This script read the atlas and ref chromosome files and add columns
+for tri-nucleotide mutation spectrum to its output.
+"""
+
+import pandas as pd
+from pyfaidx import Fasta
+import sys
+
+fn = sys.argv[1]    # atlas file
+ref = sys.argv[2]   # refseq file
+out = sys.argv[3]   # name for output file
+
+df = pd.read_csv(fn, usecols=[1,2,3,4,5,6,23], comment='#', skipinitialspace=True, index_col=False)
+df = df[(df['AlleleAnc'] != '.')]
+#df.drop_duplicates(subset=['Position'], keep='last', inplace=True)
+
+chrom = Fasta(ref)
+name = [i for i in chrom.keys()]
+
+
+Ancestral, Derived = [], []
+for i, row in df.iterrows():
+    pos = row[1]-1
+    prev_nuc = chrom[name[0]][pos-1].seq
+    next_nuc = chrom[name[0]][pos+1].seq
+    #nuc = chrom[name[0]][pos].seq
+    #Ancestral.append(prev_nuc + nuc + next_nuc)
+    Ancestral.append(prev_nuc.upper() + row[4] + next_nuc.upper())
+    if row[4] == row[2]:
+        Derived.append(prev_nuc.upper() + row[3] + next_nuc.upper())
+#    elif row[4] != row[2]:
+    else:
+        Derived.append(prev_nuc.upper() + row[2] + next_nuc.upper())
+#    else:
+#        raise Exception('I do not know which base in anc/der')
+
+
+df['Ancestral'] = Ancestral
+df['Derived'] = Derived
+
+noTCC_index = df[(df['Ancestral'] == 'TCC') & (df['Derived'] == 'TTC')].index
+df.drop(noTCC_index, inplace = True)
+
+noAGG_index = df[(df['Ancestral'] == 'AGG') & (df['Derived'] == 'AAG')].index
+df.drop(noAGG_index, inplace = True)
+
+df.to_csv(out, sep='\t', index=False)
+
diff --git a/bin/introgressedSites.py b/bin/introgressedSites.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+
+"""
+This script reads a dataframe and a bed file with introgression
+coordinates, and outputs a new bed file containing the introgressed 
+sites to be removed later.
+"""
+
+import numpy as np
+import pandas as pd
+import sys
+
+chrom = int(sys.argv[1])    # chromosome number
+in_df = sys.argv[2]     # dataframe
+in_bed = sys.argv[3]    # coordinate file
+
+data = pd.read_csv(in_df, sep='\t', usecols=[0, 1])
+mask = pd.read_csv(in_bed, sep='\t')
+
+idx1 = data['Chromosome'] == chrom
+idx2 = mask['chrom'] == chrom
+
+pointer = 0
+for i, row in mask[idx2].iterrows():
+  for p, pos in data[idx1].iterrows():
+    if np.logical_and(pos['Position'] >= row['start'], pos['Position'] <= row['end']):
+      print('%s\t%d' % (chrom, pos['Position']))
+    elif pos['Position'] > row['end']:
+      break
+    else:
+      continue
+
+
diff --git a/bin/makeIntrogressMap.py b/bin/makeIntrogressMap.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+
+"""
+This script join introgression annotations across all individuals and
+all populations found in the Steinrucken et al, 2018 paper, and outputs
+a bed file with combined region annotations. 
+
+The input is a directory containing all files. 
+"""
+
+import os
+import sys
+import pandas as pd
+
+indir = sys.argv[1]
+
+files = os.listdir(indir)
+
+biglist = []
+for file in files:
+    if os.path.isfile(os.path.join(indir, file)):
+        #with open(os.path.join(indir, file), 'r') as f:
+        df = pd.read_csv(os.path.join(indir, file), sep='\t', names=['chr', 'start', 'end'])
+        df['length'] = df['end'] - df['start']
+        sum_diff = sum(df['length'])
+        print(f"{file[:7]}\t{sum_diff}")
+
+
diff --git a/bin/splitfasta.py b/bin/splitfasta.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+
+"""
+This script extract per chromosome sequence from the GRCh37 
+reference genome downloaded from:
+https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.25/
+
+It takes the ref genome file as its only argument.
+"""
+
+from Bio import SeqIO
+import sys
+
+RefSeq = ['NC_000001.10', 'NC_000002.11'
+'NC_000003.11', 'NC_000004.11', 'NC_000005.9', 'NC_000006.11',
+'NC_000007.13', 'NC_000008.10', 'NC_000009.11', 'NC_000010.10',
+'NC_000011.9', 'NC_000012.11', 'NC_000013.10', 'NC_000014.8',
+'NC_000015.9', 'NC_000016.9', 'NC_000017.10', 'NC_000018.9',
+'NC_000019.9', 'NC_000020.10', 'NC_000021.8', 'NC_000022.10']
+
+#wantedSeqs = sys.argv[2]
+seqs = SeqIO.parse(open(sys.argv[1]), 'fasta')
+for name in RefSeq:
+    SeqIO.write((seq for seq in seqs if seq.id in name), sys.stdout, 'fasta')
+