Skip to content

Commit 33594a2

Browse files
committed
add false primering check for db1, instead of using only db2
1 parent 9adef0f commit 33594a2

6 files changed

Lines changed: 259 additions & 74 deletions

File tree

primerdiffer/general_settings.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,3 @@
1-
#!/usr/bin/env python
2-
# -*- coding: utf-8 -*-
3-
# @Time : 9/13/16 3:13 PM
4-
# @Author : Runsheng
5-
# @Email : Runsheng.lee@gmail.com
6-
# @File : general_settings.py
7-
81
primer3_general_settings = {
92
'PRIMER_OPT_SIZE': 20,
103
'PRIMER_PICK_LEFT_PRIMER':1,

primerdiffer/primer_check.py

Lines changed: 106 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
# @Author : Runsheng
55
# @Email : Runsheng.lee@gmail.com
66
# @File : primer_check.py
7+
import primer3
8+
9+
from primerdiffer.general_settings import primer3_general_settings
710

811
try:
912
from StringIO import StringIO ## for Python 2
@@ -14,6 +17,21 @@
1417
from Bio.Blast import NCBIXML
1518

1619

20+
def my_design_primer(name,seq,primer3_settings=primer3_general_settings):
21+
"""
22+
general wrapper for primer3-py
23+
:param name: name for the sequence
24+
:param seq: string for sequence in UPPER case
25+
:param primer3_settings: general setting for primer_design
26+
:return: the dict storing the primer pairs specific for seq
27+
"""
28+
seq_args = {'SEQUENCE_ID': name,
29+
'SEQUENCE_TEMPLATE':seq}
30+
myprimer=primer3.bindings.designPrimers(seq_args,primer3_settings)
31+
return myprimer
32+
33+
34+
1735
def primer_blast(query, db):
1836
# query = myprimer['PRIMER_LEFT_0_SEQUENCE'] # The sequence
1937
blastn_cline = NcbiblastnCommandline(db=db, outfmt=5, task="blastn-short") #Blast command
@@ -22,7 +40,58 @@ def primer_blast(query, db):
2240
return blast_records
2341

2442

25-
def is_nofalse_primer(blast_records,query,debugmod=False):
43+
def filter_hsp(blast_records,query,cutoff_alignlength=16,cutoff_free3=2, debugmod=False):
44+
"""
45+
filter the hsp, keep only the hsps with align_length <cutoff or free3 <cutoff
46+
used mainly for insilicon_pcr
47+
"""
48+
keep=[]
49+
for n,alignment in enumerate(blast_records.alignments):
50+
# get all possible alignment position that pass the filter
51+
if debugmod==True:
52+
print ("chro is", alignment.hit_def)
53+
for hsp in alignment.hsps:
54+
if debugmod==True:
55+
print ("The subj end is", hsp.sbjct_end)
56+
print ("The query is", query)
57+
print (hsp)
58+
print (hsp.query_end, len(query))
59+
# get the cutoff
60+
if hsp.align_length>=cutoff_alignlength or len(query)-hsp.query_end<=cutoff_free3:
61+
strand=hsp.frame[-1] # the query is always 1, the target may be -1, 1 is plus and -1 is minus
62+
keep.append((alignment.hit_def, hsp.sbjct_start, hsp.sbjct_end, strand)) # no end , just one pos
63+
if debugmod==True:
64+
print ("===============Keep==============")
65+
66+
return keep
67+
68+
69+
def insilicon_pcr(primer_left, primer_right, db, cutoff_alignlength=16, cutoff_free3=2, profuct_cutoff=2000,
70+
debugmod=False):
71+
"""
72+
para: the left and right primers
73+
return: a bed-like tuple-list
74+
"""
75+
possible_product = []
76+
77+
blast_records_left = primer_blast(primer_left, db)
78+
blast_records_right = primer_blast(primer_right, db)
79+
80+
# p_left is a bed like tuple list like [("I", 10000)]
81+
p_left = filter_hsp(blast_records_left, primer_left, cutoff_alignlength, cutoff_free3)
82+
p_right = filter_hsp(blast_records_right, primer_right, cutoff_alignlength, cutoff_free3)
83+
84+
for pl in p_left: # may need to add a score sys to this function
85+
for pr in p_right:
86+
# print pl, pr
87+
# use only the start to get a approx length, also, the direction for left and right primer should be different
88+
if pl[0] == pr[0] and abs(pl[1] - pr[1]) <= profuct_cutoff and pl[-1] * pr[-1] == -1:
89+
possible_product.append((pl[0], pl[1], pr[1]))
90+
91+
return possible_product
92+
93+
94+
def _is_nofalse_primer(blast_records,query,debugmod=False):
2695
"""
2796
:param blast_records: input a blast record in XML format
2897
:param query: the query sequence (str)
@@ -43,21 +112,46 @@ def is_nofalse_primer(blast_records,query,debugmod=False):
43112
return True
44113

45114

46-
def primer_check(myprimer, db, primer_number=5, debugmod=False):
47-
'''primer is a return of function primer3.bindings.designPrimers'''
115+
def primer_check(myprimer, db1, db2, primer_number=5,
116+
cutoff_alignlength=16,cutoff_free3=2, profuct_cutoff=2000,
117+
db1_maxhit=1, db2_maxhit=0,
118+
debugmod=False):
119+
'''primer is a return of function primer3.bindings.designPrimers
120+
db1 and db2 are blastdb,
121+
122+
CASE1:
123+
db1 is the fasta genome used to design primer, so the product need to be only 1 (db1_maxhit=1)
124+
db2 is the fasta genome which should not be amplified, so the product need to be 0 (db2_maxhit=2)
125+
126+
CASE2:
127+
db1 is a short sequence used to design primer, products (db1_maxhit) need to be <=1
128+
db2 is the whole genome, which should has little
129+
130+
cutoff_alignlength=16,cutoff_free3=2, profuct_cutoff=2000 are used for in silicon PCR
131+
'''
132+
48133
for i in range(0, primer_number):
49134
left = myprimer['PRIMER_LEFT_' + str(i) + '_SEQUENCE']
50135
right = myprimer['PRIMER_RIGHT_' + str(i) + '_SEQUENCE']
51-
if debugmod:
52-
print ("The %d primer :" % i)
53-
print (left, right)
54-
blast_records_l = primer_blast(left,db=db)
55-
blast_records_r = primer_blast(right,db=db)
136+
# designed primer size
56137
product_size = myprimer['PRIMER_PAIR_' + str(i) + '_PRODUCT_SIZE']
57138

58-
if is_nofalse_primer(blast_records_l, left, debugmod=debugmod) and is_nofalse_primer(blast_records_r, right,
59-
debugmod=debugmod):
60-
print ("Both pass")
61-
return (left, right, product_size)
139+
# the original sequence to detect the false primer
140+
product_l1=insilicon_pcr(left, right, db1,
141+
cutoff_alignlength,cutoff_free3, profuct_cutoff,
142+
debugmod=debugmod)
143+
# the genome used to check false priming
144+
product_l2=insilicon_pcr(left, right, db2,
145+
cutoff_alignlength, cutoff_free3, profuct_cutoff,
146+
debugmod=debugmod)
147+
if debugmod:
148+
print("The %d primer :" % i)
149+
print(left, right)
150+
print("product_l1", product_l1)
151+
print("produect_l2", product_l2)
152+
153+
if len(product_l1)<=db1_maxhit and len(product_l2)<=db2_maxhit: # no false primer
154+
return left, right, product_size # return is a tuple
62155
return 0
63156

157+

primerdiffer/utils.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def fasta2dic(fastafile):
1818
Give a fasta file name, return a dict contains the name and seq
1919
Require Biopython SeqIO medule to parse the sequence into dict, a large genome may take a lot of RAM
2020
"""
21-
handle=open(fastafile, "rU")
21+
handle=open(fastafile, "r")
2222
record_dict=SeqIO.to_dict(SeqIO.parse(handle,"fasta"))
2323
handle.close()
2424
return record_dict
@@ -36,6 +36,25 @@ def dic2dic(record_dict):
3636
return seq_dict
3737

3838

39+
def tuple_to_pos_str(pos_t):
40+
"""
41+
reverse of pos_str_to_tuple
42+
"""
43+
pos_s=[str(x) for x in pos_t]
44+
chro, start,end=pos_s
45+
return chro+":"+start+"-"+end
46+
47+
48+
def pos_str_to_tuple(pos_str):
49+
"""
50+
pos_str: chro:start-end, same as the input for igv or ucsc web browser
51+
return (chro, start, end), start and end are int
52+
"""
53+
chro=pos_str.split(":")[0]
54+
start, end =pos_str.split(":")[1].split("-") # still str
55+
return (chro, int(start), int(end))
56+
57+
3958
def chr_select(seq_dict, chro, start,end):
4059
"""
4160
Note the start and end is 0 based
@@ -44,10 +63,8 @@ def chr_select(seq_dict, chro, start,end):
4463
for example, chrcut(record_dict, "I", 100,109) returns
4564
("I:100_109","AAAAAAAAAA")
4665
"""
47-
name=chro+ ":"+str(round(float(start)/1000000,2))+"M"
66+
name=tuple_to_pos_str((chro, start, end))
4867
seq=str(seq_dict[chro])[start:end]
4968
return name,seq
5069

5170

52-
if __name__=="__main__":
53-
pass

primerdiffer/walk_chr.py

Lines changed: 62 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,30 +6,18 @@
66
# @File : walk_chr.py
77

88
# third part import
9-
import primer3
109

1110
# self import
12-
from utils import chr_select
13-
from primer_check import primer_check
14-
from general_settings import primer3_general_settings
11+
import os
12+
from os.path import exists
1513

14+
from Bio.Blast.Applications import NcbimakeblastdbCommandline
1615

17-
def my_design_primer(name,seq,primer3_settings=primer3_general_settings):
18-
# todo: to make the setting for the primer number easier
19-
"""
20-
general wrapper for primer3-py
21-
:param name: name for the sequence
22-
:param seq: string for sequence in UPPER case
23-
:param primer3_settings: general setting for primer_design
24-
:return: the dict storing the primer pairs specific for seq
25-
"""
26-
seq_args = {'SEQUENCE_ID': name,
27-
'SEQUENCE_TEMPLATE':seq}
28-
myprimer=primer3.bindings.designPrimers(seq_args,primer3_settings)
29-
return myprimer
16+
from primerdiffer.primer_check import my_design_primer, primer_check
17+
from primerdiffer.utils import dic2dic, fasta2dic, chr_select, tuple_to_pos_str, pos_str_to_tuple
3018

3119

32-
def walk_chr_dense(genome, chro, db,
20+
def walk_chr_dense(genome, chro, start, end, db1, db2,
3321
interval=500000, jump=4000, out_prefix="primers"):
3422
"""
3523
:param genome: genome is a dict in name:seq,
@@ -40,26 +28,27 @@ def walk_chr_dense(genome, chro, db,
4028
:return:
4129
"""
4230
primer_dict = {}
43-
f_out = open(out_prefix + "_" + chro + ".txt", "w")
31+
pos_str= tuple_to_pos_str((chro, start, end))
32+
f_out = open(out_prefix + "_" + pos_str + ".txt", "w")
4433

45-
n = len(genome[chro]) / interval
34+
n = len(genome[chro][start:end]) / interval
4635
i = 0
4736
while i < n:
4837
offset = 0
4938
while offset <= (interval / 2) / jump:
50-
name, seq = chr_select(genome, chro, i * interval + offset * jump,
51-
i * interval + (offset + 1) * jump)
52-
# print name,seq
39+
name, seq = chr_select(genome, chro, start+i * interval + offset * jump,
40+
start+i * interval + (offset + 1) * jump)
41+
print ("Design primers for ", name)
5342
if "N" in seq.upper():
5443
offset += 1
5544
else:
5645
myprimer = my_design_primer(name=name, seq=seq)
57-
primer_status = primer_check(myprimer,db)
58-
if primer_status == 0:
46+
primer_used = primer_check(myprimer,db1,db2, debugmod=True)
47+
if primer_used == 0:
5948
offset += 1
6049
else:
6150
offset = interval / jump # just >(interval/2)/jump, used to indicate the sucess of primer finding
62-
left, right, product_size = primer_status
51+
left, right, product_size = primer_used
6352
i += 1
6453

6554
if offset == interval / jump:
@@ -71,3 +60,50 @@ def walk_chr_dense(genome, chro, db,
7160

7261
f_out.close()
7362
return primer_dict
63+
64+
65+
def makeblastdb(genomefile):
66+
cline = NcbimakeblastdbCommandline(dbtype="nucl",
67+
input_file=genomefile)
68+
NcbimakeblastdbCommandline(cmd='makeblastdb', dbtype='prot', input_file='NC_005816.faa')
69+
print(cline)
70+
cline()
71+
72+
73+
def checkblastdb(genomefile):
74+
"""
75+
check if the blastdb exist, if not, create one
76+
"""
77+
dbfile=genomefile+".nsq"
78+
if exists(dbfile):
79+
return 0
80+
else:
81+
makeblastdb(genomefile)
82+
return 1
83+
84+
85+
def flow_walk_chr(wkdir, genome1, genome2, pos_str, interval=4000, jump=400, out_prefix="primers"):
86+
"""
87+
88+
genome1: the genome fasta file used to design primers
89+
genome2: the genome fasta used to blast against, the primer should not amplify any product in genome2
90+
91+
pos_str: chro:start-end, same as the input for igv or ucsc web browser
92+
93+
"""
94+
# can add wkdir is os.getcwd() later
95+
os.chdir(wkdir)
96+
97+
# first to test if the blastdb for genome2 is there, if not, create one
98+
checkblastdb(genome1)
99+
checkblastdb(genome2)
100+
101+
# main
102+
g1=dic2dic(fasta2dic(genome1))
103+
#g2=dic2dic(fasta2dic(genome2))
104+
chro, start, end = pos_str_to_tuple(pos_str)
105+
106+
primer_dict=walk_chr_dense(genome=g1, chro=chro, start=start, end=end,
107+
db1=genome1,db2=genome2,
108+
interval=interval, jump=jump, out_prefix=out_prefix)
109+
print(primer_dict)

0 commit comments

Comments
 (0)