add false primering check for db1, instead of using only db2

Runsheng · Runsheng · commit 33594a2ea3c3 · 2022-10-09T16:14:12.000+08:00
diff --git a/primerdiffer/general_settings.py b/primerdiffer/general_settings.py
@@ -1,10 +1,3 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# @Time    : 9/13/16 3:13 PM
-# @Author  : Runsheng
-# @Email   : Runsheng.lee@gmail.com
-# @File    : general_settings.py
-
 primer3_general_settings =  {
         'PRIMER_OPT_SIZE': 20,
         'PRIMER_PICK_LEFT_PRIMER':1,
diff --git a/primerdiffer/primer_check.py b/primerdiffer/primer_check.py
@@ -4,6 +4,9 @@
 # @Author  : Runsheng
 # @Email   : Runsheng.lee@gmail.com
 # @File    : primer_check.py
+import primer3
+
+from primerdiffer.general_settings import primer3_general_settings
 
 try:
     from StringIO import StringIO ## for Python 2
@@ -14,6 +17,21 @@
 from Bio.Blast import NCBIXML
 
 
+def my_design_primer(name,seq,primer3_settings=primer3_general_settings):
+    """
+    general wrapper for primer3-py
+    :param name: name for the sequence
+    :param seq: string for sequence in UPPER case
+    :param primer3_settings: general setting for primer_design
+    :return: the dict storing the primer pairs specific for seq
+    """
+    seq_args = {'SEQUENCE_ID': name,
+                'SEQUENCE_TEMPLATE':seq}
+    myprimer=primer3.bindings.designPrimers(seq_args,primer3_settings)
+    return myprimer
+
+
+
 def primer_blast(query, db):
     # query = myprimer['PRIMER_LEFT_0_SEQUENCE'] # The sequence
     blastn_cline = NcbiblastnCommandline(db=db, outfmt=5, task="blastn-short") #Blast command
@@ -22,7 +40,58 @@ def primer_blast(query, db):
     return blast_records
 
 
-def is_nofalse_primer(blast_records,query,debugmod=False):
+def filter_hsp(blast_records,query,cutoff_alignlength=16,cutoff_free3=2, debugmod=False):
+    """
+    filter the hsp, keep only the hsps with align_length <cutoff or free3 <cutoff
+    used mainly for insilicon_pcr
+    """
+    keep=[]
+    for n,alignment in enumerate(blast_records.alignments):
+        # get all possible alignment position that pass the filter
+        if debugmod==True:
+            print ("chro is", alignment.hit_def)
+        for hsp in alignment.hsps:
+            if debugmod==True:
+                print ("The subj end is", hsp.sbjct_end)
+                print ("The query is", query)
+                print (hsp)
+                print (hsp.query_end, len(query))
+            # get the cutoff
+            if hsp.align_length>=cutoff_alignlength or len(query)-hsp.query_end<=cutoff_free3:
+                strand=hsp.frame[-1]  # the query is always 1, the target may be -1, 1 is plus and -1 is minus
+                keep.append((alignment.hit_def, hsp.sbjct_start, hsp.sbjct_end, strand)) # no end , just one pos
+                if debugmod==True:
+                    print ("===============Keep==============")
+
+    return keep
+
+
+def insilicon_pcr(primer_left, primer_right, db, cutoff_alignlength=16, cutoff_free3=2, profuct_cutoff=2000,
+                  debugmod=False):
+    """
+    para: the left and right primers
+    return: a bed-like tuple-list
+    """
+    possible_product = []
+
+    blast_records_left = primer_blast(primer_left, db)
+    blast_records_right = primer_blast(primer_right, db)
+
+    # p_left is a bed like tuple list like [("I", 10000)]
+    p_left = filter_hsp(blast_records_left, primer_left, cutoff_alignlength, cutoff_free3)
+    p_right = filter_hsp(blast_records_right, primer_right, cutoff_alignlength, cutoff_free3)
+
+    for pl in p_left:  # may need to add a score sys to this function
+        for pr in p_right:
+            # print pl, pr
+            # use only the start to get a approx length, also, the direction for left and right primer should be different
+            if pl[0] == pr[0] and abs(pl[1] - pr[1]) <= profuct_cutoff and pl[-1] * pr[-1] == -1:
+                possible_product.append((pl[0], pl[1], pr[1]))
+
+    return possible_product
+
+
+def _is_nofalse_primer(blast_records,query,debugmod=False):
     """
     :param blast_records: input a blast record in XML format
     :param query: the query sequence (str)
@@ -43,21 +112,46 @@ def is_nofalse_primer(blast_records,query,debugmod=False):
     return True
 
 
-def primer_check(myprimer, db, primer_number=5, debugmod=False):
-    '''primer is a return of function primer3.bindings.designPrimers'''
+def primer_check(myprimer, db1, db2, primer_number=5,
+                 cutoff_alignlength=16,cutoff_free3=2, profuct_cutoff=2000,
+                 db1_maxhit=1, db2_maxhit=0,
+                 debugmod=False):
+    '''primer is a return of function primer3.bindings.designPrimers
+    db1 and db2 are blastdb,
+
+    CASE1:
+    db1 is the fasta genome used to design primer, so the product need to be only 1 (db1_maxhit=1)
+    db2 is the fasta genome which should not be amplified, so the product need to be 0 (db2_maxhit=2)
+
+    CASE2:
+    db1 is a short sequence used to design primer, products (db1_maxhit) need to be <=1
+    db2 is the whole genome, which should has little
+
+    cutoff_alignlength=16,cutoff_free3=2, profuct_cutoff=2000 are used for in silicon PCR
+    '''
+
     for i in range(0, primer_number):
         left = myprimer['PRIMER_LEFT_' + str(i) + '_SEQUENCE']
         right = myprimer['PRIMER_RIGHT_' + str(i) + '_SEQUENCE']
-        if debugmod:
-            print ("The %d primer :" % i)
-            print (left, right)
-        blast_records_l = primer_blast(left,db=db)
-        blast_records_r = primer_blast(right,db=db)
+        # designed primer size
         product_size = myprimer['PRIMER_PAIR_' + str(i) + '_PRODUCT_SIZE']
 
-        if is_nofalse_primer(blast_records_l, left, debugmod=debugmod) and is_nofalse_primer(blast_records_r, right,
-                                                                                             debugmod=debugmod):
-            print ("Both pass")
-            return (left, right, product_size)
+        # the original sequence to detect the false primer
+        product_l1=insilicon_pcr(left, right, db1,
+                                cutoff_alignlength,cutoff_free3, profuct_cutoff,
+                                debugmod=debugmod)
+        # the genome used to check false priming
+        product_l2=insilicon_pcr(left, right, db2,
+                                cutoff_alignlength, cutoff_free3, profuct_cutoff,
+                                debugmod=debugmod)
+        if debugmod:
+            print("The %d primer :" % i)
+            print(left, right)
+            print("product_l1", product_l1)
+            print("produect_l2", product_l2)
+
+        if len(product_l1)<=db1_maxhit and len(product_l2)<=db2_maxhit: # no false primer
+            return left, right, product_size # return is a tuple
     return 0
 
+
diff --git a/primerdiffer/utils.py b/primerdiffer/utils.py
@@ -18,7 +18,7 @@ def fasta2dic(fastafile):
     Give a fasta file name, return a dict contains the name and seq
     Require Biopython SeqIO medule to parse the sequence into dict, a large genome may take a lot of RAM
     """
-    handle=open(fastafile, "rU")
+    handle=open(fastafile, "r")
     record_dict=SeqIO.to_dict(SeqIO.parse(handle,"fasta"))
     handle.close()
     return record_dict
@@ -36,6 +36,25 @@ def dic2dic(record_dict):
     return seq_dict
 
 
+def tuple_to_pos_str(pos_t):
+    """
+    reverse of pos_str_to_tuple
+    """
+    pos_s=[str(x) for x in pos_t]
+    chro, start,end=pos_s
+    return chro+":"+start+"-"+end
+
+
+def pos_str_to_tuple(pos_str):
+    """
+    pos_str: chro:start-end, same as the input for igv or ucsc web browser
+    return (chro, start, end), start and end are int
+    """
+    chro=pos_str.split(":")[0]
+    start, end =pos_str.split(":")[1].split("-") # still str
+    return (chro, int(start), int(end))
+
+
 def chr_select(seq_dict, chro, start,end):
     """
     Note the start and end is 0 based
@@ -44,10 +63,8 @@ def chr_select(seq_dict, chro, start,end):
     for example, chrcut(record_dict, "I", 100,109) returns
      ("I:100_109","AAAAAAAAAA")
     """
-    name=chro+ ":"+str(round(float(start)/1000000,2))+"M"
+    name=tuple_to_pos_str((chro, start, end))
     seq=str(seq_dict[chro])[start:end]
     return name,seq
 
 
-if __name__=="__main__":
-    pass
diff --git a/primerdiffer/walk_chr.py b/primerdiffer/walk_chr.py
@@ -6,30 +6,18 @@
 # @File    : walk_chr.py
 
 # third part import
-import primer3
 
 # self import
-from utils import chr_select
-from primer_check import primer_check
-from general_settings import primer3_general_settings
+import os
+from os.path import exists
 
+from Bio.Blast.Applications import NcbimakeblastdbCommandline
 
-def my_design_primer(name,seq,primer3_settings=primer3_general_settings):
-    # todo: to make the setting for the primer number easier
-    """
-    general wrapper for primer3-py
-    :param name: name for the sequence
-    :param seq: string for sequence in UPPER case
-    :param primer3_settings: general setting for primer_design
-    :return: the dict storing the primer pairs specific for seq
-    """
-    seq_args = {'SEQUENCE_ID': name,
-                'SEQUENCE_TEMPLATE':seq}
-    myprimer=primer3.bindings.designPrimers(seq_args,primer3_settings)
-    return myprimer
+from primerdiffer.primer_check import my_design_primer, primer_check
+from primerdiffer.utils import dic2dic, fasta2dic, chr_select, tuple_to_pos_str, pos_str_to_tuple
 
 
-def walk_chr_dense(genome, chro, db,
+def walk_chr_dense(genome, chro, start, end, db1, db2,
                    interval=500000, jump=4000, out_prefix="primers"):
     """
     :param genome: genome is a dict in name:seq,
@@ -40,26 +28,27 @@ def walk_chr_dense(genome, chro, db,
     :return:
     """
     primer_dict = {}
-    f_out = open(out_prefix + "_" + chro + ".txt", "w")
+    pos_str= tuple_to_pos_str((chro, start, end))
+    f_out = open(out_prefix + "_" + pos_str + ".txt", "w")
 
-    n = len(genome[chro]) / interval
+    n = len(genome[chro][start:end]) / interval
     i = 0
     while i < n:
         offset = 0
         while offset <= (interval / 2) / jump:
-            name, seq = chr_select(genome, chro, i * interval + offset * jump,
-                                   i * interval + (offset + 1) * jump)
-            # print name,seq
+            name, seq = chr_select(genome, chro, start+i * interval + offset * jump,
+                                   start+i * interval + (offset + 1) * jump)
+            print ("Design primers for ", name)
             if "N" in seq.upper():
                 offset += 1
             else:
                 myprimer = my_design_primer(name=name, seq=seq)
-                primer_status = primer_check(myprimer,db)
-                if primer_status == 0:
+                primer_used = primer_check(myprimer,db1,db2, debugmod=True)
+                if primer_used == 0:
                     offset += 1
                 else:
                     offset = interval / jump  # just >(interval/2)/jump, used to indicate the sucess of primer finding
-                    left, right, product_size = primer_status
+                    left, right, product_size = primer_used
         i += 1
 
         if offset == interval / jump:
@@ -71,3 +60,50 @@ def walk_chr_dense(genome, chro, db,
 
     f_out.close()
     return primer_dict
+
+
+def makeblastdb(genomefile):
+    cline = NcbimakeblastdbCommandline(dbtype="nucl",
+                                       input_file=genomefile)
+    NcbimakeblastdbCommandline(cmd='makeblastdb', dbtype='prot', input_file='NC_005816.faa')
+    print(cline)
+    cline()
+
+
+def checkblastdb(genomefile):
+    """
+    check if the blastdb exist, if not, create one
+    """
+    dbfile=genomefile+".nsq"
+    if exists(dbfile):
+        return 0
+    else:
+        makeblastdb(genomefile)
+        return 1
+
+
+def flow_walk_chr(wkdir, genome1, genome2, pos_str, interval=4000, jump=400, out_prefix="primers"):
+    """
+
+    genome1: the genome fasta file used to design primers
+    genome2: the genome fasta used to blast against, the primer should not amplify any product in genome2
+
+    pos_str: chro:start-end, same as the input for igv or ucsc web browser
+
+    """
+    # can add wkdir is os.getcwd() later
+    os.chdir(wkdir)
+
+    # first to test if the blastdb for genome2 is there, if not, create one
+    checkblastdb(genome1)
+    checkblastdb(genome2)
+
+    # main
+    g1=dic2dic(fasta2dic(genome1))
+    #g2=dic2dic(fasta2dic(genome2))
+    chro, start, end = pos_str_to_tuple(pos_str)
+
+    primer_dict=walk_chr_dense(genome=g1, chro=chro, start=start, end=end,
+                               db1=genome1,db2=genome2,
+                   interval=interval, jump=jump, out_prefix=out_prefix)
+    print(primer_dict)
diff --git a/test/test_primer_check.py b/test/test_primer_check.py
diff --git a/test/test_walk_chr.py b/test/test_walk_chr.py