From 81ff5646f5731583138e7bd773bdbc4bfcc52436 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 12 Apr 2026 22:00:47 +0200 Subject: [PATCH 1/5] Added miniprot as second search tool for candidate region determination and enabled single gene search --- fdog/data/conda_requirements.yml | 1 + fdog/data/dependencies.txt | 1 + fdog/fDOGassembly.py | 208 ++++++++++++++++++++++++------- 3 files changed, 164 insertions(+), 46 deletions(-) diff --git a/fdog/data/conda_requirements.yml b/fdog/data/conda_requirements.yml index d5deb2b..818e6b4 100644 --- a/fdog/data/conda_requirements.yml +++ b/fdog/data/conda_requirements.yml @@ -6,3 +6,4 @@ mafft muscle=5.1 augustus=3.5.0 metaeuk +miniprot diff --git a/fdog/data/dependencies.txt b/fdog/data/dependencies.txt index 6b36db4..e3502f7 100644 --- a/fdog/data/dependencies.txt +++ b/fdog/data/dependencies.txt @@ -5,3 +5,4 @@ mafft muscle augustus metaeuk +miniprot diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 850304a..7c20569 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -61,7 +61,6 @@ def check_ref_spec(species_list, fasta_file): print("Reference species is not part of the ortholog group. Exciting ...") sys.exit() - def starting_subprocess(cmd, mode, time_out = None): try: @@ -166,9 +165,41 @@ def parse_blast(line, blast_results, cutoff): blast_results[node_name] = list else: blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]] - + print(blast_results) return blast_results, evalue +def parse_miniprot(line, results, cutoff): + # 1 string Protein sequence name + #2 int Protein sequence length + #3 int Protein start coordinate (0-based) + #4 int Protein end coordinate (0-based) + #5 char ‘+’ for forward strand; ‘-’ for reverse + #6 string Contig sequence name + #7 int Contig sequence length + #8 int Contig start coordinate on the original strand + #9 int Contig end coordinate on the original strand + #10 int Number of matching nucleotides + #11 int Number of nucleotides in alignment excl. introns + #12 int Mapping quality (0-255 with 255 for missing) + print(line) + print(results) + print(cutoff) + + line = line.replace("\n", "") + line_info = line.split("\t") + #add region to dictionary + + refstart, refend, strand, contig, start, end, quality = int(line_info[2]), int(line_info[3]), line_info[4], line_info[5], int(line_info[7]), int(line_info[8]), int(line_info[11]) + + if contig in results: + list = results[contig] + list.append([int(start),int(end), quality, int(refstart), int(refend), strand, quality]) + results[contig] = list + else: + results[contig] = [[int(start),int(end), quality, int(refstart), int(refend), strand, quality]] + print(results) + return results, 0 + def get_x_results(blast_dic, x, score_list): new_dic = {} @@ -187,27 +218,35 @@ def get_x_results(blast_dic, x, score_list): number_regions += len(key_list) return new_dic, number_regions -def candidate_regions(intron_length, cutoff_evalue, tmp_path, x = 10): +def candidate_regions(intron_length, cutoff_evalue, tmp_path, searchTool, x = 10): ###################### extracting candidate regions ######################## + # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 - blast_file = open(tmp_path + "/blast_results.out", "r") + if searchTool == "blast": + search_file = open(tmp_path + "/blast_results.out", "r") + else: + search_file = open(tmp_path + "/miniprot_results.out", "r") evalue = 0 - blast_results = {} + results = {} #parsing blast output while True: - line = blast_file.readline() + line = search_file.readline() #end of file is reached if not line: break - #parsing blast output - blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) - - if blast_results == {}: - blast_file.close() + #parsing output + if searchTool == "blast": + results, evalue = parse_blast(line, results, cutoff_evalue) + else: + results, evalue = parse_miniprot(line, results, cutoff_evalue) + print(results) + if results == {}: + search_file.close() return 0,0 else: - candidate_regions, number_regions, score_list = merge(blast_results, intron_length) - blast_file.close() + candidate_regions, number_regions, score_list = merge(results, intron_length) + search_file.close() + print(score_list) if number_regions > x: candidate_regions, number_regions = get_x_results(candidate_regions, x, score_list) return candidate_regions, number_regions @@ -345,6 +384,18 @@ def searching_for_db(assembly_path): check = False return check +def check_blast_version(blast_db, mode): + """ Check if blast DBs are compatible with blastp version """ + fdog_path = os.path.realpath(__file__).replace('fDOGassembly.py', '') + print(fdog_path) + if mode == "prot": + query = os.path.join(fdog_path, 'data', 'infile.fa') + try: + cmd = ["blastp", "-query", query, "-db", blast_db] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + except subprocess.CalledProcessError as e: + sys.exit(f"ERROR: Error running BLAST (probably conflict with BLAST DB versions)\n{e.stderr}") + def get_distance_biopython(file, matrix): """ Reads alignment file and returns distance matrix """ #print(file) @@ -469,12 +520,9 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva except KeyError: #print("The fDOG reference species isn't part of the core ortholog group, ... exciting") return 0, seed - if searchTool == "blast": - cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile - starting_subprocess(cmd, mode) - else: - print("diamonds are the girls best friends") - ##### diamond call + + cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + starting_subprocess(cmd, mode) alg_file = open(tmp_path + "blast_" + fdog_ref_species, "r") lines = alg_file.readlines() @@ -872,18 +920,7 @@ def run_fas(cmd): sys.exit() return output -def ortholog_search_tblastn(args): - (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms) = args - output = [] - asNamePath = asName.replace('@', '_') - cmd = 'mkdir ' + out + '/tmp/' + asNamePath - starting_subprocess(cmd, 'silent') - tmp_path = out + "tmp/" + asNamePath + "/" - candidatesOutFile = tmp_path + group + ".candidates.fa" - - output.append("Searching in species " + asName + "\n") - assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" - db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" +def tblastn(assemblyDir, asName, consensus_path, evalue, tmp_path, mode, output, db_path, assembly_path): blast_dir_path = assemblyDir + "/" + asName + "/blast_dir/" if not os.path.exists(blast_dir_path): cmd = 'mkdir ' + blast_dir_path @@ -893,6 +930,9 @@ def ortholog_search_tblastn(args): if db_check == 0: cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path starting_subprocess(cmd, mode) + else: + ## check if blast version and DB are compartable + check_blast_version(db_path, "nt") #makes a tBLASTn search against database #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt @@ -902,12 +942,53 @@ def ortholog_search_tblastn(args): time_tblastn_end = time.time() time_tblastn = time_tblastn_end - time_tblastn_start if exit_code == 1: - output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName) - return [], candidatesOutFile, output + return 1 output.append("Time tblastn %s in species %s" % (str(time_tblastn), asName)) + return 0 - regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) +def miniprot(assemblyDir, asName, consensus_path, evalue, tmp_path, mode, output, assembly_path): + mini_db_path = assemblyDir + "/" + asName + "/miniprot_dir/" + asName + ".mpi" + if not os.path.exists(mini_db_path): + cmd = 'mkdir -p ' + os.path.dirname(assemblyDir + "/" + asName + "/miniprot_dir/") + starting_subprocess(cmd, 'silent') + cmd = 'miniprot -d ' + mini_db_path + ' ' + assembly_path + starting_subprocess(cmd, mode) + + cmd = "miniprot %s %s > %s" % (mini_db_path, consensus_path, tmp_path + "/miniprot_results.out") + print(cmd) + time_miniprot_start = time.time() + result = starting_subprocess(cmd, 'silent') + time_miniprot_end = time.time() + time_miniprot = time_miniprot_end - time_miniprot_start + if result == 1: + return 1 + + output.append("Time miniprot %s in species %s" % (str(time_miniprot), asName)) + return 0 + +def ortholog_search_forward(args): + (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms) = args + output = [] + asNamePath = asName.replace('@', '_') + cmd = 'mkdir ' + out + '/tmp/' + asNamePath + starting_subprocess(cmd, 'silent') + tmp_path = out + "tmp/" + asNamePath + "/" + candidatesOutFile = tmp_path + group + ".candidates.fa" + db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" + + output.append("Searching in species " + asName + "\n") + assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" + + if searchTool == 'blast': + exit_code = tblastn(assemblyDir, asName, consensus_path, evalue, tmp_path, mode, output, db_path, assembly_path) + if exit_code == 1: + output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName) + return [], candidatesOutFile, output + else: + exit_code = miniprot(assemblyDir, asName, consensus_path, evalue, tmp_path, mode, output, assembly_path) + + regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path, searchTool) if regions == 0: #no candidat region are available, no ortholog can be found output.append("No candidate region found for species %s!\n" % asName) @@ -916,7 +997,7 @@ def ortholog_search_tblastn(args): else: output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) extract_seq(regions, db_path, tmp_path, mode) - + print(regions) if gene_prediction == "augustus": ############### make Augustus PPX search ################################### time_augustus_start = time.time() @@ -960,7 +1041,7 @@ def blockProfiles(core_path, group, mode, out, msaTool): check_path(fasta_path) if msaTool == "muscle": if align_fn.get_muscle_version(msaTool) == 'v3': - print("muscle -quiet -in " + fasta_path + " -out " + msa_path) + cmd = "muscle -quiet -in " + fasta_path + " -out " + msa_path else: cmd = "muscle -quiet -align " + fasta_path + " -output " + msa_path elif msaTool == "mafft-linsi": @@ -1077,12 +1158,27 @@ def getAugustusRefSpec(mapping_augustus): dict[assembly] = id return dict +def extract_ref_gene(fasta, species_list): + """ Extracts reference gene sequences and uses this for ortholog search instead of consensus sequence + """ + seqs = readFasta(fasta) + for seq in seqs: + for species in species_list: + if seq.id.split('|')[1] == species: + species = seq.id.split("|")[1] + print("Gene of reference species identified:") + print(seq.id) + return seq + sys.exit("No fitting reference gene found. Check if a sequence of the reference species is part of the ortholog group!") + + + def main(): #################### handle user input ##################################### start = time.time() - version = '0.1.5.2' + version = '0.2.0.0' ################### initialize parser ###################################### parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) @@ -1100,7 +1196,7 @@ def main(): optional.add_argument('--out', help='Output directory', action='store', default='') optional.add_argument('--dataPath', help='fDOG data directory containing searchTaxa_dir, coreTaxa_dir and annotation_dir', action='store', default='') optional.add_argument('--coregroupPath', help='core_ortholog directory containing ortholog groups of gene of interest', action='store', default='') - #optional.add_argument('--searchTool', help='Choose between blast and diamond as alignment search tool(default:blast)', action='store', choices=['blast', 'diamond'], default='blast') + optional.add_argument('--searchTool', help='Search tool used for candidate region determination. (default: miniprot)', choices=['miniprot', 'blast'], action='store', default='miniprot') optional.add_argument('--evalBlast', help='E-value cut-off for the Blast search. (default: 0.00001)', action='store', default=0.00001, type=float) optional.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', action='store_true', default=False) optional.add_argument('--msaTool', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. (default:muscle)', choices=['mafft-linsi', 'muscle'], action='store', default='muscle') @@ -1119,8 +1215,10 @@ def main(): optional.add_argument('--augustusRefSpec', help='Augustus reference species identifier (use command: augustus --species=help to get precomputed augustus gene models)', action='store', default='') optional.add_argument('--augustusRefSpecFile', help='Mapping file tab seperated containing Assembly Names and augustus reference species that should be used', action='store', default='') optional.add_argument('--metaeukDb', help='Path to MetaEuk reference database', action='store', default='') + optional.add_argument('--miniprot', help='Path to MiniProt database', action='store', default='') optional.add_argument('--isoforms', help='All Isoforms of a gene passing the ortholog verification will be included in the output', action='store_true', default=False) optional.add_argument('--gff', help='GFF files will be included in output', action='store_true', default=False) + optional.add_argument('--referenceGeneOnly', help='Extracts the protein sequence of the reference Species from the core group and preforms ortholog search with this sequence only', default=False, action='store_true') args = parser.parse_args() # required @@ -1136,12 +1234,12 @@ def main(): tmp = args.tmp strict = args.strict checkCoorthologs = args.checkCoorthologsOff + refGene = args.referenceGeneOnly # print(checkCoorthologs) #others average_intron_length = args.avIntron length_extension = args.lengthExtension - #searchTool = args.searchTool - searchTool = 'blast' + searchTool = args.searchTool evalue = args.evalBlast msaTool = args.msaTool matrix = args.scoringmatrix @@ -1285,7 +1383,7 @@ def main(): ###################### preparations ######################################## - if augustus == True: + if augustus == True and refGene == False: group_computation_time_start = time.time() consensus_path = core_path + '/' + group + '/' + group + '.con' if check_path(consensus_path, exit=False) == 1: @@ -1293,11 +1391,11 @@ def main(): print(consensus_path) profile_path = core_path + '/' + group + '/' + group + '.prfl' if check_path(profile_path, exit=False) == 1: - profile_path = blockProfiles(core_path, group, mode, out, msaTool) + profile_path = blockProfiles(core_path, group, 'silent', out, msaTool) print(profile_path) group_computation_time_end = time.time() time_group = group_computation_time_end - group_computation_time_start - else: + elif augustus != True and refGene == False: #print("test") profile_path = "" group_computation_time_start = time.time() @@ -1307,6 +1405,24 @@ def main(): #concatinade core_group sequences if metaeuk should be run without tblastn group_computation_time_end = time.time() time_group = group_computation_time_end - group_computation_time_start + else: + group_computation_time_start = time.time() + consensus_path = core_path + '/' + group + '/' + group + '.con' + if augustus == True: + profile_path = core_path + '/' + group + '/' + group + '.prfl' + if check_path(profile_path, exit=False) == 1: + profile_path = blockProfiles(core_path, group, 'silent', out, msaTool) + print(profile_path) + else: + profile_path = "" + ref_gene = extract_ref_gene(core_path + '/' + group + '/' + group + '.fa', [fdog_ref_species]) + print(consensus_path) + with open(consensus_path, 'w') as file: + file.write('>' + ref_gene.id + '\n') + file.write(str(ref_gene.seq) + '\n') + group_computation_time_end = time.time() + time_group = group_computation_time_end - group_computation_time_start + ###################### ortholog search ##################################### @@ -1329,7 +1445,7 @@ def main(): print("%s is not included in Augustus reference species mapping file. %s will be skipped" %(asName, asName)) print("Searching for orthologs ...", flush=True) - for i in tqdm(pool.imap_unordered(ortholog_search_tblastn, calls),total=len(calls)): + for i in tqdm(pool.imap_unordered(ortholog_search_forward, calls),total=len(calls)): ortholog_sequences.append([i[0], i[1]]) if mode == 'debug': for k in i[2]: @@ -1346,7 +1462,7 @@ def main(): args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms] except KeyError: print("%s is not included in Augustus reference species mapping file. %s will be skipped" % (asName, asName)) - reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_tblastn(args) + reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_forward(args) ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) if mode == 'debug': for k in output_ortholog_search: @@ -1377,7 +1493,7 @@ def main(): tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) cmd = ['fas.run', '--seed', fasta_path , '--query' , orthologsOutFile , '--annotation_dir' , tmp_path + 'anno_dir' ,'--bidirectional', '--tsv', '--phyloprofile', mappingFile, '--seed_id', fas_seed_id, '--out_dir', out, '--out_name', group] - # print(cmd) + #print(cmd) fas_out = run_fas(cmd) clean_fas(out + group + "_forward.domains", 'domains') clean_fas(out + group + "_reverse.domains", 'domains') From c1a35e1e8878f7ec4beb31db815177fb8dc2c1db Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 19 Apr 2026 17:28:27 +0200 Subject: [PATCH 2/5] enabled lowComplexity filter for blastp --- fdog/fDOGassembly.py | 63 ++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 7c20569..a859525 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -165,7 +165,7 @@ def parse_blast(line, blast_results, cutoff): blast_results[node_name] = list else: blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]] - print(blast_results) + #print(blast_results) return blast_results, evalue def parse_miniprot(line, results, cutoff): @@ -181,9 +181,9 @@ def parse_miniprot(line, results, cutoff): #10 int Number of matching nucleotides #11 int Number of nucleotides in alignment excl. introns #12 int Mapping quality (0-255 with 255 for missing) - print(line) - print(results) - print(cutoff) + #print(line) + #print(results) + #print(cutoff) line = line.replace("\n", "") line_info = line.split("\t") @@ -197,7 +197,7 @@ def parse_miniprot(line, results, cutoff): results[contig] = list else: results[contig] = [[int(start),int(end), quality, int(refstart), int(refend), strand, quality]] - print(results) + #print(results) return results, 0 def get_x_results(blast_dic, x, score_list): @@ -218,7 +218,7 @@ def get_x_results(blast_dic, x, score_list): number_regions += len(key_list) return new_dic, number_regions -def candidate_regions(intron_length, cutoff_evalue, tmp_path, searchTool, x = 10): +def candidate_regions(intron_length, cutoff_evalue, tmp_path, searchTool, x): ###################### extracting candidate regions ######################## # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 @@ -239,15 +239,13 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path, searchTool, x = 10 results, evalue = parse_blast(line, results, cutoff_evalue) else: results, evalue = parse_miniprot(line, results, cutoff_evalue) - print(results) if results == {}: search_file.close() return 0,0 else: candidate_regions, number_regions, score_list = merge(results, intron_length) search_file.close() - print(score_list) - if number_regions > x: + if number_regions > x and search_file == 'blast': candidate_regions, number_regions = get_x_results(candidate_regions, x, score_list) return candidate_regions, number_regions @@ -293,7 +291,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug # augutus call cmd = "augustus --protein=1 --gff3=on --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" #print(cmd) - starting_subprocess(cmd, 'normal') + starting_subprocess(cmd, 'silent') # transfer augustus output to AS sequence #print(tmp_path) #print(key) @@ -387,7 +385,7 @@ def searching_for_db(assembly_path): def check_blast_version(blast_db, mode): """ Check if blast DBs are compatible with blastp version """ fdog_path = os.path.realpath(__file__).replace('fDOGassembly.py', '') - print(fdog_path) + #print(fdog_path) if mode == "prot": query = os.path.join(fdog_path, 'data', 'infile.fa') try: @@ -501,7 +499,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates #rejected return 0, distance_ref_hit, distance_hit_query -def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue_cut_off, taxa, searchTool, checkCo, msaTool, matrix, dataPath, filter, tmp_path, mode): +def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue_cut_off, taxa, checkCo, msaTool, matrix, dataPath, tmp_path, mode, filter): # the backward search uses the genes predicted from augustus and makes a blastp search #the blastp search is against all species that are part of the core_ortholog group if the option --strict was chosen or only against the ref taxa seedDic = getSeedInfo(fasta_path) @@ -521,7 +519,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva #print("The fDOG reference species isn't part of the core ortholog group, ... exciting") return 0, seed - cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + " -seg " + filter starting_subprocess(cmd, mode) alg_file = open(tmp_path + "blast_" + fdog_ref_species, "r") @@ -947,16 +945,15 @@ def tblastn(assemblyDir, asName, consensus_path, evalue, tmp_path, mode, output, output.append("Time tblastn %s in species %s" % (str(time_tblastn), asName)) return 0 -def miniprot(assemblyDir, asName, consensus_path, evalue, tmp_path, mode, output, assembly_path): +def miniprot(assemblyDir, asName, consensus_path, tmp_path, mode, output, assembly_path, number_candidates): mini_db_path = assemblyDir + "/" + asName + "/miniprot_dir/" + asName + ".mpi" if not os.path.exists(mini_db_path): cmd = 'mkdir -p ' + os.path.dirname(assemblyDir + "/" + asName + "/miniprot_dir/") starting_subprocess(cmd, 'silent') cmd = 'miniprot -d ' + mini_db_path + ' ' + assembly_path starting_subprocess(cmd, mode) - - cmd = "miniprot %s %s > %s" % (mini_db_path, consensus_path, tmp_path + "/miniprot_results.out") - print(cmd) + cmd = "miniprot %s %s -N %i > %s" % (mini_db_path, consensus_path, number_candidates, tmp_path + "/miniprot_results.out") + #print(cmd) time_miniprot_start = time.time() result = starting_subprocess(cmd, 'silent') time_miniprot_end = time.time() @@ -968,7 +965,7 @@ def miniprot(assemblyDir, asName, consensus_path, evalue, tmp_path, mode, output return 0 def ortholog_search_forward(args): - (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms) = args + (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter) = args output = [] asNamePath = asName.replace('@', '_') cmd = 'mkdir ' + out + '/tmp/' + asNamePath @@ -986,9 +983,9 @@ def ortholog_search_forward(args): output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName) return [], candidatesOutFile, output else: - exit_code = miniprot(assemblyDir, asName, consensus_path, evalue, tmp_path, mode, output, assembly_path) + exit_code = miniprot(assemblyDir, asName, consensus_path, tmp_path, mode, output, assembly_path, number_candidates) - regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path, searchTool) + regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path, searchTool, number_candidates) if regions == 0: #no candidat region are available, no ortholog can be found output.append("No candidate region found for species %s!\n" % asName) @@ -997,7 +994,7 @@ def ortholog_search_forward(args): else: output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) extract_seq(regions, db_path, tmp_path, mode) - print(regions) + #print(regions) if gene_prediction == "augustus": ############### make Augustus PPX search ################################### time_augustus_start = time.time() @@ -1021,7 +1018,7 @@ def ortholog_search_forward(args): #print("No genes found at candidate regions\n") return [], candidatesOutFile, output - reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, checkCoorthologs, msaTool, matrix, dataPath, tmp_path, mode, low_complexity_filter) if reciprocal_sequences == 0: if regions != 0: @@ -1062,7 +1059,7 @@ def blockProfiles(core_path, group, mode, out, msaTool): print("Building block profiles failed. Using prepareAlign to convert alignment\n") new_path = core_path + group +"/"+ group + "_new.aln" cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path - starting_subprocess(cmd, mode) + starting_subprocess(cmd, 'silent') cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path starting_subprocess(cmd, 'silent') print(" \t ...finished \n", flush=True) @@ -1203,7 +1200,8 @@ def main(): optional.add_argument('--checkCoorthologsOff', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_false', default=True) optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. (default: blosum62)', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', nargs="+", default=[]) - #optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') + optional.add_argument('--numberCandidates', help='Number of candidate sequences to consider', action='store', default=10, type=int) + optional.add_argument('--lowComplexityFilter', help='Switch the low complexity filter for the blastp search on.', action='store', choices=['yes', 'no'], default='no') optional.add_argument('--fasoff', help='Turn off FAS support', action='store_true', default=False) optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') optional.add_argument('--searchTaxa', help='List of Taxa to search in, (default: all species located in assembly_dir)', action='store', nargs="+", default=[]) @@ -1235,7 +1233,6 @@ def main(): strict = args.strict checkCoorthologs = args.checkCoorthologsOff refGene = args.referenceGeneOnly - # print(checkCoorthologs) #others average_intron_length = args.avIntron length_extension = args.lengthExtension @@ -1255,6 +1252,8 @@ def main(): metaeuk_db = args.metaeukDb isoforms = args.isoforms gff = args.gff + number_candidates = args.numberCandidates + low_complexity_filter = args.lowComplexityFilter #gene prediction tool augustus = args.augustus @@ -1388,11 +1387,11 @@ def main(): consensus_path = core_path + '/' + group + '/' + group + '.con' if check_path(consensus_path, exit=False) == 1: consensus_path = consensusSequence(core_path, group, mode, out) - print(consensus_path) + #print(consensus_path) profile_path = core_path + '/' + group + '/' + group + '.prfl' if check_path(profile_path, exit=False) == 1: profile_path = blockProfiles(core_path, group, 'silent', out, msaTool) - print(profile_path) + #print(profile_path) group_computation_time_end = time.time() time_group = group_computation_time_end - group_computation_time_start elif augustus != True and refGene == False: @@ -1416,7 +1415,7 @@ def main(): else: profile_path = "" ref_gene = extract_ref_gene(core_path + '/' + group + '/' + group + '.fa', [fdog_ref_species]) - print(consensus_path) + #print(consensus_path) with open(consensus_path, 'w') as file: file.write('>' + ref_gene.id + '\n') file.write(str(ref_gene.seq) + '\n') @@ -1437,10 +1436,10 @@ def main(): pool = mp.Pool(cpus) for asName in assembly_names: if mapping_augustus == '': - calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms]) + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter]) else: try: - calls.append([asName, out, assemblyDir, consensus_path, aug_ref_dict[asName], group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms]) + calls.append([asName, out, assemblyDir, consensus_path, aug_ref_dict[asName], group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter]) except KeyError: print("%s is not included in Augustus reference species mapping file. %s will be skipped" %(asName, asName)) @@ -1456,10 +1455,10 @@ def main(): ###################### computation species wise ################ for asName in tqdm(assembly_names): if mapping_augustus == '': - args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms] + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter] else: try: - args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms] + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter] except KeyError: print("%s is not included in Augustus reference species mapping file. %s will be skipped" % (asName, asName)) reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_forward(args) From 7dc9ac9bb017ed8ea3c1643669d729ade92959bf Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 19 Apr 2026 17:42:46 +0200 Subject: [PATCH 3/5] added miniprot to conda_requirements.txt --- fdog/data/conda_requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/data/conda_requirements.txt b/fdog/data/conda_requirements.txt index e1b8eba..a41972d 100644 --- a/fdog/data/conda_requirements.txt +++ b/fdog/data/conda_requirements.txt @@ -6,3 +6,4 @@ mafft muscle>=5.1 augustus>=3.5.0 metaeuk +miniprot From 0c36c393a883da328bf517300be5f3f68ed7309e Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sat, 25 Apr 2026 18:33:12 +0200 Subject: [PATCH 4/5] New fast track mode included using miniprot results for ortholog search --- fdog/fDOGassembly.py | 329 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 280 insertions(+), 49 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a859525..b79bb79 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -35,6 +35,7 @@ from tqdm import tqdm from pathlib import Path import pandas as pd +import re ########################### functions ########################################## def check_path(path, exit=True): @@ -200,6 +201,101 @@ def parse_miniprot(line, results, cutoff): #print(results) return results, 0 +def parse_miniprot_predictions(tmp_path, candidatesOutFile, asName, group_name): + search_file = open(os.path.join(tmp_path, "miniprot_results.out"), "r") + + # --- 1. Parse hit blocks --- + hits = [] + current = None + + for line in search_file: + line = line.rstrip("\n") + + if line.startswith("##PAF"): + if current is not None: + hits.append(current) + + fields = line.split("\t") + tstart = int(fields[8]) + tend = int(fields[9]) + aln_len = int(fields[11]) + + as_score = None + for f in fields[12:]: + m = re.match(r"AS:i:(-?\d+)", f) + if m: + as_score = int(m.group(1)) + break + + current = { + "paf_line": line, + "sta_line": None, + "gff_lines": [], + "gff_id": None, + "tstart": tstart, + "tend": tend, + "aln_len": aln_len, + "as_score": as_score, + "sequence": None, + } + + elif line.startswith("##STA") and current is not None: + current["sta_line"] = line + current["sequence"] = line.split(None, 1)[1] if len(line.split(None, 1)) > 1 else "" + + elif line.startswith("##"): + pass + + elif current is not None and line.strip(): + current["gff_lines"].append(line) + cols = line.split("\t") + if len(cols) >= 9 and cols[2] == "mRNA": + m = re.search(r"ID=([^;]+)", cols[8]) + if m: + current["gff_id"] = m.group(1) + + if current is not None: + hits.append(current) + + search_file.close() + + # --- 2. Group overlapping hits --- + groups = [] + + for hit in hits: + placed = False + for group in groups: + for member in group: + if hit["tstart"] <= member["tend"] and hit["tend"] >= member["tstart"]: + group.append(hit) + placed = True + break + if placed: + break + if not placed: + groups.append([hit]) + + # --- 3. Select best hit per group (highest AS score, then longest alignment) --- + best_hits = [] + for group in groups: + best = max( + group, + key=lambda h: ( + h["as_score"] if h["as_score"] is not None else float("-inf"), + h["aln_len"], + ), + ) + best_hits.append(best) + + # --- 4. Write FASTA output --- + with open(candidatesOutFile, "w") as out: + for hit in best_hits: + header = hit["gff_id"] if hit["gff_id"] else "unknown" + header_full = f"{group_name}|{asName}|{header}" + seq = hit["sequence"] if hit["sequence"] else "" + out.write(f">{header_full}\n{seq}\n") + return candidatesOutFile + def get_x_results(blast_dic, x, score_list): new_dic = {} @@ -218,6 +314,75 @@ def get_x_results(blast_dic, x, score_list): number_regions += len(key_list) return new_dic, number_regions +def merge_overlapping_entries(format_dict): + """ + For each node, merge overlapping intervals based on start/end positions. + Overlapping entries in the original list remain unchanged. + Returns a new dict with merged (non-overlapping) intervals per node, + the total number of merged entries across all nodes, and a flat list of all scores. + + Merging rules: + - start: minimum of all overlapping entries + - end: maximum of all overlapping entries + - evalue: minimum (best) evalue of all overlapping entries + - qstart: minimum of all overlapping entries + - qend: maximum of all overlapping entries + - strand: must be identical across all overlapping entries, raises ValueError otherwise + - score: minimum of all overlapping entries + + Args: + format_dict: {node_name: [(start, end, evalue, qstart, qend, strand, score), ...]} + + Returns: + merged_dict: {node_name: [(min_start, max_end, min_evalue, min_qstart, max_qend, strand, min_score), ...]} + total_entries: int, total number of merged entries across all nodes + all_scores: list of all scores (float/int) from all merged entries, in node iteration order + """ + merged_dict = {} + total_entries = 0 + all_scores = [] + + for node_name, entries in format_dict.items(): + if not entries: + merged_dict[node_name] = [] + continue + + # Sort entries by start position to make overlap detection easier + sorted_entries = sorted(entries, key=lambda x: x[0]) + + merged = [] + + # Initialize the running interval with the first entry + curr_start, curr_end, curr_evalue, curr_qstart, curr_qend, curr_strand, curr_score = sorted_entries[0] + + for entry in sorted_entries[1:]: + entry_start, entry_end, entry_evalue, entry_qstart, entry_qend, entry_strand, entry_score = entry + + # Check if the current entry overlaps with the running interval + if entry_start <= curr_end and entry_strand == curr_strand: + # Merge: extend interval and keep best (minimum) evalue and score + curr_end = max(curr_end, entry_end) + curr_qstart = min(curr_qstart, entry_qstart) + curr_qend = max(curr_qend, entry_qend) + curr_evalue = min(curr_evalue, entry_evalue) + curr_score = min(curr_score, entry_score) + + else: + # No overlap: save the completed merged interval and start a new one + merged.append((curr_start, curr_end, curr_evalue, curr_qstart, curr_qend, curr_strand, curr_score)) + curr_start, curr_end, curr_evalue, curr_qstart, curr_qend, curr_strand, curr_score = entry + + # Append the last remaining interval + merged.append((curr_start, curr_end, curr_evalue, curr_qstart, curr_qend, curr_strand, curr_score)) + + merged_dict[node_name] = merged + + # Accumulate totals across all nodes + total_entries += len(merged) + all_scores.extend(entry[6] for entry in merged) + + return merged_dict, total_entries, all_scores + def candidate_regions(intron_length, cutoff_evalue, tmp_path, searchTool, x): ###################### extracting candidate regions ######################## @@ -243,7 +408,10 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path, searchTool, x): search_file.close() return 0,0 else: - candidate_regions, number_regions, score_list = merge(results, intron_length) + if searchTool == "blast": + candidate_regions, number_regions, score_list = merge(results, intron_length) + else: + candidate_regions, number_regions, score_list = merge_overlapping_entries(results) search_file.close() if number_regions > x and search_file == 'blast': candidate_regions, number_regions = get_x_results(candidate_regions, x, score_list) @@ -732,6 +900,8 @@ def getLocationFromGff(gff_file, name, tool): #print(name) if tool == 'metaeuk': gene_count = int(name.split('_')[-1:][0]) + elif tool == "miniprot": + gene_count = int(name[2:]) else: gene_count = int(name.split('.')[-2].replace('g', '').split('_')[-1:][0]) counter = 0 @@ -741,7 +911,7 @@ def getLocationFromGff(gff_file, name, tool): pass else: contig, source, type, start, end, score, strand, phase, att = line.split('\t') - if type == 'gene': + if type == 'gene' or tool == "miniprot" and type == 'mRNA': counter += 1 if counter == gene_count: position = [contig, int(start), int(end), strand] @@ -839,7 +1009,10 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci distance = distances[ref_id , name] id = name.split('|')[2] if isoforms == False: - gff_file = tmp_path + '/' + '_'.join(id.split('_')[0:-1]) + '.gff' + if gene_prediction != "miniprot": + gff_file = tmp_path + '/' + '_'.join(id.split('_')[0:-1]) + '.gff' + else: + gff_file = tmp_path + '/miniprot_results.out' position[name] = getLocationFromGff(gff_file, id, gene_prediction) if distance <= min_dist: min_dist = distance @@ -945,14 +1118,50 @@ def tblastn(assemblyDir, asName, consensus_path, evalue, tmp_path, mode, output, output.append("Time tblastn %s in species %s" % (str(time_tblastn), asName)) return 0 -def miniprot(assemblyDir, asName, consensus_path, tmp_path, mode, output, assembly_path, number_candidates): +def miniprot(assemblyDir, asName, consensus_path, tmp_path, mode, output, assembly_path, number_candidates, group): mini_db_path = assemblyDir + "/" + asName + "/miniprot_dir/" + asName + ".mpi" + asName = asName.replace('@', '_') + core_fasta_path = tmp_path.replace(asName,"") + group + ".fa" + #print(core_fasta_path) + + #create combined input file for miniprot, concat core group genes and consensus sequence + combined_fasta_path = tmp_path + "/miniprot_in.fa" + cmd = 'cat ' + core_fasta_path + ' ' + consensus_path + ' > ' + combined_fasta_path + starting_subprocess(cmd, 'silent') + if not os.path.exists(mini_db_path): cmd = 'mkdir -p ' + os.path.dirname(assemblyDir + "/" + asName + "/miniprot_dir/") starting_subprocess(cmd, 'silent') cmd = 'miniprot -d ' + mini_db_path + ' ' + assembly_path starting_subprocess(cmd, mode) - cmd = "miniprot %s %s -N %i > %s" % (mini_db_path, consensus_path, number_candidates, tmp_path + "/miniprot_results.out") + cmd = "miniprot -I %s %s --outn %i > %s" % (mini_db_path, combined_fasta_path, number_candidates, tmp_path + "/miniprot_results.out") + #print(cmd) + time_miniprot_start = time.time() + result = starting_subprocess(cmd, 'silent') + time_miniprot_end = time.time() + time_miniprot = time_miniprot_end - time_miniprot_start + if result == 1: + return 1 + + output.append("Time miniprot %s in species %s" % (str(time_miniprot), asName)) + return 0 + +def miniprot_fast(assemblyDir, asName, consensus_path, tmp_path, mode, output, assembly_path, number_candidates, group): + mini_db_path = assemblyDir + "/" + asName + "/miniprot_dir/" + asName + ".mpi" + asName = asName.replace('@', '_') + core_fasta_path = tmp_path.replace(asName,"") + group + ".fa" + + #create combined input file for miniprot, concat core group genes and consensus sequence + combined_fasta_path = tmp_path + "/miniprot_in.fa" + cmd = 'cat ' + core_fasta_path + ' ' + consensus_path + ' > ' + combined_fasta_path + starting_subprocess(cmd, 'silent') + + if not os.path.exists(mini_db_path): + cmd = 'mkdir -p ' + os.path.dirname(assemblyDir + "/" + asName + "/miniprot_dir/") + starting_subprocess(cmd, 'silent') + cmd = 'miniprot -d ' + mini_db_path + ' ' + assembly_path + starting_subprocess(cmd, mode) + cmd = "miniprot -I --trans --gff %s %s --outn %i > %s" % (mini_db_path, combined_fasta_path, number_candidates, tmp_path + "/miniprot_results.out") #print(cmd) time_miniprot_start = time.time() result = starting_subprocess(cmd, 'silent') @@ -965,7 +1174,7 @@ def miniprot(assemblyDir, asName, consensus_path, tmp_path, mode, output, assemb return 0 def ortholog_search_forward(args): - (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter) = args + (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter,fast) = args output = [] asNamePath = asName.replace('@', '_') cmd = 'mkdir ' + out + '/tmp/' + asNamePath @@ -973,6 +1182,7 @@ def ortholog_search_forward(args): tmp_path = out + "tmp/" + asNamePath + "/" candidatesOutFile = tmp_path + group + ".candidates.fa" db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" + regions = 0 output.append("Searching in species " + asName + "\n") assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" @@ -982,44 +1192,50 @@ def ortholog_search_forward(args): if exit_code == 1: output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName) return [], candidatesOutFile, output - else: - exit_code = miniprot(assemblyDir, asName, consensus_path, tmp_path, mode, output, assembly_path, number_candidates) - - regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path, searchTool, number_candidates) - if regions == 0: - #no candidat region are available, no ortholog can be found - output.append("No candidate region found for species %s!\n" % asName) - return [], candidatesOutFile, output + elif searchTool == 'miniprot' and fast == False: + exit_code = miniprot(assemblyDir, asName, consensus_path, tmp_path, mode, output, assembly_path, number_candidates, group) + elif fast == True: + exit_code = miniprot_fast(assemblyDir, asName, consensus_path, tmp_path, mode, output, assembly_path, number_candidates, group) + + if fast == False: + regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path, searchTool, number_candidates) + if regions == 0: + #no candidat region are available, no ortholog can be found + output.append("No candidate region found for species %s!\n" % asName) + return [], candidatesOutFile, output - else: - output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) - extract_seq(regions, db_path, tmp_path, mode) + else: + output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) + extract_seq(regions, db_path, tmp_path, mode) #print(regions) - if gene_prediction == "augustus": - ############### make Augustus PPX search ################################### - time_augustus_start = time.time() - augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - time_augustus_end = time.time() - time_augustus = time_augustus_end - time_augustus_start - output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) - else: - time_metaeuk_start = time.time() - if metaeuk_db == '': - db = fasta_path + if gene_prediction == "augustus": + ############### make Augustus PPX search ################################### + time_augustus_start = time.time() + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) + time_augustus_end = time.time() + time_augustus = time_augustus_end - time_augustus_start + output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) else: - db = metaeuk_db - metaeuk_single(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, db) - time_metaeuk_end = time.time() - time_metaeuk = time_metaeuk_end - time_metaeuk_start - output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName)) - - ################# backward search to filter for orthologs################### - if int(os.path.getsize(candidatesOutFile)) <= 0: - #print("No genes found at candidate regions\n") - return [], candidatesOutFile, output - + time_metaeuk_start = time.time() + if metaeuk_db == '': + db = fasta_path + else: + db = metaeuk_db + metaeuk_single(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, db) + time_metaeuk_end = time.time() + time_metaeuk = time_metaeuk_end - time_metaeuk_start + output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName)) + + ################# backward search to filter for orthologs################### + if int(os.path.getsize(candidatesOutFile)) <= 0: + #print("No genes found at candidate regions\n") + return [], candidatesOutFile, output + + else: + candidatesOutFile = parse_miniprot_predictions(tmp_path, candidatesOutFile,asName, group) reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, checkCoorthologs, msaTool, matrix, dataPath, tmp_path, mode, low_complexity_filter) + if reciprocal_sequences == 0: if regions != 0: output.append("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) @@ -1085,6 +1301,7 @@ def consensusSequence(core_path, group, mode, out): def createGff(ortholog_sequences, out_folder, tool): #print(ortholog_sequences) #print(out_folder) + #print(tool) gff_folder = out_folder + "/gff/" os.system('mkdir %s >/dev/null 2>&1' %(gff_folder)) types_set = set(['gene', 'CDS', 'transcript', 'mRNA', 'exon']) @@ -1103,10 +1320,15 @@ def createGff(ortholog_sequences, out_folder, tool): region = '_'.join(gene.split('_')[0:-1]) if tool == 'metaeuk': gene_count = int(gene.split('_')[-1:][0]) + elif tool == "miniprot": + gene_count = int(gene[2:]) else: gene_count = int(gene.split('.')[-2].replace('g', '').split('_')[-1:][0]) #print(region, gene_count) - gff_file_gene = "%s/tmp/%s/%s.gff" %(out_folder, species.replace('@', '_'), region) + if tool != 'miniprot': + gff_file_gene = "%s/tmp/%s/%s.gff" %(out_folder, species.replace('@', '_'), region) + else: + gff_file_gene = "%s/tmp/%s/miniprot_results.out" %(out_folder, species.replace('@', '_')) #print(gff_file_gene) with open(gff_file_gene, 'r') as gff: counter = 0 @@ -1116,7 +1338,7 @@ def createGff(ortholog_sequences, out_folder, tool): else: line=line.rstrip() contig, source, type, start, end, score, strand, phase, att = line.split('\t') - if type == 'gene': + if type == 'gene' or tool == 'miniprot' and type == 'mRNA': counter += 1 if counter == gene_count: if source == 'AUGUSTUS': @@ -1216,6 +1438,7 @@ def main(): optional.add_argument('--miniprot', help='Path to MiniProt database', action='store', default='') optional.add_argument('--isoforms', help='All Isoforms of a gene passing the ortholog verification will be included in the output', action='store_true', default=False) optional.add_argument('--gff', help='GFF files will be included in output', action='store_true', default=False) + optional.add_argument('--fast', help='Enable FAST mode for faster processing by using miniprot only', action='store_true', default=False) optional.add_argument('--referenceGeneOnly', help='Extracts the protein sequence of the reference Species from the core group and preforms ortholog search with this sequence only', default=False, action='store_true') args = parser.parse_args() @@ -1254,6 +1477,7 @@ def main(): gff = args.gff number_candidates = args.numberCandidates low_complexity_filter = args.lowComplexityFilter + fast = args.fast #gene prediction tool augustus = args.augustus @@ -1265,9 +1489,11 @@ def main(): if mapping_augustus != '': check_path(mapping_augustus) aug_ref_dict = getAugustusRefSpec(mapping_augustus) + elif fast == True: + gene_prediction = "miniprot" else: gene_prediction = "metaeuk" - if metaeuk_db == '': + if metaeuk_db == '' and fast != True: print("MetaEuk DB is required when using MetaEuk as gene prediction tool") return 1 @@ -1377,6 +1603,11 @@ def main(): cmd = 'mkdir ' + out + '/tmp' starting_subprocess(cmd, 'silent') + #link fasta group to tmp + cmd = 'ln -s ' + os.path.abspath(fasta_path) + ' ' + tmp_folder + '/' + group + '.fa' + #print(cmd) + starting_subprocess(cmd, 'normal') + print("Gene: " + group, flush=True) print("fDOG reference species: " + fdog_ref_species + " \n",flush=True) @@ -1404,7 +1635,7 @@ def main(): #concatinade core_group sequences if metaeuk should be run without tblastn group_computation_time_end = time.time() time_group = group_computation_time_end - group_computation_time_start - else: + elif fast == False and refGene == True or refGene == True and searchTool == 'miniprot': group_computation_time_start = time.time() consensus_path = core_path + '/' + group + '/' + group + '.con' if augustus == True: @@ -1421,8 +1652,8 @@ def main(): file.write(str(ref_gene.seq) + '\n') group_computation_time_end = time.time() time_group = group_computation_time_end - group_computation_time_start - - + else: + sys.exit("--referenceGeneOnly can not be used in combination with --searchTool 'miniprot' or --fast. Please remove --referenceGeneOnly if miniprot should be used") ###################### ortholog search ##################################### @@ -1436,10 +1667,10 @@ def main(): pool = mp.Pool(cpus) for asName in assembly_names: if mapping_augustus == '': - calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter]) + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter, fast]) else: try: - calls.append([asName, out, assemblyDir, consensus_path, aug_ref_dict[asName], group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter]) + calls.append([asName, out, assemblyDir, consensus_path, aug_ref_dict[asName], group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter, fast]) except KeyError: print("%s is not included in Augustus reference species mapping file. %s will be skipped" %(asName, asName)) @@ -1455,10 +1686,10 @@ def main(): ###################### computation species wise ################ for asName in tqdm(assembly_names): if mapping_augustus == '': - args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter] + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter, fast] else: try: - args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter] + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter, fast] except KeyError: print("%s is not included in Augustus reference species mapping file. %s will be skipped" % (asName, asName)) reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_forward(args) From fc437cd5d9a282a2d269d59eea5be6c113188357 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 10 May 2026 20:47:59 +0200 Subject: [PATCH 5/5] new fallback strategy using miniprot predictions if metaeuk or augustus mode fail to predict an ortholog --- fdog/fDOGassembly.py | 94 ++++++++++++++++++++++++++++++++------------ 1 file changed, 69 insertions(+), 25 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b79bb79..28f1f97 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -185,12 +185,11 @@ def parse_miniprot(line, results, cutoff): #print(line) #print(results) #print(cutoff) - + line = line.replace("\n", "") line_info = line.split("\t") #add region to dictionary - - refstart, refend, strand, contig, start, end, quality = int(line_info[2]), int(line_info[3]), line_info[4], line_info[5], int(line_info[7]), int(line_info[8]), int(line_info[11]) + refstart, refend, strand, contig, start, end, quality = int(line_info[3]), int(line_info[4]), line_info[5], line_info[6], int(line_info[8]), int(line_info[9]), int(line_info[12]) if contig in results: list = results[contig] @@ -403,7 +402,8 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path, searchTool, x): if searchTool == "blast": results, evalue = parse_blast(line, results, cutoff_evalue) else: - results, evalue = parse_miniprot(line, results, cutoff_evalue) + if line.startswith("##PAF") == True: + results, evalue = parse_miniprot(line, results, cutoff_evalue) if results == {}: search_file.close() return 0,0 @@ -457,9 +457,24 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug end = str(i[1] + length_extension) name = key + "_" + str(counter) # augutus call + #print(augustus_ref_species) cmd = "augustus --protein=1 --gff3=on --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" #print(cmd) - starting_subprocess(cmd, 'silent') + result = starting_subprocess(cmd, 'silent') + # Error handling + if result == 1: + print(f"[WARNING] Augustus timed out for {name}") + elif result is not None: + stderr_output = result.stderr.decode('utf-8').strip() if result.stderr else "" + + if stderr_output and any(kw in stderr_output.lower() + for kw in ["error", "failed", "abort", "exception"]): + print(f"[WARNING] Augustus reported an error for species {ass_name} region {name} from {start} to {end}:\n{stderr_output}") + + if result.returncode != 0: + print(f"[WARNING] Augustus exited with code {result.returncode} for species {ass_name} region {name} from {start} to {end}") + if stderr_output: + print(f" Details: {stderr_output}") # transfer augustus output to AS sequence #print(tmp_path) #print(key) @@ -505,7 +520,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group #metaeuk call sensitive #cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 --remove-tmp-files -s 6" #print(cmd) - cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --max-intron 130000 --max-seq-len 160000 --min-exon-aa 15 --max-overlap 15 --min-intron 5 --overlap 1 -s 4.5 --remove-tmp-files" + cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --max-intron 130000 --max-seq-len 160000 --min-exon-aa 15 --max-overlap 15 --min-intron 5 --overlap 1 -s 6 --split 0 --remove-tmp-files" # other parameteres used by BUSCO with metazoa set--max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 starting_subprocess(cmd, mode) # parsing header and sequences @@ -803,7 +818,6 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva #print("No ortholog was found with option --strict") return 0, seed - #print(orthologs) orthologs = set(orthologs) return list(orthologs), seed @@ -896,14 +910,24 @@ def cleanup(tmp, tmp_path): if file.endswith(".fasta"): os.remove(os.path.join(root, file)) -def getLocationFromGff(gff_file, name, tool): +def getLocationFromGff(gff_file, name, tool, mini): #print(name) if tool == 'metaeuk': - gene_count = int(name.split('_')[-1:][0]) + try: + gene_count = int(name.split('_')[-1:][0]) + except (IndexError, ValueError): + gene_count = int(name[2:]) + tool = "miniprot" + gff_file = mini elif tool == "miniprot": gene_count = int(name[2:]) else: - gene_count = int(name.split('.')[-2].replace('g', '').split('_')[-1:][0]) + try: + gene_count = int(name.split('.')[-2].replace('g', '').split('_')[-1:][0]) + except IndexError: + gene_count = int(name[2:]) + tool = "miniprot" + gff_file = mini counter = 0 with open(gff_file,'r') as gff: for line in gff: @@ -959,7 +983,6 @@ def checkOverlap(position, n=30): def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix, isoforms, gene_prediction, mode='silent'): if len(candidate_names) == 1: return candidate_names - candidates = readFasta(candidatesFile) ref = readFasta(fasta) @@ -1013,7 +1036,8 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci gff_file = tmp_path + '/' + '_'.join(id.split('_')[0:-1]) + '.gff' else: gff_file = tmp_path + '/miniprot_results.out' - position[name] = getLocationFromGff(gff_file, id, gene_prediction) + mini = tmp_path + '/miniprot_results.out' + position[name] = getLocationFromGff(gff_file, id, gene_prediction, mini) if distance <= min_dist: min_dist = distance min_name = name @@ -1048,7 +1072,7 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci pass else: checked.append(name) - #print(checked) + return checked def clean_fas(path, file_type): @@ -1183,7 +1207,6 @@ def ortholog_search_forward(args): candidatesOutFile = tmp_path + group + ".candidates.fa" db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" regions = 0 - output.append("Searching in species " + asName + "\n") assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" @@ -1192,9 +1215,7 @@ def ortholog_search_forward(args): if exit_code == 1: output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName) return [], candidatesOutFile, output - elif searchTool == 'miniprot' and fast == False: - exit_code = miniprot(assemblyDir, asName, consensus_path, tmp_path, mode, output, assembly_path, number_candidates, group) - elif fast == True: + elif searchTool == 'miniprot' or fast == True: exit_code = miniprot_fast(assemblyDir, asName, consensus_path, tmp_path, mode, output, assembly_path, number_candidates, group) if fast == False: @@ -1229,15 +1250,28 @@ def ortholog_search_forward(args): ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: #print("No genes found at candidate regions\n") - return [], candidatesOutFile, output - + if searchTool == 'blast': + return [], candidatesOutFile, output + else: + print("No gene predicted with augustus or metaeuk, using miniprot predictions for species %s" % asName) + candidatesOutFile = parse_miniprot_predictions(tmp_path, candidatesOutFile,asName, group) + fast = True + else: candidatesOutFile = parse_miniprot_predictions(tmp_path, candidatesOutFile,asName, group) + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, checkCoorthologs, msaTool, matrix, dataPath, tmp_path, mode, low_complexity_filter) if reciprocal_sequences == 0: - if regions != 0: + if fast == False and searchTool != 'blast': + candidatesOutFile = parse_miniprot_predictions(tmp_path, candidatesOutFile,asName, group) + fast = True + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, checkCoorthologs, msaTool, matrix, dataPath, tmp_path, mode, low_complexity_filter) + if reciprocal_sequences == 0: + if regions != 0: + output.append("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) + elif regions != 0: output.append("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) return [], candidatesOutFile, output else: @@ -1302,6 +1336,7 @@ def createGff(ortholog_sequences, out_folder, tool): #print(ortholog_sequences) #print(out_folder) #print(tool) + changed = False gff_folder = out_folder + "/gff/" os.system('mkdir %s >/dev/null 2>&1' %(gff_folder)) types_set = set(['gene', 'CDS', 'transcript', 'mRNA', 'exon']) @@ -1316,16 +1351,25 @@ def createGff(ortholog_sequences, out_folder, tool): continue #print(gene.split('|')) group, species, gene = gene.split('|') + changed = False #print(group, species, gene) region = '_'.join(gene.split('_')[0:-1]) if tool == 'metaeuk': - gene_count = int(gene.split('_')[-1:][0]) + try: + gene_count = int(gene.split('_')[-1:][0]) + except (IndexError, ValueError): + gene_count = int(gene[2:]) + changed = True elif tool == "miniprot": gene_count = int(gene[2:]) else: - gene_count = int(gene.split('.')[-2].replace('g', '').split('_')[-1:][0]) + try: + gene_count = int(gene.split('.')[-2].replace('g', '').split('_')[-1:][0]) + except IndexError: + gene_count = int(gene[2:]) + changed = True #print(region, gene_count) - if tool != 'miniprot': + if tool != 'miniprot' and changed == False: gff_file_gene = "%s/tmp/%s/%s.gff" %(out_folder, species.replace('@', '_'), region) else: gff_file_gene = "%s/tmp/%s/miniprot_results.out" %(out_folder, species.replace('@', '_')) @@ -1338,7 +1382,7 @@ def createGff(ortholog_sequences, out_folder, tool): else: line=line.rstrip() contig, source, type, start, end, score, strand, phase, att = line.split('\t') - if type == 'gene' or tool == 'miniprot' and type == 'mRNA': + if type == 'gene' or tool == 'miniprot' and type == 'mRNA' or changed == True and type == 'mRNA': counter += 1 if counter == gene_count: if source == 'AUGUSTUS': @@ -1689,7 +1733,7 @@ def main(): args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter, fast] else: try: - args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter, fast] + args = [asName, out, assemblyDir, consensus_path, aug_ref_dict[asName], group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms, number_candidates, low_complexity_filter, fast] except KeyError: print("%s is not included in Augustus reference species mapping file. %s will be skipped" % (asName, asName)) reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_forward(args)