seqsero_v2: SeqSero2/SeqSero2_package.py comparison

comparison SeqSero2/SeqSero2_package.py @ 7:87c7eebc6797

planemo upload

author	jpayne
date	Fri, 07 Jun 2019 15:48:15 -0400
parents	b82e5f3c9187
children	77d3edd25de7

comparison

equal deleted inserted replaced

-:5b84740f0463
+:87c7eebc6797
 def parse_args():
 "Parse the input arguments, use '-h' for help."
 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com')#add "-m <data_type>" in future
 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data")
 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)")
-parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: mode for mapping, 'sam'(bwa samse/sampe), 'mem'(bwa mem), default=mem")
+parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: mode used for mapping in allele workflow, 'sam'(bwa samse/sampe), 'mem'(bwa mem), default=mem")
-parser.add_argument("-p",default="1",help="<int>: threads used for mapping mode, if p>4, only 4 threads will be used for assembly, default=1")
+parser.add_argument("-p",default="1",help="<int>: threads used for allele workflow, if p>4, only 4 threads will be used for assembly step, default=1")
-parser.add_argument("-m",choices=['k','a'],default="k",help="<string>: 'k'(kmer mode), 'a'(allele mode), default=k")
+parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: 'k'(kmer workflow), 'a'(allele workflow), default=a")
 parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number")
 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will use clean mode and only output serotyping prediction, the directory containing log files will be deleted")
 return parser.parse_args()
 def reverse_complement(sequence):
 ]
 star = ""
 star_line = ""
 if len(seronames) > 1:  #there are two possible predictions for serotypes
 star = "*"
-star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
+#changed 04072019
+#star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0:
 star="*"
-star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
+star_line=" The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme." + star_line
+#star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
 if  Otype=="":
 Otype="-"
 predict_form = Otype + ":" + fliC + ":" + fljB
 predict_sero = (" or ").join(seronames)
 ###special test for Enteritidis
 if predict_form == "9:g,m:-":
 sdf = "-"
 for x in special_gene_list:
 if x.startswith("sdf"):
 sdf = "+"
-predict_form = predict_form + " Sdf prediction:" + sdf
+#star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
+star_line=" sdf gene detected." # ed_SL_04152019: new output format
+#predict_form = predict_form + " Sdf prediction:" + sdf
+predict_form = predict_form #changed 04072019
 if sdf == "-":
 star = "*"
-star_line = "Additional characterization is necessary to assign a serotype to this strain.  Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
+#star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
-predict_sero = "Gallinarum/Enteritidis sdf -"
+star_line=" sdf gene not detected." # ed_SL_04152019: new output format
+#changed in 04072019, for new output
+#star_line = "Additional characterization is necessary to assign a serotype to this strain.  Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
+#predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement
+predict_sero = "Gallinarum or Enteritidis" # ed_SL_04152019: new output format
 ###end of special test for Enteritidis
 elif predict_form == "4:i:-":
-predict_sero = "potential monophasic variant of Typhimurium"
+predict_sero = "4:i:-"
 elif predict_form == "4:r:-":
-predict_sero = "potential monophasic variant of Heidelberg"
+predict_sero = "4:r:-"
 elif predict_form == "4:b:-":
-predict_sero = "potential monophasic variant of Paratyphi B"
+predict_sero = "4:b:-"
 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
 #predict_sero = "Newport"
 #star = "*"
 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
-claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme.  New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies.  Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
+claim = " The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies.  Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
 if "N/A" in predict_sero:
 claim = ""
 #special test for Typhimurium
 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
 normal = 0
 elif "oafA-O-4_5-" in x:
 mutation = float(special_gene_list[x])
 if normal > mutation:
 pass
 elif normal < mutation:
-predict_sero = predict_sero.strip() + "(O5-)"
+#predict_sero = predict_sero.strip() + "(O5-)"
+predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019
 star = "*"
-star_line = "Detected the deletion of O5-."
+#star_line = "Detected the deletion of O5-."
+star_line = " Detected a deletion that causes O5- variant of Typhimurium." # ed_SL_04152019: new output format
 else:
 pass
 #special test for Paratyphi B
 if "Paratyphi B" in predict_sero or predict_form == "4:b:-":
 normal = 0
 normal = float(special_gene_list[x])
 elif "gntR-family-regulatory-protein_dt-negative" in x:
 mutation = float(special_gene_list[x])
 #print(normal,mutation)
 if normal > mutation:
-predict_sero = predict_sero.strip() + "(dt+)"
+#predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019
+predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip() # ed_SL_04152019: new output format
 star = "*"
-star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
+#star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
+star_line = " The SNP that causes d-Tartrate nonfermentating phenotype was not detected. " # ed_SL_04152019: new output format
 elif normal < mutation:
-predict_sero = predict_sero.strip() + "(dt-)"
+#predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019
+predict_sero = predict_sero.strip()
 star = "*"
-star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
+#star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
+star_line = " Detected the SNP for d-Tartrate nonfermenting phenotype." # ed_SL_04152019: new output format
 else:
 star = "*"
-star_line = "Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
+star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
 #special test for O13,22 and O13,23
 if Otype=="13":
 #ex_dir = os.path.dirname(os.path.realpath(__file__))
 f = open(dirpath + '/special.pickle', 'rb')
 special = pickle.load(f)
 #print(O22_score,O23_score)
 for z in O22_O23[0]:
 if predict_sero.split(" or ")[0] in z:
 if O22_score > O23_score:
 star = "*"
-star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'."
+#star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
 predict_sero = z[0]
 elif O22_score < O23_score:
 star = "*"
-star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'."
+#star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
 predict_sero = z[1]
 else:
 star = "*"
-star_line = "Fail to detect O22 and O23 differences."
+#star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019
+if " or " in predict_sero:
+star_line = star_line + " The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
 #special test for O6,8
 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
 #for x in merge_O68_list:
 #  if x in predict_sero:
 #    predict_sero=x
 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
 #print "$$$No Otype, due to no hit"#may need to be changed
 O_choice="-"
 else:
-highest_O_coverage=max([float(x[0].split("_cov_")[-1]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
+highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
 O_list=[]
 O_list_less_contamination=[]
 for x in final_O:
 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off
 O_list.append(x[0].split("__")[0])
 O_nodes_list.append(x[0].split("___")[1])
-if float(x[0].split("_cov_")[-1])>highest_O_coverage*0.15:
+if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:
 O_list_less_contamination.append(x[0].split("__")[0])
 ### special test for O9,46 and O3,10 family
 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
 O_choice="O-9,46"
 ### end of special test for O9,46 and O3,10 family
 else:
 try:
 max_score=0
 for x in final_O:
-if x[2]>=max_score and float(x[0].split("_cov_")[-1])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold
+if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold
 max_score=x[2]#change from x[-1] to x[2],08172018
 O_choice=x[0].split("_")[0]
 if O_choice=="O-1,3,19":
 O_choice=final_O[1][0].split("_")[0]
 #print "$$$Most possilble Otype: ",O_choice
 fljB_choice="-"
 fliC_contig="NA"
 fljB_contig="NA"
 fliC_region=set([0])
 fljB_region=set([0,])
-fliC_length=0 #can be changed to coverage in future
+fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
-fljB_length=0 #can be changed to coverage in future
+fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
 O_choice="-"#no need to decide O contig for now, should be only one
 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
 O_choice=O_choice.split("-")[-1].strip()
 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
 O_choice="-"
 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
+#add alignment locations, used for further selection, 03312019
+for i in range(len(H_contig_roles)):
+x=H_contig_roles[i]
+for y in Final_list_passed:
+if x[1] in y[0] and y[0].startswith(x[0]):
+H_contig_roles[i]+=H_contig_roles[i]+(y[-1],)
+break
 log_file=open("SeqSero_log.txt","a")
 extract_file=open("Extracted_antigen_alleles.fasta","a")
 handle_fasta=list(SeqIO.parse(new_fasta,"fasta"))
 #print("O_contigs:")
 for z in handle_fasta:
 if x[0].split("___")[-1]==z.description:
 seqs=str(z.seq)
 extract_file.write(title+seqs+"\n")
 if len(H_contig_roles)!=0:
-highest_H_coverage=max([float(x[1].split("_cov_")[-1]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
+highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
 else:
 highest_H_coverage=0
 for x in H_contig_roles:
 #if multiple choices, temporately select the one with longest length for now, will revise in further change
-if "fliC" == x[0] and int(x[1].split("_")[3])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
+if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
 fliC_contig=x[1]
-fliC_length=int(x[1].split("_")[3])
+fliC_length=len(x[-1])
-elif "fljB" == x[0] and int(x[1].split("_")[3])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1])>highest_H_coverage*0.13:
+elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:
 fljB_contig=x[1]
-fljB_length=int(x[1].split("_")[3])
+fljB_length=len(x[-1])
 for x in Final_list_passed:
 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]:
 fliC_choice=x[0].split("_")[1]
 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]:
 fljB_choice=x[0].split("_")[1]
 print("assembling...")
 if int(threads)>4:
 t="4"
 else:
 t=threads
-if os.path.getsize(combined_fq)>100 and os.path.getsize(mapped_fq1)>100:#if not, then it's "-:-:-"
+if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-"
 if fnameB!="":
 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT)
 else:
 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT)
 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta"
 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results
 file=open("data_log.txt","a")
 for x in Final_list:
 file.write("\t".join(str(y) for y in x)+"\n")
 file.close()
-Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
+Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB
 subspecies=judge_subspecies(fnameA) #predict subspecies
 ###output
 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
+claim="" #04132019, disable claim for new report requirement
 contamination_report=""
 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0]
 if contamination_O!="" and contamination_H=="":
 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"."
 elif contamination_O=="" and contamination_H!="":
 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"."
 elif contamination_O!="" and contamination_H!="":
 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"."
 if contamination_report!="":
-contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing
+#contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019
-claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences
+contamination_report="Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles."
+#claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version
+if contamination_report+star_line+claim=="": #0413, new output style
+note=""
+else:
+note="Note:"
 if clean_mode:
 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
 make_dir="none-output-directory due to '-c' flag"
 else:
 #new_file=open("Seqsero_result.txt","w")
 O_antigen_prediction = O_choice,
 H1_antigen_prediction = fliC_choice,
 H2_antigen_prediction = fljB_choice,
 predicted_antigenic_profile = predict_form,
 predicted_subspecies = subspecies,
-predicted_serotype = predict_sero,
+predicted_serotype = "{}{}".format(predict_sero, star),
 note=claim.replace('\n','')
 )
+result['*'] = star_line
 with open("Seqsero_result.tsv","w") as new_file:
 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
 #new_file.close()
 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys())
 wrtr.writeheader()
 subspecies = judge_subspecies_Kmer(Special_dict)
 if subspecies=="IIb" or subspecies=="IIa":
 subspecies="II"
 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
-claim=""
+claim="" #no claim any more based on new output requirement
+if star_line+claim=="": #0413, new output style
+note=""
+else:
+note="Note:"
 if clean_mode:
 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
 make_dir="none-output-directory due to '-c' flag"
 else:
 if highest_O.split('-')[-1]=="":
 O_antigen_prediction = O_choice,
 H1_antigen_prediction = highest_fliC,
 H2_antigen_prediction = highest_fljB,
 predicted_antigenic_profile = predict_form,
 predicted_subspecies = subspecies,
-predicted_serotype = predict_sero,
+predicted_serotype = "{}{}".format(predict_sero, star),
 note=claim.replace('\n','')
 )
+result['*'] = star_line
 with open("Seqsero_result.tsv","w") as new_file:
 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
 #new_file.close()
 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys())
 wrtr.writeheader()

Mercurial > repos > jpayne > seqsero_v2

comparison SeqSero2/SeqSero2_package.py @ 7:87c7eebc6797