seqsero_v2: SeqSero2/SeqSero2_package.py comparison

comparison SeqSero2/SeqSero2_package.py @ 4:b82e5f3c9187

planemo upload

author	jpayne
date	Thu, 18 Apr 2019 15:18:14 -0400
parents	fae43708974d
children	87c7eebc6797

comparison

equal deleted inserted replaced

-:b18e8cdfdf81
+:b82e5f3c9187
 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies):
 #like test_output_06012017.txt
 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
-from Initial_Conditions import phase1
+from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict
-from Initial_Conditions import phase2
+rename_dict_not_anymore=[rename_dict[x] for x in rename_dict]
-from Initial_Conditions import phaseO
+rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to
-from Initial_Conditions import sero
-from Initial_Conditions import subs
 seronames = []
 seronames_none_subspecies=[]
 for i in range(len(phase1)):
 fliC_combine = []
 fljB_combine = []
 new_fljB.sort()
 new_fljB = ",".join(new_fljB)
 if (new_fliC in fliC_combine
 or fliC in fliC_combine) and (new_fljB in fljB_combine
 or fljB in fljB_combine):
-if subs[i] == subspecies:
+######start, remove_list,rename_dict, added on 11/11/2018
-seronames.append(sero[i])
+if sero[i] not in remove_list:
-seronames_none_subspecies.append(sero[i])
+temp_sero=sero[i]
+if temp_sero in rename_dict:
+temp_sero=rename_dict[temp_sero] #rename if in the rename list
+if temp_sero not in seronames:#the new sero may already included, if yes, then not consider
+if subs[i] == subspecies:
+seronames.append(temp_sero)
+seronames_none_subspecies.append(temp_sero)
+else:
+pass
+else:
+pass
+######end, added on 11/11/2018
 #analyze seronames
 subspecies_pointer=""
 if len(seronames) == 0 and len(seronames_none_subspecies)!=0:
 seronames=seronames_none_subspecies
 subspecies_pointer="1"
 predict_sero = "potential monophasic variant of Typhimurium"
 elif predict_form == "4:r:-":
 predict_sero = "potential monophasic variant of Heidelberg"
 elif predict_form == "4:b:-":
 predict_sero = "potential monophasic variant of Paratyphi B"
-elif predict_form == "8:e,h:1,2":
+#elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
-predict_sero = "Newport"
+#predict_sero = "Newport"
-star = "*"
+#star = "*"
-star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
+#star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
 claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme.  New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies.  Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
 if "N/A" in predict_sero:
 claim = ""
 #special test for Typhimurium
 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
 if Otype=="13":
 #ex_dir = os.path.dirname(os.path.realpath(__file__))
 f = open(dirpath + '/special.pickle', 'rb')
 special = pickle.load(f)
 O22_O23=special['O22_O23']
-if predict_sero.split(" or ")[0] in O22_O23[-1]:
+if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze
 O22_score=0
 O23_score=0
 for x in special_gene_list:
 if "O:22" in x:
 O22_score = O22_score+float(special_gene_list[x])
 predict_sero = z[1]
 else:
 star = "*"
 star_line = "Fail to detect O22 and O23 differences."
 #special test for O6,8
-merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"]
+#merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
-for x in merge_O68_list:
+#for x in merge_O68_list:
-if x in predict_sero:
+#  if x in predict_sero:
-predict_sero=x
+#    predict_sero=x
-star=""
+#    star=""
-star_line=""
+#    star_line=""
 #special test for Montevideo; most of them are monophasic
-if "Montevideo" in predict_sero and "1,2,7" in predict_form:
+#if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list
-star="*"
+#star="*"
-star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
+#star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
 return predict_form, predict_sero, star, star_line, claim
 ### End of SeqSero Kmer part
 ### Begin of SeqSero2 allele prediction and output
 def xml_parse_score_comparision_seqsero(xmlfile):
 #print(potenial_new_gene)
 break
 if potenial_new_gene!="":
 print("two differnt genes in same contig, fix it for O antigen")
 print(potenial_new_gene[:3])
-final_O.append(potenial_new_gene)
+pointer=0
+for y in final_O:
+if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]:
+pointer=1
+if pointer!=1:
+final_O.append(potenial_new_gene)
 ### end of the two genes on same contig test
 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
 #print "$$$No Otype, due to no hit"#may need to be changed
 O_choice="-"
 contamination_O=""
 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
 contamination_O=""
 else:
 contamination_O="potential contamination from O antigen signals"
-return O_choice,O_nodes_list,special_genes,final_O,contamination_O
+return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq
 ### End of SeqSero2 allele prediction and output
 def get_input_files(make_dir,input_file,data_type,dirpath):
 #tell input files from datatype
 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)"
 elif data_type in ["4","5","6"]:
 for_fq=input_file[0].split("/")[-1]
 os.chdir(old_dir)
 return for_fq,rev_fq
-def predict_O_and_H_types(Final_list,Final_list_passed):
+def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta):
 #get O and H types from Final_list from blast parsing; allele mode
+from Bio import SeqIO
 fliC_choice="-"
 fljB_choice="-"
 fliC_contig="NA"
 fljB_contig="NA"
 fliC_region=set([0])
 fljB_region=set([0,])
 fliC_length=0 #can be changed to coverage in future
 fljB_length=0 #can be changed to coverage in future
 O_choice="-"#no need to decide O contig for now, should be only one
-O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
+O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
 O_choice=O_choice.split("-")[-1].strip()
 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
 O_choice="-"
 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
 log_file=open("SeqSero_log.txt","a")
-print("O_contigs:")
+extract_file=open("Extracted_antigen_alleles.fasta","a")
+handle_fasta=list(SeqIO.parse(new_fasta,"fasta"))
+#print("O_contigs:")
 log_file.write("O_contigs:\n")
+extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n")
+extract_file.write("#O_contigs:\n")
 for x in O_nodes_roles:
 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker
-print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
+#print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
-log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+" "+"blast score: "+str(x[1])+"identity%:"+str(round(x[2]*100,2))+"% "+str(min(x[-1]))+" to "+str(max(x[-1]))+"\n")
+log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n")
+title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n"
+seqs=""
+for z in handle_fasta:
+if x[0].split("___")[-1]==z.description:
+seqs=str(z.seq)
+extract_file.write(title+seqs+"\n")
 if len(H_contig_roles)!=0:
 highest_H_coverage=max([float(x[1].split("_cov_")[-1]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
 else:
 highest_H_coverage=0
 for x in H_contig_roles:
 H_contig_roles.remove(y)
 else:
 pass
 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5)
 #end of removing none-middle contigs
-print("H_contigs:")
+#print("H_contigs:")
 log_file.write("H_contigs:\n")
+extract_file.write("#H_contigs:\n")
 H_contig_stat=[]
 H1_cont_stat={}
 H2_cont_stat={}
 for i in range(len(H_contig_roles)):
 x=H_contig_roles[i]
 for y in Final_list_passed:
 if x[1] in y[0] and y[0].startswith(x[0]):
 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide
 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it
 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB
-print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
+#print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
-log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+" "+"blast_score: "+str(y[1])+" identity%:"+str(round(y[2]*100,2))+"% "+str(min(y[-1]))+" to "+str(max(y[-1]))+"\n")
+log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
 H_contig_roles[i]="can't decide fliC or fljB, may be third phase"
+title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n"
+seqs=""
+for z in handle_fasta:
+if x[1]==z.description:
+seqs=str(z.seq)
+extract_file.write(title+seqs+"\n")
 break
 else:
-print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
+#print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
-log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+" "+"blast_score: "+str(y[1])+" identity%:"+str(round(y[2]*100,2))+"% "+str(min(y[-1]))+" to "+str(max(y[-1]))+"\n")
+log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
+title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n"
+seqs=""
+for z in handle_fasta:
+if x[1]==z.description:
+seqs=str(z.seq)
+extract_file.write(title+seqs+"\n")
 if x[0]=="fliC":
 if y[0].split("_")[1] not in H1_cont_stat:
 H1_cont_stat[y[0].split("_")[1]]=y[2]
 else:
 H1_cont_stat[y[0].split("_")[1]]+=y[2]
 contamination_H=""
 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1:
 contamination_H="potential contamination from H antigen signals"
 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA":
 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected"
-print(contamination_O)
+#get additional antigens
-print(contamination_H)
+"""
+if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
+if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
+O_choice="O-9,46"
+#print "$$$Most possilble Otype:  O-9,46"
+elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
+O_choice="O-9,46,27"
+#print "$$$Most possilble Otype:  O-9,46,27"
+elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
+if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
+O_choice="O-3,10"
+#print "$$$Most possilble Otype:  O-3,10 (contain O-3,10_not_in_1,3,19)"
+else:
+O_choice="O-1,3,19"
+#print "$$$Most possilble Otype:  O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
+### end of special test for O9,46 and O3,10 family
+if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
+if len(Otypes_uniq)>2:
+contamination_O="potential contamination from O antigen signals"
+else:
+if len(Otypes_uniq)>1:
+if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
+contamination_O=""
+elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
+contamination_O=""
+"""
+additonal_antigents=[]
+#print(contamination_O)
+#print(contamination_H)
 log_file.write(contamination_O+"\n")
 log_file.write(contamination_H+"\n")
 log_file.close()
-return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H
+return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list
 def get_input_K(input_file,lib_dict,data_type,k_size):
 #kmer mode; get input_Ks from dict and data_type
 kmers = []
 for h in lib_dict:
 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml"
 subprocess.check_output('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl',shell=True, stderr=subprocess.STDOUT) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015
 subprocess.check_output("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5",shell=True, stderr=subprocess.STDOUT)###1/27/2015; 08272018, remove "-word_size 10"
 else:
 xmlfile="NA"
-return xmlfile
+return xmlfile,new_fasta
 def judge_subspecies(fnameA):
 #seqsero2 -a; judge subspecies on just forward raw reads fastq
 salmID_output=subprocess.check_output("../SalmID/SalmID.py -i "+fnameA, shell=True, stderr=subprocess.STDOUT)
 #out, err = salmID_output.communicate()
 fnameA=for_fq.split("/")[-1]
 fnameB=rev_fq.split("/")[-1]
 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id
 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort
-xmlfile=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast
+xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast
 if xmlfile=="NA":
 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","")
 else:
 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results
 file=open("data_log.txt","a")
 for x in Final_list:
 file.write("\t".join(str(y) for y in x)+"\n")
 file.close()
 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
-O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=predict_O_and_H_types(Final_list,Final_list_passed) #predict O, fliC and fljB
+O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB
 subspecies=judge_subspecies(fnameA) #predict subspecies
 ###output
 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
 contamination_report=""
+H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0]
 if contamination_O!="" and contamination_H=="":
-contamination_report="#detected potential contamination of mixed serotypes from O antigen signals.\n"
+contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"."
 elif contamination_O=="" and contamination_H!="":
-contamination_report="#detected potential contamination of mixed serotypes or potential thrid H phase from H antigen signals.\n"
+contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"."
 elif contamination_O!="" and contamination_H!="":
-contamination_report="#detected potential contamination of mixed serotypes from both O and H antigen signals.\n"
+contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"."
+if contamination_report!="":
+contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing
+claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences
 if clean_mode:
 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
 make_dir="none-output-directory due to '-c' flag"
 else:
 #new_file=open("Seqsero_result.txt","w")
 subspecies = judge_subspecies_Kmer(Special_dict)
 if subspecies=="IIb" or subspecies=="IIa":
 subspecies="II"
 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
+claim=""
 if clean_mode:
 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
 make_dir="none-output-directory due to '-c' flag"
 else:
 if highest_O.split('-')[-1]=="":

Mercurial > repos > jpayne > seqsero_v2

comparison SeqSero2/SeqSero2_package.py @ 4:b82e5f3c9187