comparison SeqSero2/SeqSero2_package.py @ 4:b82e5f3c9187

planemo upload
author jpayne
date Thu, 18 Apr 2019 15:18:14 -0400
parents fae43708974d
children 87c7eebc6797
comparison
equal deleted inserted replaced
3:b18e8cdfdf81 4:b82e5f3c9187
261 261
262 262
263 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies): 263 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies):
264 #like test_output_06012017.txt 264 #like test_output_06012017.txt
265 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list) 265 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
266 from Initial_Conditions import phase1 266 from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict
267 from Initial_Conditions import phase2 267 rename_dict_not_anymore=[rename_dict[x] for x in rename_dict]
268 from Initial_Conditions import phaseO 268 rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to
269 from Initial_Conditions import sero
270 from Initial_Conditions import subs
271 seronames = [] 269 seronames = []
272 seronames_none_subspecies=[] 270 seronames_none_subspecies=[]
273 for i in range(len(phase1)): 271 for i in range(len(phase1)):
274 fliC_combine = [] 272 fliC_combine = []
275 fljB_combine = [] 273 fljB_combine = []
323 new_fljB.sort() 321 new_fljB.sort()
324 new_fljB = ",".join(new_fljB) 322 new_fljB = ",".join(new_fljB)
325 if (new_fliC in fliC_combine 323 if (new_fliC in fliC_combine
326 or fliC in fliC_combine) and (new_fljB in fljB_combine 324 or fliC in fliC_combine) and (new_fljB in fljB_combine
327 or fljB in fljB_combine): 325 or fljB in fljB_combine):
328 if subs[i] == subspecies: 326 ######start, remove_list,rename_dict, added on 11/11/2018
329 seronames.append(sero[i]) 327 if sero[i] not in remove_list:
330 seronames_none_subspecies.append(sero[i]) 328 temp_sero=sero[i]
329 if temp_sero in rename_dict:
330 temp_sero=rename_dict[temp_sero] #rename if in the rename list
331 if temp_sero not in seronames:#the new sero may already included, if yes, then not consider
332 if subs[i] == subspecies:
333 seronames.append(temp_sero)
334 seronames_none_subspecies.append(temp_sero)
335 else:
336 pass
337 else:
338 pass
339 ######end, added on 11/11/2018
331 #analyze seronames 340 #analyze seronames
332 subspecies_pointer="" 341 subspecies_pointer=""
333 if len(seronames) == 0 and len(seronames_none_subspecies)!=0: 342 if len(seronames) == 0 and len(seronames_none_subspecies)!=0:
334 seronames=seronames_none_subspecies 343 seronames=seronames_none_subspecies
335 subspecies_pointer="1" 344 subspecies_pointer="1"
365 predict_sero = "potential monophasic variant of Typhimurium" 374 predict_sero = "potential monophasic variant of Typhimurium"
366 elif predict_form == "4:r:-": 375 elif predict_form == "4:r:-":
367 predict_sero = "potential monophasic variant of Heidelberg" 376 predict_sero = "potential monophasic variant of Heidelberg"
368 elif predict_form == "4:b:-": 377 elif predict_form == "4:b:-":
369 predict_sero = "potential monophasic variant of Paratyphi B" 378 predict_sero = "potential monophasic variant of Paratyphi B"
370 elif predict_form == "8:e,h:1,2": 379 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
371 predict_sero = "Newport" 380 #predict_sero = "Newport"
372 star = "*" 381 #star = "*"
373 star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare." 382 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
374 claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n" 383 claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
375 if "N/A" in predict_sero: 384 if "N/A" in predict_sero:
376 claim = "" 385 claim = ""
377 #special test for Typhimurium 386 #special test for Typhimurium
378 if "Typhimurium" in predict_sero or predict_form == "4:i:-": 387 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
416 if Otype=="13": 425 if Otype=="13":
417 #ex_dir = os.path.dirname(os.path.realpath(__file__)) 426 #ex_dir = os.path.dirname(os.path.realpath(__file__))
418 f = open(dirpath + '/special.pickle', 'rb') 427 f = open(dirpath + '/special.pickle', 'rb')
419 special = pickle.load(f) 428 special = pickle.load(f)
420 O22_O23=special['O22_O23'] 429 O22_O23=special['O22_O23']
421 if predict_sero.split(" or ")[0] in O22_O23[-1]: 430 if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze
422 O22_score=0 431 O22_score=0
423 O23_score=0 432 O23_score=0
424 for x in special_gene_list: 433 for x in special_gene_list:
425 if "O:22" in x: 434 if "O:22" in x:
426 O22_score = O22_score+float(special_gene_list[x]) 435 O22_score = O22_score+float(special_gene_list[x])
439 predict_sero = z[1] 448 predict_sero = z[1]
440 else: 449 else:
441 star = "*" 450 star = "*"
442 star_line = "Fail to detect O22 and O23 differences." 451 star_line = "Fail to detect O22 and O23 differences."
443 #special test for O6,8 452 #special test for O6,8
444 merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] 453 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
445 for x in merge_O68_list: 454 #for x in merge_O68_list:
446 if x in predict_sero: 455 # if x in predict_sero:
447 predict_sero=x 456 # predict_sero=x
448 star="" 457 # star=""
449 star_line="" 458 # star_line=""
450 #special test for Montevideo; most of them are monophasic 459 #special test for Montevideo; most of them are monophasic
451 if "Montevideo" in predict_sero and "1,2,7" in predict_form: 460 #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list
452 star="*" 461 #star="*"
453 star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination." 462 #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
454 return predict_form, predict_sero, star, star_line, claim 463 return predict_form, predict_sero, star, star_line, claim
455 ### End of SeqSero Kmer part 464 ### End of SeqSero Kmer part
456 465
457 ### Begin of SeqSero2 allele prediction and output 466 ### Begin of SeqSero2 allele prediction and output
458 def xml_parse_score_comparision_seqsero(xmlfile): 467 def xml_parse_score_comparision_seqsero(xmlfile):
646 #print(potenial_new_gene) 655 #print(potenial_new_gene)
647 break 656 break
648 if potenial_new_gene!="": 657 if potenial_new_gene!="":
649 print("two differnt genes in same contig, fix it for O antigen") 658 print("two differnt genes in same contig, fix it for O antigen")
650 print(potenial_new_gene[:3]) 659 print(potenial_new_gene[:3])
651 final_O.append(potenial_new_gene) 660 pointer=0
661 for y in final_O:
662 if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]:
663 pointer=1
664 if pointer!=1:
665 final_O.append(potenial_new_gene)
652 ### end of the two genes on same contig test 666 ### end of the two genes on same contig test
653 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted 667 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
654 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]): 668 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
655 #print "$$$No Otype, due to no hit"#may need to be changed 669 #print "$$$No Otype, due to no hit"#may need to be changed
656 O_choice="-" 670 O_choice="-"
729 contamination_O="" 743 contamination_O=""
730 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund 744 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
731 contamination_O="" 745 contamination_O=""
732 else: 746 else:
733 contamination_O="potential contamination from O antigen signals" 747 contamination_O="potential contamination from O antigen signals"
734 return O_choice,O_nodes_list,special_genes,final_O,contamination_O 748 return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq
735 ### End of SeqSero2 allele prediction and output 749 ### End of SeqSero2 allele prediction and output
736 750
737 def get_input_files(make_dir,input_file,data_type,dirpath): 751 def get_input_files(make_dir,input_file,data_type,dirpath):
738 #tell input files from datatype 752 #tell input files from datatype
739 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)" 753 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)"
768 elif data_type in ["4","5","6"]: 782 elif data_type in ["4","5","6"]:
769 for_fq=input_file[0].split("/")[-1] 783 for_fq=input_file[0].split("/")[-1]
770 os.chdir(old_dir) 784 os.chdir(old_dir)
771 return for_fq,rev_fq 785 return for_fq,rev_fq
772 786
773 def predict_O_and_H_types(Final_list,Final_list_passed): 787 def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta):
774 #get O and H types from Final_list from blast parsing; allele mode 788 #get O and H types from Final_list from blast parsing; allele mode
789 from Bio import SeqIO
775 fliC_choice="-" 790 fliC_choice="-"
776 fljB_choice="-" 791 fljB_choice="-"
777 fliC_contig="NA" 792 fliC_contig="NA"
778 fljB_contig="NA" 793 fljB_contig="NA"
779 fliC_region=set([0]) 794 fliC_region=set([0])
780 fljB_region=set([0,]) 795 fljB_region=set([0,])
781 fliC_length=0 #can be changed to coverage in future 796 fliC_length=0 #can be changed to coverage in future
782 fljB_length=0 #can be changed to coverage in future 797 fljB_length=0 #can be changed to coverage in future
783 O_choice="-"#no need to decide O contig for now, should be only one 798 O_choice="-"#no need to decide O contig for now, should be only one
784 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification 799 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
785 O_choice=O_choice.split("-")[-1].strip() 800 O_choice=O_choice.split("-")[-1].strip()
786 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="": 801 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
787 O_choice="-" 802 O_choice="-"
788 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB 803 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
789 log_file=open("SeqSero_log.txt","a") 804 log_file=open("SeqSero_log.txt","a")
790 print("O_contigs:") 805 extract_file=open("Extracted_antigen_alleles.fasta","a")
806 handle_fasta=list(SeqIO.parse(new_fasta,"fasta"))
807
808 #print("O_contigs:")
791 log_file.write("O_contigs:\n") 809 log_file.write("O_contigs:\n")
810 extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n")
811 extract_file.write("#O_contigs:\n")
792 for x in O_nodes_roles: 812 for x in O_nodes_roles:
793 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker 813 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker
794 print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1]))) 814 #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
795 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+" "+"blast score: "+str(x[1])+"identity%:"+str(round(x[2]*100,2))+"% "+str(min(x[-1]))+" to "+str(max(x[-1]))+"\n") 815 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n")
816 title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n"
817 seqs=""
818 for z in handle_fasta:
819 if x[0].split("___")[-1]==z.description:
820 seqs=str(z.seq)
821 extract_file.write(title+seqs+"\n")
796 if len(H_contig_roles)!=0: 822 if len(H_contig_roles)!=0:
797 highest_H_coverage=max([float(x[1].split("_cov_")[-1]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output 823 highest_H_coverage=max([float(x[1].split("_cov_")[-1]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
798 else: 824 else:
799 highest_H_coverage=0 825 highest_H_coverage=0
800 for x in H_contig_roles: 826 for x in H_contig_roles:
862 H_contig_roles.remove(y) 888 H_contig_roles.remove(y)
863 else: 889 else:
864 pass 890 pass
865 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5) 891 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5)
866 #end of removing none-middle contigs 892 #end of removing none-middle contigs
867 print("H_contigs:") 893 #print("H_contigs:")
868 log_file.write("H_contigs:\n") 894 log_file.write("H_contigs:\n")
895 extract_file.write("#H_contigs:\n")
869 H_contig_stat=[] 896 H_contig_stat=[]
870 H1_cont_stat={} 897 H1_cont_stat={}
871 H2_cont_stat={} 898 H2_cont_stat={}
872 for i in range(len(H_contig_roles)): 899 for i in range(len(H_contig_roles)):
873 x=H_contig_roles[i] 900 x=H_contig_roles[i]
875 for y in Final_list_passed: 902 for y in Final_list_passed:
876 if x[1] in y[0] and y[0].startswith(x[0]): 903 if x[1] in y[0] and y[0].startswith(x[0]):
877 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide 904 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide
878 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it 905 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it
879 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB 906 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB
880 print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1]))) 907 #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
881 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+" "+"blast_score: "+str(y[1])+" identity%:"+str(round(y[2]*100,2))+"% "+str(min(y[-1]))+" to "+str(max(y[-1]))+"\n") 908 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
882 H_contig_roles[i]="can't decide fliC or fljB, may be third phase" 909 H_contig_roles[i]="can't decide fliC or fljB, may be third phase"
910 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n"
911 seqs=""
912 for z in handle_fasta:
913 if x[1]==z.description:
914 seqs=str(z.seq)
915 extract_file.write(title+seqs+"\n")
883 break 916 break
884 else: 917 else:
885 print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1]))) 918 #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
886 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+" "+"blast_score: "+str(y[1])+" identity%:"+str(round(y[2]*100,2))+"% "+str(min(y[-1]))+" to "+str(max(y[-1]))+"\n") 919 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
920 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n"
921 seqs=""
922 for z in handle_fasta:
923 if x[1]==z.description:
924 seqs=str(z.seq)
925 extract_file.write(title+seqs+"\n")
887 if x[0]=="fliC": 926 if x[0]=="fliC":
888 if y[0].split("_")[1] not in H1_cont_stat: 927 if y[0].split("_")[1] not in H1_cont_stat:
889 H1_cont_stat[y[0].split("_")[1]]=y[2] 928 H1_cont_stat[y[0].split("_")[1]]=y[2]
890 else: 929 else:
891 H1_cont_stat[y[0].split("_")[1]]+=y[2] 930 H1_cont_stat[y[0].split("_")[1]]+=y[2]
903 contamination_H="" 942 contamination_H=""
904 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1: 943 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1:
905 contamination_H="potential contamination from H antigen signals" 944 contamination_H="potential contamination from H antigen signals"
906 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA": 945 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA":
907 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected" 946 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected"
908 print(contamination_O) 947 #get additional antigens
909 print(contamination_H) 948 """
949 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
950 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
951 O_choice="O-9,46"
952 #print "$$$Most possilble Otype: O-9,46"
953 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
954 O_choice="O-9,46,27"
955 #print "$$$Most possilble Otype: O-9,46,27"
956 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
957 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
958 O_choice="O-3,10"
959 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
960 else:
961 O_choice="O-1,3,19"
962 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
963 ### end of special test for O9,46 and O3,10 family
964
965 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
966 if len(Otypes_uniq)>2:
967 contamination_O="potential contamination from O antigen signals"
968 else:
969 if len(Otypes_uniq)>1:
970 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
971 contamination_O=""
972 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
973 contamination_O=""
974 """
975 additonal_antigents=[]
976 #print(contamination_O)
977 #print(contamination_H)
910 log_file.write(contamination_O+"\n") 978 log_file.write(contamination_O+"\n")
911 log_file.write(contamination_H+"\n") 979 log_file.write(contamination_H+"\n")
912 log_file.close() 980 log_file.close()
913 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H 981 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list
914 982
915 def get_input_K(input_file,lib_dict,data_type,k_size): 983 def get_input_K(input_file,lib_dict,data_type,k_size):
916 #kmer mode; get input_Ks from dict and data_type 984 #kmer mode; get input_Ks from dict and data_type
917 kmers = [] 985 kmers = []
918 for h in lib_dict: 986 for h in lib_dict:
1093 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml" 1161 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml"
1094 subprocess.check_output('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl',shell=True, stderr=subprocess.STDOUT) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015 1162 subprocess.check_output('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl',shell=True, stderr=subprocess.STDOUT) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015
1095 subprocess.check_output("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5",shell=True, stderr=subprocess.STDOUT)###1/27/2015; 08272018, remove "-word_size 10" 1163 subprocess.check_output("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5",shell=True, stderr=subprocess.STDOUT)###1/27/2015; 08272018, remove "-word_size 10"
1096 else: 1164 else:
1097 xmlfile="NA" 1165 xmlfile="NA"
1098 return xmlfile 1166 return xmlfile,new_fasta
1099 1167
1100 def judge_subspecies(fnameA): 1168 def judge_subspecies(fnameA):
1101 #seqsero2 -a; judge subspecies on just forward raw reads fastq 1169 #seqsero2 -a; judge subspecies on just forward raw reads fastq
1102 salmID_output=subprocess.check_output("../SalmID/SalmID.py -i "+fnameA, shell=True, stderr=subprocess.STDOUT) 1170 salmID_output=subprocess.check_output("../SalmID/SalmID.py -i "+fnameA, shell=True, stderr=subprocess.STDOUT)
1103 #out, err = salmID_output.communicate() 1171 #out, err = salmID_output.communicate()
1170 fnameA=for_fq.split("/")[-1] 1238 fnameA=for_fq.split("/")[-1]
1171 fnameB=rev_fq.split("/")[-1] 1239 fnameB=rev_fq.split("/")[-1]
1172 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) 1240 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
1173 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id 1241 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id
1174 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort 1242 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort
1175 xmlfile=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast 1243 xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast
1176 if xmlfile=="NA": 1244 if xmlfile=="NA":
1177 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","") 1245 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","")
1178 else: 1246 else:
1179 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results 1247 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results
1180 file=open("data_log.txt","a") 1248 file=open("data_log.txt","a")
1181 for x in Final_list: 1249 for x in Final_list:
1182 file.write("\t".join(str(y) for y in x)+"\n") 1250 file.write("\t".join(str(y) for y in x)+"\n")
1183 file.close() 1251 file.close()
1184 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)] 1252 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
1185 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=predict_O_and_H_types(Final_list,Final_list_passed) #predict O, fliC and fljB 1253 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB
1186 subspecies=judge_subspecies(fnameA) #predict subspecies 1254 subspecies=judge_subspecies(fnameA) #predict subspecies
1187 ###output 1255 ###output
1188 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies) 1256 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
1189 contamination_report="" 1257 contamination_report=""
1258 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0]
1190 if contamination_O!="" and contamination_H=="": 1259 if contamination_O!="" and contamination_H=="":
1191 contamination_report="#detected potential contamination of mixed serotypes from O antigen signals.\n" 1260 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"."
1192 elif contamination_O=="" and contamination_H!="": 1261 elif contamination_O=="" and contamination_H!="":
1193 contamination_report="#detected potential contamination of mixed serotypes or potential thrid H phase from H antigen signals.\n" 1262 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"."
1194 elif contamination_O!="" and contamination_H!="": 1263 elif contamination_O!="" and contamination_H!="":
1195 contamination_report="#detected potential contamination of mixed serotypes from both O and H antigen signals.\n" 1264 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"."
1265 if contamination_report!="":
1266 contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing
1267 claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences
1196 if clean_mode: 1268 if clean_mode:
1197 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT) 1269 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
1198 make_dir="none-output-directory due to '-c' flag" 1270 make_dir="none-output-directory due to '-c' flag"
1199 else: 1271 else:
1200 #new_file=open("Seqsero_result.txt","w") 1272 #new_file=open("Seqsero_result.txt","w")
1241 subspecies = judge_subspecies_Kmer(Special_dict) 1313 subspecies = judge_subspecies_Kmer(Special_dict)
1242 if subspecies=="IIb" or subspecies=="IIa": 1314 if subspecies=="IIb" or subspecies=="IIa":
1243 subspecies="II" 1315 subspecies="II"
1244 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes( 1316 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
1245 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies) 1317 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
1318 claim=""
1246 if clean_mode: 1319 if clean_mode:
1247 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT) 1320 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
1248 make_dir="none-output-directory due to '-c' flag" 1321 make_dir="none-output-directory due to '-c' flag"
1249 else: 1322 else:
1250 if highest_O.split('-')[-1]=="": 1323 if highest_O.split('-')[-1]=="":