comparison SeqSero2/SeqSero2_package.py @ 7:87c7eebc6797

planemo upload
author jpayne
date Fri, 07 Jun 2019 15:48:15 -0400
parents b82e5f3c9187
children 77d3edd25de7
comparison
equal deleted inserted replaced
6:5b84740f0463 7:87c7eebc6797
21 def parse_args(): 21 def parse_args():
22 "Parse the input arguments, use '-h' for help." 22 "Parse the input arguments, use '-h' for help."
23 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com')#add "-m <data_type>" in future 23 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com')#add "-m <data_type>" in future
24 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data") 24 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data")
25 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)") 25 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)")
26 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: mode for mapping, 'sam'(bwa samse/sampe), 'mem'(bwa mem), default=mem") 26 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: mode used for mapping in allele workflow, 'sam'(bwa samse/sampe), 'mem'(bwa mem), default=mem")
27 parser.add_argument("-p",default="1",help="<int>: threads used for mapping mode, if p>4, only 4 threads will be used for assembly, default=1") 27 parser.add_argument("-p",default="1",help="<int>: threads used for allele workflow, if p>4, only 4 threads will be used for assembly step, default=1")
28 parser.add_argument("-m",choices=['k','a'],default="k",help="<string>: 'k'(kmer mode), 'a'(allele mode), default=k") 28 parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: 'k'(kmer workflow), 'a'(allele workflow), default=a")
29 parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number") 29 parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number")
30 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will use clean mode and only output serotyping prediction, the directory containing log files will be deleted") 30 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will use clean mode and only output serotyping prediction, the directory containing log files will be deleted")
31 return parser.parse_args() 31 return parser.parse_args()
32 32
33 def reverse_complement(sequence): 33 def reverse_complement(sequence):
348 ] 348 ]
349 star = "" 349 star = ""
350 star_line = "" 350 star_line = ""
351 if len(seronames) > 1: #there are two possible predictions for serotypes 351 if len(seronames) > 1: #there are two possible predictions for serotypes
352 star = "*" 352 star = "*"
353 star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n" 353 #changed 04072019
354 #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
354 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0: 355 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0:
355 star="*" 356 star="*"
356 star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line 357 star_line=" The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme." + star_line
358 #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
357 if Otype=="": 359 if Otype=="":
358 Otype="-" 360 Otype="-"
359 predict_form = Otype + ":" + fliC + ":" + fljB 361 predict_form = Otype + ":" + fliC + ":" + fljB
360 predict_sero = (" or ").join(seronames) 362 predict_sero = (" or ").join(seronames)
361 ###special test for Enteritidis 363 ###special test for Enteritidis
362 if predict_form == "9:g,m:-": 364 if predict_form == "9:g,m:-":
363 sdf = "-" 365 sdf = "-"
364 for x in special_gene_list: 366 for x in special_gene_list:
365 if x.startswith("sdf"): 367 if x.startswith("sdf"):
366 sdf = "+" 368 sdf = "+"
367 predict_form = predict_form + " Sdf prediction:" + sdf 369 #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
370 star_line=" sdf gene detected." # ed_SL_04152019: new output format
371 #predict_form = predict_form + " Sdf prediction:" + sdf
372 predict_form = predict_form #changed 04072019
368 if sdf == "-": 373 if sdf == "-":
369 star = "*" 374 star = "*"
370 star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n" 375 #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
371 predict_sero = "Gallinarum/Enteritidis sdf -" 376 star_line=" sdf gene not detected." # ed_SL_04152019: new output format
377 #changed in 04072019, for new output
378 #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
379 #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement
380 predict_sero = "Gallinarum or Enteritidis" # ed_SL_04152019: new output format
372 ###end of special test for Enteritidis 381 ###end of special test for Enteritidis
373 elif predict_form == "4:i:-": 382 elif predict_form == "4:i:-":
374 predict_sero = "potential monophasic variant of Typhimurium" 383 predict_sero = "4:i:-"
375 elif predict_form == "4:r:-": 384 elif predict_form == "4:r:-":
376 predict_sero = "potential monophasic variant of Heidelberg" 385 predict_sero = "4:r:-"
377 elif predict_form == "4:b:-": 386 elif predict_form == "4:b:-":
378 predict_sero = "potential monophasic variant of Paratyphi B" 387 predict_sero = "4:b:-"
379 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo 388 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
380 #predict_sero = "Newport" 389 #predict_sero = "Newport"
381 #star = "*" 390 #star = "*"
382 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare." 391 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
383 claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n" 392 claim = " The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
384 if "N/A" in predict_sero: 393 if "N/A" in predict_sero:
385 claim = "" 394 claim = ""
386 #special test for Typhimurium 395 #special test for Typhimurium
387 if "Typhimurium" in predict_sero or predict_form == "4:i:-": 396 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
388 normal = 0 397 normal = 0
393 elif "oafA-O-4_5-" in x: 402 elif "oafA-O-4_5-" in x:
394 mutation = float(special_gene_list[x]) 403 mutation = float(special_gene_list[x])
395 if normal > mutation: 404 if normal > mutation:
396 pass 405 pass
397 elif normal < mutation: 406 elif normal < mutation:
398 predict_sero = predict_sero.strip() + "(O5-)" 407 #predict_sero = predict_sero.strip() + "(O5-)"
408 predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019
399 star = "*" 409 star = "*"
400 star_line = "Detected the deletion of O5-." 410 #star_line = "Detected the deletion of O5-."
411 star_line = " Detected a deletion that causes O5- variant of Typhimurium." # ed_SL_04152019: new output format
401 else: 412 else:
402 pass 413 pass
403 #special test for Paratyphi B 414 #special test for Paratyphi B
404 if "Paratyphi B" in predict_sero or predict_form == "4:b:-": 415 if "Paratyphi B" in predict_sero or predict_form == "4:b:-":
405 normal = 0 416 normal = 0
409 normal = float(special_gene_list[x]) 420 normal = float(special_gene_list[x])
410 elif "gntR-family-regulatory-protein_dt-negative" in x: 421 elif "gntR-family-regulatory-protein_dt-negative" in x:
411 mutation = float(special_gene_list[x]) 422 mutation = float(special_gene_list[x])
412 #print(normal,mutation) 423 #print(normal,mutation)
413 if normal > mutation: 424 if normal > mutation:
414 predict_sero = predict_sero.strip() + "(dt+)" 425 #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019
426 predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip() # ed_SL_04152019: new output format
415 star = "*" 427 star = "*"
416 star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)." 428 #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
429 star_line = " The SNP that causes d-Tartrate nonfermentating phenotype was not detected. " # ed_SL_04152019: new output format
417 elif normal < mutation: 430 elif normal < mutation:
418 predict_sero = predict_sero.strip() + "(dt-)" 431 #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019
432 predict_sero = predict_sero.strip()
419 star = "*" 433 star = "*"
420 star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B." 434 #star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
435 star_line = " Detected the SNP for d-Tartrate nonfermenting phenotype." # ed_SL_04152019: new output format
421 else: 436 else:
422 star = "*" 437 star = "*"
423 star_line = "Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not." 438 star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
424 #special test for O13,22 and O13,23 439 #special test for O13,22 and O13,23
425 if Otype=="13": 440 if Otype=="13":
426 #ex_dir = os.path.dirname(os.path.realpath(__file__)) 441 #ex_dir = os.path.dirname(os.path.realpath(__file__))
427 f = open(dirpath + '/special.pickle', 'rb') 442 f = open(dirpath + '/special.pickle', 'rb')
428 special = pickle.load(f) 443 special = pickle.load(f)
438 #print(O22_score,O23_score) 453 #print(O22_score,O23_score)
439 for z in O22_O23[0]: 454 for z in O22_O23[0]:
440 if predict_sero.split(" or ")[0] in z: 455 if predict_sero.split(" or ")[0] in z:
441 if O22_score > O23_score: 456 if O22_score > O23_score:
442 star = "*" 457 star = "*"
443 star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." 458 #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
444 predict_sero = z[0] 459 predict_sero = z[0]
445 elif O22_score < O23_score: 460 elif O22_score < O23_score:
446 star = "*" 461 star = "*"
447 star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." 462 #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
448 predict_sero = z[1] 463 predict_sero = z[1]
449 else: 464 else:
450 star = "*" 465 star = "*"
451 star_line = "Fail to detect O22 and O23 differences." 466 #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019
467 if " or " in predict_sero:
468 star_line = star_line + " The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
452 #special test for O6,8 469 #special test for O6,8
453 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list 470 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
454 #for x in merge_O68_list: 471 #for x in merge_O68_list:
455 # if x in predict_sero: 472 # if x in predict_sero:
456 # predict_sero=x 473 # predict_sero=x
667 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted 684 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
668 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]): 685 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
669 #print "$$$No Otype, due to no hit"#may need to be changed 686 #print "$$$No Otype, due to no hit"#may need to be changed
670 O_choice="-" 687 O_choice="-"
671 else: 688 else:
672 highest_O_coverage=max([float(x[0].split("_cov_")[-1]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]]) 689 highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
673 O_list=[] 690 O_list=[]
674 O_list_less_contamination=[] 691 O_list_less_contamination=[]
675 for x in final_O: 692 for x in final_O:
676 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off 693 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off
677 O_list.append(x[0].split("__")[0]) 694 O_list.append(x[0].split("__")[0])
678 O_nodes_list.append(x[0].split("___")[1]) 695 O_nodes_list.append(x[0].split("___")[1])
679 if float(x[0].split("_cov_")[-1])>highest_O_coverage*0.15: 696 if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:
680 O_list_less_contamination.append(x[0].split("__")[0]) 697 O_list_less_contamination.append(x[0].split("__")[0])
681 ### special test for O9,46 and O3,10 family 698 ### special test for O9,46 and O3,10 family
682 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1 699 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
683 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1 700 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
684 O_choice="O-9,46" 701 O_choice="O-9,46"
712 ### end of special test for O9,46 and O3,10 family 729 ### end of special test for O9,46 and O3,10 family
713 else: 730 else:
714 try: 731 try:
715 max_score=0 732 max_score=0
716 for x in final_O: 733 for x in final_O:
717 if x[2]>=max_score and float(x[0].split("_cov_")[-1])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold 734 if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold
718 max_score=x[2]#change from x[-1] to x[2],08172018 735 max_score=x[2]#change from x[-1] to x[2],08172018
719 O_choice=x[0].split("_")[0] 736 O_choice=x[0].split("_")[0]
720 if O_choice=="O-1,3,19": 737 if O_choice=="O-1,3,19":
721 O_choice=final_O[1][0].split("_")[0] 738 O_choice=final_O[1][0].split("_")[0]
722 #print "$$$Most possilble Otype: ",O_choice 739 #print "$$$Most possilble Otype: ",O_choice
791 fljB_choice="-" 808 fljB_choice="-"
792 fliC_contig="NA" 809 fliC_contig="NA"
793 fljB_contig="NA" 810 fljB_contig="NA"
794 fliC_region=set([0]) 811 fliC_region=set([0])
795 fljB_region=set([0,]) 812 fljB_region=set([0,])
796 fliC_length=0 #can be changed to coverage in future 813 fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
797 fljB_length=0 #can be changed to coverage in future 814 fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
798 O_choice="-"#no need to decide O contig for now, should be only one 815 O_choice="-"#no need to decide O contig for now, should be only one
799 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification 816 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
800 O_choice=O_choice.split("-")[-1].strip() 817 O_choice=O_choice.split("-")[-1].strip()
801 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="": 818 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
802 O_choice="-" 819 O_choice="-"
803 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB 820 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
821 #add alignment locations, used for further selection, 03312019
822 for i in range(len(H_contig_roles)):
823 x=H_contig_roles[i]
824 for y in Final_list_passed:
825 if x[1] in y[0] and y[0].startswith(x[0]):
826 H_contig_roles[i]+=H_contig_roles[i]+(y[-1],)
827 break
804 log_file=open("SeqSero_log.txt","a") 828 log_file=open("SeqSero_log.txt","a")
805 extract_file=open("Extracted_antigen_alleles.fasta","a") 829 extract_file=open("Extracted_antigen_alleles.fasta","a")
806 handle_fasta=list(SeqIO.parse(new_fasta,"fasta")) 830 handle_fasta=list(SeqIO.parse(new_fasta,"fasta"))
807 831
808 #print("O_contigs:") 832 #print("O_contigs:")
818 for z in handle_fasta: 842 for z in handle_fasta:
819 if x[0].split("___")[-1]==z.description: 843 if x[0].split("___")[-1]==z.description:
820 seqs=str(z.seq) 844 seqs=str(z.seq)
821 extract_file.write(title+seqs+"\n") 845 extract_file.write(title+seqs+"\n")
822 if len(H_contig_roles)!=0: 846 if len(H_contig_roles)!=0:
823 highest_H_coverage=max([float(x[1].split("_cov_")[-1]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output 847 highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
824 else: 848 else:
825 highest_H_coverage=0 849 highest_H_coverage=0
826 for x in H_contig_roles: 850 for x in H_contig_roles:
827 #if multiple choices, temporately select the one with longest length for now, will revise in further change 851 #if multiple choices, temporately select the one with longest length for now, will revise in further change
828 if "fliC" == x[0] and int(x[1].split("_")[3])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list 852 if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
829 fliC_contig=x[1] 853 fliC_contig=x[1]
830 fliC_length=int(x[1].split("_")[3]) 854 fliC_length=len(x[-1])
831 elif "fljB" == x[0] and int(x[1].split("_")[3])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1])>highest_H_coverage*0.13: 855 elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:
832 fljB_contig=x[1] 856 fljB_contig=x[1]
833 fljB_length=int(x[1].split("_")[3]) 857 fljB_length=len(x[-1])
834 for x in Final_list_passed: 858 for x in Final_list_passed:
835 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]: 859 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]:
836 fliC_choice=x[0].split("_")[1] 860 fliC_choice=x[0].split("_")[1]
837 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]: 861 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]:
838 fljB_choice=x[0].split("_")[1] 862 fljB_choice=x[0].split("_")[1]
1146 print("assembling...") 1170 print("assembling...")
1147 if int(threads)>4: 1171 if int(threads)>4:
1148 t="4" 1172 t="4"
1149 else: 1173 else:
1150 t=threads 1174 t=threads
1151 if os.path.getsize(combined_fq)>100 and os.path.getsize(mapped_fq1)>100:#if not, then it's "-:-:-" 1175 if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-"
1152 if fnameB!="": 1176 if fnameB!="":
1153 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT) 1177 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT)
1154 else: 1178 else:
1155 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT) 1179 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT)
1156 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta" 1180 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta"
1247 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results 1271 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results
1248 file=open("data_log.txt","a") 1272 file=open("data_log.txt","a")
1249 for x in Final_list: 1273 for x in Final_list:
1250 file.write("\t".join(str(y) for y in x)+"\n") 1274 file.write("\t".join(str(y) for y in x)+"\n")
1251 file.close() 1275 file.close()
1252 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)] 1276 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
1253 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB 1277 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB
1254 subspecies=judge_subspecies(fnameA) #predict subspecies 1278 subspecies=judge_subspecies(fnameA) #predict subspecies
1255 ###output 1279 ###output
1256 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies) 1280 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
1281 claim="" #04132019, disable claim for new report requirement
1257 contamination_report="" 1282 contamination_report=""
1258 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0] 1283 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0]
1259 if contamination_O!="" and contamination_H=="": 1284 if contamination_O!="" and contamination_H=="":
1260 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"." 1285 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"."
1261 elif contamination_O=="" and contamination_H!="": 1286 elif contamination_O=="" and contamination_H!="":
1262 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"." 1287 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"."
1263 elif contamination_O!="" and contamination_H!="": 1288 elif contamination_O!="" and contamination_H!="":
1264 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"." 1289 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"."
1265 if contamination_report!="": 1290 if contamination_report!="":
1266 contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing 1291 #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019
1267 claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences 1292 contamination_report="Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles."
1293 #claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version
1294 if contamination_report+star_line+claim=="": #0413, new output style
1295 note=""
1296 else:
1297 note="Note:"
1268 if clean_mode: 1298 if clean_mode:
1269 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT) 1299 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
1270 make_dir="none-output-directory due to '-c' flag" 1300 make_dir="none-output-directory due to '-c' flag"
1271 else: 1301 else:
1272 #new_file=open("Seqsero_result.txt","w") 1302 #new_file=open("Seqsero_result.txt","w")
1277 O_antigen_prediction = O_choice, 1307 O_antigen_prediction = O_choice,
1278 H1_antigen_prediction = fliC_choice, 1308 H1_antigen_prediction = fliC_choice,
1279 H2_antigen_prediction = fljB_choice, 1309 H2_antigen_prediction = fljB_choice,
1280 predicted_antigenic_profile = predict_form, 1310 predicted_antigenic_profile = predict_form,
1281 predicted_subspecies = subspecies, 1311 predicted_subspecies = subspecies,
1282 predicted_serotype = predict_sero, 1312 predicted_serotype = "{}{}".format(predict_sero, star),
1283 note=claim.replace('\n','') 1313 note=claim.replace('\n','')
1284 ) 1314 )
1315 result['*'] = star_line
1285 with open("Seqsero_result.tsv","w") as new_file: 1316 with open("Seqsero_result.tsv","w") as new_file:
1286 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+## 1317 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
1287 #new_file.close() 1318 #new_file.close()
1288 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys()) 1319 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys())
1289 wrtr.writeheader() 1320 wrtr.writeheader()
1313 subspecies = judge_subspecies_Kmer(Special_dict) 1344 subspecies = judge_subspecies_Kmer(Special_dict)
1314 if subspecies=="IIb" or subspecies=="IIa": 1345 if subspecies=="IIb" or subspecies=="IIa":
1315 subspecies="II" 1346 subspecies="II"
1316 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes( 1347 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
1317 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies) 1348 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
1318 claim="" 1349 claim="" #no claim any more based on new output requirement
1350 if star_line+claim=="": #0413, new output style
1351 note=""
1352 else:
1353 note="Note:"
1319 if clean_mode: 1354 if clean_mode:
1320 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT) 1355 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
1321 make_dir="none-output-directory due to '-c' flag" 1356 make_dir="none-output-directory due to '-c' flag"
1322 else: 1357 else:
1323 if highest_O.split('-')[-1]=="": 1358 if highest_O.split('-')[-1]=="":
1330 O_antigen_prediction = O_choice, 1365 O_antigen_prediction = O_choice,
1331 H1_antigen_prediction = highest_fliC, 1366 H1_antigen_prediction = highest_fliC,
1332 H2_antigen_prediction = highest_fljB, 1367 H2_antigen_prediction = highest_fljB,
1333 predicted_antigenic_profile = predict_form, 1368 predicted_antigenic_profile = predict_form,
1334 predicted_subspecies = subspecies, 1369 predicted_subspecies = subspecies,
1335 predicted_serotype = predict_sero, 1370 predicted_serotype = "{}{}".format(predict_sero, star),
1336 note=claim.replace('\n','') 1371 note=claim.replace('\n','')
1337 ) 1372 )
1373 result['*'] = star_line
1338 with open("Seqsero_result.tsv","w") as new_file: 1374 with open("Seqsero_result.tsv","w") as new_file:
1339 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+## 1375 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
1340 #new_file.close() 1376 #new_file.close()
1341 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys()) 1377 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys())
1342 wrtr.writeheader() 1378 wrtr.writeheader()