Mercurial > repos > jpayne > seqsero_v2
comparison SeqSero2/SeqSero2_package.py @ 7:87c7eebc6797
planemo upload
author | jpayne |
---|---|
date | Fri, 07 Jun 2019 15:48:15 -0400 |
parents | b82e5f3c9187 |
children | 77d3edd25de7 |
comparison
equal
deleted
inserted
replaced
6:5b84740f0463 | 7:87c7eebc6797 |
---|---|
21 def parse_args(): | 21 def parse_args(): |
22 "Parse the input arguments, use '-h' for help." | 22 "Parse the input arguments, use '-h' for help." |
23 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com')#add "-m <data_type>" in future | 23 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com')#add "-m <data_type>" in future |
24 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data") | 24 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data") |
25 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)") | 25 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)") |
26 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: mode for mapping, 'sam'(bwa samse/sampe), 'mem'(bwa mem), default=mem") | 26 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: mode used for mapping in allele workflow, 'sam'(bwa samse/sampe), 'mem'(bwa mem), default=mem") |
27 parser.add_argument("-p",default="1",help="<int>: threads used for mapping mode, if p>4, only 4 threads will be used for assembly, default=1") | 27 parser.add_argument("-p",default="1",help="<int>: threads used for allele workflow, if p>4, only 4 threads will be used for assembly step, default=1") |
28 parser.add_argument("-m",choices=['k','a'],default="k",help="<string>: 'k'(kmer mode), 'a'(allele mode), default=k") | 28 parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: 'k'(kmer workflow), 'a'(allele workflow), default=a") |
29 parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number") | 29 parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number") |
30 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will use clean mode and only output serotyping prediction, the directory containing log files will be deleted") | 30 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will use clean mode and only output serotyping prediction, the directory containing log files will be deleted") |
31 return parser.parse_args() | 31 return parser.parse_args() |
32 | 32 |
33 def reverse_complement(sequence): | 33 def reverse_complement(sequence): |
348 ] | 348 ] |
349 star = "" | 349 star = "" |
350 star_line = "" | 350 star_line = "" |
351 if len(seronames) > 1: #there are two possible predictions for serotypes | 351 if len(seronames) > 1: #there are two possible predictions for serotypes |
352 star = "*" | 352 star = "*" |
353 star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n" | 353 #changed 04072019 |
354 #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n" | |
354 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0: | 355 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0: |
355 star="*" | 356 star="*" |
356 star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line | 357 star_line=" The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme." + star_line |
358 #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line | |
357 if Otype=="": | 359 if Otype=="": |
358 Otype="-" | 360 Otype="-" |
359 predict_form = Otype + ":" + fliC + ":" + fljB | 361 predict_form = Otype + ":" + fliC + ":" + fljB |
360 predict_sero = (" or ").join(seronames) | 362 predict_sero = (" or ").join(seronames) |
361 ###special test for Enteritidis | 363 ###special test for Enteritidis |
362 if predict_form == "9:g,m:-": | 364 if predict_form == "9:g,m:-": |
363 sdf = "-" | 365 sdf = "-" |
364 for x in special_gene_list: | 366 for x in special_gene_list: |
365 if x.startswith("sdf"): | 367 if x.startswith("sdf"): |
366 sdf = "+" | 368 sdf = "+" |
367 predict_form = predict_form + " Sdf prediction:" + sdf | 369 #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis" |
370 star_line=" sdf gene detected." # ed_SL_04152019: new output format | |
371 #predict_form = predict_form + " Sdf prediction:" + sdf | |
372 predict_form = predict_form #changed 04072019 | |
368 if sdf == "-": | 373 if sdf == "-": |
369 star = "*" | 374 star = "*" |
370 star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n" | 375 #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis" |
371 predict_sero = "Gallinarum/Enteritidis sdf -" | 376 star_line=" sdf gene not detected." # ed_SL_04152019: new output format |
377 #changed in 04072019, for new output | |
378 #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n" | |
379 #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement | |
380 predict_sero = "Gallinarum or Enteritidis" # ed_SL_04152019: new output format | |
372 ###end of special test for Enteritidis | 381 ###end of special test for Enteritidis |
373 elif predict_form == "4:i:-": | 382 elif predict_form == "4:i:-": |
374 predict_sero = "potential monophasic variant of Typhimurium" | 383 predict_sero = "4:i:-" |
375 elif predict_form == "4:r:-": | 384 elif predict_form == "4:r:-": |
376 predict_sero = "potential monophasic variant of Heidelberg" | 385 predict_sero = "4:r:-" |
377 elif predict_form == "4:b:-": | 386 elif predict_form == "4:b:-": |
378 predict_sero = "potential monophasic variant of Paratyphi B" | 387 predict_sero = "4:b:-" |
379 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo | 388 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo |
380 #predict_sero = "Newport" | 389 #predict_sero = "Newport" |
381 #star = "*" | 390 #star = "*" |
382 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare." | 391 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare." |
383 claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n" | 392 claim = " The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n" |
384 if "N/A" in predict_sero: | 393 if "N/A" in predict_sero: |
385 claim = "" | 394 claim = "" |
386 #special test for Typhimurium | 395 #special test for Typhimurium |
387 if "Typhimurium" in predict_sero or predict_form == "4:i:-": | 396 if "Typhimurium" in predict_sero or predict_form == "4:i:-": |
388 normal = 0 | 397 normal = 0 |
393 elif "oafA-O-4_5-" in x: | 402 elif "oafA-O-4_5-" in x: |
394 mutation = float(special_gene_list[x]) | 403 mutation = float(special_gene_list[x]) |
395 if normal > mutation: | 404 if normal > mutation: |
396 pass | 405 pass |
397 elif normal < mutation: | 406 elif normal < mutation: |
398 predict_sero = predict_sero.strip() + "(O5-)" | 407 #predict_sero = predict_sero.strip() + "(O5-)" |
408 predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019 | |
399 star = "*" | 409 star = "*" |
400 star_line = "Detected the deletion of O5-." | 410 #star_line = "Detected the deletion of O5-." |
411 star_line = " Detected a deletion that causes O5- variant of Typhimurium." # ed_SL_04152019: new output format | |
401 else: | 412 else: |
402 pass | 413 pass |
403 #special test for Paratyphi B | 414 #special test for Paratyphi B |
404 if "Paratyphi B" in predict_sero or predict_form == "4:b:-": | 415 if "Paratyphi B" in predict_sero or predict_form == "4:b:-": |
405 normal = 0 | 416 normal = 0 |
409 normal = float(special_gene_list[x]) | 420 normal = float(special_gene_list[x]) |
410 elif "gntR-family-regulatory-protein_dt-negative" in x: | 421 elif "gntR-family-regulatory-protein_dt-negative" in x: |
411 mutation = float(special_gene_list[x]) | 422 mutation = float(special_gene_list[x]) |
412 #print(normal,mutation) | 423 #print(normal,mutation) |
413 if normal > mutation: | 424 if normal > mutation: |
414 predict_sero = predict_sero.strip() + "(dt+)" | 425 #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019 |
426 predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip() # ed_SL_04152019: new output format | |
415 star = "*" | 427 star = "*" |
416 star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)." | 428 #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)." |
429 star_line = " The SNP that causes d-Tartrate nonfermentating phenotype was not detected. " # ed_SL_04152019: new output format | |
417 elif normal < mutation: | 430 elif normal < mutation: |
418 predict_sero = predict_sero.strip() + "(dt-)" | 431 #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019 |
432 predict_sero = predict_sero.strip() | |
419 star = "*" | 433 star = "*" |
420 star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B." | 434 #star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B." |
435 star_line = " Detected the SNP for d-Tartrate nonfermenting phenotype." # ed_SL_04152019: new output format | |
421 else: | 436 else: |
422 star = "*" | 437 star = "*" |
423 star_line = "Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not." | 438 star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not." |
424 #special test for O13,22 and O13,23 | 439 #special test for O13,22 and O13,23 |
425 if Otype=="13": | 440 if Otype=="13": |
426 #ex_dir = os.path.dirname(os.path.realpath(__file__)) | 441 #ex_dir = os.path.dirname(os.path.realpath(__file__)) |
427 f = open(dirpath + '/special.pickle', 'rb') | 442 f = open(dirpath + '/special.pickle', 'rb') |
428 special = pickle.load(f) | 443 special = pickle.load(f) |
438 #print(O22_score,O23_score) | 453 #print(O22_score,O23_score) |
439 for z in O22_O23[0]: | 454 for z in O22_O23[0]: |
440 if predict_sero.split(" or ")[0] in z: | 455 if predict_sero.split(" or ")[0] in z: |
441 if O22_score > O23_score: | 456 if O22_score > O23_score: |
442 star = "*" | 457 star = "*" |
443 star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." | 458 #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019 |
444 predict_sero = z[0] | 459 predict_sero = z[0] |
445 elif O22_score < O23_score: | 460 elif O22_score < O23_score: |
446 star = "*" | 461 star = "*" |
447 star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." | 462 #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019 |
448 predict_sero = z[1] | 463 predict_sero = z[1] |
449 else: | 464 else: |
450 star = "*" | 465 star = "*" |
451 star_line = "Fail to detect O22 and O23 differences." | 466 #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019 |
467 if " or " in predict_sero: | |
468 star_line = star_line + " The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n" | |
452 #special test for O6,8 | 469 #special test for O6,8 |
453 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list | 470 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list |
454 #for x in merge_O68_list: | 471 #for x in merge_O68_list: |
455 # if x in predict_sero: | 472 # if x in predict_sero: |
456 # predict_sero=x | 473 # predict_sero=x |
667 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted | 684 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted |
668 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]): | 685 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]): |
669 #print "$$$No Otype, due to no hit"#may need to be changed | 686 #print "$$$No Otype, due to no hit"#may need to be changed |
670 O_choice="-" | 687 O_choice="-" |
671 else: | 688 else: |
672 highest_O_coverage=max([float(x[0].split("_cov_")[-1]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]]) | 689 highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]]) |
673 O_list=[] | 690 O_list=[] |
674 O_list_less_contamination=[] | 691 O_list_less_contamination=[] |
675 for x in final_O: | 692 for x in final_O: |
676 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off | 693 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off |
677 O_list.append(x[0].split("__")[0]) | 694 O_list.append(x[0].split("__")[0]) |
678 O_nodes_list.append(x[0].split("___")[1]) | 695 O_nodes_list.append(x[0].split("___")[1]) |
679 if float(x[0].split("_cov_")[-1])>highest_O_coverage*0.15: | 696 if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15: |
680 O_list_less_contamination.append(x[0].split("__")[0]) | 697 O_list_less_contamination.append(x[0].split("__")[0]) |
681 ### special test for O9,46 and O3,10 family | 698 ### special test for O9,46 and O3,10 family |
682 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1 | 699 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1 |
683 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1 | 700 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1 |
684 O_choice="O-9,46" | 701 O_choice="O-9,46" |
712 ### end of special test for O9,46 and O3,10 family | 729 ### end of special test for O9,46 and O3,10 family |
713 else: | 730 else: |
714 try: | 731 try: |
715 max_score=0 | 732 max_score=0 |
716 for x in final_O: | 733 for x in final_O: |
717 if x[2]>=max_score and float(x[0].split("_cov_")[-1])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold | 734 if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold |
718 max_score=x[2]#change from x[-1] to x[2],08172018 | 735 max_score=x[2]#change from x[-1] to x[2],08172018 |
719 O_choice=x[0].split("_")[0] | 736 O_choice=x[0].split("_")[0] |
720 if O_choice=="O-1,3,19": | 737 if O_choice=="O-1,3,19": |
721 O_choice=final_O[1][0].split("_")[0] | 738 O_choice=final_O[1][0].split("_")[0] |
722 #print "$$$Most possilble Otype: ",O_choice | 739 #print "$$$Most possilble Otype: ",O_choice |
791 fljB_choice="-" | 808 fljB_choice="-" |
792 fliC_contig="NA" | 809 fliC_contig="NA" |
793 fljB_contig="NA" | 810 fljB_contig="NA" |
794 fliC_region=set([0]) | 811 fliC_region=set([0]) |
795 fljB_region=set([0,]) | 812 fljB_region=set([0,]) |
796 fliC_length=0 #can be changed to coverage in future | 813 fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length |
797 fljB_length=0 #can be changed to coverage in future | 814 fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length |
798 O_choice="-"#no need to decide O contig for now, should be only one | 815 O_choice="-"#no need to decide O contig for now, should be only one |
799 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification | 816 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification |
800 O_choice=O_choice.split("-")[-1].strip() | 817 O_choice=O_choice.split("-")[-1].strip() |
801 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="": | 818 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="": |
802 O_choice="-" | 819 O_choice="-" |
803 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB | 820 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB |
821 #add alignment locations, used for further selection, 03312019 | |
822 for i in range(len(H_contig_roles)): | |
823 x=H_contig_roles[i] | |
824 for y in Final_list_passed: | |
825 if x[1] in y[0] and y[0].startswith(x[0]): | |
826 H_contig_roles[i]+=H_contig_roles[i]+(y[-1],) | |
827 break | |
804 log_file=open("SeqSero_log.txt","a") | 828 log_file=open("SeqSero_log.txt","a") |
805 extract_file=open("Extracted_antigen_alleles.fasta","a") | 829 extract_file=open("Extracted_antigen_alleles.fasta","a") |
806 handle_fasta=list(SeqIO.parse(new_fasta,"fasta")) | 830 handle_fasta=list(SeqIO.parse(new_fasta,"fasta")) |
807 | 831 |
808 #print("O_contigs:") | 832 #print("O_contigs:") |
818 for z in handle_fasta: | 842 for z in handle_fasta: |
819 if x[0].split("___")[-1]==z.description: | 843 if x[0].split("___")[-1]==z.description: |
820 seqs=str(z.seq) | 844 seqs=str(z.seq) |
821 extract_file.write(title+seqs+"\n") | 845 extract_file.write(title+seqs+"\n") |
822 if len(H_contig_roles)!=0: | 846 if len(H_contig_roles)!=0: |
823 highest_H_coverage=max([float(x[1].split("_cov_")[-1]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output | 847 highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output |
824 else: | 848 else: |
825 highest_H_coverage=0 | 849 highest_H_coverage=0 |
826 for x in H_contig_roles: | 850 for x in H_contig_roles: |
827 #if multiple choices, temporately select the one with longest length for now, will revise in further change | 851 #if multiple choices, temporately select the one with longest length for now, will revise in further change |
828 if "fliC" == x[0] and int(x[1].split("_")[3])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list | 852 if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list |
829 fliC_contig=x[1] | 853 fliC_contig=x[1] |
830 fliC_length=int(x[1].split("_")[3]) | 854 fliC_length=len(x[-1]) |
831 elif "fljB" == x[0] and int(x[1].split("_")[3])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1])>highest_H_coverage*0.13: | 855 elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13: |
832 fljB_contig=x[1] | 856 fljB_contig=x[1] |
833 fljB_length=int(x[1].split("_")[3]) | 857 fljB_length=len(x[-1]) |
834 for x in Final_list_passed: | 858 for x in Final_list_passed: |
835 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]: | 859 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]: |
836 fliC_choice=x[0].split("_")[1] | 860 fliC_choice=x[0].split("_")[1] |
837 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]: | 861 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]: |
838 fljB_choice=x[0].split("_")[1] | 862 fljB_choice=x[0].split("_")[1] |
1146 print("assembling...") | 1170 print("assembling...") |
1147 if int(threads)>4: | 1171 if int(threads)>4: |
1148 t="4" | 1172 t="4" |
1149 else: | 1173 else: |
1150 t=threads | 1174 t=threads |
1151 if os.path.getsize(combined_fq)>100 and os.path.getsize(mapped_fq1)>100:#if not, then it's "-:-:-" | 1175 if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-" |
1152 if fnameB!="": | 1176 if fnameB!="": |
1153 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT) | 1177 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT) |
1154 else: | 1178 else: |
1155 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT) | 1179 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT) |
1156 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta" | 1180 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta" |
1247 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results | 1271 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results |
1248 file=open("data_log.txt","a") | 1272 file=open("data_log.txt","a") |
1249 for x in Final_list: | 1273 for x in Final_list: |
1250 file.write("\t".join(str(y) for y in x)+"\n") | 1274 file.write("\t".join(str(y) for y in x)+"\n") |
1251 file.close() | 1275 file.close() |
1252 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)] | 1276 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)] |
1253 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB | 1277 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB |
1254 subspecies=judge_subspecies(fnameA) #predict subspecies | 1278 subspecies=judge_subspecies(fnameA) #predict subspecies |
1255 ###output | 1279 ###output |
1256 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies) | 1280 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies) |
1281 claim="" #04132019, disable claim for new report requirement | |
1257 contamination_report="" | 1282 contamination_report="" |
1258 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0] | 1283 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0] |
1259 if contamination_O!="" and contamination_H=="": | 1284 if contamination_O!="" and contamination_H=="": |
1260 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"." | 1285 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"." |
1261 elif contamination_O=="" and contamination_H!="": | 1286 elif contamination_O=="" and contamination_H!="": |
1262 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"." | 1287 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"." |
1263 elif contamination_O!="" and contamination_H!="": | 1288 elif contamination_O!="" and contamination_H!="": |
1264 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"." | 1289 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"." |
1265 if contamination_report!="": | 1290 if contamination_report!="": |
1266 contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing | 1291 #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019 |
1267 claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences | 1292 contamination_report="Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles." |
1293 #claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version | |
1294 if contamination_report+star_line+claim=="": #0413, new output style | |
1295 note="" | |
1296 else: | |
1297 note="Note:" | |
1268 if clean_mode: | 1298 if clean_mode: |
1269 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT) | 1299 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT) |
1270 make_dir="none-output-directory due to '-c' flag" | 1300 make_dir="none-output-directory due to '-c' flag" |
1271 else: | 1301 else: |
1272 #new_file=open("Seqsero_result.txt","w") | 1302 #new_file=open("Seqsero_result.txt","w") |
1277 O_antigen_prediction = O_choice, | 1307 O_antigen_prediction = O_choice, |
1278 H1_antigen_prediction = fliC_choice, | 1308 H1_antigen_prediction = fliC_choice, |
1279 H2_antigen_prediction = fljB_choice, | 1309 H2_antigen_prediction = fljB_choice, |
1280 predicted_antigenic_profile = predict_form, | 1310 predicted_antigenic_profile = predict_form, |
1281 predicted_subspecies = subspecies, | 1311 predicted_subspecies = subspecies, |
1282 predicted_serotype = predict_sero, | 1312 predicted_serotype = "{}{}".format(predict_sero, star), |
1283 note=claim.replace('\n','') | 1313 note=claim.replace('\n','') |
1284 ) | 1314 ) |
1315 result['*'] = star_line | |
1285 with open("Seqsero_result.tsv","w") as new_file: | 1316 with open("Seqsero_result.tsv","w") as new_file: |
1286 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+## | 1317 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+## |
1287 #new_file.close() | 1318 #new_file.close() |
1288 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys()) | 1319 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys()) |
1289 wrtr.writeheader() | 1320 wrtr.writeheader() |
1313 subspecies = judge_subspecies_Kmer(Special_dict) | 1344 subspecies = judge_subspecies_Kmer(Special_dict) |
1314 if subspecies=="IIb" or subspecies=="IIa": | 1345 if subspecies=="IIb" or subspecies=="IIa": |
1315 subspecies="II" | 1346 subspecies="II" |
1316 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes( | 1347 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes( |
1317 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies) | 1348 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies) |
1318 claim="" | 1349 claim="" #no claim any more based on new output requirement |
1350 if star_line+claim=="": #0413, new output style | |
1351 note="" | |
1352 else: | |
1353 note="Note:" | |
1319 if clean_mode: | 1354 if clean_mode: |
1320 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT) | 1355 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT) |
1321 make_dir="none-output-directory due to '-c' flag" | 1356 make_dir="none-output-directory due to '-c' flag" |
1322 else: | 1357 else: |
1323 if highest_O.split('-')[-1]=="": | 1358 if highest_O.split('-')[-1]=="": |
1330 O_antigen_prediction = O_choice, | 1365 O_antigen_prediction = O_choice, |
1331 H1_antigen_prediction = highest_fliC, | 1366 H1_antigen_prediction = highest_fliC, |
1332 H2_antigen_prediction = highest_fljB, | 1367 H2_antigen_prediction = highest_fljB, |
1333 predicted_antigenic_profile = predict_form, | 1368 predicted_antigenic_profile = predict_form, |
1334 predicted_subspecies = subspecies, | 1369 predicted_subspecies = subspecies, |
1335 predicted_serotype = predict_sero, | 1370 predicted_serotype = "{}{}".format(predict_sero, star), |
1336 note=claim.replace('\n','') | 1371 note=claim.replace('\n','') |
1337 ) | 1372 ) |
1373 result['*'] = star_line | |
1338 with open("Seqsero_result.tsv","w") as new_file: | 1374 with open("Seqsero_result.tsv","w") as new_file: |
1339 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+## | 1375 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+## |
1340 #new_file.close() | 1376 #new_file.close() |
1341 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys()) | 1377 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys()) |
1342 wrtr.writeheader() | 1378 wrtr.writeheader() |