Mercurial > repos > jpayne > seqsero_v2
diff SeqSero2/SeqSero2_package.py @ 7:87c7eebc6797
planemo upload
author | jpayne |
---|---|
date | Fri, 07 Jun 2019 15:48:15 -0400 |
parents | b82e5f3c9187 |
children | 77d3edd25de7 |
line wrap: on
line diff
--- a/SeqSero2/SeqSero2_package.py Thu Apr 18 16:14:32 2019 -0400 +++ b/SeqSero2/SeqSero2_package.py Fri Jun 07 15:48:15 2019 -0400 @@ -23,9 +23,9 @@ parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com')#add "-m <data_type>" in future parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data") parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)") - parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: mode for mapping, 'sam'(bwa samse/sampe), 'mem'(bwa mem), default=mem") - parser.add_argument("-p",default="1",help="<int>: threads used for mapping mode, if p>4, only 4 threads will be used for assembly, default=1") - parser.add_argument("-m",choices=['k','a'],default="k",help="<string>: 'k'(kmer mode), 'a'(allele mode), default=k") + parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: mode used for mapping in allele workflow, 'sam'(bwa samse/sampe), 'mem'(bwa mem), default=mem") + parser.add_argument("-p",default="1",help="<int>: threads used for allele workflow, if p>4, only 4 threads will be used for assembly step, default=1") + parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: 'k'(kmer workflow), 'a'(allele workflow), default=a") parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number") parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will use clean mode and only output serotyping prediction, the directory containing log files will be deleted") return parser.parse_args() @@ -350,10 +350,12 @@ star_line = "" if len(seronames) > 1: #there are two possible predictions for serotypes star = "*" - star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n" + #changed 04072019 + #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n" if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0: star="*" - star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line + star_line=" The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme." + star_line + #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line if Otype=="": Otype="-" predict_form = Otype + ":" + fliC + ":" + fljB @@ -364,23 +366,30 @@ for x in special_gene_list: if x.startswith("sdf"): sdf = "+" - predict_form = predict_form + " Sdf prediction:" + sdf + #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis" + star_line=" sdf gene detected." # ed_SL_04152019: new output format + #predict_form = predict_form + " Sdf prediction:" + sdf + predict_form = predict_form #changed 04072019 if sdf == "-": star = "*" - star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n" - predict_sero = "Gallinarum/Enteritidis sdf -" + #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis" + star_line=" sdf gene not detected." # ed_SL_04152019: new output format + #changed in 04072019, for new output + #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n" + #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement + predict_sero = "Gallinarum or Enteritidis" # ed_SL_04152019: new output format ###end of special test for Enteritidis elif predict_form == "4:i:-": - predict_sero = "potential monophasic variant of Typhimurium" + predict_sero = "4:i:-" elif predict_form == "4:r:-": - predict_sero = "potential monophasic variant of Heidelberg" + predict_sero = "4:r:-" elif predict_form == "4:b:-": - predict_sero = "potential monophasic variant of Paratyphi B" + predict_sero = "4:b:-" #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo #predict_sero = "Newport" #star = "*" #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare." - claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n" + claim = " The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n" if "N/A" in predict_sero: claim = "" #special test for Typhimurium @@ -395,9 +404,11 @@ if normal > mutation: pass elif normal < mutation: - predict_sero = predict_sero.strip() + "(O5-)" + #predict_sero = predict_sero.strip() + "(O5-)" + predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019 star = "*" - star_line = "Detected the deletion of O5-." + #star_line = "Detected the deletion of O5-." + star_line = " Detected a deletion that causes O5- variant of Typhimurium." # ed_SL_04152019: new output format else: pass #special test for Paratyphi B @@ -411,16 +422,20 @@ mutation = float(special_gene_list[x]) #print(normal,mutation) if normal > mutation: - predict_sero = predict_sero.strip() + "(dt+)" + #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019 + predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip() # ed_SL_04152019: new output format star = "*" - star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)." + #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)." + star_line = " The SNP that causes d-Tartrate nonfermentating phenotype was not detected. " # ed_SL_04152019: new output format elif normal < mutation: - predict_sero = predict_sero.strip() + "(dt-)" + #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019 + predict_sero = predict_sero.strip() star = "*" - star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B." + #star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B." + star_line = " Detected the SNP for d-Tartrate nonfermenting phenotype." # ed_SL_04152019: new output format else: star = "*" - star_line = "Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not." + star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not." #special test for O13,22 and O13,23 if Otype=="13": #ex_dir = os.path.dirname(os.path.realpath(__file__)) @@ -440,15 +455,17 @@ if predict_sero.split(" or ")[0] in z: if O22_score > O23_score: star = "*" - star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." + #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019 predict_sero = z[0] elif O22_score < O23_score: star = "*" - star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." + #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019 predict_sero = z[1] else: star = "*" - star_line = "Fail to detect O22 and O23 differences." + #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019 + if " or " in predict_sero: + star_line = star_line + " The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n" #special test for O6,8 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list #for x in merge_O68_list: @@ -669,14 +686,14 @@ #print "$$$No Otype, due to no hit"#may need to be changed O_choice="-" else: - highest_O_coverage=max([float(x[0].split("_cov_")[-1]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]]) + highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]]) O_list=[] O_list_less_contamination=[] for x in final_O: if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off O_list.append(x[0].split("__")[0]) O_nodes_list.append(x[0].split("___")[1]) - if float(x[0].split("_cov_")[-1])>highest_O_coverage*0.15: + if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15: O_list_less_contamination.append(x[0].split("__")[0]) ### special test for O9,46 and O3,10 family if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1 @@ -714,7 +731,7 @@ try: max_score=0 for x in final_O: - if x[2]>=max_score and float(x[0].split("_cov_")[-1])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold + if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold max_score=x[2]#change from x[-1] to x[2],08172018 O_choice=x[0].split("_")[0] if O_choice=="O-1,3,19": @@ -793,14 +810,21 @@ fljB_contig="NA" fliC_region=set([0]) fljB_region=set([0,]) - fliC_length=0 #can be changed to coverage in future - fljB_length=0 #can be changed to coverage in future + fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length + fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length O_choice="-"#no need to decide O contig for now, should be only one O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification O_choice=O_choice.split("-")[-1].strip() if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="": O_choice="-" H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB + #add alignment locations, used for further selection, 03312019 + for i in range(len(H_contig_roles)): + x=H_contig_roles[i] + for y in Final_list_passed: + if x[1] in y[0] and y[0].startswith(x[0]): + H_contig_roles[i]+=H_contig_roles[i]+(y[-1],) + break log_file=open("SeqSero_log.txt","a") extract_file=open("Extracted_antigen_alleles.fasta","a") handle_fasta=list(SeqIO.parse(new_fasta,"fasta")) @@ -820,17 +844,17 @@ seqs=str(z.seq) extract_file.write(title+seqs+"\n") if len(H_contig_roles)!=0: - highest_H_coverage=max([float(x[1].split("_cov_")[-1]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output + highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output else: highest_H_coverage=0 for x in H_contig_roles: #if multiple choices, temporately select the one with longest length for now, will revise in further change - if "fliC" == x[0] and int(x[1].split("_")[3])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list + if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list fliC_contig=x[1] - fliC_length=int(x[1].split("_")[3]) - elif "fljB" == x[0] and int(x[1].split("_")[3])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1])>highest_H_coverage*0.13: + fliC_length=len(x[-1]) + elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13: fljB_contig=x[1] - fljB_length=int(x[1].split("_")[3]) + fljB_length=len(x[-1]) for x in Final_list_passed: if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]: fliC_choice=x[0].split("_")[1] @@ -1148,7 +1172,7 @@ t="4" else: t=threads - if os.path.getsize(combined_fq)>100 and os.path.getsize(mapped_fq1)>100:#if not, then it's "-:-:-" + if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-" if fnameB!="": subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT) else: @@ -1249,11 +1273,12 @@ for x in Final_list: file.write("\t".join(str(y) for y in x)+"\n") file.close() - Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)] + Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)] O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB subspecies=judge_subspecies(fnameA) #predict subspecies ###output predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies) + claim="" #04132019, disable claim for new report requirement contamination_report="" H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0] if contamination_O!="" and contamination_H=="": @@ -1263,8 +1288,13 @@ elif contamination_O!="" and contamination_H!="": contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"." if contamination_report!="": - contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing - claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences + #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019 + contamination_report="Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles." + #claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version + if contamination_report+star_line+claim=="": #0413, new output style + note="" + else: + note="Note:" if clean_mode: subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT) make_dir="none-output-directory due to '-c' flag" @@ -1279,9 +1309,10 @@ H2_antigen_prediction = fljB_choice, predicted_antigenic_profile = predict_form, predicted_subspecies = subspecies, - predicted_serotype = predict_sero, + predicted_serotype = "{}{}".format(predict_sero, star), note=claim.replace('\n','') ) + result['*'] = star_line with open("Seqsero_result.tsv","w") as new_file: #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+## #new_file.close() @@ -1315,7 +1346,11 @@ subspecies="II" predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes( highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies) - claim="" + claim="" #no claim any more based on new output requirement + if star_line+claim=="": #0413, new output style + note="" + else: + note="Note:" if clean_mode: subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT) make_dir="none-output-directory due to '-c' flag" @@ -1332,9 +1367,10 @@ H2_antigen_prediction = highest_fljB, predicted_antigenic_profile = predict_form, predicted_subspecies = subspecies, - predicted_serotype = predict_sero, + predicted_serotype = "{}{}".format(predict_sero, star), note=claim.replace('\n','') ) + result['*'] = star_line with open("Seqsero_result.tsv","w") as new_file: #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+## #new_file.close()