Mercurial > repos > jpayne > seqsero_v2

diff SeqSero2/SeqSero2_package.py @ 7:87c7eebc6797
planemo upload
author: jpayne
date: Fri, 07 Jun 2019 15:48:15 -0400
parents: b82e5f3c9187
children: 77d3edd25de7
--- a/SeqSero2/SeqSero2_package.py	Thu Apr 18 16:14:32 2019 -0400
+++ b/SeqSero2/SeqSero2_package.py	Fri Jun 07 15:48:15 2019 -0400
@@ -23,9 +23,9 @@
     parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com')#add "-m <data_type>" in future
     parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data")
     parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)")
-    parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: mode for mapping, 'sam'(bwa samse/sampe), 'mem'(bwa mem), default=mem") 
-    parser.add_argument("-p",default="1",help="<int>: threads used for mapping mode, if p>4, only 4 threads will be used for assembly, default=1") 
-    parser.add_argument("-m",choices=['k','a'],default="k",help="<string>: 'k'(kmer mode), 'a'(allele mode), default=k") 
+    parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: mode used for mapping in allele workflow, 'sam'(bwa samse/sampe), 'mem'(bwa mem), default=mem") 
+    parser.add_argument("-p",default="1",help="<int>: threads used for allele workflow, if p>4, only 4 threads will be used for assembly step, default=1") 
+    parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: 'k'(kmer workflow), 'a'(allele workflow), default=a") 
     parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number")
     parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will use clean mode and only output serotyping prediction, the directory containing log files will be deleted")
     return parser.parse_args()
@@ -350,10 +350,12 @@
     star_line = ""
     if len(seronames) > 1:  #there are two possible predictions for serotypes
         star = "*"
-        star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
+        #changed 04072019
+        #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
     if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0:
       star="*"
-      star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
+      star_line=" The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme." + star_line
+      #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
     if  Otype=="":
       Otype="-"
     predict_form = Otype + ":" + fliC + ":" + fljB
@@ -364,23 +366,30 @@
         for x in special_gene_list:
             if x.startswith("sdf"):
                 sdf = "+"
-        predict_form = predict_form + " Sdf prediction:" + sdf
+                #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
+                star_line=" sdf gene detected." # ed_SL_04152019: new output format
+        #predict_form = predict_form + " Sdf prediction:" + sdf
+        predict_form = predict_form #changed 04072019
         if sdf == "-":
             star = "*"
-            star_line = "Additional characterization is necessary to assign a serotype to this strain.  Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
-            predict_sero = "Gallinarum/Enteritidis sdf -"
+            #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
+            star_line=" sdf gene not detected." # ed_SL_04152019: new output format
+            #changed in 04072019, for new output
+            #star_line = "Additional characterization is necessary to assign a serotype to this strain.  Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
+            #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement
+            predict_sero = "Gallinarum or Enteritidis" # ed_SL_04152019: new output format
     ###end of special test for Enteritidis
     elif predict_form == "4:i:-":
-        predict_sero = "potential monophasic variant of Typhimurium"
+        predict_sero = "4:i:-"
     elif predict_form == "4:r:-":
-        predict_sero = "potential monophasic variant of Heidelberg"
+        predict_sero = "4:r:-"
     elif predict_form == "4:b:-":
-        predict_sero = "potential monophasic variant of Paratyphi B"
+        predict_sero = "4:b:-"
     #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
         #predict_sero = "Newport"
         #star = "*"
         #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
-    claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme.  New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies.  Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
+    claim = " The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies.  Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
     if "N/A" in predict_sero:
         claim = ""
     #special test for Typhimurium
@@ -395,9 +404,11 @@
         if normal > mutation:
             pass
         elif normal < mutation:
-            predict_sero = predict_sero.strip() + "(O5-)"
+            #predict_sero = predict_sero.strip() + "(O5-)"
+            predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019
             star = "*"
-            star_line = "Detected the deletion of O5-."
+            #star_line = "Detected the deletion of O5-."
+            star_line = " Detected a deletion that causes O5- variant of Typhimurium." # ed_SL_04152019: new output format
         else:
             pass
     #special test for Paratyphi B
@@ -411,16 +422,20 @@
                 mutation = float(special_gene_list[x])
         #print(normal,mutation)
         if normal > mutation:
-            predict_sero = predict_sero.strip() + "(dt+)"
+            #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019
+            predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip() # ed_SL_04152019: new output format
             star = "*"
-            star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
+            #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
+            star_line = " The SNP that causes d-Tartrate nonfermentating phenotype was not detected. " # ed_SL_04152019: new output format
         elif normal < mutation:
-            predict_sero = predict_sero.strip() + "(dt-)"
+            #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019
+            predict_sero = predict_sero.strip()
             star = "*"
-            star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
+            #star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
+            star_line = " Detected the SNP for d-Tartrate nonfermenting phenotype." # ed_SL_04152019: new output format
         else:
             star = "*"
-            star_line = "Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
+            star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
     #special test for O13,22 and O13,23
     if Otype=="13":
       #ex_dir = os.path.dirname(os.path.realpath(__file__))
@@ -440,15 +455,17 @@
           if predict_sero.split(" or ")[0] in z:
             if O22_score > O23_score:
               star = "*"
-              star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'."
+              #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
               predict_sero = z[0]
             elif O22_score < O23_score:
               star = "*"
-              star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'."
+              #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
               predict_sero = z[1]
             else:
               star = "*"
-              star_line = "Fail to detect O22 and O23 differences."
+              #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019
+    if " or " in predict_sero:
+      star_line = star_line + " The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
     #special test for O6,8 
     #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
     #for x in merge_O68_list:
@@ -669,14 +686,14 @@
     #print "$$$No Otype, due to no hit"#may need to be changed
     O_choice="-"
   else:
-    highest_O_coverage=max([float(x[0].split("_cov_")[-1]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
+    highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
     O_list=[]
     O_list_less_contamination=[]
     for x in final_O:
       if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off
         O_list.append(x[0].split("__")[0])
         O_nodes_list.append(x[0].split("___")[1])
-        if float(x[0].split("_cov_")[-1])>highest_O_coverage*0.15:
+        if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:
           O_list_less_contamination.append(x[0].split("__")[0])
     ### special test for O9,46 and O3,10 family
     if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
@@ -714,7 +731,7 @@
       try: 
         max_score=0
         for x in final_O:
-          if x[2]>=max_score and float(x[0].split("_cov_")[-1])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold
+          if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold
             max_score=x[2]#change from x[-1] to x[2],08172018
             O_choice=x[0].split("_")[0]
         if O_choice=="O-1,3,19":
@@ -793,14 +810,21 @@
   fljB_contig="NA"
   fliC_region=set([0])
   fljB_region=set([0,])
-  fliC_length=0 #can be changed to coverage in future
-  fljB_length=0 #can be changed to coverage in future
+  fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
+  fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
   O_choice="-"#no need to decide O contig for now, should be only one
   O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
   O_choice=O_choice.split("-")[-1].strip()
   if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
     O_choice="-"
   H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
+  #add alignment locations, used for further selection, 03312019
+  for i in range(len(H_contig_roles)):
+    x=H_contig_roles[i]
+    for y in Final_list_passed:
+      if x[1] in y[0] and y[0].startswith(x[0]):
+        H_contig_roles[i]+=H_contig_roles[i]+(y[-1],)
+        break
   log_file=open("SeqSero_log.txt","a")
   extract_file=open("Extracted_antigen_alleles.fasta","a")
   handle_fasta=list(SeqIO.parse(new_fasta,"fasta"))
@@ -820,17 +844,17 @@
           seqs=str(z.seq)
       extract_file.write(title+seqs+"\n")
   if len(H_contig_roles)!=0:
-    highest_H_coverage=max([float(x[1].split("_cov_")[-1]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
+    highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
   else:
     highest_H_coverage=0
   for x in H_contig_roles:
     #if multiple choices, temporately select the one with longest length for now, will revise in further change
-    if "fliC" == x[0] and int(x[1].split("_")[3])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
+    if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
       fliC_contig=x[1]
-      fliC_length=int(x[1].split("_")[3])
-    elif "fljB" == x[0] and int(x[1].split("_")[3])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1])>highest_H_coverage*0.13:
+      fliC_length=len(x[-1])
+    elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:
       fljB_contig=x[1]
-      fljB_length=int(x[1].split("_")[3])
+      fljB_length=len(x[-1])
   for x in Final_list_passed:
     if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]:
       fliC_choice=x[0].split("_")[1]
@@ -1148,7 +1172,7 @@
     t="4"
   else:
     t=threads
-  if os.path.getsize(combined_fq)>100 and os.path.getsize(mapped_fq1)>100:#if not, then it's "-:-:-"
+  if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-"
     if fnameB!="":
       subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT)
     else:
@@ -1249,11 +1273,12 @@
           for x in Final_list:
             file.write("\t".join(str(y) for y in x)+"\n")
           file.close()
-          Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
+          Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
           O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB
         subspecies=judge_subspecies(fnameA) #predict subspecies
         ###output
         predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
+        claim="" #04132019, disable claim for new report requirement
         contamination_report=""
         H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0]
         if contamination_O!="" and contamination_H=="":
@@ -1263,8 +1288,13 @@
         elif contamination_O!="" and contamination_H!="":
           contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"."
         if contamination_report!="":
-          contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing
-        claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences
+          #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019
+          contamination_report="Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles."
+        #claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version
+        if contamination_report+star_line+claim=="": #0413, new output style
+          note=""
+        else:
+          note="Note:"
         if clean_mode:
           subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
           make_dir="none-output-directory due to '-c' flag"
@@ -1279,9 +1309,10 @@
           H2_antigen_prediction = fljB_choice,
           predicted_antigenic_profile = predict_form,
           predicted_subspecies = subspecies,
-          predicted_serotype = predict_sero,
+          predicted_serotype = "{}{}".format(predict_sero, star),
           note=claim.replace('\n','')
           )
+          result['*'] = star_line
           with open("Seqsero_result.tsv","w") as new_file:
          #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
          #new_file.close()
@@ -1315,7 +1346,11 @@
         subspecies="II"
       predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
           highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
-      claim=""
+      claim="" #no claim any more based on new output requirement
+      if star_line+claim=="": #0413, new output style
+        note=""
+      else:
+        note="Note:"
       if clean_mode:
         subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
         make_dir="none-output-directory due to '-c' flag"
@@ -1332,9 +1367,10 @@
           H2_antigen_prediction = highest_fljB,
           predicted_antigenic_profile = predict_form,
           predicted_subspecies = subspecies,
-          predicted_serotype = predict_sero,
+          predicted_serotype = "{}{}".format(predict_sero, star),
           note=claim.replace('\n','')
         )
+        result['*'] = star_line
         with open("Seqsero_result.tsv","w") as new_file:
          #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
          #new_file.close()
author	jpayne
date	Fri, 07 Jun 2019 15:48:15 -0400
parents	b82e5f3c9187
children	77d3edd25de7