diff SeqSero2/SeqSero2_package.py @ 4:b82e5f3c9187

planemo upload
author jpayne
date Thu, 18 Apr 2019 15:18:14 -0400
parents fae43708974d
children 87c7eebc6797
line wrap: on
line diff
--- a/SeqSero2/SeqSero2_package.py	Tue Nov 13 16:18:17 2018 -0500
+++ b/SeqSero2/SeqSero2_package.py	Thu Apr 18 15:18:14 2019 -0400
@@ -263,11 +263,9 @@
 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies):
     #like test_output_06012017.txt
     #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
-    from Initial_Conditions import phase1
-    from Initial_Conditions import phase2
-    from Initial_Conditions import phaseO
-    from Initial_Conditions import sero
-    from Initial_Conditions import subs
+    from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict
+    rename_dict_not_anymore=[rename_dict[x] for x in rename_dict]
+    rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to 
     seronames = []
     seronames_none_subspecies=[]
     for i in range(len(phase1)):
@@ -325,9 +323,20 @@
             if (new_fliC in fliC_combine
                     or fliC in fliC_combine) and (new_fljB in fljB_combine
                                                   or fljB in fljB_combine):
-                if subs[i] == subspecies:
-                  seronames.append(sero[i])
-                seronames_none_subspecies.append(sero[i])
+              ######start, remove_list,rename_dict, added on 11/11/2018
+              if sero[i] not in remove_list:
+                temp_sero=sero[i]
+                if temp_sero in rename_dict:
+                  temp_sero=rename_dict[temp_sero] #rename if in the rename list
+                if temp_sero not in seronames:#the new sero may already included, if yes, then not consider
+                  if subs[i] == subspecies:
+                    seronames.append(temp_sero)
+                  seronames_none_subspecies.append(temp_sero)
+                else:
+                  pass
+              else:
+                pass
+              ######end, added on 11/11/2018
     #analyze seronames
     subspecies_pointer=""
     if len(seronames) == 0 and len(seronames_none_subspecies)!=0:
@@ -367,10 +376,10 @@
         predict_sero = "potential monophasic variant of Heidelberg"
     elif predict_form == "4:b:-":
         predict_sero = "potential monophasic variant of Paratyphi B"
-    elif predict_form == "8:e,h:1,2":
-        predict_sero = "Newport"
-        star = "*"
-        star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
+    #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
+        #predict_sero = "Newport"
+        #star = "*"
+        #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
     claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme.  New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies.  Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
     if "N/A" in predict_sero:
         claim = ""
@@ -418,7 +427,7 @@
       f = open(dirpath + '/special.pickle', 'rb')
       special = pickle.load(f)
       O22_O23=special['O22_O23']
-      if predict_sero.split(" or ")[0] in O22_O23[-1]:
+      if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze 
         O22_score=0
         O23_score=0
         for x in special_gene_list:
@@ -441,16 +450,16 @@
               star = "*"
               star_line = "Fail to detect O22 and O23 differences."
     #special test for O6,8 
-    merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"]
-    for x in merge_O68_list:
-      if x in predict_sero:
-        predict_sero=x
-        star=""
-        star_line=""
+    #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
+    #for x in merge_O68_list:
+    #  if x in predict_sero:
+    #    predict_sero=x
+    #    star=""
+    #    star_line=""
     #special test for Montevideo; most of them are monophasic
-    if "Montevideo" in predict_sero and "1,2,7" in predict_form:
-      star="*"
-      star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
+    #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list
+      #star="*"
+      #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
     return predict_form, predict_sero, star, star_line, claim
 ### End of SeqSero Kmer part
 
@@ -648,7 +657,12 @@
   if potenial_new_gene!="":
     print("two differnt genes in same contig, fix it for O antigen")
     print(potenial_new_gene[:3])
-    final_O.append(potenial_new_gene)
+    pointer=0
+    for y in final_O:
+      if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]:
+        pointer=1
+    if pointer!=1:
+      final_O.append(potenial_new_gene)
   ### end of the two genes on same contig test
   final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
   if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
@@ -731,7 +745,7 @@
         contamination_O=""
       else:
         contamination_O="potential contamination from O antigen signals"
-  return O_choice,O_nodes_list,special_genes,final_O,contamination_O
+  return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq
 ### End of SeqSero2 allele prediction and output
 
 def get_input_files(make_dir,input_file,data_type,dirpath):
@@ -770,8 +784,9 @@
   os.chdir(old_dir)
   return for_fq,rev_fq
 
-def predict_O_and_H_types(Final_list,Final_list_passed):
+def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta):
   #get O and H types from Final_list from blast parsing; allele mode
+  from Bio import SeqIO
   fliC_choice="-"
   fljB_choice="-"
   fliC_contig="NA"
@@ -781,18 +796,29 @@
   fliC_length=0 #can be changed to coverage in future
   fljB_length=0 #can be changed to coverage in future
   O_choice="-"#no need to decide O contig for now, should be only one
-  O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
+  O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
   O_choice=O_choice.split("-")[-1].strip()
   if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
     O_choice="-"
   H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
   log_file=open("SeqSero_log.txt","a")
-  print("O_contigs:")
+  extract_file=open("Extracted_antigen_alleles.fasta","a")
+  handle_fasta=list(SeqIO.parse(new_fasta,"fasta"))
+
+  #print("O_contigs:")
   log_file.write("O_contigs:\n")
+  extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n")
+  extract_file.write("#O_contigs:\n")
   for x in O_nodes_roles:
     if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker
-      print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
-      log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+" "+"blast score: "+str(x[1])+"identity%:"+str(round(x[2]*100,2))+"% "+str(min(x[-1]))+" to "+str(max(x[-1]))+"\n")
+      #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
+      log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n")
+      title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n"      
+      seqs=""
+      for z in handle_fasta:
+        if x[0].split("___")[-1]==z.description:
+          seqs=str(z.seq)
+      extract_file.write(title+seqs+"\n")
   if len(H_contig_roles)!=0:
     highest_H_coverage=max([float(x[1].split("_cov_")[-1]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
   else:
@@ -864,8 +890,9 @@
       pass
       #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5)
   #end of removing none-middle contigs
-  print("H_contigs:")
+  #print("H_contigs:")
   log_file.write("H_contigs:\n")
+  extract_file.write("#H_contigs:\n")
   H_contig_stat=[]
   H1_cont_stat={}
   H2_cont_stat={}
@@ -877,13 +904,25 @@
         if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide
           for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it
             if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB
-              print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
-              log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+" "+"blast_score: "+str(y[1])+" identity%:"+str(round(y[2]*100,2))+"% "+str(min(y[-1]))+" to "+str(max(y[-1]))+"\n")
+              #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
+              log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
               H_contig_roles[i]="can't decide fliC or fljB, may be third phase"
+              title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n"
+              seqs=""
+              for z in handle_fasta:
+                if x[1]==z.description:
+                  seqs=str(z.seq)
+              extract_file.write(title+seqs+"\n")
               break
         else:
-          print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
-          log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+" "+"blast_score: "+str(y[1])+" identity%:"+str(round(y[2]*100,2))+"% "+str(min(y[-1]))+" to "+str(max(y[-1]))+"\n")
+          #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
+          log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
+          title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n"
+          seqs=""
+          for z in handle_fasta:
+            if x[1]==z.description:
+              seqs=str(z.seq)
+          extract_file.write(title+seqs+"\n")
         if x[0]=="fliC":
           if y[0].split("_")[1] not in H1_cont_stat:
             H1_cont_stat[y[0].split("_")[1]]=y[2]
@@ -905,12 +944,41 @@
     contamination_H="potential contamination from H antigen signals"
   elif len(H2_cont_stat_list)==1 and fljB_contig=="NA":
     contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected"
-  print(contamination_O)
-  print(contamination_H)
+  #get additional antigens
+  """
+    if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
+      if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
+        O_choice="O-9,46"
+        #print "$$$Most possilble Otype:  O-9,46"
+      elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
+        O_choice="O-9,46,27"
+        #print "$$$Most possilble Otype:  O-9,46,27"
+    elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
+      if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
+        O_choice="O-3,10"
+        #print "$$$Most possilble Otype:  O-3,10 (contain O-3,10_not_in_1,3,19)"
+      else:
+        O_choice="O-1,3,19"
+        #print "$$$Most possilble Otype:  O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
+    ### end of special test for O9,46 and O3,10 family
+
+    if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
+    if len(Otypes_uniq)>2:
+      contamination_O="potential contamination from O antigen signals"
+  else:
+    if len(Otypes_uniq)>1:
+      if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
+        contamination_O=""
+      elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
+        contamination_O=""
+  """
+  additonal_antigents=[]
+  #print(contamination_O)
+  #print(contamination_H)
   log_file.write(contamination_O+"\n")
   log_file.write(contamination_H+"\n")
   log_file.close()
-  return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H
+  return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list
 
 def get_input_K(input_file,lib_dict,data_type,k_size):
   #kmer mode; get input_Ks from dict and data_type
@@ -1095,7 +1163,7 @@
     subprocess.check_output("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5",shell=True, stderr=subprocess.STDOUT)###1/27/2015; 08272018, remove "-word_size 10"
   else:
     xmlfile="NA"
-  return xmlfile
+  return xmlfile,new_fasta
 
 def judge_subspecies(fnameA):
   #seqsero2 -a; judge subspecies on just forward raw reads fastq
@@ -1172,7 +1240,7 @@
         current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
         sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id
         map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort
-        xmlfile=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast
+        xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast
         if xmlfile=="NA":
           O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","")
         else:
@@ -1182,17 +1250,21 @@
             file.write("\t".join(str(y) for y in x)+"\n")
           file.close()
           Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
-          O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=predict_O_and_H_types(Final_list,Final_list_passed) #predict O, fliC and fljB
+          O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB
         subspecies=judge_subspecies(fnameA) #predict subspecies
         ###output
         predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
         contamination_report=""
+        H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0]
         if contamination_O!="" and contamination_H=="":
-          contamination_report="#detected potential contamination of mixed serotypes from O antigen signals.\n"
+          contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"."
         elif contamination_O=="" and contamination_H!="":
-          contamination_report="#detected potential contamination of mixed serotypes or potential thrid H phase from H antigen signals.\n"
+          contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"."
         elif contamination_O!="" and contamination_H!="":
-          contamination_report="#detected potential contamination of mixed serotypes from both O and H antigen signals.\n"
+          contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"."
+        if contamination_report!="":
+          contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing
+        claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences
         if clean_mode:
           subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
           make_dir="none-output-directory due to '-c' flag"
@@ -1243,6 +1315,7 @@
         subspecies="II"
       predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
           highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
+      claim=""
       if clean_mode:
         subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
         make_dir="none-output-directory due to '-c' flag"