Mercurial > repos > jpayne > seqsero_v2
diff SeqSero2/SeqSero2_package.py @ 4:b82e5f3c9187
planemo upload
author | jpayne |
---|---|
date | Thu, 18 Apr 2019 15:18:14 -0400 |
parents | fae43708974d |
children | 87c7eebc6797 |
line wrap: on
line diff
--- a/SeqSero2/SeqSero2_package.py Tue Nov 13 16:18:17 2018 -0500 +++ b/SeqSero2/SeqSero2_package.py Thu Apr 18 15:18:14 2019 -0400 @@ -263,11 +263,9 @@ def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies): #like test_output_06012017.txt #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list) - from Initial_Conditions import phase1 - from Initial_Conditions import phase2 - from Initial_Conditions import phaseO - from Initial_Conditions import sero - from Initial_Conditions import subs + from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict + rename_dict_not_anymore=[rename_dict[x] for x in rename_dict] + rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to seronames = [] seronames_none_subspecies=[] for i in range(len(phase1)): @@ -325,9 +323,20 @@ if (new_fliC in fliC_combine or fliC in fliC_combine) and (new_fljB in fljB_combine or fljB in fljB_combine): - if subs[i] == subspecies: - seronames.append(sero[i]) - seronames_none_subspecies.append(sero[i]) + ######start, remove_list,rename_dict, added on 11/11/2018 + if sero[i] not in remove_list: + temp_sero=sero[i] + if temp_sero in rename_dict: + temp_sero=rename_dict[temp_sero] #rename if in the rename list + if temp_sero not in seronames:#the new sero may already included, if yes, then not consider + if subs[i] == subspecies: + seronames.append(temp_sero) + seronames_none_subspecies.append(temp_sero) + else: + pass + else: + pass + ######end, added on 11/11/2018 #analyze seronames subspecies_pointer="" if len(seronames) == 0 and len(seronames_none_subspecies)!=0: @@ -367,10 +376,10 @@ predict_sero = "potential monophasic variant of Heidelberg" elif predict_form == "4:b:-": predict_sero = "potential monophasic variant of Paratyphi B" - elif predict_form == "8:e,h:1,2": - predict_sero = "Newport" - star = "*" - star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare." + #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo + #predict_sero = "Newport" + #star = "*" + #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare." claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n" if "N/A" in predict_sero: claim = "" @@ -418,7 +427,7 @@ f = open(dirpath + '/special.pickle', 'rb') special = pickle.load(f) O22_O23=special['O22_O23'] - if predict_sero.split(" or ")[0] in O22_O23[-1]: + if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze O22_score=0 O23_score=0 for x in special_gene_list: @@ -441,16 +450,16 @@ star = "*" star_line = "Fail to detect O22 and O23 differences." #special test for O6,8 - merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] - for x in merge_O68_list: - if x in predict_sero: - predict_sero=x - star="" - star_line="" + #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list + #for x in merge_O68_list: + # if x in predict_sero: + # predict_sero=x + # star="" + # star_line="" #special test for Montevideo; most of them are monophasic - if "Montevideo" in predict_sero and "1,2,7" in predict_form: - star="*" - star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination." + #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list + #star="*" + #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination." return predict_form, predict_sero, star, star_line, claim ### End of SeqSero Kmer part @@ -648,7 +657,12 @@ if potenial_new_gene!="": print("two differnt genes in same contig, fix it for O antigen") print(potenial_new_gene[:3]) - final_O.append(potenial_new_gene) + pointer=0 + for y in final_O: + if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]: + pointer=1 + if pointer!=1: + final_O.append(potenial_new_gene) ### end of the two genes on same contig test final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]): @@ -731,7 +745,7 @@ contamination_O="" else: contamination_O="potential contamination from O antigen signals" - return O_choice,O_nodes_list,special_genes,final_O,contamination_O + return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq ### End of SeqSero2 allele prediction and output def get_input_files(make_dir,input_file,data_type,dirpath): @@ -770,8 +784,9 @@ os.chdir(old_dir) return for_fq,rev_fq -def predict_O_and_H_types(Final_list,Final_list_passed): +def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta): #get O and H types from Final_list from blast parsing; allele mode + from Bio import SeqIO fliC_choice="-" fljB_choice="-" fliC_contig="NA" @@ -781,18 +796,29 @@ fliC_length=0 #can be changed to coverage in future fljB_length=0 #can be changed to coverage in future O_choice="-"#no need to decide O contig for now, should be only one - O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification + O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification O_choice=O_choice.split("-")[-1].strip() if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="": O_choice="-" H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB log_file=open("SeqSero_log.txt","a") - print("O_contigs:") + extract_file=open("Extracted_antigen_alleles.fasta","a") + handle_fasta=list(SeqIO.parse(new_fasta,"fasta")) + + #print("O_contigs:") log_file.write("O_contigs:\n") + extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n") + extract_file.write("#O_contigs:\n") for x in O_nodes_roles: if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker - print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1]))) - log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+" "+"blast score: "+str(x[1])+"identity%:"+str(round(x[2]*100,2))+"% "+str(min(x[-1]))+" to "+str(max(x[-1]))+"\n") + #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1]))) + log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n") + title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n" + seqs="" + for z in handle_fasta: + if x[0].split("___")[-1]==z.description: + seqs=str(z.seq) + extract_file.write(title+seqs+"\n") if len(H_contig_roles)!=0: highest_H_coverage=max([float(x[1].split("_cov_")[-1]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output else: @@ -864,8 +890,9 @@ pass #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5) #end of removing none-middle contigs - print("H_contigs:") + #print("H_contigs:") log_file.write("H_contigs:\n") + extract_file.write("#H_contigs:\n") H_contig_stat=[] H1_cont_stat={} H2_cont_stat={} @@ -877,13 +904,25 @@ if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB - print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1]))) - log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+" "+"blast_score: "+str(y[1])+" identity%:"+str(round(y[2]*100,2))+"% "+str(min(y[-1]))+" to "+str(max(y[-1]))+"\n") + #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1]))) + log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n") H_contig_roles[i]="can't decide fliC or fljB, may be third phase" + title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n" + seqs="" + for z in handle_fasta: + if x[1]==z.description: + seqs=str(z.seq) + extract_file.write(title+seqs+"\n") break else: - print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1]))) - log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+" "+"blast_score: "+str(y[1])+" identity%:"+str(round(y[2]*100,2))+"% "+str(min(y[-1]))+" to "+str(max(y[-1]))+"\n") + #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1]))) + log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n") + title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n" + seqs="" + for z in handle_fasta: + if x[1]==z.description: + seqs=str(z.seq) + extract_file.write(title+seqs+"\n") if x[0]=="fliC": if y[0].split("_")[1] not in H1_cont_stat: H1_cont_stat[y[0].split("_")[1]]=y[2] @@ -905,12 +944,41 @@ contamination_H="potential contamination from H antigen signals" elif len(H2_cont_stat_list)==1 and fljB_contig=="NA": contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected" - print(contamination_O) - print(contamination_H) + #get additional antigens + """ + if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1 + if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1 + O_choice="O-9,46" + #print "$$$Most possilble Otype: O-9,46" + elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1 + O_choice="O-9,46,27" + #print "$$$Most possilble Otype: O-9,46,27" + elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1 + if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1 + O_choice="O-3,10" + #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)" + else: + O_choice="O-1,3,19" + #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)" + ### end of special test for O9,46 and O3,10 family + + if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19": + if len(Otypes_uniq)>2: + contamination_O="potential contamination from O antigen signals" + else: + if len(Otypes_uniq)>1: + if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund + contamination_O="" + elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund + contamination_O="" + """ + additonal_antigents=[] + #print(contamination_O) + #print(contamination_H) log_file.write(contamination_O+"\n") log_file.write(contamination_H+"\n") log_file.close() - return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H + return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list def get_input_K(input_file,lib_dict,data_type,k_size): #kmer mode; get input_Ks from dict and data_type @@ -1095,7 +1163,7 @@ subprocess.check_output("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5",shell=True, stderr=subprocess.STDOUT)###1/27/2015; 08272018, remove "-word_size 10" else: xmlfile="NA" - return xmlfile + return xmlfile,new_fasta def judge_subspecies(fnameA): #seqsero2 -a; judge subspecies on just forward raw reads fastq @@ -1172,7 +1240,7 @@ current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort - xmlfile=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast + xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast if xmlfile=="NA": O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","") else: @@ -1182,17 +1250,21 @@ file.write("\t".join(str(y) for y in x)+"\n") file.close() Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)] - O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=predict_O_and_H_types(Final_list,Final_list_passed) #predict O, fliC and fljB + O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB subspecies=judge_subspecies(fnameA) #predict subspecies ###output predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies) contamination_report="" + H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0] if contamination_O!="" and contamination_H=="": - contamination_report="#detected potential contamination of mixed serotypes from O antigen signals.\n" + contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"." elif contamination_O=="" and contamination_H!="": - contamination_report="#detected potential contamination of mixed serotypes or potential thrid H phase from H antigen signals.\n" + contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"." elif contamination_O!="" and contamination_H!="": - contamination_report="#detected potential contamination of mixed serotypes from both O and H antigen signals.\n" + contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"." + if contamination_report!="": + contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing + claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences if clean_mode: subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT) make_dir="none-output-directory due to '-c' flag" @@ -1243,6 +1315,7 @@ subspecies="II" predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes( highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies) + claim="" if clean_mode: subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT) make_dir="none-output-directory due to '-c' flag"