seqsero_v2: SeqSero2/SeqSero2_package.py annotate

annotate SeqSero2/SeqSero2_package.py @ 5:e258caab671c

planemo upload

author	jpayne
date	Thu, 18 Apr 2019 15:22:24 -0400
parents	b82e5f3c9187
children	87c7eebc6797

rev	line source
jpayne@1	1 #!/usr/bin/env python3
jpayne@1	2
jpayne@1	3 import sys
jpayne@1	4 import time
jpayne@1	5 import random
jpayne@1	6 import os
jpayne@1	7 import subprocess
jpayne@1	8 import gzip
jpayne@1	9 import io
jpayne@1	10 import pickle
jpayne@1	11 import argparse
jpayne@1	12 import itertools
jpayne@1	13 from distutils.version import LooseVersion
jpayne@1	14
jpayne@1	15 from collections import OrderedDict
jpayne@1	16 from csv import DictWriter
jpayne@1	17
jpayne@1	18 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
jpayne@1	19
jpayne@1	20 ### SeqSero Kmer
jpayne@1	21 def parse_args():
jpayne@1	22 "Parse the input arguments, use '-h' for help."
jpayne@1	23 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com')#add "-m <data_type>" in future
jpayne@1	24 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data")
jpayne@1	25 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)")
jpayne@1	26 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: mode for mapping, 'sam'(bwa samse/sampe), 'mem'(bwa mem), default=mem")
jpayne@1	27 parser.add_argument("-p",default="1",help="<int>: threads used for mapping mode, if p>4, only 4 threads will be used for assembly, default=1")
jpayne@1	28 parser.add_argument("-m",choices=['k','a'],default="k",help="<string>: 'k'(kmer mode), 'a'(allele mode), default=k")
jpayne@1	29 parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number")
jpayne@1	30 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will use clean mode and only output serotyping prediction, the directory containing log files will be deleted")
jpayne@1	31 return parser.parse_args()
jpayne@1	32
jpayne@1	33 def reverse_complement(sequence):
jpayne@1	34 complement = {
jpayne@1	35 'A': 'T',
jpayne@1	36 'C': 'G',
jpayne@1	37 'G': 'C',
jpayne@1	38 'T': 'A',
jpayne@1	39 'N': 'N',
jpayne@1	40 'M': 'K',
jpayne@1	41 'R': 'Y',
jpayne@1	42 'W': 'W',
jpayne@1	43 'S': 'S',
jpayne@1	44 'Y': 'R',
jpayne@1	45 'K': 'M',
jpayne@1	46 'V': 'B',
jpayne@1	47 'H': 'D',
jpayne@1	48 'D': 'H',
jpayne@1	49 'B': 'V'
jpayne@1	50 }
jpayne@1	51 return "".join(complement[base] for base in reversed(sequence))
jpayne@1	52
jpayne@1	53
jpayne@1	54 def createKmerDict_reads(list_of_strings, kmer):
jpayne@1	55 kmer_table = {}
jpayne@1	56 for string in list_of_strings:
jpayne@1	57 sequence = string.strip('\n')
jpayne@1	58 for i in range(len(sequence) - kmer + 1):
jpayne@1	59 new_mer = sequence[i:i + kmer].upper()
jpayne@1	60 new_mer_rc = reverse_complement(new_mer)
jpayne@1	61 if new_mer in kmer_table:
jpayne@1	62 kmer_table[new_mer.upper()] += 1
jpayne@1	63 else:
jpayne@1	64 kmer_table[new_mer.upper()] = 1
jpayne@1	65 if new_mer_rc in kmer_table:
jpayne@1	66 kmer_table[new_mer_rc.upper()] += 1
jpayne@1	67 else:
jpayne@1	68 kmer_table[new_mer_rc.upper()] = 1
jpayne@1	69 return kmer_table
jpayne@1	70
jpayne@1	71
jpayne@1	72 def multifasta_dict(multifasta):
jpayne@1	73 multifasta_list = [
jpayne@1	74 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0
jpayne@1	75 ]
jpayne@1	76 headers = [i for i in multifasta_list if i[0] == '>']
jpayne@1	77 multifasta_dict = {}
jpayne@1	78 for h in headers:
jpayne@1	79 start = multifasta_list.index(h)
jpayne@1	80 for element in multifasta_list[start + 1:]:
jpayne@1	81 if element[0] == '>':
jpayne@1	82 break
jpayne@1	83 else:
jpayne@1	84 if h[1:] in multifasta_dict:
jpayne@1	85 multifasta_dict[h[1:]] += element
jpayne@1	86 else:
jpayne@1	87 multifasta_dict[h[1:]] = element
jpayne@1	88 return multifasta_dict
jpayne@1	89
jpayne@1	90
jpayne@1	91 def multifasta_single_string(multifasta):
jpayne@1	92 multifasta_list = [
jpayne@1	93 line.strip() for line in open(multifasta, 'r')
jpayne@1	94 if (len(line.strip()) > 0) and (line.strip()[0] != '>')
jpayne@1	95 ]
jpayne@1	96 return ''.join(multifasta_list)
jpayne@1	97
jpayne@1	98
jpayne@1	99 def chunk_a_long_sequence(long_sequence, chunk_size=60):
jpayne@1	100 chunk_list = []
jpayne@1	101 steps = len(long_sequence) // 60 #how many chunks
jpayne@1	102 for i in range(steps):
jpayne@1	103 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size])
jpayne@1	104 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)])
jpayne@1	105 return chunk_list
jpayne@1	106
jpayne@1	107
jpayne@1	108 def target_multifasta_kmerizer(multifasta, k, kmerDict):
jpayne@1	109 forward_length = 300 #if find the target, put forward 300 bases
jpayne@1	110 reverse_length = 2200 #if find the target, put backward 2200 bases
jpayne@1	111 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences
jpayne@1	112 target_mers = []
jpayne@1	113 long_single_string = multifasta_single_string(multifasta)
jpayne@1	114 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size)
jpayne@1	115 unit_length = len(multifasta_list[0])
jpayne@1	116 forward_lines = int(forward_length / unit_length) + 1
jpayne@1	117 reverse_lines = int(forward_length / unit_length) + 1
jpayne@1	118 start_num = 0
jpayne@1	119 end_num = 0
jpayne@1	120 for i in range(len(multifasta_list)):
jpayne@1	121 if i not in range(start_num, end_num): #avoid computational repetition
jpayne@1	122 line = multifasta_list[i]
jpayne@1	123 start = int((len(line) - k) // 2)
jpayne@1	124 s1 = line[start:k + start]
jpayne@1	125 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
jpayne@1	126 if i - forward_lines >= 0:
jpayne@1	127 start_num = i - forward_lines
jpayne@1	128 else:
jpayne@1	129 start_num = 0
jpayne@1	130 if i + reverse_lines <= len(multifasta_list) - 1:
jpayne@1	131 end_num = i + reverse_lines
jpayne@1	132 else:
jpayne@1	133 end_num = len(multifasta_list) - 1
jpayne@1	134 target_list = [
jpayne@1	135 x.strip() for x in multifasta_list[start_num:end_num]
jpayne@1	136 ]
jpayne@1	137 target_line = "".join(target_list)
jpayne@1	138 target_mers += [
jpayne@1	139 k1 for k1 in createKmerDict_reads([str(target_line)], k)
jpayne@1	140 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
jpayne@1	141 else:
jpayne@1	142 pass
jpayne@1	143 return set(target_mers)
jpayne@1	144
jpayne@1	145
jpayne@1	146 def target_read_kmerizer(file, k, kmerDict):
jpayne@1	147 i = 1
jpayne@1	148 n_reads = 0
jpayne@1	149 total_coverage = 0
jpayne@1	150 target_mers = []
jpayne@1	151 if file.endswith(".gz"):
jpayne@1	152 file_content = io.BufferedReader(gzip.open(file))
jpayne@1	153 else:
jpayne@1	154 file_content = open(file, "r").readlines()
jpayne@1	155 for line in file_content:
jpayne@1	156 start = int((len(line) - k) // 2)
jpayne@1	157 if i % 4 == 2:
jpayne@1	158 if file.endswith(".gz"):
jpayne@1	159 s1 = line[start:k + start].decode()
jpayne@1	160 line = line.decode()
jpayne@1	161 else:
jpayne@1	162 s1 = line[start:k + start]
jpayne@1	163 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
jpayne@1	164 n_reads += 1
jpayne@1	165 total_coverage += len(line)
jpayne@1	166 target_mers += [
jpayne@1	167 k1 for k1 in createKmerDict_reads([str(line)], k)
jpayne@1	168 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
jpayne@1	169 i += 1
jpayne@1	170 if total_coverage >= 4000000:
jpayne@1	171 break
jpayne@1	172 return set(target_mers)
jpayne@1	173
jpayne@1	174
jpayne@1	175 def minion_fasta_kmerizer(file, k, kmerDict):
jpayne@1	176 i = 1
jpayne@1	177 n_reads = 0
jpayne@1	178 total_coverage = 0
jpayne@1	179 target_mers = {}
jpayne@1	180 for line in open(file):
jpayne@1	181 if i % 2 == 0:
jpayne@1	182 for kmer, rc_kmer in kmers(line.strip().upper(), k):
jpayne@1	183 if (kmer in kmerDict) or (rc_kmer in kmerDict):
jpayne@1	184 if kmer in target_mers:
jpayne@1	185 target_mers[kmer] += 1
jpayne@1	186 else:
jpayne@1	187 target_mers[kmer] = 1
jpayne@1	188 if rc_kmer in target_mers:
jpayne@1	189 target_mers[rc_kmer] += 1
jpayne@1	190 else:
jpayne@1	191 target_mers[rc_kmer] = 1
jpayne@1	192 i += 1
jpayne@1	193 return set([h for h in target_mers])
jpayne@1	194
jpayne@1	195
jpayne@1	196 def minion_fastq_kmerizer(file, k, kmerDict):
jpayne@1	197 i = 1
jpayne@1	198 n_reads = 0
jpayne@1	199 total_coverage = 0
jpayne@1	200 target_mers = {}
jpayne@1	201 for line in open(file):
jpayne@1	202 if i % 4 == 2:
jpayne@1	203 for kmer, rc_kmer in kmers(line.strip().upper(), k):
jpayne@1	204 if (kmer in kmerDict) or (rc_kmer in kmerDict):
jpayne@1	205 if kmer in target_mers:
jpayne@1	206 target_mers[kmer] += 1
jpayne@1	207 else:
jpayne@1	208 target_mers[kmer] = 1
jpayne@1	209 if rc_kmer in target_mers:
jpayne@1	210 target_mers[rc_kmer] += 1
jpayne@1	211 else:
jpayne@1	212 target_mers[rc_kmer] = 1
jpayne@1	213 i += 1
jpayne@1	214 return set([h for h in target_mers])
jpayne@1	215
jpayne@1	216
jpayne@1	217 def multifasta_single_string2(multifasta):
jpayne@1	218 single_string = ''
jpayne@1	219 with open(multifasta, 'r') as f:
jpayne@1	220 for line in f:
jpayne@1	221 if line.strip()[0] == '>':
jpayne@1	222 pass
jpayne@1	223 else:
jpayne@1	224 single_string += line.strip()
jpayne@1	225 return single_string
jpayne@1	226
jpayne@1	227
jpayne@1	228 def kmers(seq, k):
jpayne@1	229 rev_comp = reverse_complement(seq)
jpayne@1	230 for start in range(1, len(seq) - k + 1):
jpayne@1	231 yield seq[start:start + k], rev_comp[-(start + k):-start]
jpayne@1	232
jpayne@1	233
jpayne@1	234 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set
jpayne@1	235 multi_seq_dict = multifasta_dict(multifasta)
jpayne@1	236 lib_dict = {}
jpayne@1	237 for h in multi_seq_dict:
jpayne@1	238 lib_dict[h] = set(
jpayne@1	239 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)])
jpayne@1	240 return lib_dict
jpayne@1	241
jpayne@1	242
jpayne@1	243 def Combine(b, c):
jpayne@1	244 fliC_combinations = []
jpayne@1	245 fliC_combinations.append(",".join(c))
jpayne@1	246 temp_combinations = []
jpayne@1	247 for i in range(len(b)):
jpayne@1	248 for x in itertools.combinations(b, i + 1):
jpayne@1	249 temp_combinations.append(",".join(x))
jpayne@1	250 for x in temp_combinations:
jpayne@1	251 temp = []
jpayne@1	252 for y in c:
jpayne@1	253 temp.append(y)
jpayne@1	254 temp.append(x)
jpayne@1	255 temp = ",".join(temp)
jpayne@1	256 temp = temp.split(",")
jpayne@1	257 temp.sort()
jpayne@1	258 temp = ",".join(temp)
jpayne@1	259 fliC_combinations.append(temp)
jpayne@1	260 return fliC_combinations
jpayne@1	261
jpayne@1	262
jpayne@1	263 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies):
jpayne@1	264 #like test_output_06012017.txt
jpayne@1	265 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
jpayne@4	266 from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict
jpayne@4	267 rename_dict_not_anymore=[rename_dict[x] for x in rename_dict]
jpayne@4	268 rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to
jpayne@1	269 seronames = []
jpayne@1	270 seronames_none_subspecies=[]
jpayne@1	271 for i in range(len(phase1)):
jpayne@1	272 fliC_combine = []
jpayne@1	273 fljB_combine = []
jpayne@1	274 if phaseO[i] == Otype: # no VII in KW, but it's there
jpayne@1	275 ### for fliC, detect every possible combinations to avoid the effect of "["
jpayne@1	276 if phase1[i].count("[") == 0:
jpayne@1	277 fliC_combine.append(phase1[i])
jpayne@1	278 elif phase1[i].count("[") >= 1:
jpayne@1	279 c = []
jpayne@1	280 b = []
jpayne@1	281 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count(
jpayne@1	282 "[") == 1:
jpayne@1	283 content = phase1[i].replace("[", "").replace("]", "")
jpayne@1	284 fliC_combine.append(content)
jpayne@1	285 fliC_combine.append("-")
jpayne@1	286 else:
jpayne@1	287 for x in phase1[i].split(","):
jpayne@1	288 if "[" in x:
jpayne@1	289 b.append(x.replace("[", "").replace("]", ""))
jpayne@1	290 else:
jpayne@1	291 c.append(x)
jpayne@1	292 fliC_combine = Combine(
jpayne@1	293 b, c
jpayne@1	294 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t
jpayne@1	295 ### end of fliC "[" detect
jpayne@1	296 ### for fljB, detect every possible combinations to avoid the effect of "["
jpayne@1	297 if phase2[i].count("[") == 0:
jpayne@1	298 fljB_combine.append(phase2[i])
jpayne@1	299 elif phase2[i].count("[") >= 1:
jpayne@1	300 d = []
jpayne@1	301 e = []
jpayne@1	302 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count(
jpayne@1	303 "[") == 1:
jpayne@1	304 content = phase2[i].replace("[", "").replace("]", "")
jpayne@1	305 fljB_combine.append(content)
jpayne@1	306 fljB_combine.append("-")
jpayne@1	307 else:
jpayne@1	308 for x in phase2[i].split(","):
jpayne@1	309 if "[" in x:
jpayne@1	310 d.append(x.replace("[", "").replace("]", ""))
jpayne@1	311 else:
jpayne@1	312 e.append(x)
jpayne@1	313 fljB_combine = Combine(d, e)
jpayne@1	314 ### end of fljB "[" detect
jpayne@1	315 new_fliC = fliC.split(
jpayne@1	316 ","
jpayne@1	317 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings
jpayne@1	318 new_fliC.sort()
jpayne@1	319 new_fliC = ",".join(new_fliC)
jpayne@1	320 new_fljB = fljB.split(",")
jpayne@1	321 new_fljB.sort()
jpayne@1	322 new_fljB = ",".join(new_fljB)
jpayne@1	323 if (new_fliC in fliC_combine
jpayne@1	324 or fliC in fliC_combine) and (new_fljB in fljB_combine
jpayne@1	325 or fljB in fljB_combine):
jpayne@4	326 ######start, remove_list,rename_dict, added on 11/11/2018
jpayne@4	327 if sero[i] not in remove_list:
jpayne@4	328 temp_sero=sero[i]
jpayne@4	329 if temp_sero in rename_dict:
jpayne@4	330 temp_sero=rename_dict[temp_sero] #rename if in the rename list
jpayne@4	331 if temp_sero not in seronames:#the new sero may already included, if yes, then not consider
jpayne@4	332 if subs[i] == subspecies:
jpayne@4	333 seronames.append(temp_sero)
jpayne@4	334 seronames_none_subspecies.append(temp_sero)
jpayne@4	335 else:
jpayne@4	336 pass
jpayne@4	337 else:
jpayne@4	338 pass
jpayne@4	339 ######end, added on 11/11/2018
jpayne@1	340 #analyze seronames
jpayne@1	341 subspecies_pointer=""
jpayne@1	342 if len(seronames) == 0 and len(seronames_none_subspecies)!=0:
jpayne@1	343 seronames=seronames_none_subspecies
jpayne@1	344 subspecies_pointer="1"
jpayne@1	345 if len(seronames) == 0:
jpayne@1	346 seronames = [
jpayne@1	347 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)"
jpayne@1	348 ]
jpayne@1	349 star = ""
jpayne@1	350 star_line = ""
jpayne@1	351 if len(seronames) > 1: #there are two possible predictions for serotypes
jpayne@1	352 star = "*"
jpayne@1	353 star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
jpayne@1	354 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0:
jpayne@1	355 star="*"
jpayne@1	356 star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
jpayne@1	357 if Otype=="":
jpayne@1	358 Otype="-"
jpayne@1	359 predict_form = Otype + ":" + fliC + ":" + fljB
jpayne@1	360 predict_sero = (" or ").join(seronames)
jpayne@1	361 ###special test for Enteritidis
jpayne@1	362 if predict_form == "9:g,m:-":
jpayne@1	363 sdf = "-"
jpayne@1	364 for x in special_gene_list:
jpayne@1	365 if x.startswith("sdf"):
jpayne@1	366 sdf = "+"
jpayne@1	367 predict_form = predict_form + " Sdf prediction:" + sdf
jpayne@1	368 if sdf == "-":
jpayne@1	369 star = "*"
jpayne@1	370 star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
jpayne@1	371 predict_sero = "Gallinarum/Enteritidis sdf -"
jpayne@1	372 ###end of special test for Enteritidis
jpayne@1	373 elif predict_form == "4:i:-":
jpayne@1	374 predict_sero = "potential monophasic variant of Typhimurium"
jpayne@1	375 elif predict_form == "4:r:-":
jpayne@1	376 predict_sero = "potential monophasic variant of Heidelberg"
jpayne@1	377 elif predict_form == "4:b:-":
jpayne@1	378 predict_sero = "potential monophasic variant of Paratyphi B"
jpayne@4	379 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
jpayne@4	380 #predict_sero = "Newport"
jpayne@4	381 #star = "*"
jpayne@4	382 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
jpayne@1	383 claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
jpayne@1	384 if "N/A" in predict_sero:
jpayne@1	385 claim = ""
jpayne@1	386 #special test for Typhimurium
jpayne@1	387 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
jpayne@1	388 normal = 0
jpayne@1	389 mutation = 0
jpayne@1	390 for x in special_gene_list:
jpayne@1	391 if "oafA-O-4_full" in x:
jpayne@1	392 normal = float(special_gene_list[x])
jpayne@1	393 elif "oafA-O-4_5-" in x:
jpayne@1	394 mutation = float(special_gene_list[x])
jpayne@1	395 if normal > mutation:
jpayne@1	396 pass
jpayne@1	397 elif normal < mutation:
jpayne@1	398 predict_sero = predict_sero.strip() + "(O5-)"
jpayne@1	399 star = "*"
jpayne@1	400 star_line = "Detected the deletion of O5-."
jpayne@1	401 else:
jpayne@1	402 pass
jpayne@1	403 #special test for Paratyphi B
jpayne@1	404 if "Paratyphi B" in predict_sero or predict_form == "4:b:-":
jpayne@1	405 normal = 0
jpayne@1	406 mutation = 0
jpayne@1	407 for x in special_gene_list:
jpayne@1	408 if "gntR-family-regulatory-protein_dt-positive" in x:
jpayne@1	409 normal = float(special_gene_list[x])
jpayne@1	410 elif "gntR-family-regulatory-protein_dt-negative" in x:
jpayne@1	411 mutation = float(special_gene_list[x])
jpayne@1	412 #print(normal,mutation)
jpayne@1	413 if normal > mutation:
jpayne@1	414 predict_sero = predict_sero.strip() + "(dt+)"
jpayne@1	415 star = "*"
jpayne@1	416 star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
jpayne@1	417 elif normal < mutation:
jpayne@1	418 predict_sero = predict_sero.strip() + "(dt-)"
jpayne@1	419 star = "*"
jpayne@1	420 star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
jpayne@1	421 else:
jpayne@1	422 star = "*"
jpayne@1	423 star_line = "Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
jpayne@1	424 #special test for O13,22 and O13,23
jpayne@1	425 if Otype=="13":
jpayne@1	426 #ex_dir = os.path.dirname(os.path.realpath(__file__))
jpayne@1	427 f = open(dirpath + '/special.pickle', 'rb')
jpayne@1	428 special = pickle.load(f)
jpayne@1	429 O22_O23=special['O22_O23']
jpayne@4	430 if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze
jpayne@1	431 O22_score=0
jpayne@1	432 O23_score=0
jpayne@1	433 for x in special_gene_list:
jpayne@1	434 if "O:22" in x:
jpayne@1	435 O22_score = O22_score+float(special_gene_list[x])
jpayne@1	436 elif "O:23" in x:
jpayne@1	437 O23_score = O23_score+float(special_gene_list[x])
jpayne@1	438 #print(O22_score,O23_score)
jpayne@1	439 for z in O22_O23[0]:
jpayne@1	440 if predict_sero.split(" or ")[0] in z:
jpayne@1	441 if O22_score > O23_score:
jpayne@1	442 star = "*"
jpayne@1	443 star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'."
jpayne@1	444 predict_sero = z[0]
jpayne@1	445 elif O22_score < O23_score:
jpayne@1	446 star = "*"
jpayne@1	447 star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'."
jpayne@1	448 predict_sero = z[1]
jpayne@1	449 else:
jpayne@1	450 star = "*"
jpayne@1	451 star_line = "Fail to detect O22 and O23 differences."
jpayne@1	452 #special test for O6,8
jpayne@4	453 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
jpayne@4	454 #for x in merge_O68_list:
jpayne@4	455 # if x in predict_sero:
jpayne@4	456 # predict_sero=x
jpayne@4	457 # star=""
jpayne@4	458 # star_line=""
jpayne@1	459 #special test for Montevideo; most of them are monophasic
jpayne@4	460 #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list
jpayne@4	461 #star="*"
jpayne@4	462 #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
jpayne@1	463 return predict_form, predict_sero, star, star_line, claim
jpayne@1	464 ### End of SeqSero Kmer part
jpayne@1	465
jpayne@1	466 ### Begin of SeqSero2 allele prediction and output
jpayne@1	467 def xml_parse_score_comparision_seqsero(xmlfile):
jpayne@1	468 #used to do seqsero xml analysis
jpayne@1	469 from Bio.Blast import NCBIXML
jpayne@1	470 handle=open(xmlfile)
jpayne@1	471 handle=NCBIXML.parse(handle)
jpayne@1	472 handle=list(handle)
jpayne@1	473 List=[]
jpayne@1	474 List_score=[]
jpayne@1	475 List_ids=[]
jpayne@1	476 List_query_region=[]
jpayne@1	477 for i in range(len(handle)):
jpayne@1	478 if len(handle[i].alignments)>0:
jpayne@1	479 for j in range(len(handle[i].alignments)):
jpayne@1	480 score=0
jpayne@1	481 ids=0
jpayne@1	482 cover_region=set() #fixed problem that repeated calculation leading percentage > 1
jpayne@1	483 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def)
jpayne@1	484 for z in range(len(handle[i].alignments[j].hsps)):
jpayne@1	485 hsp=handle[i].alignments[j].hsps[z]
jpayne@1	486 temp=set(range(hsp.query_start,hsp.query_end))
jpayne@1	487 if len(cover_region)==0:
jpayne@1	488 cover_region=cover_region\|temp
jpayne@1	489 fraction=1
jpayne@1	490 else:
jpayne@1	491 fraction=1-len(cover_region&temp)/float(len(temp))
jpayne@1	492 cover_region=cover_region\|temp
jpayne@1	493 if "last" in handle[i].query or "first" in handle[i].query:
jpayne@1	494 score+=hsp.bits*fraction
jpayne@1	495 ids+=float(hsp.identities)/handle[i].query_length*fraction
jpayne@1	496 else:
jpayne@1	497 score+=hsp.bits*fraction
jpayne@1	498 ids+=float(hsp.identities)/handle[i].query_length*fraction
jpayne@1	499 List_score.append(score)
jpayne@1	500 List_ids.append(ids)
jpayne@1	501 List_query_region.append(cover_region)
jpayne@1	502 temp=zip(List,List_score,List_ids,List_query_region)
jpayne@1	503 Final_list=sorted(temp, key=lambda d:d[1], reverse = True)
jpayne@1	504 return Final_list
jpayne@1	505
jpayne@1	506
jpayne@1	507 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number
jpayne@1	508 Old=L
jpayne@1	509 L.sort()
jpayne@1	510 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]]
jpayne@1	511 count=[]
jpayne@1	512 for j in range(len(L)):
jpayne@1	513 y=0
jpayne@1	514 for x in Old:
jpayne@1	515 if L[j]==x:
jpayne@1	516 y+=1
jpayne@1	517 count.append(y)
jpayne@1	518 if sort_on_fre!="none":
jpayne@1	519 d=zip(*sorted(zip(count, L)))
jpayne@1	520 L=d[1]
jpayne@1	521 count=d[0]
jpayne@1	522 return (L,count)
jpayne@1	523
jpayne@1	524 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list):
jpayne@1	525 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test
jpayne@1	526 #this is mainly used for
jpayne@1	527 a=nodes_vs_score_list
jpayne@1	528 fliC_score=0
jpayne@1	529 fljB_score=0
jpayne@1	530 for z in a:
jpayne@1	531 if "fliC" in z[0]:
jpayne@1	532 fliC_score+=z[1]
jpayne@1	533 elif "fljB" in z[0]:
jpayne@1	534 fljB_score+=z[1]
jpayne@1	535 if fliC_score>=fljB_score:
jpayne@1	536 role="fliC"
jpayne@1	537 else:
jpayne@1	538 role="fljB"
jpayne@1	539 return (role,abs(fliC_score-fljB_score))
jpayne@1	540
jpayne@1	541 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed):
jpayne@1	542 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small)
jpayne@1	543 #also used when no head or tail got blasted score for the contig
jpayne@1	544 role=""
jpayne@1	545 for z in Final_list_passed:
jpayne@1	546 if node_name in z[0]:
jpayne@1	547 role=z[0].split("_")[0]
jpayne@1	548 break
jpayne@1	549 return role
jpayne@1	550
jpayne@1	551 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed):
jpayne@1	552 #nodes_list is the c created by c,d=Uniq(nodes) in below function
jpayne@1	553 first_target=""
jpayne@1	554 role_list=[]
jpayne@1	555 for x in nodes_list:
jpayne@1	556 a=[]
jpayne@1	557 role=""
jpayne@1	558 for y in tail_head_list:
jpayne@1	559 if x in y[0]:
jpayne@1	560 a.append(y)
jpayne@1	561 if len(a)==4:
jpayne@1	562 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
jpayne@1	563 if diff<20:
jpayne@1	564 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1	565 elif len(a)==3:
jpayne@1	566 ###however, if the one with highest score is the fewer one, compare their accumulation score
jpayne@1	567 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
jpayne@1	568 if diff<20:
jpayne@1	569 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1	570 ###end of above score comparison
jpayne@1	571 elif len(a)==2:
jpayne@1	572 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37)
jpayne@1	573 temp=[]
jpayne@1	574 for z in a:
jpayne@1	575 temp.append(z[0].split("_")[0])
jpayne@1	576 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too
jpayne@1	577 if len(m)==1:
jpayne@1	578 pass
jpayne@1	579 else:
jpayne@1	580 pass
jpayne@1	581 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
jpayne@1	582 if diff<20:
jpayne@1	583 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1	584 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation
jpayne@1	585 elif len(a)==1:
jpayne@1	586 #that one
jpayne@1	587 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
jpayne@1	588 if diff<20:
jpayne@1	589 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1	590 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0"
jpayne@1	591 else:#a==0
jpayne@1	592 #use Final_list_passed best match
jpayne@1	593 for z in Final_list_passed:
jpayne@1	594 if x in z[0]:
jpayne@1	595 role=z[0].split("_")[0]
jpayne@1	596 break
jpayne@1	597 #print x,role,len(a)
jpayne@1	598 role_list.append((role,x))
jpayne@1	599 if len(role_list)==2:
jpayne@1	600 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase
jpayne@1	601 #just use score to do a final test
jpayne@1	602 role_list=[]
jpayne@1	603 for x in nodes_list:
jpayne@1	604 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1	605 role_list.append((role,x))
jpayne@1	606 return role_list
jpayne@1	607
jpayne@1	608 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed):
jpayne@1	609 #used to decide which contig is FliC and which one is fljB
jpayne@1	610 contigs=[]
jpayne@1	611 nodes=[]
jpayne@1	612 for x in Final_list_passed:
jpayne@1	613 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]:
jpayne@1	614 nodes.append(x[0].split("___")[1].strip())
jpayne@1	615 c,d=Uniq(nodes)#c is node_list
jpayne@1	616 #print c
jpayne@1	617 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])]
jpayne@1	618 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed)
jpayne@1	619 return roles
jpayne@1	620
jpayne@1	621 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed):
jpayne@1	622 #decide O based on Final_list
jpayne@1	623 O_choice="?"
jpayne@1	624 O_list=[]
jpayne@1	625 special_genes={}
jpayne@1	626 nodes=[]
jpayne@1	627 for x in Final_list_passed:
jpayne@1	628 if x[0].startswith("O-"):
jpayne@1	629 nodes.append(x[0].split("___")[1].strip())
jpayne@1	630 elif not x[0].startswith("fl"):
jpayne@1	631 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1]
jpayne@1	632 #print "special_genes:",special_genes
jpayne@1	633 c,d=Uniq(nodes)
jpayne@1	634 #print "potential O antigen contig",c
jpayne@1	635 final_O=[]
jpayne@1	636 O_nodes_list=[]
jpayne@1	637 for x in c:#c is the list for contigs
jpayne@1	638 temp=0
jpayne@1	639 for y in Final_list_passed:
jpayne@1	640 if x in y[0] and y[0].startswith("O-"):
jpayne@1	641 final_O.append(y)
jpayne@1	642 break
jpayne@1	643 ### O contig has the problem of two genes on same contig, so do additional test
jpayne@1	644 potenial_new_gene=""
jpayne@1	645 for x in final_O:
jpayne@1	646 pointer=0 #for genes merged or not
jpayne@1	647 #not consider O-1,3,19_not_in_3,10, too short compared with others
jpayne@1	648 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 3002 (for secureity can use 400*2) as flank region
jpayne@1	649 pointer=x[0].split("___")[1].strip()#store the contig name
jpayne@1	650 print(pointer)
jpayne@1	651 if pointer!=0:#it has potential merge event
jpayne@1	652 for y in Final_list:
jpayne@1	653 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O
jpayne@1	654 potenial_new_gene=y
jpayne@1	655 #print(potenial_new_gene)
jpayne@1	656 break
jpayne@1	657 if potenial_new_gene!="":
jpayne@1	658 print("two differnt genes in same contig, fix it for O antigen")
jpayne@1	659 print(potenial_new_gene[:3])
jpayne@4	660 pointer=0
jpayne@4	661 for y in final_O:
jpayne@4	662 if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]:
jpayne@4	663 pointer=1
jpayne@4	664 if pointer!=1:
jpayne@4	665 final_O.append(potenial_new_gene)
jpayne@1	666 ### end of the two genes on same contig test
jpayne@1	667 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
jpayne@1	668 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
jpayne@1	669 #print "$$$No Otype, due to no hit"#may need to be changed
jpayne@1	670 O_choice="-"
jpayne@1	671 else:
jpayne@1	672 highest_O_coverage=max([float(x[0].split("_cov_")[-1]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
jpayne@1	673 O_list=[]
jpayne@1	674 O_list_less_contamination=[]
jpayne@1	675 for x in final_O:
jpayne@1	676 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off
jpayne@1	677 O_list.append(x[0].split("__")[0])
jpayne@1	678 O_nodes_list.append(x[0].split("___")[1])
jpayne@1	679 if float(x[0].split("_cov_")[-1])>highest_O_coverage*0.15:
jpayne@1	680 O_list_less_contamination.append(x[0].split("__")[0])
jpayne@1	681 ### special test for O9,46 and O3,10 family
jpayne@1	682 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
jpayne@1	683 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
jpayne@1	684 O_choice="O-9,46"
jpayne@1	685 #print "$$$Most possilble Otype: O-9,46"
jpayne@1	686 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
jpayne@1	687 O_choice="O-9,46,27"
jpayne@1	688 #print "$$$Most possilble Otype: O-9,46,27"
jpayne@1	689 else:
jpayne@1	690 O_choice="O-9"#next, detect O9 vs O2?
jpayne@1	691 O2=0
jpayne@1	692 O9=0
jpayne@1	693 for z in special_genes:
jpayne@1	694 if "tyr-O-9" in z:
jpayne@1	695 O9=special_genes[z]
jpayne@1	696 elif "tyr-O-2" in z:
jpayne@1	697 O2=special_genes[z]
jpayne@1	698 if O2>O9:
jpayne@1	699 O_choice="O-2"
jpayne@1	700 elif O2<O9:
jpayne@1	701 pass
jpayne@1	702 else:
jpayne@1	703 pass
jpayne@1	704 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility."
jpayne@1	705 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
jpayne@1	706 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
jpayne@1	707 O_choice="O-3,10"
jpayne@1	708 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
jpayne@1	709 else:
jpayne@1	710 O_choice="O-1,3,19"
jpayne@1	711 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
jpayne@1	712 ### end of special test for O9,46 and O3,10 family
jpayne@1	713 else:
jpayne@1	714 try:
jpayne@1	715 max_score=0
jpayne@1	716 for x in final_O:
jpayne@1	717 if x[2]>=max_score and float(x[0].split("_cov_")[-1])>highest_O_coverage0.15:#use x[2],08172018, the "coverage identity = cover_length identity"; also meet coverage threshold
jpayne@1	718 max_score=x[2]#change from x[-1] to x[2],08172018
jpayne@1	719 O_choice=x[0].split("_")[0]
jpayne@1	720 if O_choice=="O-1,3,19":
jpayne@1	721 O_choice=final_O[1][0].split("_")[0]
jpayne@1	722 #print "$$$Most possilble Otype: ",O_choice
jpayne@1	723 except:
jpayne@1	724 pass
jpayne@1	725 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)"
jpayne@1	726 #print "O:",O_choice,O_nodes_list
jpayne@1	727 Otypes=[]
jpayne@1	728 for x in O_list:
jpayne@1	729 if x!="O-1,3,19_not_in_3,10":
jpayne@1	730 if "O-9,46_" not in x:
jpayne@1	731 Otypes.append(x.split("_")[0])
jpayne@1	732 else:
jpayne@1	733 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254
jpayne@1	734 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"]
jpayne@1	735 Otypes_uniq,Otypes_fre=Uniq(Otypes)
jpayne@1	736 contamination_O=""
jpayne@1	737 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
jpayne@1	738 if len(Otypes_uniq)>2:
jpayne@1	739 contamination_O="potential contamination from O antigen signals"
jpayne@1	740 else:
jpayne@1	741 if len(Otypes_uniq)>1:
jpayne@1	742 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
jpayne@1	743 contamination_O=""
jpayne@1	744 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
jpayne@1	745 contamination_O=""
jpayne@1	746 else:
jpayne@1	747 contamination_O="potential contamination from O antigen signals"
jpayne@4	748 return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq
jpayne@1	749 ### End of SeqSero2 allele prediction and output
jpayne@1	750
jpayne@1	751 def get_input_files(make_dir,input_file,data_type,dirpath):
jpayne@1	752 #tell input files from datatype
jpayne@1	753 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)"
jpayne@1	754 for_fq=""
jpayne@1	755 rev_fq=""
jpayne@1	756 old_dir = os.getcwd()
jpayne@1	757 os.chdir(make_dir)
jpayne@1	758 if data_type=="1":
jpayne@1	759 input_file=input_file[0].split("/")[-1]
jpayne@1	760 if input_file.endswith(".sra"):
jpayne@1	761 subprocess.check_output("fastq-dump --split-files "+input_file,shell=True, stderr=subprocess.STDOUT)
jpayne@1	762 for_fq=input_file.replace(".sra","_1.fastq")
jpayne@1	763 rev_fq=input_file.replace(".sra","_2.fastq")
jpayne@1	764 else:
jpayne@1	765 core_id=input_file.split(".fastq")[0].split(".fq")[0]
jpayne@1	766 for_fq=core_id+"_1.fastq"
jpayne@1	767 rev_fq=core_id+"_2.fastq"
jpayne@1	768 if input_file.endswith(".gz"):
jpayne@1	769 subprocess.check_output("gzip -dc "+input_file+" \| "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True, stderr=subprocess.STDOUT)
jpayne@1	770 else:
jpayne@1	771 subprocess.check_output("cat "+input_file+" \| "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True, stderr=subprocess.STDOUT)
jpayne@1	772 elif data_type=="2":
jpayne@1	773 for_fq=input_file[0].split("/")[-1]
jpayne@1	774 rev_fq=input_file[1].split("/")[-1]
jpayne@1	775 elif data_type=="3":
jpayne@1	776 input_file=input_file[0].split("/")[-1]
jpayne@1	777 if input_file.endswith(".sra"):
jpayne@1	778 subprocess.check_output("fastq-dump --split-files "+input_file,shell=True, stderr=subprocess.STDOUT)
jpayne@1	779 for_fq=input_file.replace(".sra","_1.fastq")
jpayne@1	780 else:
jpayne@1	781 for_fq=input_file
jpayne@1	782 elif data_type in ["4","5","6"]:
jpayne@1	783 for_fq=input_file[0].split("/")[-1]
jpayne@1	784 os.chdir(old_dir)
jpayne@1	785 return for_fq,rev_fq
jpayne@1	786
jpayne@4	787 def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta):
jpayne@1	788 #get O and H types from Final_list from blast parsing; allele mode
jpayne@4	789 from Bio import SeqIO
jpayne@1	790 fliC_choice="-"
jpayne@1	791 fljB_choice="-"
jpayne@1	792 fliC_contig="NA"
jpayne@1	793 fljB_contig="NA"
jpayne@1	794 fliC_region=set([0])
jpayne@1	795 fljB_region=set([0,])
jpayne@1	796 fliC_length=0 #can be changed to coverage in future
jpayne@1	797 fljB_length=0 #can be changed to coverage in future
jpayne@1	798 O_choice="-"#no need to decide O contig for now, should be only one
jpayne@4	799 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
jpayne@1	800 O_choice=O_choice.split("-")[-1].strip()
jpayne@1	801 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
jpayne@1	802 O_choice="-"
jpayne@1	803 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
jpayne@1	804 log_file=open("SeqSero_log.txt","a")
jpayne@4	805 extract_file=open("Extracted_antigen_alleles.fasta","a")
jpayne@4	806 handle_fasta=list(SeqIO.parse(new_fasta,"fasta"))
jpayne@4	807
jpayne@4	808 #print("O_contigs:")
jpayne@1	809 log_file.write("O_contigs:\n")
jpayne@4	810 extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n")
jpayne@4	811 extract_file.write("#O_contigs:\n")
jpayne@1	812 for x in O_nodes_roles:
jpayne@1	813 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker
jpayne@4	814 #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
jpayne@4	815 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n")
jpayne@4	816 title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n"
jpayne@4	817 seqs=""
jpayne@4	818 for z in handle_fasta:
jpayne@4	819 if x[0].split("___")[-1]==z.description:
jpayne@4	820 seqs=str(z.seq)
jpayne@4	821 extract_file.write(title+seqs+"\n")
jpayne@1	822 if len(H_contig_roles)!=0:
jpayne@1	823 highest_H_coverage=max([float(x[1].split("_cov_")[-1]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
jpayne@1	824 else:
jpayne@1	825 highest_H_coverage=0
jpayne@1	826 for x in H_contig_roles:
jpayne@1	827 #if multiple choices, temporately select the one with longest length for now, will revise in further change
jpayne@1	828 if "fliC" == x[0] and int(x[1].split("_")[3])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
jpayne@1	829 fliC_contig=x[1]
jpayne@1	830 fliC_length=int(x[1].split("_")[3])
jpayne@1	831 elif "fljB" == x[0] and int(x[1].split("_")[3])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1])>highest_H_coverage*0.13:
jpayne@1	832 fljB_contig=x[1]
jpayne@1	833 fljB_length=int(x[1].split("_")[3])
jpayne@1	834 for x in Final_list_passed:
jpayne@1	835 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]:
jpayne@1	836 fliC_choice=x[0].split("_")[1]
jpayne@1	837 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]:
jpayne@1	838 fljB_choice=x[0].split("_")[1]
jpayne@1	839 elif fliC_choice!="-" and fljB_choice!="-":
jpayne@1	840 break
jpayne@1	841 #now remove contigs not in middle core part
jpayne@1	842 first_allele="NA"
jpayne@1	843 first_allele_percentage=0
jpayne@1	844 for x in Final_list:
jpayne@1	845 if x[0].startswith("fliC") or x[0].startswith("fljB"):
jpayne@1	846 first_allele=x[0].split("__")[0] #used to filter those un-middle contigs
jpayne@1	847 first_allele_percentage=x[2]
jpayne@1	848 break
jpayne@1	849 additional_contigs=[]
jpayne@1	850 for x in Final_list:
jpayne@1	851 if first_allele in x[0]:
jpayne@1	852 if (fliC_contig == x[0].split("___")[-1]):
jpayne@1	853 fliC_region=x[3]
jpayne@1	854 elif fljB_contig!="NA" and (fljB_contig == x[0].split("___")[-1]):
jpayne@1	855 fljB_region=x[3]
jpayne@1	856 else:
jpayne@1	857 if x[1]*1.1>int(x[0].split("___")[1].split("_")[3]):#loose threshold by multiplying 1.1
jpayne@1	858 additional_contigs.append(x)
jpayne@1	859 #else:
jpayne@1	860 #print x[:3]
jpayne@1	861 #we can just use the fljB region (or fliC depends on size), no matter set() or contain a large locations (without middle part); however, if none of them is fully assembled, use 500 and 1200 as conservative cut-off
jpayne@1	862 if first_allele_percentage>0.9:
jpayne@1	863 if len(fliC_region)>len(fljB_region) and (max(fljB_region)-min(fljB_region))>1000:
jpayne@1	864 target_region=fljB_region\|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) #fljB_region\|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
jpayne@1	865 elif len(fliC_region)<len(fljB_region) and (max(fliC_region)-min(fliC_region))>1000:
jpayne@1	866 target_region=fliC_region\|(fljB_region-set(range(min(fliC_region),max(fliC_region)))) #fljB_region\|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
jpayne@1	867 else:
jpayne@1	868 target_region=set()#doesn't do anything
jpayne@1	869 else:
jpayne@1	870 target_region=set()#doesn't do anything
jpayne@1	871 #print(target_region)
jpayne@1	872 #print(additional_contigs)
jpayne@1	873 target_region2=set(list(range(0,525))+list(range(1200,1700)))#I found to use 500 to 1200 as special region would be best
jpayne@1	874 target_region=target_region2\|target_region
jpayne@1	875 for x in additional_contigs:
jpayne@1	876 removal=0
jpayne@1	877 contig_length=int(x[0].split("___")[1].split("length_")[-1].split("_")[0])
jpayne@1	878 if fljB_contig not in x[0] and fliC_contig not in x[0] and len(target_region&x[3])/float(len(x[3]))>0.65 and contig_length0.5<len(x[3])<contig_length1.5: #consider length and alignment length for now, but very loose,0.5 and 1.5 as cut-off
jpayne@1	879 removal=1
jpayne@1	880 else:
jpayne@1	881 if first_allele_percentage > 0.9 and float(x[0].split("__")[1].split("___")[0])*x[2]/len(x[-1])>0.96:#if high similiarity with middle part of first allele (first allele >0.9, already cover middle part)
jpayne@1	882 removal=1
jpayne@1	883 else:
jpayne@1	884 pass
jpayne@1	885 if removal==1:
jpayne@1	886 for y in H_contig_roles:
jpayne@1	887 if y[1] in x[0]:
jpayne@1	888 H_contig_roles.remove(y)
jpayne@1	889 else:
jpayne@1	890 pass
jpayne@1	891 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length0.5,len(x[3]),contig_length1.5)
jpayne@1	892 #end of removing none-middle contigs
jpayne@4	893 #print("H_contigs:")
jpayne@1	894 log_file.write("H_contigs:\n")
jpayne@4	895 extract_file.write("#H_contigs:\n")
jpayne@1	896 H_contig_stat=[]
jpayne@1	897 H1_cont_stat={}
jpayne@1	898 H2_cont_stat={}
jpayne@1	899 for i in range(len(H_contig_roles)):
jpayne@1	900 x=H_contig_roles[i]
jpayne@1	901 a=0
jpayne@1	902 for y in Final_list_passed:
jpayne@1	903 if x[1] in y[0] and y[0].startswith(x[0]):
jpayne@1	904 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide
jpayne@1	905 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it
jpayne@1	906 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB
jpayne@4	907 #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
jpayne@4	908 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
jpayne@1	909 H_contig_roles[i]="can't decide fliC or fljB, may be third phase"
jpayne@4	910 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n"
jpayne@4	911 seqs=""
jpayne@4	912 for z in handle_fasta:
jpayne@4	913 if x[1]==z.description:
jpayne@4	914 seqs=str(z.seq)
jpayne@4	915 extract_file.write(title+seqs+"\n")
jpayne@1	916 break
jpayne@1	917 else:
jpayne@4	918 #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
jpayne@4	919 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
jpayne@4	920 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n"
jpayne@4	921 seqs=""
jpayne@4	922 for z in handle_fasta:
jpayne@4	923 if x[1]==z.description:
jpayne@4	924 seqs=str(z.seq)
jpayne@4	925 extract_file.write(title+seqs+"\n")
jpayne@1	926 if x[0]=="fliC":
jpayne@1	927 if y[0].split("_")[1] not in H1_cont_stat:
jpayne@1	928 H1_cont_stat[y[0].split("_")[1]]=y[2]
jpayne@1	929 else:
jpayne@1	930 H1_cont_stat[y[0].split("_")[1]]+=y[2]
jpayne@1	931 if x[0]=="fljB":
jpayne@1	932 if y[0].split("_")[1] not in H2_cont_stat:
jpayne@1	933 H2_cont_stat[y[0].split("_")[1]]=y[2]
jpayne@1	934 else:
jpayne@1	935 H2_cont_stat[y[0].split("_")[1]]+=y[2]
jpayne@1	936 break
jpayne@1	937 #detect contaminations
jpayne@1	938 #print(H1_cont_stat)
jpayne@1	939 #print(H2_cont_stat)
jpayne@1	940 H1_cont_stat_list=[x for x in H1_cont_stat if H1_cont_stat[x]>0.2]
jpayne@1	941 H2_cont_stat_list=[x for x in H2_cont_stat if H2_cont_stat[x]>0.2]
jpayne@1	942 contamination_H=""
jpayne@1	943 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1:
jpayne@1	944 contamination_H="potential contamination from H antigen signals"
jpayne@1	945 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA":
jpayne@1	946 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected"
jpayne@4	947 #get additional antigens
jpayne@4	948 """
jpayne@4	949 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
jpayne@4	950 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
jpayne@4	951 O_choice="O-9,46"
jpayne@4	952 #print "$$$Most possilble Otype: O-9,46"
jpayne@4	953 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
jpayne@4	954 O_choice="O-9,46,27"
jpayne@4	955 #print "$$$Most possilble Otype: O-9,46,27"
jpayne@4	956 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
jpayne@4	957 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
jpayne@4	958 O_choice="O-3,10"
jpayne@4	959 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
jpayne@4	960 else:
jpayne@4	961 O_choice="O-1,3,19"
jpayne@4	962 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
jpayne@4	963 ### end of special test for O9,46 and O3,10 family
jpayne@4	964
jpayne@4	965 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
jpayne@4	966 if len(Otypes_uniq)>2:
jpayne@4	967 contamination_O="potential contamination from O antigen signals"
jpayne@4	968 else:
jpayne@4	969 if len(Otypes_uniq)>1:
jpayne@4	970 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
jpayne@4	971 contamination_O=""
jpayne@4	972 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
jpayne@4	973 contamination_O=""
jpayne@4	974 """
jpayne@4	975 additonal_antigents=[]
jpayne@4	976 #print(contamination_O)
jpayne@4	977 #print(contamination_H)
jpayne@1	978 log_file.write(contamination_O+"\n")
jpayne@1	979 log_file.write(contamination_H+"\n")
jpayne@1	980 log_file.close()
jpayne@4	981 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list
jpayne@1	982
jpayne@1	983 def get_input_K(input_file,lib_dict,data_type,k_size):
jpayne@1	984 #kmer mode; get input_Ks from dict and data_type
jpayne@1	985 kmers = []
jpayne@1	986 for h in lib_dict:
jpayne@1	987 kmers += lib_dict[h]
jpayne@1	988 if data_type == '4':
jpayne@1	989 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers))
jpayne@1	990 elif data_type == '1' or data_type == '2' or data_type == '3':#set it for now, will change later
jpayne@1	991 input_Ks = target_read_kmerizer(input_file, k_size, set(kmers))
jpayne@1	992 elif data_type == '5':#minion_2d_fasta
jpayne@1	993 input_Ks = minion_fasta_kmerizer(input_file, k_size, set(kmers))
jpayne@1	994 if data_type == '6':#minion_2d_fastq
jpayne@1	995 input_Ks = minion_fastq_kmerizer(input_file, k_size, set(kmers))
jpayne@1	996 return input_Ks
jpayne@1	997
jpayne@1	998 def get_kmer_dict(lib_dict,input_Ks):
jpayne@1	999 #kmer mode; get predicted types
jpayne@1	1000 O_dict = {}
jpayne@1	1001 H_dict = {}
jpayne@1	1002 Special_dict = {}
jpayne@1	1003 for h in lib_dict:
jpayne@1	1004 score = (len(lib_dict[h] & input_Ks) / len(lib_dict[h])) * 100
jpayne@1	1005 if score > 1: # Arbitrary cut-off for similarity score very low but seems necessary to detect O-3,10 in some cases
jpayne@1	1006 if h.startswith('O-') and score > 25:
jpayne@1	1007 O_dict[h] = score
jpayne@1	1008 if h.startswith('fl') and score > 40:
jpayne@1	1009 H_dict[h] = score
jpayne@1	1010 if (h[:2] != 'fl') and (h[:2] != 'O-'):
jpayne@1	1011 Special_dict[h] = score
jpayne@1	1012 return O_dict,H_dict,Special_dict
jpayne@1	1013
jpayne@1	1014 def call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir):
jpayne@1	1015 log_file=open("SeqSero_log.txt","a")
jpayne@1	1016 log_file.write("O_scores:\n")
jpayne@1	1017 #call O:
jpayne@1	1018 highest_O = '-'
jpayne@1	1019 if len(O_dict) == 0:
jpayne@1	1020 pass
jpayne@1	1021 else:
jpayne@1	1022 for x in O_dict:
jpayne@1	1023 log_file.write(x+"\t"+str(O_dict[x])+"\n")
jpayne@1	1024 if ('O-9,46_wbaV__1002' in O_dict and O_dict['O-9,46_wbaV__1002']>70) or ("O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002" in O_dict and O_dict['O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002']>70): # not sure should use and float(O9_wbaV)/float(num_1) > 0.1
jpayne@1	1025 if 'O-9,46_wzy__1191' in O_dict: # and float(O946_wzy)/float(num_1) > 0.1
jpayne@1	1026 highest_O = "O-9,46"
jpayne@1	1027 elif "O-9,46,27_partial_wzy__1019" in O_dict: # and float(O94627)/float(num_1) > 0.1
jpayne@1	1028 highest_O = "O-9,46,27"
jpayne@1	1029 else:
jpayne@1	1030 highest_O = "O-9" # next, detect O9 vs O2?
jpayne@1	1031 O2 = 0
jpayne@1	1032 O9 = 0
jpayne@1	1033 for z in Special_dict:
jpayne@1	1034 if "tyr-O-9" in z:
jpayne@1	1035 O9 = float(Special_dict[z])
jpayne@1	1036 if "tyr-O-2" in z:
jpayne@1	1037 O2 = float(Special_dict[z])
jpayne@1	1038 if O2 > O9:
jpayne@1	1039 highest_O = "O-2"
jpayne@1	1040 elif ("O-3,10_wzx__1539" in O_dict) and (
jpayne@1	1041 "O-9,46_wzy__1191" in O_dict
jpayne@1	1042 ): # and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
jpayne@1	1043 if "O-3,10_not_in_1,3,19__1519" in O_dict: # and float(O310_no_1319)/float(num_1) > 0.1
jpayne@1	1044 highest_O = "O-3,10"
jpayne@1	1045 else:
jpayne@1	1046 highest_O = "O-1,3,19"
jpayne@1	1047 ### end of special test for O9,46 and O3,10 family
jpayne@1	1048 else:
jpayne@1	1049 try:
jpayne@1	1050 max_score = 0
jpayne@1	1051 for x in O_dict:
jpayne@1	1052 if float(O_dict[x]) >= max_score:
jpayne@1	1053 max_score = float(O_dict[x])
jpayne@1	1054 highest_O = x.split("_")[0]
jpayne@1	1055 if highest_O == "O-1,3,19":
jpayne@1	1056 highest_O = '-'
jpayne@1	1057 max_score = 0
jpayne@1	1058 for x in O_dict:
jpayne@1	1059 if x == 'O-1,3,19_not_in_3,10__130':
jpayne@1	1060 pass
jpayne@1	1061 else:
jpayne@1	1062 if float(O_dict[x]) >= max_score:
jpayne@1	1063 max_score = float(O_dict[x])
jpayne@1	1064 highest_O = x.split("_")[0]
jpayne@1	1065 except:
jpayne@1	1066 pass
jpayne@1	1067 #call_fliC:
jpayne@1	1068 if len(H_dict)!=0:
jpayne@1	1069 highest_H_score_both_BC=H_dict[max(H_dict.keys(), key=(lambda k: H_dict[k]))] #used to detect whether fljB existed or not
jpayne@1	1070 else:
jpayne@1	1071 highest_H_score_both_BC=0
jpayne@1	1072 highest_fliC = '-'
jpayne@1	1073 highest_fliC_raw = '-'
jpayne@1	1074 highest_Score = 0
jpayne@1	1075 log_file.write("\nH_scores:\n")
jpayne@1	1076 for s in H_dict:
jpayne@1	1077 log_file.write(s+"\t"+str(H_dict[s])+"\n")
jpayne@1	1078 if s.startswith('fliC'):
jpayne@1	1079 if float(H_dict[s]) > highest_Score:
jpayne@1	1080 highest_fliC = s.split('_')[1]
jpayne@1	1081 highest_fliC_raw = s
jpayne@1	1082 highest_Score = float(H_dict[s])
jpayne@1	1083 #call_fljB
jpayne@1	1084 highest_fljB = '-'
jpayne@1	1085 highest_fljB_raw = '-'
jpayne@1	1086 highest_Score = 0
jpayne@1	1087 for s in H_dict:
jpayne@1	1088 if s.startswith('fljB'):
jpayne@1	1089 if float(H_dict[s]) > highest_Score and float(H_dict[s]) > highest_H_score_both_BC * 0.65: #fljB is special, so use highest_H_score_both_BC to give a general estimate of coverage, currently 0.65 seems pretty good; the reason use a high (0.65) is some fliC and fljB shared with each other
jpayne@1	1090 highest_fljB = s.split('_')[1]
jpayne@1	1091 highest_fljB_raw = s
jpayne@1	1092 highest_Score = float(H_dict[s])
jpayne@1	1093 log_file.write("\nSpecial_scores:\n")
jpayne@1	1094 for s in Special_dict:
jpayne@1	1095 log_file.write(s+"\t"+str(Special_dict[s])+"\n")
jpayne@1	1096 log_file.close()
jpayne@1	1097 return highest_O,highest_fliC,highest_fljB
jpayne@1	1098
jpayne@1	1099 def get_temp_file_names(for_fq,rev_fq):
jpayne@1	1100 #seqsero2 -a; get temp file names
jpayne@1	1101 sam=for_fq+".sam"
jpayne@1	1102 bam=for_fq+".bam"
jpayne@1	1103 sorted_bam=for_fq+"_sorted.bam"
jpayne@1	1104 mapped_fq1=for_fq+"_mapped.fq"
jpayne@1	1105 mapped_fq2=rev_fq+"_mapped.fq"
jpayne@1	1106 combined_fq=for_fq+"_combined.fq"
jpayne@1	1107 for_sai=for_fq+".sai"
jpayne@1	1108 rev_sai=rev_fq+".sai"
jpayne@1	1109 return sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai
jpayne@1	1110
jpayne@1	1111 def map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode):
jpayne@1	1112 #seqsero2 -a; do mapping and sort
jpayne@1	1113 print("building database...")
jpayne@1	1114 subprocess.check_output("bwa index "+database,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1115 print("mapping...")
jpayne@1	1116 if mapping_mode=="mem":
jpayne@1	1117 subprocess.check_output("bwa mem -k 17 -t "+threads+" "+database+" "+fnameA+" "+fnameB+" > "+sam,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1118 elif mapping_mode=="sam":
jpayne@1	1119 if fnameB!="":
jpayne@1	1120 subprocess.check_output("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1121 subprocess.check_output("bwa aln -t "+threads+" "+database+" "+fnameB+" > "+rev_sai,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1122 subprocess.check_output("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+fnameA+" "+fnameB+" > "+sam,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1123 else:
jpayne@1	1124 subprocess.check_output("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1125 subprocess.check_output("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam, stderr=subprocess.STDOUT)
jpayne@1	1126 subprocess.check_output("samtools view -@ "+threads+" -F 4 -Sh "+sam+" > "+bam,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1127 ### check the version of samtools then use differnt commands
jpayne@1	1128 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
jpayne@1	1129 out, err = samtools_version.communicate()
jpayne@1	1130 version = str(err).split("ersion:")[1].strip().split(" ")[0].strip()
jpayne@1	1131 print("check samtools version:",version)
jpayne@1	1132 ### end of samtools version check and its analysis
jpayne@1	1133 if LooseVersion(version)<=LooseVersion("1.2"):
jpayne@1	1134 subprocess.check_output("samtools sort -@ "+threads+" -n "+bam+" "+fnameA+"_sorted",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1135 else:
jpayne@1	1136 subprocess.check_output("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1137
jpayne@1	1138 def extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode):
jpayne@1	1139 #seqsero2 -a; extract, assembly and blast
jpayne@1	1140 subprocess.check_output("bamToFastq -i "+sorted_bam+" -fq "+combined_fq,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1141 if fnameB!="":
jpayne@1	1142 subprocess.check_output("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 ,shell=True, stderr=subprocess.STDOUT)#2> /dev/null if want no output
jpayne@1	1143 else:
jpayne@1	1144 pass
jpayne@1	1145 outdir=current_time+"_temp"
jpayne@1	1146 print("assembling...")
jpayne@1	1147 if int(threads)>4:
jpayne@1	1148 t="4"
jpayne@1	1149 else:
jpayne@1	1150 t=threads
jpayne@1	1151 if os.path.getsize(combined_fq)>100 and os.path.getsize(mapped_fq1)>100:#if not, then it's "-:-:-"
jpayne@1	1152 if fnameB!="":
jpayne@1	1153 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1154 else:
jpayne@1	1155 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1156 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta"
jpayne@1	1157 subprocess.check_output("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1158 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta
jpayne@1	1159 subprocess.check_output("rm -rf "+outdir+ " 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1160 print("blasting...","\n")
jpayne@1	1161 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml"
jpayne@1	1162 subprocess.check_output('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl',shell=True, stderr=subprocess.STDOUT) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015
jpayne@1	1163 subprocess.check_output("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5",shell=True, stderr=subprocess.STDOUT)###1/27/2015; 08272018, remove "-word_size 10"
jpayne@1	1164 else:
jpayne@1	1165 xmlfile="NA"
jpayne@4	1166 return xmlfile,new_fasta
jpayne@1	1167
jpayne@1	1168 def judge_subspecies(fnameA):
jpayne@1	1169 #seqsero2 -a; judge subspecies on just forward raw reads fastq
jpayne@1	1170 salmID_output=subprocess.check_output("../SalmID/SalmID.py -i "+fnameA, shell=True, stderr=subprocess.STDOUT)
jpayne@1	1171 #out, err = salmID_output.communicate()
jpayne@1	1172 #out=out.decode("utf-8")
jpayne@1	1173 out = salmID_output.decode("utf-8")
jpayne@1	1174 file=open("data_log.txt","a")
jpayne@1	1175 file.write(out)
jpayne@1	1176 file.close()
jpayne@1	1177 salm_species_scores=out.split("\n")[1].split("\t")[6:]
jpayne@1	1178 salm_species_results=out.split("\n")[0].split("\t")[6:]
jpayne@1	1179 max_score=0
jpayne@1	1180 max_score_index=1 #default is 1, means "I"
jpayne@1	1181 for i in range(len(salm_species_scores)):
jpayne@1	1182 if max_score<float(salm_species_scores[i]):
jpayne@1	1183 max_score=float(salm_species_scores[i])
jpayne@1	1184 max_score_index=i
jpayne@1	1185 prediction=salm_species_results[max_score_index].split(".")[1].strip().split(" ")[0]
jpayne@1	1186 if float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): #bongori and enterica compare
jpayne@1	1187 prediction="bongori" #if not, the prediction would always be enterica, since they are located in the later part
jpayne@1	1188 if max_score<10:
jpayne@1	1189 prediction="-"
jpayne@1	1190 return prediction
jpayne@1	1191
jpayne@1	1192 def judge_subspecies_Kmer(Special_dict):
jpayne@1	1193 #seqsero2 -k;
jpayne@1	1194 max_score=0
jpayne@1	1195 prediction="-" #default should be I
jpayne@1	1196 for x in Special_dict:
jpayne@1	1197 if "mer" in x:
jpayne@1	1198 if max_score<float(Special_dict[x]):
jpayne@1	1199 max_score=float(Special_dict[x])
jpayne@1	1200 prediction=x.split("_")[-1].strip()
jpayne@1	1201 if x.split("_")[-1].strip()=="bongori" and float(Special_dict[x])>95:#if bongori already, then no need to test enterica
jpayne@1	1202 prediction="bongori"
jpayne@1	1203 break
jpayne@1	1204 return prediction
jpayne@1	1205
jpayne@1	1206 def main():
jpayne@1	1207 #combine SeqSeroK and SeqSero2, also with SalmID
jpayne@1	1208 args = parse_args()
jpayne@1	1209 input_file = args.i
jpayne@1	1210 data_type = args.t
jpayne@1	1211 analysis_mode = args.m
jpayne@1	1212 mapping_mode=args.b
jpayne@1	1213 threads=args.p
jpayne@1	1214 make_dir=args.d
jpayne@1	1215 clean_mode=args.c
jpayne@1	1216 k_size=27 #will change for bug fixing
jpayne@1	1217 database="H_and_O_and_specific_genes.fasta"
jpayne@1	1218
jpayne@1	1219 if len(sys.argv)==1:
jpayne@1	1220 subprocess.check_output(dirpath+"/SeqSero2_package.py -h",shell=True, stderr=subprocess.STDOUT)#change name of python file
jpayne@1	1221 else:
jpayne@1	1222 request_id = time.strftime("%m_%d_%Y_%H_%M_%S", time.localtime())
jpayne@1	1223 request_id += str(random.randint(1, 10000000))
jpayne@1	1224 if make_dir is None:
jpayne@1	1225 make_dir="SeqSero_result_"+request_id
jpayne@1	1226 if os.path.isdir(make_dir):
jpayne@1	1227 pass
jpayne@1	1228 else:
jpayne@1	1229 subprocess.check_output(["mkdir",make_dir])
jpayne@1	1230 #subprocess.check_output("cp "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1231 subprocess.check_output("ln -srf "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1232 ############################begin the real analysis
jpayne@1	1233 if analysis_mode=="a":
jpayne@1	1234 if data_type in ["1","2","3"]:#use allele mode
jpayne@1	1235 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
jpayne@1	1236 os.chdir(make_dir)
jpayne@1	1237 ###add a function to tell input files
jpayne@1	1238 fnameA=for_fq.split("/")[-1]
jpayne@1	1239 fnameB=rev_fq.split("/")[-1]
jpayne@1	1240 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
jpayne@1	1241 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id
jpayne@1	1242 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort
jpayne@4	1243 xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast
jpayne@1	1244 if xmlfile=="NA":
jpayne@1	1245 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","")
jpayne@1	1246 else:
jpayne@1	1247 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results
jpayne@1	1248 file=open("data_log.txt","a")
jpayne@1	1249 for x in Final_list:
jpayne@1	1250 file.write("\t".join(str(y) for y in x)+"\n")
jpayne@1	1251 file.close()
jpayne@1	1252 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
jpayne@4	1253 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB
jpayne@1	1254 subspecies=judge_subspecies(fnameA) #predict subspecies
jpayne@1	1255 ###output
jpayne@1	1256 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
jpayne@1	1257 contamination_report=""
jpayne@4	1258 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0]
jpayne@1	1259 if contamination_O!="" and contamination_H=="":
jpayne@4	1260 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"."
jpayne@1	1261 elif contamination_O=="" and contamination_H!="":
jpayne@4	1262 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"."
jpayne@1	1263 elif contamination_O!="" and contamination_H!="":
jpayne@4	1264 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"."
jpayne@4	1265 if contamination_report!="":
jpayne@4	1266 contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing
jpayne@4	1267 claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences
jpayne@1	1268 if clean_mode:
jpayne@1	1269 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1270 make_dir="none-output-directory due to '-c' flag"
jpayne@1	1271 else:
jpayne@1	1272 #new_file=open("Seqsero_result.txt","w")
jpayne@1	1273 if O_choice=="":
jpayne@1	1274 O_choice="-"
jpayne@1	1275 result = OrderedDict(
jpayne@1	1276 sample_name = input_file[0],
jpayne@1	1277 O_antigen_prediction = O_choice,
jpayne@1	1278 H1_antigen_prediction = fliC_choice,
jpayne@1	1279 H2_antigen_prediction = fljB_choice,
jpayne@1	1280 predicted_antigenic_profile = predict_form,
jpayne@1	1281 predicted_subspecies = subspecies,
jpayne@1	1282 predicted_serotype = predict_sero,
jpayne@1	1283 note=claim.replace('\n','')
jpayne@1	1284 )
jpayne@1	1285 with open("Seqsero_result.tsv","w") as new_file:
jpayne@1	1286 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
jpayne@1	1287 #new_file.close()
jpayne@1	1288 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys())
jpayne@1	1289 wrtr.writeheader()
jpayne@1	1290 wrtr.writerow(result)
jpayne@1	1291
jpayne@1	1292
jpayne@1	1293 # new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+contamination_report+star+star_line+claim+"\n")#+##
jpayne@1	1294 # new_file.close()
jpayne@1	1295 #subprocess.check_output("cat Seqsero_result.txt",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1296 #subprocess.call("rm H_and_O_and_specific_genes.fasta* .sra .bam .sam .fastq .gz .fq temp.txt .xml "+fnameA+"_db* 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1297 subprocess.call("rm H_and_O_and_specific_genes.fasta* .sra .bam .sam .fastq .gz .fq temp.txt "+fnameA+"_db 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1298 print("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+contamination_report+star+star_line+claim+"\n")#+##
jpayne@1	1299 else:
jpayne@1	1300 print("Allele modes only support raw reads datatype, i.e. '-t 1 or 2 or 3'; please use '-m k'")
jpayne@1	1301 elif analysis_mode=="k":
jpayne@1	1302 ex_dir = os.path.dirname(os.path.realpath(__file__))
jpayne@1	1303 #output_mode = args.mode
jpayne@1	1304 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
jpayne@1	1305 input_file = for_fq #-k will just use forward because not all reads were used
jpayne@1	1306 os.chdir(make_dir)
jpayne@1	1307 f = open(ex_dir + '/antigens.pickle', 'rb')
jpayne@1	1308 lib_dict = pickle.load(f)
jpayne@1	1309 f.close
jpayne@1	1310 input_Ks = get_input_K(input_file,lib_dict,data_type,k_size)
jpayne@1	1311 O_dict,H_dict,Special_dict = get_kmer_dict(lib_dict,input_Ks)
jpayne@1	1312 highest_O,highest_fliC,highest_fljB = call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir)
jpayne@1	1313 subspecies = judge_subspecies_Kmer(Special_dict)
jpayne@1	1314 if subspecies=="IIb" or subspecies=="IIa":
jpayne@1	1315 subspecies="II"
jpayne@1	1316 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
jpayne@1	1317 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
jpayne@4	1318 claim=""
jpayne@1	1319 if clean_mode:
jpayne@1	1320 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1321 make_dir="none-output-directory due to '-c' flag"
jpayne@1	1322 else:
jpayne@1	1323 if highest_O.split('-')[-1]=="":
jpayne@1	1324 O_choice="-"
jpayne@1	1325 else:
jpayne@1	1326 O_choice=highest_O.split('-')[-1]
jpayne@1	1327 #print("Output_directory:"+make_dir+"\tInput_file:"+input_file+"\tPredicted subpecies:"+subspecies + '\tPredicted antigenic profile:' + predict_form + '\tPredicted serotype(s):' + predict_sero)
jpayne@1	1328 result = OrderedDict(
jpayne@1	1329 sample_name = input_file,
jpayne@1	1330 O_antigen_prediction = O_choice,
jpayne@1	1331 H1_antigen_prediction = highest_fliC,
jpayne@1	1332 H2_antigen_prediction = highest_fljB,
jpayne@1	1333 predicted_antigenic_profile = predict_form,
jpayne@1	1334 predicted_subspecies = subspecies,
jpayne@1	1335 predicted_serotype = predict_sero,
jpayne@1	1336 note=claim.replace('\n','')
jpayne@1	1337 )
jpayne@1	1338 with open("Seqsero_result.tsv","w") as new_file:
jpayne@1	1339 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
jpayne@1	1340 #new_file.close()
jpayne@1	1341 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys())
jpayne@1	1342 wrtr.writeheader()
jpayne@1	1343 wrtr.writerow(result)
jpayne@1	1344 subprocess.call("rm .fasta .fastq .gz .fq temp.txt .sra 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1345 print("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
jpayne@1	1346 return 0
jpayne@1	1347
jpayne@1	1348 if __name__ == '__main__':
jpayne@1	1349 try:
jpayne@1	1350 quit(main())
jpayne@1	1351 except subprocess.CalledProcessError as e:
jpayne@1	1352 print(str(e.output))
jpayne@1	1353 print(e)
jpayne@1	1354 quit(e.returncode)

Mercurial > repos > jpayne > seqsero_v2

annotate SeqSero2/SeqSero2_package.py @ 5:e258caab671c