estrain@2
|
1 #!/usr/bin/env python3
|
estrain@2
|
2
|
estrain@2
|
3 import sys
|
estrain@2
|
4 import time
|
estrain@2
|
5 import random
|
estrain@2
|
6 import os
|
estrain@2
|
7 import subprocess
|
estrain@2
|
8 import gzip
|
estrain@2
|
9 import io
|
estrain@2
|
10 import pickle
|
estrain@2
|
11 import argparse
|
estrain@2
|
12 import itertools
|
estrain@2
|
13 from distutils.version import LooseVersion
|
estrain@2
|
14 from distutils.spawn import find_executable
|
estrain@2
|
15
|
estrain@2
|
16 try:
|
estrain@2
|
17 from .version import SeqSero2_version
|
estrain@2
|
18 except Exception: #ImportError
|
estrain@2
|
19 from version import SeqSero2_version
|
estrain@2
|
20
|
estrain@2
|
21 ### SeqSero Kmer
|
estrain@2
|
22 def parse_args():
|
estrain@2
|
23 "Parse the input arguments, use '-h' for help."
|
estrain@2
|
24 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-d <output_directory>] [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com\n\nVersion: v1.0.1')#add "-m <data_type>" in future
|
estrain@2
|
25 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data",type=os.path.abspath) ### ed_SL_05282019: add 'type=os.path.abspath' to generate absolute path of input data.
|
estrain@2
|
26 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1' for interleaved paired-end reads, '2' for separated paired-end reads, '3' for single reads, '4' for genome assembly, '5' for nanopore fasta, '6' for nanopore fastq")
|
estrain@2
|
27 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: algorithms for bwa mapping for allele mode; 'mem' for mem, 'sam' for samse/sampe; default=mem; optional; for now we only optimized for default 'mem' mode")
|
estrain@2
|
28 parser.add_argument("-p",default="1",help="<int>: number of threads for allele mode, if p >4, only 4 threads will be used for assembly since the amount of extracted reads is small, default=1")
|
estrain@2
|
29 parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: which workflow to apply, 'a'(raw reads allele micro-assembly), 'k'(raw reads and genome assembly k-mer), default=a")
|
estrain@2
|
30 parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number")
|
estrain@2
|
31 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will only output serotype prediction without the directory containing log files")
|
estrain@2
|
32 parser.add_argument("--check",action="store_true",help="<flag>: use '--check' flag to check the required dependencies")
|
estrain@2
|
33 parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + SeqSero2_version)
|
estrain@2
|
34 return parser.parse_args()
|
estrain@2
|
35
|
estrain@2
|
36 ### ed_SL_05282019: check paths of dependencies
|
estrain@2
|
37 check_dependencies = parse_args().check
|
estrain@2
|
38 dependencies = ['bwa','samtools','blastn','fastq-dump','spades.py','bedtools','SalmID.py']
|
estrain@2
|
39 if check_dependencies:
|
estrain@2
|
40 for item in dependencies:
|
estrain@2
|
41 ext_path = find_executable(item)
|
estrain@2
|
42 if ext_path is not None:
|
estrain@2
|
43 print ("Using "+item+" - "+ext_path)
|
estrain@2
|
44 else:
|
estrain@2
|
45 print ("ERROR: can not find "+item+" in PATH")
|
estrain@2
|
46 sys.exit()
|
estrain@2
|
47 ### end of --check
|
estrain@2
|
48
|
estrain@2
|
49 def reverse_complement(sequence):
|
estrain@2
|
50 complement = {
|
estrain@2
|
51 'A': 'T',
|
estrain@2
|
52 'C': 'G',
|
estrain@2
|
53 'G': 'C',
|
estrain@2
|
54 'T': 'A',
|
estrain@2
|
55 'N': 'N',
|
estrain@2
|
56 'M': 'K',
|
estrain@2
|
57 'R': 'Y',
|
estrain@2
|
58 'W': 'W',
|
estrain@2
|
59 'S': 'S',
|
estrain@2
|
60 'Y': 'R',
|
estrain@2
|
61 'K': 'M',
|
estrain@2
|
62 'V': 'B',
|
estrain@2
|
63 'H': 'D',
|
estrain@2
|
64 'D': 'H',
|
estrain@2
|
65 'B': 'V'
|
estrain@2
|
66 }
|
estrain@2
|
67 return "".join(complement[base] for base in reversed(sequence))
|
estrain@2
|
68
|
estrain@2
|
69
|
estrain@2
|
70 def createKmerDict_reads(list_of_strings, kmer):
|
estrain@2
|
71 kmer_table = {}
|
estrain@2
|
72 for string in list_of_strings:
|
estrain@2
|
73 sequence = string.strip('\n')
|
estrain@2
|
74 for i in range(len(sequence) - kmer + 1):
|
estrain@2
|
75 new_mer = sequence[i:i + kmer].upper()
|
estrain@2
|
76 new_mer_rc = reverse_complement(new_mer)
|
estrain@2
|
77 if new_mer in kmer_table:
|
estrain@2
|
78 kmer_table[new_mer.upper()] += 1
|
estrain@2
|
79 else:
|
estrain@2
|
80 kmer_table[new_mer.upper()] = 1
|
estrain@2
|
81 if new_mer_rc in kmer_table:
|
estrain@2
|
82 kmer_table[new_mer_rc.upper()] += 1
|
estrain@2
|
83 else:
|
estrain@2
|
84 kmer_table[new_mer_rc.upper()] = 1
|
estrain@2
|
85 return kmer_table
|
estrain@2
|
86
|
estrain@2
|
87
|
estrain@2
|
88 def multifasta_dict(multifasta):
|
estrain@2
|
89 multifasta_list = [
|
estrain@2
|
90 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0
|
estrain@2
|
91 ]
|
estrain@2
|
92 headers = [i for i in multifasta_list if i[0] == '>']
|
estrain@2
|
93 multifasta_dict = {}
|
estrain@2
|
94 for h in headers:
|
estrain@2
|
95 start = multifasta_list.index(h)
|
estrain@2
|
96 for element in multifasta_list[start + 1:]:
|
estrain@2
|
97 if element[0] == '>':
|
estrain@2
|
98 break
|
estrain@2
|
99 else:
|
estrain@2
|
100 if h[1:] in multifasta_dict:
|
estrain@2
|
101 multifasta_dict[h[1:]] += element
|
estrain@2
|
102 else:
|
estrain@2
|
103 multifasta_dict[h[1:]] = element
|
estrain@2
|
104 return multifasta_dict
|
estrain@2
|
105
|
estrain@2
|
106
|
estrain@2
|
107 def multifasta_single_string(multifasta):
|
estrain@2
|
108 multifasta_list = [
|
estrain@2
|
109 line.strip() for line in open(multifasta, 'r')
|
estrain@2
|
110 if (len(line.strip()) > 0) and (line.strip()[0] != '>')
|
estrain@2
|
111 ]
|
estrain@2
|
112 return ''.join(multifasta_list)
|
estrain@2
|
113
|
estrain@2
|
114
|
estrain@2
|
115 def chunk_a_long_sequence(long_sequence, chunk_size=60):
|
estrain@2
|
116 chunk_list = []
|
estrain@2
|
117 steps = len(long_sequence) // 60 #how many chunks
|
estrain@2
|
118 for i in range(steps):
|
estrain@2
|
119 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size])
|
estrain@2
|
120 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)])
|
estrain@2
|
121 return chunk_list
|
estrain@2
|
122
|
estrain@2
|
123
|
estrain@2
|
124 def target_multifasta_kmerizer(multifasta, k, kmerDict):
|
estrain@2
|
125 forward_length = 300 #if find the target, put forward 300 bases
|
estrain@2
|
126 reverse_length = 2200 #if find the target, put backward 2200 bases
|
estrain@2
|
127 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences
|
estrain@2
|
128 target_mers = []
|
estrain@2
|
129 long_single_string = multifasta_single_string(multifasta)
|
estrain@2
|
130 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size)
|
estrain@2
|
131 unit_length = len(multifasta_list[0])
|
estrain@2
|
132 forward_lines = int(forward_length / unit_length) + 1
|
estrain@2
|
133 reverse_lines = int(forward_length / unit_length) + 1
|
estrain@2
|
134 start_num = 0
|
estrain@2
|
135 end_num = 0
|
estrain@2
|
136 for i in range(len(multifasta_list)):
|
estrain@2
|
137 if i not in range(start_num, end_num): #avoid computational repetition
|
estrain@2
|
138 line = multifasta_list[i]
|
estrain@2
|
139 start = int((len(line) - k) // 2)
|
estrain@2
|
140 s1 = line[start:k + start]
|
estrain@2
|
141 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
|
estrain@2
|
142 if i - forward_lines >= 0:
|
estrain@2
|
143 start_num = i - forward_lines
|
estrain@2
|
144 else:
|
estrain@2
|
145 start_num = 0
|
estrain@2
|
146 if i + reverse_lines <= len(multifasta_list) - 1:
|
estrain@2
|
147 end_num = i + reverse_lines
|
estrain@2
|
148 else:
|
estrain@2
|
149 end_num = len(multifasta_list) - 1
|
estrain@2
|
150 target_list = [
|
estrain@2
|
151 x.strip() for x in multifasta_list[start_num:end_num]
|
estrain@2
|
152 ]
|
estrain@2
|
153 target_line = "".join(target_list)
|
estrain@2
|
154 target_mers += [
|
estrain@2
|
155 k1 for k1 in createKmerDict_reads([str(target_line)], k)
|
estrain@2
|
156 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
|
estrain@2
|
157 else:
|
estrain@2
|
158 pass
|
estrain@2
|
159 return set(target_mers)
|
estrain@2
|
160
|
estrain@2
|
161
|
estrain@2
|
162 def target_read_kmerizer(file, k, kmerDict):
|
estrain@2
|
163 i = 1
|
estrain@2
|
164 n_reads = 0
|
estrain@2
|
165 total_coverage = 0
|
estrain@2
|
166 target_mers = []
|
estrain@2
|
167 if file.endswith(".gz"):
|
estrain@2
|
168 file_content = io.BufferedReader(gzip.open(file))
|
estrain@2
|
169 else:
|
estrain@2
|
170 file_content = open(file, "r").readlines()
|
estrain@2
|
171 for line in file_content:
|
estrain@2
|
172 start = int((len(line) - k) // 2)
|
estrain@2
|
173 if i % 4 == 2:
|
estrain@2
|
174 if file.endswith(".gz"):
|
estrain@2
|
175 s1 = line[start:k + start].decode()
|
estrain@2
|
176 line = line.decode()
|
estrain@2
|
177 else:
|
estrain@2
|
178 s1 = line[start:k + start]
|
estrain@2
|
179 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
|
estrain@2
|
180 n_reads += 1
|
estrain@2
|
181 total_coverage += len(line)
|
estrain@2
|
182 target_mers += [
|
estrain@2
|
183 k1 for k1 in createKmerDict_reads([str(line)], k)
|
estrain@2
|
184 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
|
estrain@2
|
185 i += 1
|
estrain@2
|
186 if total_coverage >= 4000000:
|
estrain@2
|
187 break
|
estrain@2
|
188 return set(target_mers)
|
estrain@2
|
189
|
estrain@2
|
190
|
estrain@2
|
191 def minion_fasta_kmerizer(file, k, kmerDict):
|
estrain@2
|
192 i = 1
|
estrain@2
|
193 n_reads = 0
|
estrain@2
|
194 total_coverage = 0
|
estrain@2
|
195 target_mers = {}
|
estrain@2
|
196 for line in open(file):
|
estrain@2
|
197 if i % 2 == 0:
|
estrain@2
|
198 for kmer, rc_kmer in kmers(line.strip().upper(), k):
|
estrain@2
|
199 if (kmer in kmerDict) or (rc_kmer in kmerDict):
|
estrain@2
|
200 if kmer in target_mers:
|
estrain@2
|
201 target_mers[kmer] += 1
|
estrain@2
|
202 else:
|
estrain@2
|
203 target_mers[kmer] = 1
|
estrain@2
|
204 if rc_kmer in target_mers:
|
estrain@2
|
205 target_mers[rc_kmer] += 1
|
estrain@2
|
206 else:
|
estrain@2
|
207 target_mers[rc_kmer] = 1
|
estrain@2
|
208 i += 1
|
estrain@2
|
209 return set([h for h in target_mers])
|
estrain@2
|
210
|
estrain@2
|
211
|
estrain@2
|
212 def minion_fastq_kmerizer(file, k, kmerDict):
|
estrain@2
|
213 i = 1
|
estrain@2
|
214 n_reads = 0
|
estrain@2
|
215 total_coverage = 0
|
estrain@2
|
216 target_mers = {}
|
estrain@2
|
217 for line in open(file):
|
estrain@2
|
218 if i % 4 == 2:
|
estrain@2
|
219 for kmer, rc_kmer in kmers(line.strip().upper(), k):
|
estrain@2
|
220 if (kmer in kmerDict) or (rc_kmer in kmerDict):
|
estrain@2
|
221 if kmer in target_mers:
|
estrain@2
|
222 target_mers[kmer] += 1
|
estrain@2
|
223 else:
|
estrain@2
|
224 target_mers[kmer] = 1
|
estrain@2
|
225 if rc_kmer in target_mers:
|
estrain@2
|
226 target_mers[rc_kmer] += 1
|
estrain@2
|
227 else:
|
estrain@2
|
228 target_mers[rc_kmer] = 1
|
estrain@2
|
229 i += 1
|
estrain@2
|
230 return set([h for h in target_mers])
|
estrain@2
|
231
|
estrain@2
|
232
|
estrain@2
|
233 def multifasta_single_string2(multifasta):
|
estrain@2
|
234 single_string = ''
|
estrain@2
|
235 with open(multifasta, 'r') as f:
|
estrain@2
|
236 for line in f:
|
estrain@2
|
237 if line.strip()[0] == '>':
|
estrain@2
|
238 pass
|
estrain@2
|
239 else:
|
estrain@2
|
240 single_string += line.strip()
|
estrain@2
|
241 return single_string
|
estrain@2
|
242
|
estrain@2
|
243
|
estrain@2
|
244 def kmers(seq, k):
|
estrain@2
|
245 rev_comp = reverse_complement(seq)
|
estrain@2
|
246 for start in range(1, len(seq) - k + 1):
|
estrain@2
|
247 yield seq[start:start + k], rev_comp[-(start + k):-start]
|
estrain@2
|
248
|
estrain@2
|
249
|
estrain@2
|
250 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set
|
estrain@2
|
251 multi_seq_dict = multifasta_dict(multifasta)
|
estrain@2
|
252 lib_dict = {}
|
estrain@2
|
253 for h in multi_seq_dict:
|
estrain@2
|
254 lib_dict[h] = set(
|
estrain@2
|
255 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)])
|
estrain@2
|
256 return lib_dict
|
estrain@2
|
257
|
estrain@2
|
258
|
estrain@2
|
259 def Combine(b, c):
|
estrain@2
|
260 fliC_combinations = []
|
estrain@2
|
261 fliC_combinations.append(",".join(c))
|
estrain@2
|
262 temp_combinations = []
|
estrain@2
|
263 for i in range(len(b)):
|
estrain@2
|
264 for x in itertools.combinations(b, i + 1):
|
estrain@2
|
265 temp_combinations.append(",".join(x))
|
estrain@2
|
266 for x in temp_combinations:
|
estrain@2
|
267 temp = []
|
estrain@2
|
268 for y in c:
|
estrain@2
|
269 temp.append(y)
|
estrain@2
|
270 temp.append(x)
|
estrain@2
|
271 temp = ",".join(temp)
|
estrain@2
|
272 temp = temp.split(",")
|
estrain@2
|
273 temp.sort()
|
estrain@2
|
274 temp = ",".join(temp)
|
estrain@2
|
275 fliC_combinations.append(temp)
|
estrain@2
|
276 return fliC_combinations
|
estrain@2
|
277
|
estrain@2
|
278
|
estrain@2
|
279 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies):
|
estrain@2
|
280 #like test_output_06012017.txt
|
estrain@2
|
281 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
|
estrain@2
|
282 from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict
|
estrain@2
|
283 rename_dict_not_anymore=[rename_dict[x] for x in rename_dict]
|
estrain@2
|
284 rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to
|
estrain@2
|
285 seronames = []
|
estrain@2
|
286 seronames_none_subspecies=[]
|
estrain@2
|
287 for i in range(len(phase1)):
|
estrain@2
|
288 fliC_combine = []
|
estrain@2
|
289 fljB_combine = []
|
estrain@2
|
290 if phaseO[i] == Otype: # no VII in KW, but it's there
|
estrain@2
|
291 ### for fliC, detect every possible combinations to avoid the effect of "["
|
estrain@2
|
292 if phase1[i].count("[") == 0:
|
estrain@2
|
293 fliC_combine.append(phase1[i])
|
estrain@2
|
294 elif phase1[i].count("[") >= 1:
|
estrain@2
|
295 c = []
|
estrain@2
|
296 b = []
|
estrain@2
|
297 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count(
|
estrain@2
|
298 "[") == 1:
|
estrain@2
|
299 content = phase1[i].replace("[", "").replace("]", "")
|
estrain@2
|
300 fliC_combine.append(content)
|
estrain@2
|
301 fliC_combine.append("-")
|
estrain@2
|
302 else:
|
estrain@2
|
303 for x in phase1[i].split(","):
|
estrain@2
|
304 if "[" in x:
|
estrain@2
|
305 b.append(x.replace("[", "").replace("]", ""))
|
estrain@2
|
306 else:
|
estrain@2
|
307 c.append(x)
|
estrain@2
|
308 fliC_combine = Combine(
|
estrain@2
|
309 b, c
|
estrain@2
|
310 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t
|
estrain@2
|
311 ### end of fliC "[" detect
|
estrain@2
|
312 ### for fljB, detect every possible combinations to avoid the effect of "["
|
estrain@2
|
313 if phase2[i].count("[") == 0:
|
estrain@2
|
314 fljB_combine.append(phase2[i])
|
estrain@2
|
315 elif phase2[i].count("[") >= 1:
|
estrain@2
|
316 d = []
|
estrain@2
|
317 e = []
|
estrain@2
|
318 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count(
|
estrain@2
|
319 "[") == 1:
|
estrain@2
|
320 content = phase2[i].replace("[", "").replace("]", "")
|
estrain@2
|
321 fljB_combine.append(content)
|
estrain@2
|
322 fljB_combine.append("-")
|
estrain@2
|
323 else:
|
estrain@2
|
324 for x in phase2[i].split(","):
|
estrain@2
|
325 if "[" in x:
|
estrain@2
|
326 d.append(x.replace("[", "").replace("]", ""))
|
estrain@2
|
327 else:
|
estrain@2
|
328 e.append(x)
|
estrain@2
|
329 fljB_combine = Combine(d, e)
|
estrain@2
|
330 ### end of fljB "[" detect
|
estrain@2
|
331 new_fliC = fliC.split(
|
estrain@2
|
332 ","
|
estrain@2
|
333 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings
|
estrain@2
|
334 new_fliC.sort()
|
estrain@2
|
335 new_fliC = ",".join(new_fliC)
|
estrain@2
|
336 new_fljB = fljB.split(",")
|
estrain@2
|
337 new_fljB.sort()
|
estrain@2
|
338 new_fljB = ",".join(new_fljB)
|
estrain@2
|
339 if (new_fliC in fliC_combine
|
estrain@2
|
340 or fliC in fliC_combine) and (new_fljB in fljB_combine
|
estrain@2
|
341 or fljB in fljB_combine):
|
estrain@2
|
342 ######start, remove_list,rename_dict, added on 11/11/2018
|
estrain@2
|
343 if sero[i] not in remove_list:
|
estrain@2
|
344 temp_sero=sero[i]
|
estrain@2
|
345 if temp_sero in rename_dict:
|
estrain@2
|
346 temp_sero=rename_dict[temp_sero] #rename if in the rename list
|
estrain@2
|
347 if temp_sero not in seronames:#the new sero may already included, if yes, then not consider
|
estrain@2
|
348 if subs[i] == subspecies:
|
estrain@2
|
349 seronames.append(temp_sero)
|
estrain@2
|
350 seronames_none_subspecies.append(temp_sero)
|
estrain@2
|
351 else:
|
estrain@2
|
352 pass
|
estrain@2
|
353 else:
|
estrain@2
|
354 pass
|
estrain@2
|
355 ######end, added on 11/11/2018
|
estrain@2
|
356 #analyze seronames
|
estrain@2
|
357 subspecies_pointer=""
|
estrain@2
|
358 if len(seronames) == 0 and len(seronames_none_subspecies)!=0:
|
estrain@2
|
359 seronames=seronames_none_subspecies
|
estrain@2
|
360 subspecies_pointer="1"
|
estrain@2
|
361 if len(seronames) == 0:
|
estrain@2
|
362 seronames = [
|
estrain@2
|
363 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)"
|
estrain@2
|
364 ]
|
estrain@2
|
365 star = ""
|
estrain@2
|
366 star_line = ""
|
estrain@2
|
367 if len(seronames) > 1: #there are two possible predictions for serotypes
|
estrain@2
|
368 star = "*"
|
estrain@2
|
369 #changed 04072019
|
estrain@2
|
370 #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
|
estrain@2
|
371 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0:
|
estrain@2
|
372 star="*"
|
estrain@2
|
373 star_line=" The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme." + star_line
|
estrain@2
|
374 #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
|
estrain@2
|
375 if Otype=="":
|
estrain@2
|
376 Otype="-"
|
estrain@2
|
377 predict_form = Otype + ":" + fliC + ":" + fljB
|
estrain@2
|
378 predict_sero = (" or ").join(seronames)
|
estrain@2
|
379 ###special test for Enteritidis
|
estrain@2
|
380 if predict_form == "9:g,m:-":
|
estrain@2
|
381 sdf = "-"
|
estrain@2
|
382 for x in special_gene_list:
|
estrain@2
|
383 if x.startswith("sdf"):
|
estrain@2
|
384 sdf = "+"
|
estrain@2
|
385 #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
|
estrain@2
|
386 star_line=" sdf gene detected." # ed_SL_04152019: new output format
|
estrain@2
|
387 #predict_form = predict_form + " Sdf prediction:" + sdf
|
estrain@2
|
388 predict_form = predict_form #changed 04072019
|
estrain@2
|
389 if sdf == "-":
|
estrain@2
|
390 star = "*"
|
estrain@2
|
391 #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
|
estrain@2
|
392 star_line=" sdf gene not detected." # ed_SL_04152019: new output format
|
estrain@2
|
393 #changed in 04072019, for new output
|
estrain@2
|
394 #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
|
estrain@2
|
395 #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement
|
estrain@2
|
396 predict_sero = "Gallinarum or Enteritidis" # ed_SL_04152019: new output format
|
estrain@2
|
397 ###end of special test for Enteritidis
|
estrain@2
|
398 elif predict_form == "4:i:-":
|
estrain@2
|
399 predict_sero = "4:i:-"
|
estrain@2
|
400 elif predict_form == "4:r:-":
|
estrain@2
|
401 predict_sero = "4:r:-"
|
estrain@2
|
402 elif predict_form == "4:b:-":
|
estrain@2
|
403 predict_sero = "4:b:-"
|
estrain@2
|
404 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
|
estrain@2
|
405 #predict_sero = "Newport"
|
estrain@2
|
406 #star = "*"
|
estrain@2
|
407 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
|
estrain@2
|
408 claim = " The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
|
estrain@2
|
409 if "N/A" in predict_sero:
|
estrain@2
|
410 claim = ""
|
estrain@2
|
411 #special test for Typhimurium
|
estrain@2
|
412 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
|
estrain@2
|
413 normal = 0
|
estrain@2
|
414 mutation = 0
|
estrain@2
|
415 for x in special_gene_list:
|
estrain@2
|
416 if "oafA-O-4_full" in x:
|
estrain@2
|
417 normal = float(special_gene_list[x])
|
estrain@2
|
418 elif "oafA-O-4_5-" in x:
|
estrain@2
|
419 mutation = float(special_gene_list[x])
|
estrain@2
|
420 if normal > mutation:
|
estrain@2
|
421 pass
|
estrain@2
|
422 elif normal < mutation:
|
estrain@2
|
423 #predict_sero = predict_sero.strip() + "(O5-)"
|
estrain@2
|
424 predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019
|
estrain@2
|
425 star = "*"
|
estrain@2
|
426 #star_line = "Detected the deletion of O5-."
|
estrain@2
|
427 star_line = " Detected a deletion that causes O5- variant of Typhimurium." # ed_SL_04152019: new output format
|
estrain@2
|
428 else:
|
estrain@2
|
429 pass
|
estrain@2
|
430 #special test for Paratyphi B
|
estrain@2
|
431 if "Paratyphi B" in predict_sero or predict_form == "4:b:-":
|
estrain@2
|
432 normal = 0
|
estrain@2
|
433 mutation = 0
|
estrain@2
|
434 for x in special_gene_list:
|
estrain@2
|
435 if "gntR-family-regulatory-protein_dt-positive" in x:
|
estrain@2
|
436 normal = float(special_gene_list[x])
|
estrain@2
|
437 elif "gntR-family-regulatory-protein_dt-negative" in x:
|
estrain@2
|
438 mutation = float(special_gene_list[x])
|
estrain@2
|
439 #print(normal,mutation)
|
estrain@2
|
440 if normal > mutation:
|
estrain@2
|
441 #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019
|
estrain@2
|
442 predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip() # ed_SL_04152019: new output format
|
estrain@2
|
443 star = "*"
|
estrain@2
|
444 #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
|
estrain@2
|
445 star_line = " The SNP that causes d-Tartrate nonfermentating phenotype was not detected. " # ed_SL_04152019: new output format
|
estrain@2
|
446 elif normal < mutation:
|
estrain@2
|
447 #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019
|
estrain@2
|
448 predict_sero = predict_sero.strip()
|
estrain@2
|
449 star = "*"
|
estrain@2
|
450 #star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
|
estrain@2
|
451 star_line = " Detected the SNP for d-Tartrate nonfermenting phenotype." # ed_SL_04152019: new output format
|
estrain@2
|
452 else:
|
estrain@2
|
453 star = "*"
|
estrain@2
|
454 #star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
|
estrain@2
|
455 star_line = " " ## ed_SL_05152019: do not report this situation.
|
estrain@2
|
456 #special test for O13,22 and O13,23
|
estrain@2
|
457 if Otype=="13":
|
estrain@2
|
458 ex_dir = os.path.dirname(os.path.realpath(__file__))
|
estrain@2
|
459 f = open(ex_dir + '/special.pickle', 'rb')
|
estrain@2
|
460 special = pickle.load(f)
|
estrain@2
|
461 O22_O23=special['O22_O23']
|
estrain@2
|
462 if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze
|
estrain@2
|
463 O22_score=0
|
estrain@2
|
464 O23_score=0
|
estrain@2
|
465 for x in special_gene_list:
|
estrain@2
|
466 if "O:22" in x:
|
estrain@2
|
467 O22_score = O22_score+float(special_gene_list[x])
|
estrain@2
|
468 elif "O:23" in x:
|
estrain@2
|
469 O23_score = O23_score+float(special_gene_list[x])
|
estrain@2
|
470 #print(O22_score,O23_score)
|
estrain@2
|
471 for z in O22_O23[0]:
|
estrain@2
|
472 if predict_sero.split(" or ")[0] in z:
|
estrain@2
|
473 if O22_score > O23_score:
|
estrain@2
|
474 star = "*"
|
estrain@2
|
475 #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
|
estrain@2
|
476 predict_sero = z[0]
|
estrain@2
|
477 elif O22_score < O23_score:
|
estrain@2
|
478 star = "*"
|
estrain@2
|
479 #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
|
estrain@2
|
480 predict_sero = z[1]
|
estrain@2
|
481 else:
|
estrain@2
|
482 star = "*"
|
estrain@2
|
483 #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019
|
estrain@2
|
484 if " or " in predict_sero:
|
estrain@2
|
485 star_line = star_line + " The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
|
estrain@2
|
486 #special test for O6,8
|
estrain@2
|
487 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
|
estrain@2
|
488 #for x in merge_O68_list:
|
estrain@2
|
489 # if x in predict_sero:
|
estrain@2
|
490 # predict_sero=x
|
estrain@2
|
491 # star=""
|
estrain@2
|
492 # star_line=""
|
estrain@2
|
493 #special test for Montevideo; most of them are monophasic
|
estrain@2
|
494 #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list
|
estrain@2
|
495 #star="*"
|
estrain@2
|
496 #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
|
estrain@2
|
497 return predict_form, predict_sero, star, star_line, claim
|
estrain@2
|
498 ### End of SeqSero Kmer part
|
estrain@2
|
499
|
estrain@2
|
500 ### Begin of SeqSero2 allele prediction and output
|
estrain@2
|
501 def xml_parse_score_comparision_seqsero(xmlfile):
|
estrain@2
|
502 #used to do seqsero xml analysis
|
estrain@2
|
503 from Bio.Blast import NCBIXML
|
estrain@2
|
504 handle=open(xmlfile)
|
estrain@2
|
505 handle=NCBIXML.parse(handle)
|
estrain@2
|
506 handle=list(handle)
|
estrain@2
|
507 List=[]
|
estrain@2
|
508 List_score=[]
|
estrain@2
|
509 List_ids=[]
|
estrain@2
|
510 List_query_region=[]
|
estrain@2
|
511 for i in range(len(handle)):
|
estrain@2
|
512 if len(handle[i].alignments)>0:
|
estrain@2
|
513 for j in range(len(handle[i].alignments)):
|
estrain@2
|
514 score=0
|
estrain@2
|
515 ids=0
|
estrain@2
|
516 cover_region=set() #fixed problem that repeated calculation leading percentage > 1
|
estrain@2
|
517 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def)
|
estrain@2
|
518 for z in range(len(handle[i].alignments[j].hsps)):
|
estrain@2
|
519 hsp=handle[i].alignments[j].hsps[z]
|
estrain@2
|
520 temp=set(range(hsp.query_start,hsp.query_end))
|
estrain@2
|
521 if len(cover_region)==0:
|
estrain@2
|
522 cover_region=cover_region|temp
|
estrain@2
|
523 fraction=1
|
estrain@2
|
524 else:
|
estrain@2
|
525 fraction=1-len(cover_region&temp)/float(len(temp))
|
estrain@2
|
526 cover_region=cover_region|temp
|
estrain@2
|
527 if "last" in handle[i].query or "first" in handle[i].query:
|
estrain@2
|
528 score+=hsp.bits*fraction
|
estrain@2
|
529 ids+=float(hsp.identities)/handle[i].query_length*fraction
|
estrain@2
|
530 else:
|
estrain@2
|
531 score+=hsp.bits*fraction
|
estrain@2
|
532 ids+=float(hsp.identities)/handle[i].query_length*fraction
|
estrain@2
|
533 List_score.append(score)
|
estrain@2
|
534 List_ids.append(ids)
|
estrain@2
|
535 List_query_region.append(cover_region)
|
estrain@2
|
536 temp=zip(List,List_score,List_ids,List_query_region)
|
estrain@2
|
537 Final_list=sorted(temp, key=lambda d:d[1], reverse = True)
|
estrain@2
|
538 return Final_list
|
estrain@2
|
539
|
estrain@2
|
540
|
estrain@2
|
541 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number
|
estrain@2
|
542 Old=L
|
estrain@2
|
543 L.sort()
|
estrain@2
|
544 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]]
|
estrain@2
|
545 count=[]
|
estrain@2
|
546 for j in range(len(L)):
|
estrain@2
|
547 y=0
|
estrain@2
|
548 for x in Old:
|
estrain@2
|
549 if L[j]==x:
|
estrain@2
|
550 y+=1
|
estrain@2
|
551 count.append(y)
|
estrain@2
|
552 if sort_on_fre!="none":
|
estrain@2
|
553 d=zip(*sorted(zip(count, L)))
|
estrain@2
|
554 L=d[1]
|
estrain@2
|
555 count=d[0]
|
estrain@2
|
556 return (L,count)
|
estrain@2
|
557
|
estrain@2
|
558 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list):
|
estrain@2
|
559 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test
|
estrain@2
|
560 #this is mainly used for
|
estrain@2
|
561 a=nodes_vs_score_list
|
estrain@2
|
562 fliC_score=0
|
estrain@2
|
563 fljB_score=0
|
estrain@2
|
564 for z in a:
|
estrain@2
|
565 if "fliC" in z[0]:
|
estrain@2
|
566 fliC_score+=z[1]
|
estrain@2
|
567 elif "fljB" in z[0]:
|
estrain@2
|
568 fljB_score+=z[1]
|
estrain@2
|
569 if fliC_score>=fljB_score:
|
estrain@2
|
570 role="fliC"
|
estrain@2
|
571 else:
|
estrain@2
|
572 role="fljB"
|
estrain@2
|
573 return (role,abs(fliC_score-fljB_score))
|
estrain@2
|
574
|
estrain@2
|
575 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed):
|
estrain@2
|
576 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small)
|
estrain@2
|
577 #also used when no head or tail got blasted score for the contig
|
estrain@2
|
578 role=""
|
estrain@2
|
579 for z in Final_list_passed:
|
estrain@2
|
580 if node_name in z[0]:
|
estrain@2
|
581 role=z[0].split("_")[0]
|
estrain@2
|
582 break
|
estrain@2
|
583 return role
|
estrain@2
|
584
|
estrain@2
|
585 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed):
|
estrain@2
|
586 #nodes_list is the c created by c,d=Uniq(nodes) in below function
|
estrain@2
|
587 first_target=""
|
estrain@2
|
588 role_list=[]
|
estrain@2
|
589 for x in nodes_list:
|
estrain@2
|
590 a=[]
|
estrain@2
|
591 role=""
|
estrain@2
|
592 for y in tail_head_list:
|
estrain@2
|
593 if x in y[0]:
|
estrain@2
|
594 a.append(y)
|
estrain@2
|
595 if len(a)==4:
|
estrain@2
|
596 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
|
estrain@2
|
597 if diff<20:
|
estrain@2
|
598 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
|
estrain@2
|
599 elif len(a)==3:
|
estrain@2
|
600 ###however, if the one with highest score is the fewer one, compare their accumulation score
|
estrain@2
|
601 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
|
estrain@2
|
602 if diff<20:
|
estrain@2
|
603 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
|
estrain@2
|
604 ###end of above score comparison
|
estrain@2
|
605 elif len(a)==2:
|
estrain@2
|
606 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37)
|
estrain@2
|
607 temp=[]
|
estrain@2
|
608 for z in a:
|
estrain@2
|
609 temp.append(z[0].split("_")[0])
|
estrain@2
|
610 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too
|
estrain@2
|
611 if len(m)==1:
|
estrain@2
|
612 pass
|
estrain@2
|
613 else:
|
estrain@2
|
614 pass
|
estrain@2
|
615 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
|
estrain@2
|
616 if diff<20:
|
estrain@2
|
617 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
|
estrain@2
|
618 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation
|
estrain@2
|
619 elif len(a)==1:
|
estrain@2
|
620 #that one
|
estrain@2
|
621 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
|
estrain@2
|
622 if diff<20:
|
estrain@2
|
623 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
|
estrain@2
|
624 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0"
|
estrain@2
|
625 else:#a==0
|
estrain@2
|
626 #use Final_list_passed best match
|
estrain@2
|
627 for z in Final_list_passed:
|
estrain@2
|
628 if x in z[0]:
|
estrain@2
|
629 role=z[0].split("_")[0]
|
estrain@2
|
630 break
|
estrain@2
|
631 #print x,role,len(a)
|
estrain@2
|
632 role_list.append((role,x))
|
estrain@2
|
633 if len(role_list)==2:
|
estrain@2
|
634 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase
|
estrain@2
|
635 #just use score to do a final test
|
estrain@2
|
636 role_list=[]
|
estrain@2
|
637 for x in nodes_list:
|
estrain@2
|
638 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
|
estrain@2
|
639 role_list.append((role,x))
|
estrain@2
|
640 return role_list
|
estrain@2
|
641
|
estrain@2
|
642 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed):
|
estrain@2
|
643 #used to decide which contig is FliC and which one is fljB
|
estrain@2
|
644 contigs=[]
|
estrain@2
|
645 nodes=[]
|
estrain@2
|
646 for x in Final_list_passed:
|
estrain@2
|
647 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]:
|
estrain@2
|
648 nodes.append(x[0].split("___")[1].strip())
|
estrain@2
|
649 c,d=Uniq(nodes)#c is node_list
|
estrain@2
|
650 #print c
|
estrain@2
|
651 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])]
|
estrain@2
|
652 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed)
|
estrain@2
|
653 return roles
|
estrain@2
|
654
|
estrain@2
|
655 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed):
|
estrain@2
|
656 #decide O based on Final_list
|
estrain@2
|
657 O_choice="?"
|
estrain@2
|
658 O_list=[]
|
estrain@2
|
659 special_genes={}
|
estrain@2
|
660 nodes=[]
|
estrain@2
|
661 for x in Final_list_passed:
|
estrain@2
|
662 if x[0].startswith("O-"):
|
estrain@2
|
663 nodes.append(x[0].split("___")[1].strip())
|
estrain@2
|
664 elif not x[0].startswith("fl"):
|
estrain@2
|
665 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1]
|
estrain@2
|
666 #print "special_genes:",special_genes
|
estrain@2
|
667 c,d=Uniq(nodes)
|
estrain@2
|
668 #print "potential O antigen contig",c
|
estrain@2
|
669 final_O=[]
|
estrain@2
|
670 O_nodes_list=[]
|
estrain@2
|
671 for x in c:#c is the list for contigs
|
estrain@2
|
672 temp=0
|
estrain@2
|
673 for y in Final_list_passed:
|
estrain@2
|
674 if x in y[0] and y[0].startswith("O-"):
|
estrain@2
|
675 final_O.append(y)
|
estrain@2
|
676 break
|
estrain@2
|
677 ### O contig has the problem of two genes on same contig, so do additional test
|
estrain@2
|
678 potenial_new_gene=""
|
estrain@2
|
679 for x in final_O:
|
estrain@2
|
680 pointer=0 #for genes merged or not
|
estrain@2
|
681 #not consider O-1,3,19_not_in_3,10, too short compared with others
|
estrain@2
|
682 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])*x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 300*2 (for secureity can use 400*2) as flank region
|
estrain@2
|
683 pointer=x[0].split("___")[1].strip()#store the contig name
|
estrain@2
|
684 print(pointer)
|
estrain@2
|
685 if pointer!=0:#it has potential merge event
|
estrain@2
|
686 for y in Final_list:
|
estrain@2
|
687 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])*1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])*y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O
|
estrain@2
|
688 potenial_new_gene=y
|
estrain@2
|
689 #print(potenial_new_gene)
|
estrain@2
|
690 break
|
estrain@2
|
691 if potenial_new_gene!="":
|
estrain@2
|
692 print("two differnt genes in same contig, fix it for O antigen")
|
estrain@2
|
693 print(potenial_new_gene[:3])
|
estrain@2
|
694 pointer=0
|
estrain@2
|
695 for y in final_O:
|
estrain@2
|
696 if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]:
|
estrain@2
|
697 pointer=1
|
estrain@2
|
698 if pointer!=0: #changed to consider two genes in same contig
|
estrain@2
|
699 final_O.append(potenial_new_gene)
|
estrain@2
|
700 ### end of the two genes on same contig test
|
estrain@2
|
701 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
|
estrain@2
|
702 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
|
estrain@2
|
703 #print "$$$No Otype, due to no hit"#may need to be changed
|
estrain@2
|
704 O_choice="-"
|
estrain@2
|
705 else:
|
estrain@2
|
706 highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
|
estrain@2
|
707 O_list=[]
|
estrain@2
|
708 O_list_less_contamination=[]
|
estrain@2
|
709 for x in final_O:
|
estrain@2
|
710 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off
|
estrain@2
|
711 O_list.append(x[0].split("__")[0])
|
estrain@2
|
712 O_nodes_list.append(x[0].split("___")[1])
|
estrain@2
|
713 if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:
|
estrain@2
|
714 O_list_less_contamination.append(x[0].split("__")[0])
|
estrain@2
|
715 ### special test for O9,46 and O3,10 family
|
estrain@2
|
716 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
|
estrain@2
|
717 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
|
estrain@2
|
718 O_choice="O-9,46"
|
estrain@2
|
719 #print "$$$Most possilble Otype: O-9,46"
|
estrain@2
|
720 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
|
estrain@2
|
721 O_choice="O-9,46,27"
|
estrain@2
|
722 #print "$$$Most possilble Otype: O-9,46,27"
|
estrain@2
|
723 else:
|
estrain@2
|
724 O_choice="O-9"#next, detect O9 vs O2?
|
estrain@2
|
725 O2=0
|
estrain@2
|
726 O9=0
|
estrain@2
|
727 for z in special_genes:
|
estrain@2
|
728 if "tyr-O-9" in z:
|
estrain@2
|
729 O9=special_genes[z]
|
estrain@2
|
730 elif "tyr-O-2" in z:
|
estrain@2
|
731 O2=special_genes[z]
|
estrain@2
|
732 if O2>O9:
|
estrain@2
|
733 O_choice="O-2"
|
estrain@2
|
734 elif O2<O9:
|
estrain@2
|
735 pass
|
estrain@2
|
736 else:
|
estrain@2
|
737 pass
|
estrain@2
|
738 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility."
|
estrain@2
|
739 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
|
estrain@2
|
740 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
|
estrain@2
|
741 O_choice="O-3,10"
|
estrain@2
|
742 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
|
estrain@2
|
743 else:
|
estrain@2
|
744 O_choice="O-1,3,19"
|
estrain@2
|
745 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
|
estrain@2
|
746 ### end of special test for O9,46 and O3,10 family
|
estrain@2
|
747 else:
|
estrain@2
|
748 try:
|
estrain@2
|
749 max_score=0
|
estrain@2
|
750 for x in final_O:
|
estrain@2
|
751 if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold
|
estrain@2
|
752 max_score=x[2]#change from x[-1] to x[2],08172018
|
estrain@2
|
753 O_choice=x[0].split("_")[0]
|
estrain@2
|
754 if O_choice=="O-1,3,19":
|
estrain@2
|
755 O_choice=final_O[1][0].split("_")[0]
|
estrain@2
|
756 #print "$$$Most possilble Otype: ",O_choice
|
estrain@2
|
757 except:
|
estrain@2
|
758 pass
|
estrain@2
|
759 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)"
|
estrain@2
|
760 #print "O:",O_choice,O_nodes_list
|
estrain@2
|
761 Otypes=[]
|
estrain@2
|
762 for x in O_list:
|
estrain@2
|
763 if x!="O-1,3,19_not_in_3,10":
|
estrain@2
|
764 if "O-9,46_" not in x:
|
estrain@2
|
765 Otypes.append(x.split("_")[0])
|
estrain@2
|
766 else:
|
estrain@2
|
767 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254
|
estrain@2
|
768 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"]
|
estrain@2
|
769 Otypes_uniq,Otypes_fre=Uniq(Otypes)
|
estrain@2
|
770 contamination_O=""
|
estrain@2
|
771 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
|
estrain@2
|
772 if len(Otypes_uniq)>2:
|
estrain@2
|
773 contamination_O="potential contamination from O antigen signals"
|
estrain@2
|
774 else:
|
estrain@2
|
775 if len(Otypes_uniq)>1:
|
estrain@2
|
776 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
|
estrain@2
|
777 contamination_O=""
|
estrain@2
|
778 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
|
estrain@2
|
779 contamination_O=""
|
estrain@2
|
780 else:
|
estrain@2
|
781 contamination_O="potential contamination from O antigen signals"
|
estrain@2
|
782 return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq
|
estrain@2
|
783 ### End of SeqSero2 allele prediction and output
|
estrain@2
|
784
|
estrain@2
|
785 def get_input_files(make_dir,input_file,data_type,dirpath):
|
estrain@2
|
786 #tell input files from datatype
|
estrain@2
|
787 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)"
|
estrain@2
|
788 for_fq=""
|
estrain@2
|
789 rev_fq=""
|
estrain@2
|
790 os.chdir(make_dir)
|
estrain@2
|
791 if data_type=="1":
|
estrain@2
|
792 input_file=input_file[0].split("/")[-1]
|
estrain@2
|
793 if input_file.endswith(".sra"):
|
estrain@2
|
794 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True)
|
estrain@2
|
795 for_fq=input_file.replace(".sra","_1.fastq")
|
estrain@2
|
796 rev_fq=input_file.replace(".sra","_2.fastq")
|
estrain@2
|
797 else:
|
estrain@2
|
798 core_id=input_file.split(".fastq")[0].split(".fq")[0]
|
estrain@2
|
799 for_fq=core_id+"_1.fastq"
|
estrain@2
|
800 rev_fq=core_id+"_2.fastq"
|
estrain@2
|
801 if input_file.endswith(".gz"):
|
estrain@2
|
802 subprocess.check_call("gzip -dc "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True)
|
estrain@2
|
803 else:
|
estrain@2
|
804 subprocess.check_call("cat "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True)
|
estrain@2
|
805 elif data_type=="2":
|
estrain@2
|
806 for_fq=input_file[0].split("/")[-1]
|
estrain@2
|
807 rev_fq=input_file[1].split("/")[-1]
|
estrain@2
|
808 elif data_type=="3":
|
estrain@2
|
809 input_file=input_file[0].split("/")[-1]
|
estrain@2
|
810 if input_file.endswith(".sra"):
|
estrain@2
|
811 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True)
|
estrain@2
|
812 for_fq=input_file.replace(".sra","_1.fastq")
|
estrain@2
|
813 else:
|
estrain@2
|
814 for_fq=input_file
|
estrain@2
|
815 elif data_type in ["4","5","6"]:
|
estrain@2
|
816 for_fq=input_file[0].split("/")[-1]
|
estrain@2
|
817 os.chdir("..")
|
estrain@2
|
818 return for_fq,rev_fq
|
estrain@2
|
819
|
estrain@2
|
820 def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta):
|
estrain@2
|
821 #get O and H types from Final_list from blast parsing; allele mode
|
estrain@2
|
822 from Bio import SeqIO
|
estrain@2
|
823 fliC_choice="-"
|
estrain@2
|
824 fljB_choice="-"
|
estrain@2
|
825 fliC_contig="NA"
|
estrain@2
|
826 fljB_contig="NA"
|
estrain@2
|
827 fliC_region=set([0])
|
estrain@2
|
828 fljB_region=set([0,])
|
estrain@2
|
829 fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
|
estrain@2
|
830 fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
|
estrain@2
|
831 O_choice="-"#no need to decide O contig for now, should be only one
|
estrain@2
|
832 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
|
estrain@2
|
833 O_choice=O_choice.split("-")[-1].strip()
|
estrain@2
|
834 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
|
estrain@2
|
835 O_choice="-"
|
estrain@2
|
836 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
|
estrain@2
|
837 #add alignment locations, used for further selection, 03312019
|
estrain@2
|
838 for i in range(len(H_contig_roles)):
|
estrain@2
|
839 x=H_contig_roles[i]
|
estrain@2
|
840 for y in Final_list_passed:
|
estrain@2
|
841 if x[1] in y[0] and y[0].startswith(x[0]):
|
estrain@2
|
842 H_contig_roles[i]+=H_contig_roles[i]+(y[-1],)
|
estrain@2
|
843 break
|
estrain@2
|
844 log_file=open("SeqSero_log.txt","a")
|
estrain@2
|
845 extract_file=open("Extracted_antigen_alleles.fasta","a")
|
estrain@2
|
846 handle_fasta=list(SeqIO.parse(new_fasta,"fasta"))
|
estrain@2
|
847
|
estrain@2
|
848 #print("O_contigs:")
|
estrain@2
|
849 log_file.write("O_contigs:\n")
|
estrain@2
|
850 extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n")
|
estrain@2
|
851 extract_file.write("#O_contigs:\n")
|
estrain@2
|
852 for x in O_nodes_roles:
|
estrain@2
|
853 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker
|
estrain@2
|
854 #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
|
estrain@2
|
855 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n")
|
estrain@2
|
856 title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n"
|
estrain@2
|
857 seqs=""
|
estrain@2
|
858 for z in handle_fasta:
|
estrain@2
|
859 if x[0].split("___")[-1]==z.description:
|
estrain@2
|
860 seqs=str(z.seq)
|
estrain@2
|
861 extract_file.write(title+seqs+"\n")
|
estrain@2
|
862 if len(H_contig_roles)!=0:
|
estrain@2
|
863 highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
|
estrain@2
|
864 else:
|
estrain@2
|
865 highest_H_coverage=0
|
estrain@2
|
866 for x in H_contig_roles:
|
estrain@2
|
867 #if multiple choices, temporately select the one with longest length for now, will revise in further change
|
estrain@2
|
868 if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
|
estrain@2
|
869 fliC_contig=x[1]
|
estrain@2
|
870 fliC_length=len(x[-1])
|
estrain@2
|
871 elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:
|
estrain@2
|
872 fljB_contig=x[1]
|
estrain@2
|
873 fljB_length=len(x[-1])
|
estrain@2
|
874 for x in Final_list_passed:
|
estrain@2
|
875 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]:
|
estrain@2
|
876 fliC_choice=x[0].split("_")[1]
|
estrain@2
|
877 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]:
|
estrain@2
|
878 fljB_choice=x[0].split("_")[1]
|
estrain@2
|
879 elif fliC_choice!="-" and fljB_choice!="-":
|
estrain@2
|
880 break
|
estrain@2
|
881 #now remove contigs not in middle core part
|
estrain@2
|
882 first_allele="NA"
|
estrain@2
|
883 first_allele_percentage=0
|
estrain@2
|
884 for x in Final_list:
|
estrain@2
|
885 if x[0].startswith("fliC") or x[0].startswith("fljB"):
|
estrain@2
|
886 first_allele=x[0].split("__")[0] #used to filter those un-middle contigs
|
estrain@2
|
887 first_allele_percentage=x[2]
|
estrain@2
|
888 break
|
estrain@2
|
889 additional_contigs=[]
|
estrain@2
|
890 for x in Final_list:
|
estrain@2
|
891 if first_allele in x[0]:
|
estrain@2
|
892 if (fliC_contig == x[0].split("___")[-1]):
|
estrain@2
|
893 fliC_region=x[3]
|
estrain@2
|
894 elif fljB_contig!="NA" and (fljB_contig == x[0].split("___")[-1]):
|
estrain@2
|
895 fljB_region=x[3]
|
estrain@2
|
896 else:
|
estrain@2
|
897 if x[1]*1.1>int(x[0].split("___")[1].split("_")[3]):#loose threshold by multiplying 1.1
|
estrain@2
|
898 additional_contigs.append(x)
|
estrain@2
|
899 #else:
|
estrain@2
|
900 #print x[:3]
|
estrain@2
|
901 #we can just use the fljB region (or fliC depends on size), no matter set() or contain a large locations (without middle part); however, if none of them is fully assembled, use 500 and 1200 as conservative cut-off
|
estrain@2
|
902 if first_allele_percentage>0.9:
|
estrain@2
|
903 if len(fliC_region)>len(fljB_region) and (max(fljB_region)-min(fljB_region))>1000:
|
estrain@2
|
904 target_region=fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
|
estrain@2
|
905 elif len(fliC_region)<len(fljB_region) and (max(fliC_region)-min(fliC_region))>1000:
|
estrain@2
|
906 target_region=fliC_region|(fljB_region-set(range(min(fliC_region),max(fliC_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
|
estrain@2
|
907 else:
|
estrain@2
|
908 target_region=set()#doesn't do anything
|
estrain@2
|
909 else:
|
estrain@2
|
910 target_region=set()#doesn't do anything
|
estrain@2
|
911 #print(target_region)
|
estrain@2
|
912 #print(additional_contigs)
|
estrain@2
|
913 target_region2=set(list(range(0,525))+list(range(1200,1700)))#I found to use 500 to 1200 as special region would be best
|
estrain@2
|
914 target_region=target_region2|target_region
|
estrain@2
|
915 for x in additional_contigs:
|
estrain@2
|
916 removal=0
|
estrain@2
|
917 contig_length=int(x[0].split("___")[1].split("length_")[-1].split("_")[0])
|
estrain@2
|
918 if fljB_contig not in x[0] and fliC_contig not in x[0] and len(target_region&x[3])/float(len(x[3]))>0.65 and contig_length*0.5<len(x[3])<contig_length*1.5: #consider length and alignment length for now, but very loose,0.5 and 1.5 as cut-off
|
estrain@2
|
919 removal=1
|
estrain@2
|
920 else:
|
estrain@2
|
921 if first_allele_percentage > 0.9 and float(x[0].split("__")[1].split("___")[0])*x[2]/len(x[-1])>0.96:#if high similiarity with middle part of first allele (first allele >0.9, already cover middle part)
|
estrain@2
|
922 removal=1
|
estrain@2
|
923 else:
|
estrain@2
|
924 pass
|
estrain@2
|
925 if removal==1:
|
estrain@2
|
926 for y in H_contig_roles:
|
estrain@2
|
927 if y[1] in x[0]:
|
estrain@2
|
928 H_contig_roles.remove(y)
|
estrain@2
|
929 else:
|
estrain@2
|
930 pass
|
estrain@2
|
931 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5)
|
estrain@2
|
932 #end of removing none-middle contigs
|
estrain@2
|
933 #print("H_contigs:")
|
estrain@2
|
934 log_file.write("H_contigs:\n")
|
estrain@2
|
935 extract_file.write("#H_contigs:\n")
|
estrain@2
|
936 H_contig_stat=[]
|
estrain@2
|
937 H1_cont_stat={}
|
estrain@2
|
938 H2_cont_stat={}
|
estrain@2
|
939 for i in range(len(H_contig_roles)):
|
estrain@2
|
940 x=H_contig_roles[i]
|
estrain@2
|
941 a=0
|
estrain@2
|
942 for y in Final_list_passed:
|
estrain@2
|
943 if x[1] in y[0] and y[0].startswith(x[0]):
|
estrain@2
|
944 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide
|
estrain@2
|
945 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it
|
estrain@2
|
946 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB
|
estrain@2
|
947 #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
|
estrain@2
|
948 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
|
estrain@2
|
949 H_contig_roles[i]="can't decide fliC or fljB, may be third phase"
|
estrain@2
|
950 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n"
|
estrain@2
|
951 seqs=""
|
estrain@2
|
952 for z in handle_fasta:
|
estrain@2
|
953 if x[1]==z.description:
|
estrain@2
|
954 seqs=str(z.seq)
|
estrain@2
|
955 extract_file.write(title+seqs+"\n")
|
estrain@2
|
956 break
|
estrain@2
|
957 else:
|
estrain@2
|
958 #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
|
estrain@2
|
959 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
|
estrain@2
|
960 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n"
|
estrain@2
|
961 seqs=""
|
estrain@2
|
962 for z in handle_fasta:
|
estrain@2
|
963 if x[1]==z.description:
|
estrain@2
|
964 seqs=str(z.seq)
|
estrain@2
|
965 extract_file.write(title+seqs+"\n")
|
estrain@2
|
966 if x[0]=="fliC":
|
estrain@2
|
967 if y[0].split("_")[1] not in H1_cont_stat:
|
estrain@2
|
968 H1_cont_stat[y[0].split("_")[1]]=y[2]
|
estrain@2
|
969 else:
|
estrain@2
|
970 H1_cont_stat[y[0].split("_")[1]]+=y[2]
|
estrain@2
|
971 if x[0]=="fljB":
|
estrain@2
|
972 if y[0].split("_")[1] not in H2_cont_stat:
|
estrain@2
|
973 H2_cont_stat[y[0].split("_")[1]]=y[2]
|
estrain@2
|
974 else:
|
estrain@2
|
975 H2_cont_stat[y[0].split("_")[1]]+=y[2]
|
estrain@2
|
976 break
|
estrain@2
|
977 #detect contaminations
|
estrain@2
|
978 #print(H1_cont_stat)
|
estrain@2
|
979 #print(H2_cont_stat)
|
estrain@2
|
980 H1_cont_stat_list=[x for x in H1_cont_stat if H1_cont_stat[x]>0.2]
|
estrain@2
|
981 H2_cont_stat_list=[x for x in H2_cont_stat if H2_cont_stat[x]>0.2]
|
estrain@2
|
982 contamination_H=""
|
estrain@2
|
983 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1:
|
estrain@2
|
984 contamination_H="potential contamination from H antigen signals"
|
estrain@2
|
985 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA":
|
estrain@2
|
986 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected"
|
estrain@2
|
987 #get additional antigens
|
estrain@2
|
988 """
|
estrain@2
|
989 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
|
estrain@2
|
990 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
|
estrain@2
|
991 O_choice="O-9,46"
|
estrain@2
|
992 #print "$$$Most possilble Otype: O-9,46"
|
estrain@2
|
993 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
|
estrain@2
|
994 O_choice="O-9,46,27"
|
estrain@2
|
995 #print "$$$Most possilble Otype: O-9,46,27"
|
estrain@2
|
996 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
|
estrain@2
|
997 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
|
estrain@2
|
998 O_choice="O-3,10"
|
estrain@2
|
999 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
|
estrain@2
|
1000 else:
|
estrain@2
|
1001 O_choice="O-1,3,19"
|
estrain@2
|
1002 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
|
estrain@2
|
1003 ### end of special test for O9,46 and O3,10 family
|
estrain@2
|
1004
|
estrain@2
|
1005 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
|
estrain@2
|
1006 if len(Otypes_uniq)>2:
|
estrain@2
|
1007 contamination_O="potential contamination from O antigen signals"
|
estrain@2
|
1008 else:
|
estrain@2
|
1009 if len(Otypes_uniq)>1:
|
estrain@2
|
1010 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
|
estrain@2
|
1011 contamination_O=""
|
estrain@2
|
1012 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
|
estrain@2
|
1013 contamination_O=""
|
estrain@2
|
1014 """
|
estrain@2
|
1015 additonal_antigents=[]
|
estrain@2
|
1016 #print(contamination_O)
|
estrain@2
|
1017 #print(contamination_H)
|
estrain@2
|
1018 log_file.write(contamination_O+"\n")
|
estrain@2
|
1019 log_file.write(contamination_H+"\n")
|
estrain@2
|
1020 log_file.close()
|
estrain@2
|
1021 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list
|
estrain@2
|
1022
|
estrain@2
|
1023 def get_input_K(input_file,lib_dict,data_type,k_size):
|
estrain@2
|
1024 #kmer mode; get input_Ks from dict and data_type
|
estrain@2
|
1025 kmers = []
|
estrain@2
|
1026 for h in lib_dict:
|
estrain@2
|
1027 kmers += lib_dict[h]
|
estrain@2
|
1028 if data_type == '4':
|
estrain@2
|
1029 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers))
|
estrain@2
|
1030 elif data_type == '1' or data_type == '2' or data_type == '3':#set it for now, will change later
|
estrain@2
|
1031 input_Ks = target_read_kmerizer(input_file, k_size, set(kmers))
|
estrain@2
|
1032 elif data_type == '5':#minion_2d_fasta
|
estrain@2
|
1033 input_Ks = minion_fasta_kmerizer(input_file, k_size, set(kmers))
|
estrain@2
|
1034 if data_type == '6':#minion_2d_fastq
|
estrain@2
|
1035 input_Ks = minion_fastq_kmerizer(input_file, k_size, set(kmers))
|
estrain@2
|
1036 return input_Ks
|
estrain@2
|
1037
|
estrain@2
|
1038 def get_kmer_dict(lib_dict,input_Ks):
|
estrain@2
|
1039 #kmer mode; get predicted types
|
estrain@2
|
1040 O_dict = {}
|
estrain@2
|
1041 H_dict = {}
|
estrain@2
|
1042 Special_dict = {}
|
estrain@2
|
1043 for h in lib_dict:
|
estrain@2
|
1044 score = (len(lib_dict[h] & input_Ks) / len(lib_dict[h])) * 100
|
estrain@2
|
1045 if score > 1: # Arbitrary cut-off for similarity score very low but seems necessary to detect O-3,10 in some cases
|
estrain@2
|
1046 if h.startswith('O-') and score > 25:
|
estrain@2
|
1047 O_dict[h] = score
|
estrain@2
|
1048 if h.startswith('fl') and score > 40:
|
estrain@2
|
1049 H_dict[h] = score
|
estrain@2
|
1050 if (h[:2] != 'fl') and (h[:2] != 'O-'):
|
estrain@2
|
1051 Special_dict[h] = score
|
estrain@2
|
1052 return O_dict,H_dict,Special_dict
|
estrain@2
|
1053
|
estrain@2
|
1054 def call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir):
|
estrain@2
|
1055 log_file=open("SeqSero_log.txt","a")
|
estrain@2
|
1056 log_file.write("O_scores:\n")
|
estrain@2
|
1057 #call O:
|
estrain@2
|
1058 highest_O = '-'
|
estrain@2
|
1059 if len(O_dict) == 0:
|
estrain@2
|
1060 pass
|
estrain@2
|
1061 else:
|
estrain@2
|
1062 for x in O_dict:
|
estrain@2
|
1063 log_file.write(x+"\t"+str(O_dict[x])+"\n")
|
estrain@2
|
1064 if ('O-9,46_wbaV__1002' in O_dict and O_dict['O-9,46_wbaV__1002']>70) or ("O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002" in O_dict and O_dict['O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002']>70): # not sure should use and float(O9_wbaV)/float(num_1) > 0.1
|
estrain@2
|
1065 if 'O-9,46_wzy__1191' in O_dict: # and float(O946_wzy)/float(num_1) > 0.1
|
estrain@2
|
1066 highest_O = "O-9,46"
|
estrain@2
|
1067 elif "O-9,46,27_partial_wzy__1019" in O_dict: # and float(O94627)/float(num_1) > 0.1
|
estrain@2
|
1068 highest_O = "O-9,46,27"
|
estrain@2
|
1069 else:
|
estrain@2
|
1070 highest_O = "O-9" # next, detect O9 vs O2?
|
estrain@2
|
1071 O2 = 0
|
estrain@2
|
1072 O9 = 0
|
estrain@2
|
1073 for z in Special_dict:
|
estrain@2
|
1074 if "tyr-O-9" in z:
|
estrain@2
|
1075 O9 = float(Special_dict[z])
|
estrain@2
|
1076 if "tyr-O-2" in z:
|
estrain@2
|
1077 O2 = float(Special_dict[z])
|
estrain@2
|
1078 if O2 > O9:
|
estrain@2
|
1079 highest_O = "O-2"
|
estrain@2
|
1080 elif ("O-3,10_wzx__1539" in O_dict) and (
|
estrain@2
|
1081 "O-9,46_wzy__1191" in O_dict
|
estrain@2
|
1082 ): # and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
|
estrain@2
|
1083 if "O-3,10_not_in_1,3,19__1519" in O_dict: # and float(O310_no_1319)/float(num_1) > 0.1
|
estrain@2
|
1084 highest_O = "O-3,10"
|
estrain@2
|
1085 else:
|
estrain@2
|
1086 highest_O = "O-1,3,19"
|
estrain@2
|
1087 ### end of special test for O9,46 and O3,10 family
|
estrain@2
|
1088 else:
|
estrain@2
|
1089 try:
|
estrain@2
|
1090 max_score = 0
|
estrain@2
|
1091 for x in O_dict:
|
estrain@2
|
1092 if float(O_dict[x]) >= max_score:
|
estrain@2
|
1093 max_score = float(O_dict[x])
|
estrain@2
|
1094 highest_O = x.split("_")[0]
|
estrain@2
|
1095 if highest_O == "O-1,3,19":
|
estrain@2
|
1096 highest_O = '-'
|
estrain@2
|
1097 max_score = 0
|
estrain@2
|
1098 for x in O_dict:
|
estrain@2
|
1099 if x == 'O-1,3,19_not_in_3,10__130':
|
estrain@2
|
1100 pass
|
estrain@2
|
1101 else:
|
estrain@2
|
1102 if float(O_dict[x]) >= max_score:
|
estrain@2
|
1103 max_score = float(O_dict[x])
|
estrain@2
|
1104 highest_O = x.split("_")[0]
|
estrain@2
|
1105 except:
|
estrain@2
|
1106 pass
|
estrain@2
|
1107 #call_fliC:
|
estrain@2
|
1108 if len(H_dict)!=0:
|
estrain@2
|
1109 highest_H_score_both_BC=H_dict[max(H_dict.keys(), key=(lambda k: H_dict[k]))] #used to detect whether fljB existed or not
|
estrain@2
|
1110 else:
|
estrain@2
|
1111 highest_H_score_both_BC=0
|
estrain@2
|
1112 highest_fliC = '-'
|
estrain@2
|
1113 highest_fliC_raw = '-'
|
estrain@2
|
1114 highest_Score = 0
|
estrain@2
|
1115 log_file.write("\nH_scores:\n")
|
estrain@2
|
1116 for s in H_dict:
|
estrain@2
|
1117 log_file.write(s+"\t"+str(H_dict[s])+"\n")
|
estrain@2
|
1118 if s.startswith('fliC'):
|
estrain@2
|
1119 if float(H_dict[s]) > highest_Score:
|
estrain@2
|
1120 highest_fliC = s.split('_')[1]
|
estrain@2
|
1121 highest_fliC_raw = s
|
estrain@2
|
1122 highest_Score = float(H_dict[s])
|
estrain@2
|
1123 #call_fljB
|
estrain@2
|
1124 highest_fljB = '-'
|
estrain@2
|
1125 highest_fljB_raw = '-'
|
estrain@2
|
1126 highest_Score = 0
|
estrain@2
|
1127 for s in H_dict:
|
estrain@2
|
1128 if s.startswith('fljB'):
|
estrain@2
|
1129 if float(H_dict[s]) > highest_Score and float(H_dict[s]) > highest_H_score_both_BC * 0.65: #fljB is special, so use highest_H_score_both_BC to give a general estimate of coverage, currently 0.65 seems pretty good; the reason use a high (0.65) is some fliC and fljB shared with each other
|
estrain@2
|
1130 highest_fljB = s.split('_')[1]
|
estrain@2
|
1131 highest_fljB_raw = s
|
estrain@2
|
1132 highest_Score = float(H_dict[s])
|
estrain@2
|
1133 log_file.write("\nSpecial_scores:\n")
|
estrain@2
|
1134 for s in Special_dict:
|
estrain@2
|
1135 log_file.write(s+"\t"+str(Special_dict[s])+"\n")
|
estrain@2
|
1136 log_file.close()
|
estrain@2
|
1137 return highest_O,highest_fliC,highest_fljB
|
estrain@2
|
1138
|
estrain@2
|
1139 def get_temp_file_names(for_fq,rev_fq):
|
estrain@2
|
1140 #seqsero2 -a; get temp file names
|
estrain@2
|
1141 sam=for_fq+".sam"
|
estrain@2
|
1142 bam=for_fq+".bam"
|
estrain@2
|
1143 sorted_bam=for_fq+"_sorted.bam"
|
estrain@2
|
1144 mapped_fq1=for_fq+"_mapped.fq"
|
estrain@2
|
1145 mapped_fq2=rev_fq+"_mapped.fq"
|
estrain@2
|
1146 combined_fq=for_fq+"_combined.fq"
|
estrain@2
|
1147 for_sai=for_fq+".sai"
|
estrain@2
|
1148 rev_sai=rev_fq+".sai"
|
estrain@2
|
1149 return sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai
|
estrain@2
|
1150
|
estrain@2
|
1151 def map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode):
|
estrain@2
|
1152 #seqsero2 -a; do mapping and sort
|
estrain@2
|
1153 print("building database...")
|
estrain@2
|
1154 subprocess.check_call("bwa index "+database+ " 2>> data_log.txt",shell=True)
|
estrain@2
|
1155 print("mapping...")
|
estrain@2
|
1156 if mapping_mode=="mem":
|
estrain@2
|
1157 subprocess.check_call("bwa mem -k 17 -t "+threads+" "+database+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True)
|
estrain@2
|
1158 elif mapping_mode=="sam":
|
estrain@2
|
1159 if fnameB!="":
|
estrain@2
|
1160 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True)
|
estrain@2
|
1161 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameB+" > "+rev_sai+ " 2>> data_log.txt",shell=True)
|
estrain@2
|
1162 subprocess.check_call("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True)
|
estrain@2
|
1163 else:
|
estrain@2
|
1164 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True)
|
estrain@2
|
1165 subprocess.check_call("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam)
|
estrain@2
|
1166 subprocess.check_call("samtools view -@ "+threads+" -F 4 -Sh "+sam+" > "+bam,shell=True)
|
estrain@2
|
1167 ### check the version of samtools then use differnt commands
|
estrain@2
|
1168 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
|
estrain@2
|
1169 out, err = samtools_version.communicate()
|
estrain@2
|
1170 version = str(err).split("ersion:")[1].strip().split(" ")[0].strip()
|
estrain@2
|
1171 print("check samtools version:",version)
|
estrain@2
|
1172 ### end of samtools version check and its analysis
|
estrain@2
|
1173 if LooseVersion(version)<=LooseVersion("1.2"):
|
estrain@2
|
1174 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" "+fnameA+"_sorted",shell=True)
|
estrain@2
|
1175 else:
|
estrain@2
|
1176 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam,shell=True)
|
estrain@2
|
1177
|
estrain@2
|
1178 def extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode):
|
estrain@2
|
1179 #seqsero2 -a; extract, assembly and blast
|
estrain@2
|
1180 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+combined_fq,shell=True)
|
estrain@2
|
1181 #print("fnameA:",fnameA)
|
estrain@2
|
1182 #print("fnameB:",fnameB)
|
estrain@2
|
1183 if fnameB!="":
|
estrain@2
|
1184 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 + " 2>> data_log.txt",shell=True)#2> /dev/null if want no output
|
estrain@2
|
1185 else:
|
estrain@2
|
1186 pass
|
estrain@2
|
1187 outdir=current_time+"_temp"
|
estrain@2
|
1188 print("assembling...")
|
estrain@2
|
1189 if int(threads)>4:
|
estrain@2
|
1190 t="4"
|
estrain@2
|
1191 else:
|
estrain@2
|
1192 t=threads
|
estrain@2
|
1193 if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-"
|
estrain@2
|
1194 if fnameB!="":
|
estrain@2
|
1195 subprocess.check_call("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True)
|
estrain@2
|
1196 else:
|
estrain@2
|
1197 subprocess.check_call("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True)
|
estrain@2
|
1198 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta"
|
estrain@2
|
1199 subprocess.check_call("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null",shell=True)
|
estrain@2
|
1200 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta
|
estrain@2
|
1201 subprocess.check_call("rm -rf "+outdir+ " 2> /dev/null",shell=True)
|
estrain@2
|
1202 print("blasting...","\n")
|
estrain@2
|
1203 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml"
|
estrain@2
|
1204 subprocess.check_call('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl >> data_log.txt 2>&1',shell=True) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015
|
estrain@2
|
1205 subprocess.check_call("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5 >> data_log.txt 2>&1",shell=True)###1/27/2015; 08272018, remove "-word_size 10"
|
estrain@2
|
1206 else:
|
estrain@2
|
1207 xmlfile="NA"
|
estrain@2
|
1208 return xmlfile,new_fasta
|
estrain@2
|
1209
|
estrain@2
|
1210 def judge_subspecies(fnameA):
|
estrain@2
|
1211 #seqsero2 -a; judge subspecies on just forward raw reads fastq
|
estrain@2
|
1212 salmID_output=subprocess.Popen("SalmID.py -i "+fnameA,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
|
estrain@2
|
1213 out, err = salmID_output.communicate()
|
estrain@2
|
1214 out=out.decode("utf-8")
|
estrain@2
|
1215 file=open("data_log.txt","a")
|
estrain@2
|
1216 file.write(out)
|
estrain@2
|
1217 file.close()
|
estrain@2
|
1218 salm_species_scores=out.split("\n")[1].split("\t")[6:]
|
estrain@2
|
1219 salm_species_results=out.split("\n")[0].split("\t")[6:]
|
estrain@2
|
1220 max_score=0
|
estrain@2
|
1221 max_score_index=1 #default is 1, means "I"
|
estrain@2
|
1222 for i in range(len(salm_species_scores)):
|
estrain@2
|
1223 if max_score<float(salm_species_scores[i]):
|
estrain@2
|
1224 max_score=float(salm_species_scores[i])
|
estrain@2
|
1225 max_score_index=i
|
estrain@2
|
1226 prediction=salm_species_results[max_score_index].split(".")[1].strip().split(" ")[0]
|
estrain@2
|
1227 if float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): #bongori and enterica compare
|
estrain@2
|
1228 prediction="bongori" #if not, the prediction would always be enterica, since they are located in the later part
|
estrain@2
|
1229 if max_score<10:
|
estrain@2
|
1230 prediction="-"
|
estrain@2
|
1231 return prediction
|
estrain@2
|
1232
|
estrain@2
|
1233 def judge_subspecies_Kmer(Special_dict):
|
estrain@2
|
1234 #seqsero2 -k;
|
estrain@2
|
1235 max_score=0
|
estrain@2
|
1236 prediction="-" #default should be I
|
estrain@2
|
1237 for x in Special_dict:
|
estrain@2
|
1238 if "mer" in x:
|
estrain@2
|
1239 if max_score<float(Special_dict[x]):
|
estrain@2
|
1240 max_score=float(Special_dict[x])
|
estrain@2
|
1241 prediction=x.split("_")[-1].strip()
|
estrain@2
|
1242 if x.split("_")[-1].strip()=="bongori" and float(Special_dict[x])>95:#if bongori already, then no need to test enterica
|
estrain@2
|
1243 prediction="bongori"
|
estrain@2
|
1244 break
|
estrain@2
|
1245 return prediction
|
estrain@2
|
1246
|
estrain@2
|
1247 def main():
|
estrain@2
|
1248 #combine SeqSeroK and SeqSero2, also with SalmID
|
estrain@2
|
1249 args = parse_args()
|
estrain@2
|
1250 input_file = args.i
|
estrain@2
|
1251 data_type = args.t
|
estrain@2
|
1252 analysis_mode = args.m
|
estrain@2
|
1253 mapping_mode=args.b
|
estrain@2
|
1254 threads=args.p
|
estrain@2
|
1255 make_dir=args.d
|
estrain@2
|
1256 clean_mode=args.c
|
estrain@2
|
1257 k_size=27 #will change for bug fixing
|
estrain@2
|
1258 database="H_and_O_and_specific_genes.fasta"
|
estrain@2
|
1259 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
|
estrain@2
|
1260 if len(sys.argv)==1:
|
estrain@2
|
1261 subprocess.check_call(dirpath+"/SeqSero2_package.py -h",shell=True)#change name of python file
|
estrain@2
|
1262 else:
|
estrain@2
|
1263 request_id = time.strftime("%m_%d_%Y_%H_%M_%S", time.localtime())
|
estrain@2
|
1264 request_id += str(random.randint(1, 10000000))
|
estrain@2
|
1265 if make_dir is None:
|
estrain@2
|
1266 make_dir="SeqSero_result_"+request_id
|
estrain@2
|
1267 if os.path.isdir(make_dir):
|
estrain@2
|
1268 pass
|
estrain@2
|
1269 else:
|
estrain@2
|
1270 subprocess.check_call(["mkdir",make_dir])
|
estrain@2
|
1271 #subprocess.check_call("cp "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True)
|
estrain@2
|
1272 #subprocess.check_call("ln -sr "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True)
|
estrain@2
|
1273 subprocess.check_call("ln -f -s "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) ### ed_SL_05282019: use -f option to force the replacement of links, remove -r and use absolute path instead to avoid link issue (use 'type=os.path.abspath' in -i argument).
|
estrain@2
|
1274 ############################begin the real analysis
|
estrain@2
|
1275 if analysis_mode=="a":
|
estrain@2
|
1276 if data_type in ["1","2","3"]:#use allele mode
|
estrain@2
|
1277 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
|
estrain@2
|
1278 os.chdir(make_dir)
|
estrain@2
|
1279 ###add a function to tell input files
|
estrain@2
|
1280 fnameA=for_fq.split("/")[-1]
|
estrain@2
|
1281 fnameB=rev_fq.split("/")[-1]
|
estrain@2
|
1282 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
|
estrain@2
|
1283 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id
|
estrain@2
|
1284 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort
|
estrain@2
|
1285 xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast
|
estrain@2
|
1286 if xmlfile=="NA":
|
estrain@2
|
1287 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","")
|
estrain@2
|
1288 else:
|
estrain@2
|
1289 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results
|
estrain@2
|
1290 file=open("data_log.txt","a")
|
estrain@2
|
1291 for x in Final_list:
|
estrain@2
|
1292 file.write("\t".join(str(y) for y in x)+"\n")
|
estrain@2
|
1293 file.close()
|
estrain@2
|
1294 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
|
estrain@2
|
1295 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB
|
estrain@2
|
1296 subspecies=judge_subspecies(fnameA) #predict subspecies
|
estrain@2
|
1297 ###output
|
estrain@2
|
1298 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
|
estrain@2
|
1299 claim="" #04132019, disable claim for new report requirement
|
estrain@2
|
1300 contamination_report=""
|
estrain@2
|
1301 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0]
|
estrain@2
|
1302 if contamination_O!="" and contamination_H=="":
|
estrain@2
|
1303 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"."
|
estrain@2
|
1304 elif contamination_O=="" and contamination_H!="":
|
estrain@2
|
1305 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"."
|
estrain@2
|
1306 elif contamination_O!="" and contamination_H!="":
|
estrain@2
|
1307 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"."
|
estrain@2
|
1308 if contamination_report!="":
|
estrain@2
|
1309 #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019
|
estrain@2
|
1310 contamination_report=" Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles."
|
estrain@2
|
1311 #claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version
|
estrain@2
|
1312 if contamination_report+star_line+claim=="": #0413, new output style
|
estrain@2
|
1313 note=""
|
estrain@2
|
1314 else:
|
estrain@2
|
1315 note="Note:"
|
estrain@2
|
1316 if clean_mode:
|
estrain@2
|
1317 subprocess.check_call("rm -rf ../"+make_dir,shell=True)
|
estrain@2
|
1318 make_dir="none-output-directory due to '-c' flag"
|
estrain@2
|
1319 else:
|
estrain@2
|
1320 new_file=open("SeqSero_result.txt","w")
|
estrain@2
|
1321 if O_choice=="":
|
estrain@2
|
1322 O_choice="-"
|
estrain@2
|
1323 if "N/A" not in predict_sero:
|
estrain@2
|
1324 new_file.write("Output_directory:\t"+make_dir+"\n"+
|
estrain@2
|
1325 "Input files:\t"+"\t".join(input_file)+"\n"+
|
estrain@2
|
1326 "O antigen prediction:\t"+O_choice+"\n"+
|
estrain@2
|
1327 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
|
estrain@2
|
1328 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
|
estrain@2
|
1329 "Predicted subspecies:\t"+subspecies+"\n"+
|
estrain@2
|
1330 "Predicted antigenic profile:\t"+predict_form+"\n"+
|
estrain@2
|
1331 "Predicted serotype:\t"+predict_sero+"\n"+ # ed_SL_04152019: change serotype(s) to serotype
|
estrain@2
|
1332 note+contamination_report+star_line+claim+"\n")#+##
|
estrain@2
|
1333 else:
|
estrain@2
|
1334 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n"
|
estrain@2
|
1335 star_line="" #04132019, for new output requirement, diable star_line if "NA" in output
|
estrain@2
|
1336 new_file.write("Output_directory:\t"+make_dir+"\n"+
|
estrain@2
|
1337 "Input files:\t"+"\t".join(input_file)+"\n"+
|
estrain@2
|
1338 "O antigen prediction:\t"+O_choice+"\n"+
|
estrain@2
|
1339 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
|
estrain@2
|
1340 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
|
estrain@2
|
1341 "Predicted subspecies:\t"+subspecies+"\n"+
|
estrain@2
|
1342 "Predicted antigenic profile:\t"+predict_form+"\n"+
|
estrain@2
|
1343 note+contamination_report+star_line+claim+"\n")#+##
|
estrain@2
|
1344 new_file.close()
|
estrain@2
|
1345 print("\n")
|
estrain@2
|
1346 #subprocess.check_call("cat Seqsero_result.txt",shell=True)
|
estrain@2
|
1347 #subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt *.xml "+fnameA+"*_db* 2> /dev/null",shell=True)
|
estrain@2
|
1348 subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt "+fnameA+"*_db* 2> /dev/null",shell=True)
|
estrain@2
|
1349 if "N/A" not in predict_sero:
|
estrain@2
|
1350 #print("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\nNote:"+contamination_report+star+star_line+claim+"\n")#+##
|
estrain@2
|
1351 print("Output_directory:\t"+make_dir+"\n"+
|
estrain@2
|
1352 "Input files:\t"+"\t".join(input_file)+"\n"+
|
estrain@2
|
1353 "O antigen prediction:\t"+O_choice+"\n"+
|
estrain@2
|
1354 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
|
estrain@2
|
1355 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
|
estrain@2
|
1356 "Predicted subspecies:\t"+subspecies+"\n"+
|
estrain@2
|
1357 "Predicted antigenic profile:\t"+predict_form+"\n"+
|
estrain@2
|
1358 "Predicted serotype:\t"+predict_sero+"\n"+ # ed_SL_04152019: change serotype(s) to serotype
|
estrain@2
|
1359 note+contamination_report+star_line+claim+"\n")#+##
|
estrain@2
|
1360 else:
|
estrain@2
|
1361 print("Output_directory:\t"+make_dir+"\n"+
|
estrain@2
|
1362 "Input files:\t"+"\t".join(input_file)+"\n"+
|
estrain@2
|
1363 "O antigen prediction:\t"+O_choice+"\n"+
|
estrain@2
|
1364 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
|
estrain@2
|
1365 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
|
estrain@2
|
1366 "Predicted subspecies:\t"+subspecies+"\n"+
|
estrain@2
|
1367 "Predicted antigenic profile:\t"+predict_form+"\n"+
|
estrain@2
|
1368 note+contamination_report+star_line+claim+"\n")
|
estrain@2
|
1369 else:
|
estrain@2
|
1370 print("Allele modes only support raw reads datatype, i.e. '-t 1 or 2 or 3'; please use '-m k'")
|
estrain@2
|
1371 elif analysis_mode=="k":
|
estrain@2
|
1372 ex_dir = os.path.dirname(os.path.realpath(__file__))
|
estrain@2
|
1373 #output_mode = args.mode
|
estrain@2
|
1374 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
|
estrain@2
|
1375 input_file = for_fq #-k will just use forward because not all reads were used
|
estrain@2
|
1376 os.chdir(make_dir)
|
estrain@2
|
1377 f = open(ex_dir + '/antigens.pickle', 'rb')
|
estrain@2
|
1378 lib_dict = pickle.load(f)
|
estrain@2
|
1379 f.close
|
estrain@2
|
1380 input_Ks=get_input_K(input_file,lib_dict,data_type,k_size)
|
estrain@2
|
1381 O_dict,H_dict,Special_dict=get_kmer_dict(lib_dict,input_Ks)
|
estrain@2
|
1382 highest_O,highest_fliC,highest_fljB=call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir)
|
estrain@2
|
1383 subspecies=judge_subspecies_Kmer(Special_dict)
|
estrain@2
|
1384 if subspecies=="IIb" or subspecies=="IIa":
|
estrain@2
|
1385 subspecies="II"
|
estrain@2
|
1386 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
|
estrain@2
|
1387 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
|
estrain@2
|
1388 claim="" #no claim any more based on new output requirement
|
estrain@2
|
1389 if star_line+claim=="": #0413, new output style
|
estrain@2
|
1390 note=""
|
estrain@2
|
1391 else:
|
estrain@2
|
1392 note="Note:"
|
estrain@2
|
1393 if clean_mode:
|
estrain@2
|
1394 subprocess.check_call("rm -rf ../"+make_dir,shell=True)
|
estrain@2
|
1395 make_dir="none-output-directory due to '-c' flag"
|
estrain@2
|
1396 ### ed_SL_05282019, fix the assignment issue of variable 'O_choice' using "-m k -c"
|
estrain@2
|
1397 if highest_O.split('-')[-1]=="":
|
estrain@2
|
1398 O_choice="-"
|
estrain@2
|
1399 else:
|
estrain@2
|
1400 O_choice=highest_O.split('-')[-1]
|
estrain@2
|
1401 ###
|
estrain@2
|
1402 else:
|
estrain@2
|
1403 if highest_O.split('-')[-1]=="":
|
estrain@2
|
1404 O_choice="-"
|
estrain@2
|
1405 else:
|
estrain@2
|
1406 O_choice=highest_O.split('-')[-1]
|
estrain@2
|
1407 #print("Output_directory:"+make_dir+"\tInput_file:"+input_file+"\tPredicted subpecies:"+subspecies + '\tPredicted antigenic profile:' + predict_form + '\tPredicted serotype(s):' + predict_sero)
|
estrain@2
|
1408 new_file=open("SeqSero_result.txt","w")
|
estrain@2
|
1409 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
|
estrain@2
|
1410 if "N/A" not in predict_sero:
|
estrain@2
|
1411 new_file.write("Output_directory:\t"+make_dir+"\n"+
|
estrain@2
|
1412 "Input files:\t"+input_file+"\n"+
|
estrain@2
|
1413 "O antigen prediction:\t"+O_choice+"\n"+
|
estrain@2
|
1414 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
|
estrain@2
|
1415 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
|
estrain@2
|
1416 "Predicted subspecies:\t"+subspecies+"\n"+
|
estrain@2
|
1417 "Predicted antigenic profile:\t"+predict_form+"\n"+
|
estrain@2
|
1418 "Predicted serotype:\t"+predict_sero+"\n"+ # ed_SL_04152019: change serotype(s) to serotype
|
estrain@2
|
1419 note+star_line+claim+"\n")#+##
|
estrain@2
|
1420 else:
|
estrain@2
|
1421 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n"
|
estrain@2
|
1422 star_line = "" #changed for new output requirement, 04132019
|
estrain@2
|
1423 new_file.write("Output_directory:\t"+make_dir+"\n"+
|
estrain@2
|
1424 "Input files:\t"+input_file+"\n"+
|
estrain@2
|
1425 "O antigen prediction:\t"+O_choice+"\n"+
|
estrain@2
|
1426 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
|
estrain@2
|
1427 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
|
estrain@2
|
1428 "Predicted subspecies:\t"+subspecies+"\n"+
|
estrain@2
|
1429 "Predicted antigenic profile:\t"+predict_form+"\n"+
|
estrain@2
|
1430 note+star_line+claim+"\n")#+##
|
estrain@2
|
1431 new_file.close()
|
estrain@2
|
1432 subprocess.call("rm *.fasta* *.fastq *.gz *.fq temp.txt *.sra 2> /dev/null",shell=True)
|
estrain@2
|
1433 if "N/A" not in predict_sero:
|
estrain@2
|
1434 print("Output_directory:\t"+make_dir+"\n"+
|
estrain@2
|
1435 "Input files:\t"+input_file+"\n"+
|
estrain@2
|
1436 "O antigen prediction:\t"+O_choice+"\n"+
|
estrain@2
|
1437 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
|
estrain@2
|
1438 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
|
estrain@2
|
1439 "Predicted subspecies:\t"+subspecies+"\n"+
|
estrain@2
|
1440 "Predicted antigenic profile:\t"+predict_form+"\n"+
|
estrain@2
|
1441 "Predicted serotype:\t"+predict_sero+"\n"+ # ed_SL_04152019: change serotype(s) to serotype
|
estrain@2
|
1442 note+star_line+claim+"\n")#+##
|
estrain@2
|
1443 else:
|
estrain@2
|
1444 print("Output_directory:\t"+make_dir+"\n"+
|
estrain@2
|
1445 "Input files:\t"+input_file+"\n"+
|
estrain@2
|
1446 "O antigen prediction:\t"+O_choice+"\n"+
|
estrain@2
|
1447 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
|
estrain@2
|
1448 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
|
estrain@2
|
1449 "Predicted subspecies:\t"+subspecies+"\n"+
|
estrain@2
|
1450 "Predicted antigenic profile:\t"+predict_form+"\n"+
|
estrain@2
|
1451 note+star_line+claim+"\n")#+##
|
estrain@2
|
1452
|
estrain@2
|
1453 if __name__ == '__main__':
|
estrain@2
|
1454 main()
|