jpayne@1
|
1 #!/usr/bin/env python3
|
jpayne@1
|
2
|
jpayne@1
|
3 import sys
|
jpayne@1
|
4 import time
|
jpayne@1
|
5 import random
|
jpayne@1
|
6 import os
|
jpayne@1
|
7 import subprocess
|
jpayne@1
|
8 import gzip
|
jpayne@1
|
9 import io
|
jpayne@1
|
10 import pickle
|
jpayne@1
|
11 import argparse
|
jpayne@1
|
12 import itertools
|
jpayne@1
|
13 from distutils.version import LooseVersion
|
jpayne@1
|
14
|
jpayne@1
|
15 from collections import OrderedDict
|
jpayne@1
|
16 from csv import DictWriter
|
jpayne@1
|
17
|
jpayne@1
|
18 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
|
jpayne@1
|
19
|
jpayne@1
|
20 ### SeqSero Kmer
|
jpayne@1
|
21 def parse_args():
|
jpayne@1
|
22 "Parse the input arguments, use '-h' for help."
|
jpayne@1
|
23 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com')#add "-m <data_type>" in future
|
jpayne@1
|
24 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data")
|
jpayne@1
|
25 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)")
|
jpayne@7
|
26 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: mode used for mapping in allele workflow, 'sam'(bwa samse/sampe), 'mem'(bwa mem), default=mem")
|
jpayne@7
|
27 parser.add_argument("-p",default="1",help="<int>: threads used for allele workflow, if p>4, only 4 threads will be used for assembly step, default=1")
|
jpayne@7
|
28 parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: 'k'(kmer workflow), 'a'(allele workflow), default=a")
|
jpayne@1
|
29 parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number")
|
jpayne@1
|
30 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will use clean mode and only output serotyping prediction, the directory containing log files will be deleted")
|
jpayne@1
|
31 return parser.parse_args()
|
jpayne@1
|
32
|
jpayne@1
|
33 def reverse_complement(sequence):
|
jpayne@1
|
34 complement = {
|
jpayne@1
|
35 'A': 'T',
|
jpayne@1
|
36 'C': 'G',
|
jpayne@1
|
37 'G': 'C',
|
jpayne@1
|
38 'T': 'A',
|
jpayne@1
|
39 'N': 'N',
|
jpayne@1
|
40 'M': 'K',
|
jpayne@1
|
41 'R': 'Y',
|
jpayne@1
|
42 'W': 'W',
|
jpayne@1
|
43 'S': 'S',
|
jpayne@1
|
44 'Y': 'R',
|
jpayne@1
|
45 'K': 'M',
|
jpayne@1
|
46 'V': 'B',
|
jpayne@1
|
47 'H': 'D',
|
jpayne@1
|
48 'D': 'H',
|
jpayne@1
|
49 'B': 'V'
|
jpayne@1
|
50 }
|
jpayne@1
|
51 return "".join(complement[base] for base in reversed(sequence))
|
jpayne@1
|
52
|
jpayne@1
|
53
|
jpayne@1
|
54 def createKmerDict_reads(list_of_strings, kmer):
|
jpayne@1
|
55 kmer_table = {}
|
jpayne@1
|
56 for string in list_of_strings:
|
jpayne@1
|
57 sequence = string.strip('\n')
|
jpayne@1
|
58 for i in range(len(sequence) - kmer + 1):
|
jpayne@1
|
59 new_mer = sequence[i:i + kmer].upper()
|
jpayne@1
|
60 new_mer_rc = reverse_complement(new_mer)
|
jpayne@1
|
61 if new_mer in kmer_table:
|
jpayne@1
|
62 kmer_table[new_mer.upper()] += 1
|
jpayne@1
|
63 else:
|
jpayne@1
|
64 kmer_table[new_mer.upper()] = 1
|
jpayne@1
|
65 if new_mer_rc in kmer_table:
|
jpayne@1
|
66 kmer_table[new_mer_rc.upper()] += 1
|
jpayne@1
|
67 else:
|
jpayne@1
|
68 kmer_table[new_mer_rc.upper()] = 1
|
jpayne@1
|
69 return kmer_table
|
jpayne@1
|
70
|
jpayne@1
|
71
|
jpayne@1
|
72 def multifasta_dict(multifasta):
|
jpayne@1
|
73 multifasta_list = [
|
jpayne@1
|
74 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0
|
jpayne@1
|
75 ]
|
jpayne@1
|
76 headers = [i for i in multifasta_list if i[0] == '>']
|
jpayne@1
|
77 multifasta_dict = {}
|
jpayne@1
|
78 for h in headers:
|
jpayne@1
|
79 start = multifasta_list.index(h)
|
jpayne@1
|
80 for element in multifasta_list[start + 1:]:
|
jpayne@1
|
81 if element[0] == '>':
|
jpayne@1
|
82 break
|
jpayne@1
|
83 else:
|
jpayne@1
|
84 if h[1:] in multifasta_dict:
|
jpayne@1
|
85 multifasta_dict[h[1:]] += element
|
jpayne@1
|
86 else:
|
jpayne@1
|
87 multifasta_dict[h[1:]] = element
|
jpayne@1
|
88 return multifasta_dict
|
jpayne@1
|
89
|
jpayne@1
|
90
|
jpayne@1
|
91 def multifasta_single_string(multifasta):
|
jpayne@1
|
92 multifasta_list = [
|
jpayne@1
|
93 line.strip() for line in open(multifasta, 'r')
|
jpayne@1
|
94 if (len(line.strip()) > 0) and (line.strip()[0] != '>')
|
jpayne@1
|
95 ]
|
jpayne@1
|
96 return ''.join(multifasta_list)
|
jpayne@1
|
97
|
jpayne@1
|
98
|
jpayne@1
|
99 def chunk_a_long_sequence(long_sequence, chunk_size=60):
|
jpayne@1
|
100 chunk_list = []
|
jpayne@1
|
101 steps = len(long_sequence) // 60 #how many chunks
|
jpayne@1
|
102 for i in range(steps):
|
jpayne@1
|
103 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size])
|
jpayne@1
|
104 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)])
|
jpayne@1
|
105 return chunk_list
|
jpayne@1
|
106
|
jpayne@1
|
107
|
jpayne@1
|
108 def target_multifasta_kmerizer(multifasta, k, kmerDict):
|
jpayne@1
|
109 forward_length = 300 #if find the target, put forward 300 bases
|
jpayne@1
|
110 reverse_length = 2200 #if find the target, put backward 2200 bases
|
jpayne@1
|
111 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences
|
jpayne@1
|
112 target_mers = []
|
jpayne@1
|
113 long_single_string = multifasta_single_string(multifasta)
|
jpayne@1
|
114 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size)
|
jpayne@1
|
115 unit_length = len(multifasta_list[0])
|
jpayne@1
|
116 forward_lines = int(forward_length / unit_length) + 1
|
jpayne@1
|
117 reverse_lines = int(forward_length / unit_length) + 1
|
jpayne@1
|
118 start_num = 0
|
jpayne@1
|
119 end_num = 0
|
jpayne@1
|
120 for i in range(len(multifasta_list)):
|
jpayne@1
|
121 if i not in range(start_num, end_num): #avoid computational repetition
|
jpayne@1
|
122 line = multifasta_list[i]
|
jpayne@1
|
123 start = int((len(line) - k) // 2)
|
jpayne@1
|
124 s1 = line[start:k + start]
|
jpayne@1
|
125 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
|
jpayne@1
|
126 if i - forward_lines >= 0:
|
jpayne@1
|
127 start_num = i - forward_lines
|
jpayne@1
|
128 else:
|
jpayne@1
|
129 start_num = 0
|
jpayne@1
|
130 if i + reverse_lines <= len(multifasta_list) - 1:
|
jpayne@1
|
131 end_num = i + reverse_lines
|
jpayne@1
|
132 else:
|
jpayne@1
|
133 end_num = len(multifasta_list) - 1
|
jpayne@1
|
134 target_list = [
|
jpayne@1
|
135 x.strip() for x in multifasta_list[start_num:end_num]
|
jpayne@1
|
136 ]
|
jpayne@1
|
137 target_line = "".join(target_list)
|
jpayne@1
|
138 target_mers += [
|
jpayne@1
|
139 k1 for k1 in createKmerDict_reads([str(target_line)], k)
|
jpayne@1
|
140 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
|
jpayne@1
|
141 else:
|
jpayne@1
|
142 pass
|
jpayne@1
|
143 return set(target_mers)
|
jpayne@1
|
144
|
jpayne@1
|
145
|
jpayne@1
|
146 def target_read_kmerizer(file, k, kmerDict):
|
jpayne@1
|
147 i = 1
|
jpayne@1
|
148 n_reads = 0
|
jpayne@1
|
149 total_coverage = 0
|
jpayne@1
|
150 target_mers = []
|
jpayne@1
|
151 if file.endswith(".gz"):
|
jpayne@1
|
152 file_content = io.BufferedReader(gzip.open(file))
|
jpayne@1
|
153 else:
|
jpayne@1
|
154 file_content = open(file, "r").readlines()
|
jpayne@1
|
155 for line in file_content:
|
jpayne@1
|
156 start = int((len(line) - k) // 2)
|
jpayne@1
|
157 if i % 4 == 2:
|
jpayne@1
|
158 if file.endswith(".gz"):
|
jpayne@1
|
159 s1 = line[start:k + start].decode()
|
jpayne@1
|
160 line = line.decode()
|
jpayne@1
|
161 else:
|
jpayne@1
|
162 s1 = line[start:k + start]
|
jpayne@1
|
163 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
|
jpayne@1
|
164 n_reads += 1
|
jpayne@1
|
165 total_coverage += len(line)
|
jpayne@1
|
166 target_mers += [
|
jpayne@1
|
167 k1 for k1 in createKmerDict_reads([str(line)], k)
|
jpayne@1
|
168 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
|
jpayne@1
|
169 i += 1
|
jpayne@1
|
170 if total_coverage >= 4000000:
|
jpayne@1
|
171 break
|
jpayne@1
|
172 return set(target_mers)
|
jpayne@1
|
173
|
jpayne@1
|
174
|
jpayne@1
|
175 def minion_fasta_kmerizer(file, k, kmerDict):
|
jpayne@1
|
176 i = 1
|
jpayne@1
|
177 n_reads = 0
|
jpayne@1
|
178 total_coverage = 0
|
jpayne@1
|
179 target_mers = {}
|
jpayne@1
|
180 for line in open(file):
|
jpayne@1
|
181 if i % 2 == 0:
|
jpayne@1
|
182 for kmer, rc_kmer in kmers(line.strip().upper(), k):
|
jpayne@1
|
183 if (kmer in kmerDict) or (rc_kmer in kmerDict):
|
jpayne@1
|
184 if kmer in target_mers:
|
jpayne@1
|
185 target_mers[kmer] += 1
|
jpayne@1
|
186 else:
|
jpayne@1
|
187 target_mers[kmer] = 1
|
jpayne@1
|
188 if rc_kmer in target_mers:
|
jpayne@1
|
189 target_mers[rc_kmer] += 1
|
jpayne@1
|
190 else:
|
jpayne@1
|
191 target_mers[rc_kmer] = 1
|
jpayne@1
|
192 i += 1
|
jpayne@1
|
193 return set([h for h in target_mers])
|
jpayne@1
|
194
|
jpayne@1
|
195
|
jpayne@1
|
196 def minion_fastq_kmerizer(file, k, kmerDict):
|
jpayne@1
|
197 i = 1
|
jpayne@1
|
198 n_reads = 0
|
jpayne@1
|
199 total_coverage = 0
|
jpayne@1
|
200 target_mers = {}
|
jpayne@1
|
201 for line in open(file):
|
jpayne@1
|
202 if i % 4 == 2:
|
jpayne@1
|
203 for kmer, rc_kmer in kmers(line.strip().upper(), k):
|
jpayne@1
|
204 if (kmer in kmerDict) or (rc_kmer in kmerDict):
|
jpayne@1
|
205 if kmer in target_mers:
|
jpayne@1
|
206 target_mers[kmer] += 1
|
jpayne@1
|
207 else:
|
jpayne@1
|
208 target_mers[kmer] = 1
|
jpayne@1
|
209 if rc_kmer in target_mers:
|
jpayne@1
|
210 target_mers[rc_kmer] += 1
|
jpayne@1
|
211 else:
|
jpayne@1
|
212 target_mers[rc_kmer] = 1
|
jpayne@1
|
213 i += 1
|
jpayne@1
|
214 return set([h for h in target_mers])
|
jpayne@1
|
215
|
jpayne@1
|
216
|
jpayne@1
|
217 def multifasta_single_string2(multifasta):
|
jpayne@1
|
218 single_string = ''
|
jpayne@1
|
219 with open(multifasta, 'r') as f:
|
jpayne@1
|
220 for line in f:
|
jpayne@1
|
221 if line.strip()[0] == '>':
|
jpayne@1
|
222 pass
|
jpayne@1
|
223 else:
|
jpayne@1
|
224 single_string += line.strip()
|
jpayne@1
|
225 return single_string
|
jpayne@1
|
226
|
jpayne@1
|
227
|
jpayne@1
|
228 def kmers(seq, k):
|
jpayne@1
|
229 rev_comp = reverse_complement(seq)
|
jpayne@1
|
230 for start in range(1, len(seq) - k + 1):
|
jpayne@1
|
231 yield seq[start:start + k], rev_comp[-(start + k):-start]
|
jpayne@1
|
232
|
jpayne@1
|
233
|
jpayne@1
|
234 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set
|
jpayne@1
|
235 multi_seq_dict = multifasta_dict(multifasta)
|
jpayne@1
|
236 lib_dict = {}
|
jpayne@1
|
237 for h in multi_seq_dict:
|
jpayne@1
|
238 lib_dict[h] = set(
|
jpayne@1
|
239 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)])
|
jpayne@1
|
240 return lib_dict
|
jpayne@1
|
241
|
jpayne@1
|
242
|
jpayne@1
|
243 def Combine(b, c):
|
jpayne@1
|
244 fliC_combinations = []
|
jpayne@1
|
245 fliC_combinations.append(",".join(c))
|
jpayne@1
|
246 temp_combinations = []
|
jpayne@1
|
247 for i in range(len(b)):
|
jpayne@1
|
248 for x in itertools.combinations(b, i + 1):
|
jpayne@1
|
249 temp_combinations.append(",".join(x))
|
jpayne@1
|
250 for x in temp_combinations:
|
jpayne@1
|
251 temp = []
|
jpayne@1
|
252 for y in c:
|
jpayne@1
|
253 temp.append(y)
|
jpayne@1
|
254 temp.append(x)
|
jpayne@1
|
255 temp = ",".join(temp)
|
jpayne@1
|
256 temp = temp.split(",")
|
jpayne@1
|
257 temp.sort()
|
jpayne@1
|
258 temp = ",".join(temp)
|
jpayne@1
|
259 fliC_combinations.append(temp)
|
jpayne@1
|
260 return fliC_combinations
|
jpayne@1
|
261
|
jpayne@1
|
262
|
jpayne@1
|
263 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies):
|
jpayne@1
|
264 #like test_output_06012017.txt
|
jpayne@1
|
265 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
|
jpayne@4
|
266 from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict
|
jpayne@4
|
267 rename_dict_not_anymore=[rename_dict[x] for x in rename_dict]
|
jpayne@4
|
268 rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to
|
jpayne@1
|
269 seronames = []
|
jpayne@1
|
270 seronames_none_subspecies=[]
|
jpayne@1
|
271 for i in range(len(phase1)):
|
jpayne@1
|
272 fliC_combine = []
|
jpayne@1
|
273 fljB_combine = []
|
jpayne@1
|
274 if phaseO[i] == Otype: # no VII in KW, but it's there
|
jpayne@1
|
275 ### for fliC, detect every possible combinations to avoid the effect of "["
|
jpayne@1
|
276 if phase1[i].count("[") == 0:
|
jpayne@1
|
277 fliC_combine.append(phase1[i])
|
jpayne@1
|
278 elif phase1[i].count("[") >= 1:
|
jpayne@1
|
279 c = []
|
jpayne@1
|
280 b = []
|
jpayne@1
|
281 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count(
|
jpayne@1
|
282 "[") == 1:
|
jpayne@1
|
283 content = phase1[i].replace("[", "").replace("]", "")
|
jpayne@1
|
284 fliC_combine.append(content)
|
jpayne@1
|
285 fliC_combine.append("-")
|
jpayne@1
|
286 else:
|
jpayne@1
|
287 for x in phase1[i].split(","):
|
jpayne@1
|
288 if "[" in x:
|
jpayne@1
|
289 b.append(x.replace("[", "").replace("]", ""))
|
jpayne@1
|
290 else:
|
jpayne@1
|
291 c.append(x)
|
jpayne@1
|
292 fliC_combine = Combine(
|
jpayne@1
|
293 b, c
|
jpayne@1
|
294 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t
|
jpayne@1
|
295 ### end of fliC "[" detect
|
jpayne@1
|
296 ### for fljB, detect every possible combinations to avoid the effect of "["
|
jpayne@1
|
297 if phase2[i].count("[") == 0:
|
jpayne@1
|
298 fljB_combine.append(phase2[i])
|
jpayne@1
|
299 elif phase2[i].count("[") >= 1:
|
jpayne@1
|
300 d = []
|
jpayne@1
|
301 e = []
|
jpayne@1
|
302 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count(
|
jpayne@1
|
303 "[") == 1:
|
jpayne@1
|
304 content = phase2[i].replace("[", "").replace("]", "")
|
jpayne@1
|
305 fljB_combine.append(content)
|
jpayne@1
|
306 fljB_combine.append("-")
|
jpayne@1
|
307 else:
|
jpayne@1
|
308 for x in phase2[i].split(","):
|
jpayne@1
|
309 if "[" in x:
|
jpayne@1
|
310 d.append(x.replace("[", "").replace("]", ""))
|
jpayne@1
|
311 else:
|
jpayne@1
|
312 e.append(x)
|
jpayne@1
|
313 fljB_combine = Combine(d, e)
|
jpayne@1
|
314 ### end of fljB "[" detect
|
jpayne@1
|
315 new_fliC = fliC.split(
|
jpayne@1
|
316 ","
|
jpayne@1
|
317 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings
|
jpayne@1
|
318 new_fliC.sort()
|
jpayne@1
|
319 new_fliC = ",".join(new_fliC)
|
jpayne@1
|
320 new_fljB = fljB.split(",")
|
jpayne@1
|
321 new_fljB.sort()
|
jpayne@1
|
322 new_fljB = ",".join(new_fljB)
|
jpayne@1
|
323 if (new_fliC in fliC_combine
|
jpayne@1
|
324 or fliC in fliC_combine) and (new_fljB in fljB_combine
|
jpayne@1
|
325 or fljB in fljB_combine):
|
jpayne@4
|
326 ######start, remove_list,rename_dict, added on 11/11/2018
|
jpayne@4
|
327 if sero[i] not in remove_list:
|
jpayne@4
|
328 temp_sero=sero[i]
|
jpayne@4
|
329 if temp_sero in rename_dict:
|
jpayne@4
|
330 temp_sero=rename_dict[temp_sero] #rename if in the rename list
|
jpayne@4
|
331 if temp_sero not in seronames:#the new sero may already included, if yes, then not consider
|
jpayne@4
|
332 if subs[i] == subspecies:
|
jpayne@4
|
333 seronames.append(temp_sero)
|
jpayne@4
|
334 seronames_none_subspecies.append(temp_sero)
|
jpayne@4
|
335 else:
|
jpayne@4
|
336 pass
|
jpayne@4
|
337 else:
|
jpayne@4
|
338 pass
|
jpayne@4
|
339 ######end, added on 11/11/2018
|
jpayne@1
|
340 #analyze seronames
|
jpayne@1
|
341 subspecies_pointer=""
|
jpayne@1
|
342 if len(seronames) == 0 and len(seronames_none_subspecies)!=0:
|
jpayne@1
|
343 seronames=seronames_none_subspecies
|
jpayne@1
|
344 subspecies_pointer="1"
|
jpayne@1
|
345 if len(seronames) == 0:
|
jpayne@1
|
346 seronames = [
|
jpayne@1
|
347 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)"
|
jpayne@1
|
348 ]
|
jpayne@1
|
349 star = ""
|
jpayne@1
|
350 star_line = ""
|
jpayne@1
|
351 if len(seronames) > 1: #there are two possible predictions for serotypes
|
jpayne@1
|
352 star = "*"
|
jpayne@7
|
353 #changed 04072019
|
jpayne@7
|
354 #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
|
jpayne@1
|
355 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0:
|
jpayne@1
|
356 star="*"
|
jpayne@7
|
357 star_line=" The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme." + star_line
|
jpayne@7
|
358 #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
|
jpayne@1
|
359 if Otype=="":
|
jpayne@1
|
360 Otype="-"
|
jpayne@1
|
361 predict_form = Otype + ":" + fliC + ":" + fljB
|
jpayne@1
|
362 predict_sero = (" or ").join(seronames)
|
jpayne@1
|
363 ###special test for Enteritidis
|
jpayne@1
|
364 if predict_form == "9:g,m:-":
|
jpayne@1
|
365 sdf = "-"
|
jpayne@1
|
366 for x in special_gene_list:
|
jpayne@1
|
367 if x.startswith("sdf"):
|
jpayne@1
|
368 sdf = "+"
|
jpayne@7
|
369 #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
|
jpayne@7
|
370 star_line=" sdf gene detected." # ed_SL_04152019: new output format
|
jpayne@7
|
371 #predict_form = predict_form + " Sdf prediction:" + sdf
|
jpayne@7
|
372 predict_form = predict_form #changed 04072019
|
jpayne@1
|
373 if sdf == "-":
|
jpayne@1
|
374 star = "*"
|
jpayne@7
|
375 #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
|
jpayne@7
|
376 star_line=" sdf gene not detected." # ed_SL_04152019: new output format
|
jpayne@7
|
377 #changed in 04072019, for new output
|
jpayne@7
|
378 #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
|
jpayne@7
|
379 #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement
|
jpayne@7
|
380 predict_sero = "Gallinarum or Enteritidis" # ed_SL_04152019: new output format
|
jpayne@1
|
381 ###end of special test for Enteritidis
|
jpayne@1
|
382 elif predict_form == "4:i:-":
|
jpayne@7
|
383 predict_sero = "4:i:-"
|
jpayne@1
|
384 elif predict_form == "4:r:-":
|
jpayne@7
|
385 predict_sero = "4:r:-"
|
jpayne@1
|
386 elif predict_form == "4:b:-":
|
jpayne@7
|
387 predict_sero = "4:b:-"
|
jpayne@4
|
388 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
|
jpayne@4
|
389 #predict_sero = "Newport"
|
jpayne@4
|
390 #star = "*"
|
jpayne@4
|
391 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
|
jpayne@7
|
392 claim = " The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
|
jpayne@1
|
393 if "N/A" in predict_sero:
|
jpayne@1
|
394 claim = ""
|
jpayne@1
|
395 #special test for Typhimurium
|
jpayne@1
|
396 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
|
jpayne@1
|
397 normal = 0
|
jpayne@1
|
398 mutation = 0
|
jpayne@1
|
399 for x in special_gene_list:
|
jpayne@1
|
400 if "oafA-O-4_full" in x:
|
jpayne@1
|
401 normal = float(special_gene_list[x])
|
jpayne@1
|
402 elif "oafA-O-4_5-" in x:
|
jpayne@1
|
403 mutation = float(special_gene_list[x])
|
jpayne@1
|
404 if normal > mutation:
|
jpayne@1
|
405 pass
|
jpayne@1
|
406 elif normal < mutation:
|
jpayne@7
|
407 #predict_sero = predict_sero.strip() + "(O5-)"
|
jpayne@7
|
408 predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019
|
jpayne@1
|
409 star = "*"
|
jpayne@7
|
410 #star_line = "Detected the deletion of O5-."
|
jpayne@7
|
411 star_line = " Detected a deletion that causes O5- variant of Typhimurium." # ed_SL_04152019: new output format
|
jpayne@1
|
412 else:
|
jpayne@1
|
413 pass
|
jpayne@1
|
414 #special test for Paratyphi B
|
jpayne@1
|
415 if "Paratyphi B" in predict_sero or predict_form == "4:b:-":
|
jpayne@1
|
416 normal = 0
|
jpayne@1
|
417 mutation = 0
|
jpayne@1
|
418 for x in special_gene_list:
|
jpayne@1
|
419 if "gntR-family-regulatory-protein_dt-positive" in x:
|
jpayne@1
|
420 normal = float(special_gene_list[x])
|
jpayne@1
|
421 elif "gntR-family-regulatory-protein_dt-negative" in x:
|
jpayne@1
|
422 mutation = float(special_gene_list[x])
|
jpayne@1
|
423 #print(normal,mutation)
|
jpayne@1
|
424 if normal > mutation:
|
jpayne@7
|
425 #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019
|
jpayne@7
|
426 predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip() # ed_SL_04152019: new output format
|
jpayne@1
|
427 star = "*"
|
jpayne@7
|
428 #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
|
jpayne@7
|
429 star_line = " The SNP that causes d-Tartrate nonfermentating phenotype was not detected. " # ed_SL_04152019: new output format
|
jpayne@1
|
430 elif normal < mutation:
|
jpayne@7
|
431 #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019
|
jpayne@7
|
432 predict_sero = predict_sero.strip()
|
jpayne@1
|
433 star = "*"
|
jpayne@7
|
434 #star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
|
jpayne@7
|
435 star_line = " Detected the SNP for d-Tartrate nonfermenting phenotype." # ed_SL_04152019: new output format
|
jpayne@1
|
436 else:
|
jpayne@1
|
437 star = "*"
|
jpayne@7
|
438 star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
|
jpayne@1
|
439 #special test for O13,22 and O13,23
|
jpayne@1
|
440 if Otype=="13":
|
jpayne@1
|
441 #ex_dir = os.path.dirname(os.path.realpath(__file__))
|
jpayne@1
|
442 f = open(dirpath + '/special.pickle', 'rb')
|
jpayne@1
|
443 special = pickle.load(f)
|
jpayne@1
|
444 O22_O23=special['O22_O23']
|
jpayne@4
|
445 if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze
|
jpayne@1
|
446 O22_score=0
|
jpayne@1
|
447 O23_score=0
|
jpayne@1
|
448 for x in special_gene_list:
|
jpayne@1
|
449 if "O:22" in x:
|
jpayne@1
|
450 O22_score = O22_score+float(special_gene_list[x])
|
jpayne@1
|
451 elif "O:23" in x:
|
jpayne@1
|
452 O23_score = O23_score+float(special_gene_list[x])
|
jpayne@1
|
453 #print(O22_score,O23_score)
|
jpayne@1
|
454 for z in O22_O23[0]:
|
jpayne@1
|
455 if predict_sero.split(" or ")[0] in z:
|
jpayne@1
|
456 if O22_score > O23_score:
|
jpayne@1
|
457 star = "*"
|
jpayne@7
|
458 #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
|
jpayne@1
|
459 predict_sero = z[0]
|
jpayne@1
|
460 elif O22_score < O23_score:
|
jpayne@1
|
461 star = "*"
|
jpayne@7
|
462 #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
|
jpayne@1
|
463 predict_sero = z[1]
|
jpayne@1
|
464 else:
|
jpayne@1
|
465 star = "*"
|
jpayne@7
|
466 #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019
|
jpayne@7
|
467 if " or " in predict_sero:
|
jpayne@7
|
468 star_line = star_line + " The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
|
jpayne@1
|
469 #special test for O6,8
|
jpayne@4
|
470 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
|
jpayne@4
|
471 #for x in merge_O68_list:
|
jpayne@4
|
472 # if x in predict_sero:
|
jpayne@4
|
473 # predict_sero=x
|
jpayne@4
|
474 # star=""
|
jpayne@4
|
475 # star_line=""
|
jpayne@1
|
476 #special test for Montevideo; most of them are monophasic
|
jpayne@4
|
477 #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list
|
jpayne@4
|
478 #star="*"
|
jpayne@4
|
479 #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
|
jpayne@1
|
480 return predict_form, predict_sero, star, star_line, claim
|
jpayne@1
|
481 ### End of SeqSero Kmer part
|
jpayne@1
|
482
|
jpayne@1
|
483 ### Begin of SeqSero2 allele prediction and output
|
jpayne@1
|
484 def xml_parse_score_comparision_seqsero(xmlfile):
|
jpayne@1
|
485 #used to do seqsero xml analysis
|
jpayne@1
|
486 from Bio.Blast import NCBIXML
|
jpayne@1
|
487 handle=open(xmlfile)
|
jpayne@1
|
488 handle=NCBIXML.parse(handle)
|
jpayne@1
|
489 handle=list(handle)
|
jpayne@1
|
490 List=[]
|
jpayne@1
|
491 List_score=[]
|
jpayne@1
|
492 List_ids=[]
|
jpayne@1
|
493 List_query_region=[]
|
jpayne@1
|
494 for i in range(len(handle)):
|
jpayne@1
|
495 if len(handle[i].alignments)>0:
|
jpayne@1
|
496 for j in range(len(handle[i].alignments)):
|
jpayne@1
|
497 score=0
|
jpayne@1
|
498 ids=0
|
jpayne@1
|
499 cover_region=set() #fixed problem that repeated calculation leading percentage > 1
|
jpayne@1
|
500 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def)
|
jpayne@1
|
501 for z in range(len(handle[i].alignments[j].hsps)):
|
jpayne@1
|
502 hsp=handle[i].alignments[j].hsps[z]
|
jpayne@1
|
503 temp=set(range(hsp.query_start,hsp.query_end))
|
jpayne@1
|
504 if len(cover_region)==0:
|
jpayne@1
|
505 cover_region=cover_region|temp
|
jpayne@1
|
506 fraction=1
|
jpayne@1
|
507 else:
|
jpayne@1
|
508 fraction=1-len(cover_region&temp)/float(len(temp))
|
jpayne@1
|
509 cover_region=cover_region|temp
|
jpayne@1
|
510 if "last" in handle[i].query or "first" in handle[i].query:
|
jpayne@1
|
511 score+=hsp.bits*fraction
|
jpayne@1
|
512 ids+=float(hsp.identities)/handle[i].query_length*fraction
|
jpayne@1
|
513 else:
|
jpayne@1
|
514 score+=hsp.bits*fraction
|
jpayne@1
|
515 ids+=float(hsp.identities)/handle[i].query_length*fraction
|
jpayne@1
|
516 List_score.append(score)
|
jpayne@1
|
517 List_ids.append(ids)
|
jpayne@1
|
518 List_query_region.append(cover_region)
|
jpayne@1
|
519 temp=zip(List,List_score,List_ids,List_query_region)
|
jpayne@1
|
520 Final_list=sorted(temp, key=lambda d:d[1], reverse = True)
|
jpayne@1
|
521 return Final_list
|
jpayne@1
|
522
|
jpayne@1
|
523
|
jpayne@1
|
524 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number
|
jpayne@1
|
525 Old=L
|
jpayne@1
|
526 L.sort()
|
jpayne@1
|
527 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]]
|
jpayne@1
|
528 count=[]
|
jpayne@1
|
529 for j in range(len(L)):
|
jpayne@1
|
530 y=0
|
jpayne@1
|
531 for x in Old:
|
jpayne@1
|
532 if L[j]==x:
|
jpayne@1
|
533 y+=1
|
jpayne@1
|
534 count.append(y)
|
jpayne@1
|
535 if sort_on_fre!="none":
|
jpayne@1
|
536 d=zip(*sorted(zip(count, L)))
|
jpayne@1
|
537 L=d[1]
|
jpayne@1
|
538 count=d[0]
|
jpayne@1
|
539 return (L,count)
|
jpayne@1
|
540
|
jpayne@1
|
541 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list):
|
jpayne@1
|
542 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test
|
jpayne@1
|
543 #this is mainly used for
|
jpayne@1
|
544 a=nodes_vs_score_list
|
jpayne@1
|
545 fliC_score=0
|
jpayne@1
|
546 fljB_score=0
|
jpayne@1
|
547 for z in a:
|
jpayne@1
|
548 if "fliC" in z[0]:
|
jpayne@1
|
549 fliC_score+=z[1]
|
jpayne@1
|
550 elif "fljB" in z[0]:
|
jpayne@1
|
551 fljB_score+=z[1]
|
jpayne@1
|
552 if fliC_score>=fljB_score:
|
jpayne@1
|
553 role="fliC"
|
jpayne@1
|
554 else:
|
jpayne@1
|
555 role="fljB"
|
jpayne@1
|
556 return (role,abs(fliC_score-fljB_score))
|
jpayne@1
|
557
|
jpayne@1
|
558 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed):
|
jpayne@1
|
559 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small)
|
jpayne@1
|
560 #also used when no head or tail got blasted score for the contig
|
jpayne@1
|
561 role=""
|
jpayne@1
|
562 for z in Final_list_passed:
|
jpayne@1
|
563 if node_name in z[0]:
|
jpayne@1
|
564 role=z[0].split("_")[0]
|
jpayne@1
|
565 break
|
jpayne@1
|
566 return role
|
jpayne@1
|
567
|
jpayne@1
|
568 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed):
|
jpayne@1
|
569 #nodes_list is the c created by c,d=Uniq(nodes) in below function
|
jpayne@1
|
570 first_target=""
|
jpayne@1
|
571 role_list=[]
|
jpayne@1
|
572 for x in nodes_list:
|
jpayne@1
|
573 a=[]
|
jpayne@1
|
574 role=""
|
jpayne@1
|
575 for y in tail_head_list:
|
jpayne@1
|
576 if x in y[0]:
|
jpayne@1
|
577 a.append(y)
|
jpayne@1
|
578 if len(a)==4:
|
jpayne@1
|
579 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
|
jpayne@1
|
580 if diff<20:
|
jpayne@1
|
581 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
|
jpayne@1
|
582 elif len(a)==3:
|
jpayne@1
|
583 ###however, if the one with highest score is the fewer one, compare their accumulation score
|
jpayne@1
|
584 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
|
jpayne@1
|
585 if diff<20:
|
jpayne@1
|
586 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
|
jpayne@1
|
587 ###end of above score comparison
|
jpayne@1
|
588 elif len(a)==2:
|
jpayne@1
|
589 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37)
|
jpayne@1
|
590 temp=[]
|
jpayne@1
|
591 for z in a:
|
jpayne@1
|
592 temp.append(z[0].split("_")[0])
|
jpayne@1
|
593 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too
|
jpayne@1
|
594 if len(m)==1:
|
jpayne@1
|
595 pass
|
jpayne@1
|
596 else:
|
jpayne@1
|
597 pass
|
jpayne@1
|
598 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
|
jpayne@1
|
599 if diff<20:
|
jpayne@1
|
600 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
|
jpayne@1
|
601 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation
|
jpayne@1
|
602 elif len(a)==1:
|
jpayne@1
|
603 #that one
|
jpayne@1
|
604 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
|
jpayne@1
|
605 if diff<20:
|
jpayne@1
|
606 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
|
jpayne@1
|
607 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0"
|
jpayne@1
|
608 else:#a==0
|
jpayne@1
|
609 #use Final_list_passed best match
|
jpayne@1
|
610 for z in Final_list_passed:
|
jpayne@1
|
611 if x in z[0]:
|
jpayne@1
|
612 role=z[0].split("_")[0]
|
jpayne@1
|
613 break
|
jpayne@1
|
614 #print x,role,len(a)
|
jpayne@1
|
615 role_list.append((role,x))
|
jpayne@1
|
616 if len(role_list)==2:
|
jpayne@1
|
617 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase
|
jpayne@1
|
618 #just use score to do a final test
|
jpayne@1
|
619 role_list=[]
|
jpayne@1
|
620 for x in nodes_list:
|
jpayne@1
|
621 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
|
jpayne@1
|
622 role_list.append((role,x))
|
jpayne@1
|
623 return role_list
|
jpayne@1
|
624
|
jpayne@1
|
625 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed):
|
jpayne@1
|
626 #used to decide which contig is FliC and which one is fljB
|
jpayne@1
|
627 contigs=[]
|
jpayne@1
|
628 nodes=[]
|
jpayne@1
|
629 for x in Final_list_passed:
|
jpayne@1
|
630 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]:
|
jpayne@1
|
631 nodes.append(x[0].split("___")[1].strip())
|
jpayne@1
|
632 c,d=Uniq(nodes)#c is node_list
|
jpayne@1
|
633 #print c
|
jpayne@1
|
634 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])]
|
jpayne@1
|
635 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed)
|
jpayne@1
|
636 return roles
|
jpayne@1
|
637
|
jpayne@1
|
638 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed):
|
jpayne@1
|
639 #decide O based on Final_list
|
jpayne@1
|
640 O_choice="?"
|
jpayne@1
|
641 O_list=[]
|
jpayne@1
|
642 special_genes={}
|
jpayne@1
|
643 nodes=[]
|
jpayne@1
|
644 for x in Final_list_passed:
|
jpayne@1
|
645 if x[0].startswith("O-"):
|
jpayne@1
|
646 nodes.append(x[0].split("___")[1].strip())
|
jpayne@1
|
647 elif not x[0].startswith("fl"):
|
jpayne@1
|
648 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1]
|
jpayne@1
|
649 #print "special_genes:",special_genes
|
jpayne@1
|
650 c,d=Uniq(nodes)
|
jpayne@1
|
651 #print "potential O antigen contig",c
|
jpayne@1
|
652 final_O=[]
|
jpayne@1
|
653 O_nodes_list=[]
|
jpayne@1
|
654 for x in c:#c is the list for contigs
|
jpayne@1
|
655 temp=0
|
jpayne@1
|
656 for y in Final_list_passed:
|
jpayne@1
|
657 if x in y[0] and y[0].startswith("O-"):
|
jpayne@1
|
658 final_O.append(y)
|
jpayne@1
|
659 break
|
jpayne@1
|
660 ### O contig has the problem of two genes on same contig, so do additional test
|
jpayne@1
|
661 potenial_new_gene=""
|
jpayne@1
|
662 for x in final_O:
|
jpayne@1
|
663 pointer=0 #for genes merged or not
|
jpayne@1
|
664 #not consider O-1,3,19_not_in_3,10, too short compared with others
|
jpayne@1
|
665 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])*x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 300*2 (for secureity can use 400*2) as flank region
|
jpayne@1
|
666 pointer=x[0].split("___")[1].strip()#store the contig name
|
jpayne@1
|
667 print(pointer)
|
jpayne@1
|
668 if pointer!=0:#it has potential merge event
|
jpayne@1
|
669 for y in Final_list:
|
jpayne@1
|
670 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])*1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])*y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O
|
jpayne@1
|
671 potenial_new_gene=y
|
jpayne@1
|
672 #print(potenial_new_gene)
|
jpayne@1
|
673 break
|
jpayne@1
|
674 if potenial_new_gene!="":
|
jpayne@1
|
675 print("two differnt genes in same contig, fix it for O antigen")
|
jpayne@1
|
676 print(potenial_new_gene[:3])
|
jpayne@4
|
677 pointer=0
|
jpayne@4
|
678 for y in final_O:
|
jpayne@4
|
679 if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]:
|
jpayne@4
|
680 pointer=1
|
jpayne@4
|
681 if pointer!=1:
|
jpayne@4
|
682 final_O.append(potenial_new_gene)
|
jpayne@1
|
683 ### end of the two genes on same contig test
|
jpayne@1
|
684 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
|
jpayne@1
|
685 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
|
jpayne@1
|
686 #print "$$$No Otype, due to no hit"#may need to be changed
|
jpayne@1
|
687 O_choice="-"
|
jpayne@1
|
688 else:
|
jpayne@7
|
689 highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
|
jpayne@1
|
690 O_list=[]
|
jpayne@1
|
691 O_list_less_contamination=[]
|
jpayne@1
|
692 for x in final_O:
|
jpayne@1
|
693 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off
|
jpayne@1
|
694 O_list.append(x[0].split("__")[0])
|
jpayne@1
|
695 O_nodes_list.append(x[0].split("___")[1])
|
jpayne@7
|
696 if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:
|
jpayne@1
|
697 O_list_less_contamination.append(x[0].split("__")[0])
|
jpayne@1
|
698 ### special test for O9,46 and O3,10 family
|
jpayne@1
|
699 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
|
jpayne@1
|
700 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
|
jpayne@1
|
701 O_choice="O-9,46"
|
jpayne@1
|
702 #print "$$$Most possilble Otype: O-9,46"
|
jpayne@1
|
703 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
|
jpayne@1
|
704 O_choice="O-9,46,27"
|
jpayne@1
|
705 #print "$$$Most possilble Otype: O-9,46,27"
|
jpayne@1
|
706 else:
|
jpayne@1
|
707 O_choice="O-9"#next, detect O9 vs O2?
|
jpayne@1
|
708 O2=0
|
jpayne@1
|
709 O9=0
|
jpayne@1
|
710 for z in special_genes:
|
jpayne@1
|
711 if "tyr-O-9" in z:
|
jpayne@1
|
712 O9=special_genes[z]
|
jpayne@1
|
713 elif "tyr-O-2" in z:
|
jpayne@1
|
714 O2=special_genes[z]
|
jpayne@1
|
715 if O2>O9:
|
jpayne@1
|
716 O_choice="O-2"
|
jpayne@1
|
717 elif O2<O9:
|
jpayne@1
|
718 pass
|
jpayne@1
|
719 else:
|
jpayne@1
|
720 pass
|
jpayne@1
|
721 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility."
|
jpayne@1
|
722 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
|
jpayne@1
|
723 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
|
jpayne@1
|
724 O_choice="O-3,10"
|
jpayne@1
|
725 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
|
jpayne@1
|
726 else:
|
jpayne@1
|
727 O_choice="O-1,3,19"
|
jpayne@1
|
728 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
|
jpayne@1
|
729 ### end of special test for O9,46 and O3,10 family
|
jpayne@1
|
730 else:
|
jpayne@1
|
731 try:
|
jpayne@1
|
732 max_score=0
|
jpayne@1
|
733 for x in final_O:
|
jpayne@7
|
734 if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold
|
jpayne@1
|
735 max_score=x[2]#change from x[-1] to x[2],08172018
|
jpayne@1
|
736 O_choice=x[0].split("_")[0]
|
jpayne@1
|
737 if O_choice=="O-1,3,19":
|
jpayne@1
|
738 O_choice=final_O[1][0].split("_")[0]
|
jpayne@1
|
739 #print "$$$Most possilble Otype: ",O_choice
|
jpayne@1
|
740 except:
|
jpayne@1
|
741 pass
|
jpayne@1
|
742 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)"
|
jpayne@1
|
743 #print "O:",O_choice,O_nodes_list
|
jpayne@1
|
744 Otypes=[]
|
jpayne@1
|
745 for x in O_list:
|
jpayne@1
|
746 if x!="O-1,3,19_not_in_3,10":
|
jpayne@1
|
747 if "O-9,46_" not in x:
|
jpayne@1
|
748 Otypes.append(x.split("_")[0])
|
jpayne@1
|
749 else:
|
jpayne@1
|
750 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254
|
jpayne@1
|
751 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"]
|
jpayne@1
|
752 Otypes_uniq,Otypes_fre=Uniq(Otypes)
|
jpayne@1
|
753 contamination_O=""
|
jpayne@1
|
754 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
|
jpayne@1
|
755 if len(Otypes_uniq)>2:
|
jpayne@1
|
756 contamination_O="potential contamination from O antigen signals"
|
jpayne@1
|
757 else:
|
jpayne@1
|
758 if len(Otypes_uniq)>1:
|
jpayne@1
|
759 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
|
jpayne@1
|
760 contamination_O=""
|
jpayne@1
|
761 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
|
jpayne@1
|
762 contamination_O=""
|
jpayne@1
|
763 else:
|
jpayne@1
|
764 contamination_O="potential contamination from O antigen signals"
|
jpayne@4
|
765 return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq
|
jpayne@1
|
766 ### End of SeqSero2 allele prediction and output
|
jpayne@1
|
767
|
jpayne@1
|
768 def get_input_files(make_dir,input_file,data_type,dirpath):
|
jpayne@1
|
769 #tell input files from datatype
|
jpayne@1
|
770 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)"
|
jpayne@1
|
771 for_fq=""
|
jpayne@1
|
772 rev_fq=""
|
jpayne@1
|
773 old_dir = os.getcwd()
|
jpayne@1
|
774 os.chdir(make_dir)
|
jpayne@1
|
775 if data_type=="1":
|
jpayne@1
|
776 input_file=input_file[0].split("/")[-1]
|
jpayne@1
|
777 if input_file.endswith(".sra"):
|
jpayne@1
|
778 subprocess.check_output("fastq-dump --split-files "+input_file,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
779 for_fq=input_file.replace(".sra","_1.fastq")
|
jpayne@1
|
780 rev_fq=input_file.replace(".sra","_2.fastq")
|
jpayne@1
|
781 else:
|
jpayne@1
|
782 core_id=input_file.split(".fastq")[0].split(".fq")[0]
|
jpayne@1
|
783 for_fq=core_id+"_1.fastq"
|
jpayne@1
|
784 rev_fq=core_id+"_2.fastq"
|
jpayne@1
|
785 if input_file.endswith(".gz"):
|
jpayne@1
|
786 subprocess.check_output("gzip -dc "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
787 else:
|
jpayne@1
|
788 subprocess.check_output("cat "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
789 elif data_type=="2":
|
jpayne@1
|
790 for_fq=input_file[0].split("/")[-1]
|
jpayne@1
|
791 rev_fq=input_file[1].split("/")[-1]
|
jpayne@1
|
792 elif data_type=="3":
|
jpayne@1
|
793 input_file=input_file[0].split("/")[-1]
|
jpayne@1
|
794 if input_file.endswith(".sra"):
|
jpayne@1
|
795 subprocess.check_output("fastq-dump --split-files "+input_file,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
796 for_fq=input_file.replace(".sra","_1.fastq")
|
jpayne@1
|
797 else:
|
jpayne@1
|
798 for_fq=input_file
|
jpayne@1
|
799 elif data_type in ["4","5","6"]:
|
jpayne@1
|
800 for_fq=input_file[0].split("/")[-1]
|
jpayne@1
|
801 os.chdir(old_dir)
|
jpayne@1
|
802 return for_fq,rev_fq
|
jpayne@1
|
803
|
jpayne@4
|
804 def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta):
|
jpayne@1
|
805 #get O and H types from Final_list from blast parsing; allele mode
|
jpayne@4
|
806 from Bio import SeqIO
|
jpayne@1
|
807 fliC_choice="-"
|
jpayne@1
|
808 fljB_choice="-"
|
jpayne@1
|
809 fliC_contig="NA"
|
jpayne@1
|
810 fljB_contig="NA"
|
jpayne@1
|
811 fliC_region=set([0])
|
jpayne@1
|
812 fljB_region=set([0,])
|
jpayne@7
|
813 fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
|
jpayne@7
|
814 fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
|
jpayne@1
|
815 O_choice="-"#no need to decide O contig for now, should be only one
|
jpayne@4
|
816 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
|
jpayne@1
|
817 O_choice=O_choice.split("-")[-1].strip()
|
jpayne@1
|
818 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
|
jpayne@1
|
819 O_choice="-"
|
jpayne@1
|
820 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
|
jpayne@7
|
821 #add alignment locations, used for further selection, 03312019
|
jpayne@7
|
822 for i in range(len(H_contig_roles)):
|
jpayne@7
|
823 x=H_contig_roles[i]
|
jpayne@7
|
824 for y in Final_list_passed:
|
jpayne@7
|
825 if x[1] in y[0] and y[0].startswith(x[0]):
|
jpayne@7
|
826 H_contig_roles[i]+=H_contig_roles[i]+(y[-1],)
|
jpayne@7
|
827 break
|
jpayne@1
|
828 log_file=open("SeqSero_log.txt","a")
|
jpayne@4
|
829 extract_file=open("Extracted_antigen_alleles.fasta","a")
|
jpayne@4
|
830 handle_fasta=list(SeqIO.parse(new_fasta,"fasta"))
|
jpayne@4
|
831
|
jpayne@4
|
832 #print("O_contigs:")
|
jpayne@1
|
833 log_file.write("O_contigs:\n")
|
jpayne@4
|
834 extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n")
|
jpayne@4
|
835 extract_file.write("#O_contigs:\n")
|
jpayne@1
|
836 for x in O_nodes_roles:
|
jpayne@1
|
837 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker
|
jpayne@4
|
838 #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
|
jpayne@4
|
839 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n")
|
jpayne@4
|
840 title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n"
|
jpayne@4
|
841 seqs=""
|
jpayne@4
|
842 for z in handle_fasta:
|
jpayne@4
|
843 if x[0].split("___")[-1]==z.description:
|
jpayne@4
|
844 seqs=str(z.seq)
|
jpayne@4
|
845 extract_file.write(title+seqs+"\n")
|
jpayne@1
|
846 if len(H_contig_roles)!=0:
|
jpayne@7
|
847 highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
|
jpayne@1
|
848 else:
|
jpayne@1
|
849 highest_H_coverage=0
|
jpayne@1
|
850 for x in H_contig_roles:
|
jpayne@1
|
851 #if multiple choices, temporately select the one with longest length for now, will revise in further change
|
jpayne@7
|
852 if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
|
jpayne@1
|
853 fliC_contig=x[1]
|
jpayne@7
|
854 fliC_length=len(x[-1])
|
jpayne@7
|
855 elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:
|
jpayne@1
|
856 fljB_contig=x[1]
|
jpayne@7
|
857 fljB_length=len(x[-1])
|
jpayne@1
|
858 for x in Final_list_passed:
|
jpayne@1
|
859 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]:
|
jpayne@1
|
860 fliC_choice=x[0].split("_")[1]
|
jpayne@1
|
861 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]:
|
jpayne@1
|
862 fljB_choice=x[0].split("_")[1]
|
jpayne@1
|
863 elif fliC_choice!="-" and fljB_choice!="-":
|
jpayne@1
|
864 break
|
jpayne@1
|
865 #now remove contigs not in middle core part
|
jpayne@1
|
866 first_allele="NA"
|
jpayne@1
|
867 first_allele_percentage=0
|
jpayne@1
|
868 for x in Final_list:
|
jpayne@1
|
869 if x[0].startswith("fliC") or x[0].startswith("fljB"):
|
jpayne@1
|
870 first_allele=x[0].split("__")[0] #used to filter those un-middle contigs
|
jpayne@1
|
871 first_allele_percentage=x[2]
|
jpayne@1
|
872 break
|
jpayne@1
|
873 additional_contigs=[]
|
jpayne@1
|
874 for x in Final_list:
|
jpayne@1
|
875 if first_allele in x[0]:
|
jpayne@1
|
876 if (fliC_contig == x[0].split("___")[-1]):
|
jpayne@1
|
877 fliC_region=x[3]
|
jpayne@1
|
878 elif fljB_contig!="NA" and (fljB_contig == x[0].split("___")[-1]):
|
jpayne@1
|
879 fljB_region=x[3]
|
jpayne@1
|
880 else:
|
jpayne@1
|
881 if x[1]*1.1>int(x[0].split("___")[1].split("_")[3]):#loose threshold by multiplying 1.1
|
jpayne@1
|
882 additional_contigs.append(x)
|
jpayne@1
|
883 #else:
|
jpayne@1
|
884 #print x[:3]
|
jpayne@1
|
885 #we can just use the fljB region (or fliC depends on size), no matter set() or contain a large locations (without middle part); however, if none of them is fully assembled, use 500 and 1200 as conservative cut-off
|
jpayne@1
|
886 if first_allele_percentage>0.9:
|
jpayne@1
|
887 if len(fliC_region)>len(fljB_region) and (max(fljB_region)-min(fljB_region))>1000:
|
jpayne@1
|
888 target_region=fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
|
jpayne@1
|
889 elif len(fliC_region)<len(fljB_region) and (max(fliC_region)-min(fliC_region))>1000:
|
jpayne@1
|
890 target_region=fliC_region|(fljB_region-set(range(min(fliC_region),max(fliC_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
|
jpayne@1
|
891 else:
|
jpayne@1
|
892 target_region=set()#doesn't do anything
|
jpayne@1
|
893 else:
|
jpayne@1
|
894 target_region=set()#doesn't do anything
|
jpayne@1
|
895 #print(target_region)
|
jpayne@1
|
896 #print(additional_contigs)
|
jpayne@1
|
897 target_region2=set(list(range(0,525))+list(range(1200,1700)))#I found to use 500 to 1200 as special region would be best
|
jpayne@1
|
898 target_region=target_region2|target_region
|
jpayne@1
|
899 for x in additional_contigs:
|
jpayne@1
|
900 removal=0
|
jpayne@1
|
901 contig_length=int(x[0].split("___")[1].split("length_")[-1].split("_")[0])
|
jpayne@1
|
902 if fljB_contig not in x[0] and fliC_contig not in x[0] and len(target_region&x[3])/float(len(x[3]))>0.65 and contig_length*0.5<len(x[3])<contig_length*1.5: #consider length and alignment length for now, but very loose,0.5 and 1.5 as cut-off
|
jpayne@1
|
903 removal=1
|
jpayne@1
|
904 else:
|
jpayne@1
|
905 if first_allele_percentage > 0.9 and float(x[0].split("__")[1].split("___")[0])*x[2]/len(x[-1])>0.96:#if high similiarity with middle part of first allele (first allele >0.9, already cover middle part)
|
jpayne@1
|
906 removal=1
|
jpayne@1
|
907 else:
|
jpayne@1
|
908 pass
|
jpayne@1
|
909 if removal==1:
|
jpayne@1
|
910 for y in H_contig_roles:
|
jpayne@1
|
911 if y[1] in x[0]:
|
jpayne@1
|
912 H_contig_roles.remove(y)
|
jpayne@1
|
913 else:
|
jpayne@1
|
914 pass
|
jpayne@1
|
915 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5)
|
jpayne@1
|
916 #end of removing none-middle contigs
|
jpayne@4
|
917 #print("H_contigs:")
|
jpayne@1
|
918 log_file.write("H_contigs:\n")
|
jpayne@4
|
919 extract_file.write("#H_contigs:\n")
|
jpayne@1
|
920 H_contig_stat=[]
|
jpayne@1
|
921 H1_cont_stat={}
|
jpayne@1
|
922 H2_cont_stat={}
|
jpayne@1
|
923 for i in range(len(H_contig_roles)):
|
jpayne@1
|
924 x=H_contig_roles[i]
|
jpayne@1
|
925 a=0
|
jpayne@1
|
926 for y in Final_list_passed:
|
jpayne@1
|
927 if x[1] in y[0] and y[0].startswith(x[0]):
|
jpayne@1
|
928 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide
|
jpayne@1
|
929 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it
|
jpayne@1
|
930 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB
|
jpayne@4
|
931 #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
|
jpayne@4
|
932 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
|
jpayne@1
|
933 H_contig_roles[i]="can't decide fliC or fljB, may be third phase"
|
jpayne@4
|
934 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n"
|
jpayne@4
|
935 seqs=""
|
jpayne@4
|
936 for z in handle_fasta:
|
jpayne@4
|
937 if x[1]==z.description:
|
jpayne@4
|
938 seqs=str(z.seq)
|
jpayne@4
|
939 extract_file.write(title+seqs+"\n")
|
jpayne@1
|
940 break
|
jpayne@1
|
941 else:
|
jpayne@4
|
942 #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
|
jpayne@4
|
943 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
|
jpayne@4
|
944 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n"
|
jpayne@4
|
945 seqs=""
|
jpayne@4
|
946 for z in handle_fasta:
|
jpayne@4
|
947 if x[1]==z.description:
|
jpayne@4
|
948 seqs=str(z.seq)
|
jpayne@4
|
949 extract_file.write(title+seqs+"\n")
|
jpayne@1
|
950 if x[0]=="fliC":
|
jpayne@1
|
951 if y[0].split("_")[1] not in H1_cont_stat:
|
jpayne@1
|
952 H1_cont_stat[y[0].split("_")[1]]=y[2]
|
jpayne@1
|
953 else:
|
jpayne@1
|
954 H1_cont_stat[y[0].split("_")[1]]+=y[2]
|
jpayne@1
|
955 if x[0]=="fljB":
|
jpayne@1
|
956 if y[0].split("_")[1] not in H2_cont_stat:
|
jpayne@1
|
957 H2_cont_stat[y[0].split("_")[1]]=y[2]
|
jpayne@1
|
958 else:
|
jpayne@1
|
959 H2_cont_stat[y[0].split("_")[1]]+=y[2]
|
jpayne@1
|
960 break
|
jpayne@1
|
961 #detect contaminations
|
jpayne@1
|
962 #print(H1_cont_stat)
|
jpayne@1
|
963 #print(H2_cont_stat)
|
jpayne@1
|
964 H1_cont_stat_list=[x for x in H1_cont_stat if H1_cont_stat[x]>0.2]
|
jpayne@1
|
965 H2_cont_stat_list=[x for x in H2_cont_stat if H2_cont_stat[x]>0.2]
|
jpayne@1
|
966 contamination_H=""
|
jpayne@1
|
967 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1:
|
jpayne@1
|
968 contamination_H="potential contamination from H antigen signals"
|
jpayne@1
|
969 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA":
|
jpayne@1
|
970 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected"
|
jpayne@4
|
971 #get additional antigens
|
jpayne@4
|
972 """
|
jpayne@4
|
973 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
|
jpayne@4
|
974 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
|
jpayne@4
|
975 O_choice="O-9,46"
|
jpayne@4
|
976 #print "$$$Most possilble Otype: O-9,46"
|
jpayne@4
|
977 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
|
jpayne@4
|
978 O_choice="O-9,46,27"
|
jpayne@4
|
979 #print "$$$Most possilble Otype: O-9,46,27"
|
jpayne@4
|
980 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
|
jpayne@4
|
981 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
|
jpayne@4
|
982 O_choice="O-3,10"
|
jpayne@4
|
983 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
|
jpayne@4
|
984 else:
|
jpayne@4
|
985 O_choice="O-1,3,19"
|
jpayne@4
|
986 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
|
jpayne@4
|
987 ### end of special test for O9,46 and O3,10 family
|
jpayne@4
|
988
|
jpayne@4
|
989 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
|
jpayne@4
|
990 if len(Otypes_uniq)>2:
|
jpayne@4
|
991 contamination_O="potential contamination from O antigen signals"
|
jpayne@4
|
992 else:
|
jpayne@4
|
993 if len(Otypes_uniq)>1:
|
jpayne@4
|
994 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
|
jpayne@4
|
995 contamination_O=""
|
jpayne@4
|
996 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
|
jpayne@4
|
997 contamination_O=""
|
jpayne@4
|
998 """
|
jpayne@4
|
999 additonal_antigents=[]
|
jpayne@4
|
1000 #print(contamination_O)
|
jpayne@4
|
1001 #print(contamination_H)
|
jpayne@1
|
1002 log_file.write(contamination_O+"\n")
|
jpayne@1
|
1003 log_file.write(contamination_H+"\n")
|
jpayne@1
|
1004 log_file.close()
|
jpayne@4
|
1005 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list
|
jpayne@1
|
1006
|
jpayne@1
|
1007 def get_input_K(input_file,lib_dict,data_type,k_size):
|
jpayne@1
|
1008 #kmer mode; get input_Ks from dict and data_type
|
jpayne@1
|
1009 kmers = []
|
jpayne@1
|
1010 for h in lib_dict:
|
jpayne@1
|
1011 kmers += lib_dict[h]
|
jpayne@1
|
1012 if data_type == '4':
|
jpayne@1
|
1013 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers))
|
jpayne@1
|
1014 elif data_type == '1' or data_type == '2' or data_type == '3':#set it for now, will change later
|
jpayne@1
|
1015 input_Ks = target_read_kmerizer(input_file, k_size, set(kmers))
|
jpayne@1
|
1016 elif data_type == '5':#minion_2d_fasta
|
jpayne@1
|
1017 input_Ks = minion_fasta_kmerizer(input_file, k_size, set(kmers))
|
jpayne@1
|
1018 if data_type == '6':#minion_2d_fastq
|
jpayne@1
|
1019 input_Ks = minion_fastq_kmerizer(input_file, k_size, set(kmers))
|
jpayne@1
|
1020 return input_Ks
|
jpayne@1
|
1021
|
jpayne@1
|
1022 def get_kmer_dict(lib_dict,input_Ks):
|
jpayne@1
|
1023 #kmer mode; get predicted types
|
jpayne@1
|
1024 O_dict = {}
|
jpayne@1
|
1025 H_dict = {}
|
jpayne@1
|
1026 Special_dict = {}
|
jpayne@1
|
1027 for h in lib_dict:
|
jpayne@1
|
1028 score = (len(lib_dict[h] & input_Ks) / len(lib_dict[h])) * 100
|
jpayne@1
|
1029 if score > 1: # Arbitrary cut-off for similarity score very low but seems necessary to detect O-3,10 in some cases
|
jpayne@1
|
1030 if h.startswith('O-') and score > 25:
|
jpayne@1
|
1031 O_dict[h] = score
|
jpayne@1
|
1032 if h.startswith('fl') and score > 40:
|
jpayne@1
|
1033 H_dict[h] = score
|
jpayne@1
|
1034 if (h[:2] != 'fl') and (h[:2] != 'O-'):
|
jpayne@1
|
1035 Special_dict[h] = score
|
jpayne@1
|
1036 return O_dict,H_dict,Special_dict
|
jpayne@1
|
1037
|
jpayne@1
|
1038 def call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir):
|
jpayne@1
|
1039 log_file=open("SeqSero_log.txt","a")
|
jpayne@1
|
1040 log_file.write("O_scores:\n")
|
jpayne@1
|
1041 #call O:
|
jpayne@1
|
1042 highest_O = '-'
|
jpayne@1
|
1043 if len(O_dict) == 0:
|
jpayne@1
|
1044 pass
|
jpayne@1
|
1045 else:
|
jpayne@1
|
1046 for x in O_dict:
|
jpayne@1
|
1047 log_file.write(x+"\t"+str(O_dict[x])+"\n")
|
jpayne@1
|
1048 if ('O-9,46_wbaV__1002' in O_dict and O_dict['O-9,46_wbaV__1002']>70) or ("O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002" in O_dict and O_dict['O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002']>70): # not sure should use and float(O9_wbaV)/float(num_1) > 0.1
|
jpayne@1
|
1049 if 'O-9,46_wzy__1191' in O_dict: # and float(O946_wzy)/float(num_1) > 0.1
|
jpayne@1
|
1050 highest_O = "O-9,46"
|
jpayne@1
|
1051 elif "O-9,46,27_partial_wzy__1019" in O_dict: # and float(O94627)/float(num_1) > 0.1
|
jpayne@1
|
1052 highest_O = "O-9,46,27"
|
jpayne@1
|
1053 else:
|
jpayne@1
|
1054 highest_O = "O-9" # next, detect O9 vs O2?
|
jpayne@1
|
1055 O2 = 0
|
jpayne@1
|
1056 O9 = 0
|
jpayne@1
|
1057 for z in Special_dict:
|
jpayne@1
|
1058 if "tyr-O-9" in z:
|
jpayne@1
|
1059 O9 = float(Special_dict[z])
|
jpayne@1
|
1060 if "tyr-O-2" in z:
|
jpayne@1
|
1061 O2 = float(Special_dict[z])
|
jpayne@1
|
1062 if O2 > O9:
|
jpayne@1
|
1063 highest_O = "O-2"
|
jpayne@1
|
1064 elif ("O-3,10_wzx__1539" in O_dict) and (
|
jpayne@1
|
1065 "O-9,46_wzy__1191" in O_dict
|
jpayne@1
|
1066 ): # and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
|
jpayne@1
|
1067 if "O-3,10_not_in_1,3,19__1519" in O_dict: # and float(O310_no_1319)/float(num_1) > 0.1
|
jpayne@1
|
1068 highest_O = "O-3,10"
|
jpayne@1
|
1069 else:
|
jpayne@1
|
1070 highest_O = "O-1,3,19"
|
jpayne@1
|
1071 ### end of special test for O9,46 and O3,10 family
|
jpayne@1
|
1072 else:
|
jpayne@1
|
1073 try:
|
jpayne@1
|
1074 max_score = 0
|
jpayne@1
|
1075 for x in O_dict:
|
jpayne@1
|
1076 if float(O_dict[x]) >= max_score:
|
jpayne@1
|
1077 max_score = float(O_dict[x])
|
jpayne@1
|
1078 highest_O = x.split("_")[0]
|
jpayne@1
|
1079 if highest_O == "O-1,3,19":
|
jpayne@1
|
1080 highest_O = '-'
|
jpayne@1
|
1081 max_score = 0
|
jpayne@1
|
1082 for x in O_dict:
|
jpayne@1
|
1083 if x == 'O-1,3,19_not_in_3,10__130':
|
jpayne@1
|
1084 pass
|
jpayne@1
|
1085 else:
|
jpayne@1
|
1086 if float(O_dict[x]) >= max_score:
|
jpayne@1
|
1087 max_score = float(O_dict[x])
|
jpayne@1
|
1088 highest_O = x.split("_")[0]
|
jpayne@1
|
1089 except:
|
jpayne@1
|
1090 pass
|
jpayne@1
|
1091 #call_fliC:
|
jpayne@1
|
1092 if len(H_dict)!=0:
|
jpayne@1
|
1093 highest_H_score_both_BC=H_dict[max(H_dict.keys(), key=(lambda k: H_dict[k]))] #used to detect whether fljB existed or not
|
jpayne@1
|
1094 else:
|
jpayne@1
|
1095 highest_H_score_both_BC=0
|
jpayne@1
|
1096 highest_fliC = '-'
|
jpayne@1
|
1097 highest_fliC_raw = '-'
|
jpayne@1
|
1098 highest_Score = 0
|
jpayne@1
|
1099 log_file.write("\nH_scores:\n")
|
jpayne@1
|
1100 for s in H_dict:
|
jpayne@1
|
1101 log_file.write(s+"\t"+str(H_dict[s])+"\n")
|
jpayne@1
|
1102 if s.startswith('fliC'):
|
jpayne@1
|
1103 if float(H_dict[s]) > highest_Score:
|
jpayne@1
|
1104 highest_fliC = s.split('_')[1]
|
jpayne@1
|
1105 highest_fliC_raw = s
|
jpayne@1
|
1106 highest_Score = float(H_dict[s])
|
jpayne@1
|
1107 #call_fljB
|
jpayne@1
|
1108 highest_fljB = '-'
|
jpayne@1
|
1109 highest_fljB_raw = '-'
|
jpayne@1
|
1110 highest_Score = 0
|
jpayne@1
|
1111 for s in H_dict:
|
jpayne@1
|
1112 if s.startswith('fljB'):
|
jpayne@1
|
1113 if float(H_dict[s]) > highest_Score and float(H_dict[s]) > highest_H_score_both_BC * 0.65: #fljB is special, so use highest_H_score_both_BC to give a general estimate of coverage, currently 0.65 seems pretty good; the reason use a high (0.65) is some fliC and fljB shared with each other
|
jpayne@1
|
1114 highest_fljB = s.split('_')[1]
|
jpayne@1
|
1115 highest_fljB_raw = s
|
jpayne@1
|
1116 highest_Score = float(H_dict[s])
|
jpayne@1
|
1117 log_file.write("\nSpecial_scores:\n")
|
jpayne@1
|
1118 for s in Special_dict:
|
jpayne@1
|
1119 log_file.write(s+"\t"+str(Special_dict[s])+"\n")
|
jpayne@1
|
1120 log_file.close()
|
jpayne@1
|
1121 return highest_O,highest_fliC,highest_fljB
|
jpayne@1
|
1122
|
jpayne@1
|
1123 def get_temp_file_names(for_fq,rev_fq):
|
jpayne@1
|
1124 #seqsero2 -a; get temp file names
|
jpayne@1
|
1125 sam=for_fq+".sam"
|
jpayne@1
|
1126 bam=for_fq+".bam"
|
jpayne@1
|
1127 sorted_bam=for_fq+"_sorted.bam"
|
jpayne@1
|
1128 mapped_fq1=for_fq+"_mapped.fq"
|
jpayne@1
|
1129 mapped_fq2=rev_fq+"_mapped.fq"
|
jpayne@1
|
1130 combined_fq=for_fq+"_combined.fq"
|
jpayne@1
|
1131 for_sai=for_fq+".sai"
|
jpayne@1
|
1132 rev_sai=rev_fq+".sai"
|
jpayne@1
|
1133 return sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai
|
jpayne@1
|
1134
|
jpayne@1
|
1135 def map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode):
|
jpayne@1
|
1136 #seqsero2 -a; do mapping and sort
|
jpayne@1
|
1137 print("building database...")
|
jpayne@1
|
1138 subprocess.check_output("bwa index "+database,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1139 print("mapping...")
|
jpayne@1
|
1140 if mapping_mode=="mem":
|
jpayne@1
|
1141 subprocess.check_output("bwa mem -k 17 -t "+threads+" "+database+" "+fnameA+" "+fnameB+" > "+sam,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1142 elif mapping_mode=="sam":
|
jpayne@1
|
1143 if fnameB!="":
|
jpayne@1
|
1144 subprocess.check_output("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1145 subprocess.check_output("bwa aln -t "+threads+" "+database+" "+fnameB+" > "+rev_sai,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1146 subprocess.check_output("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+fnameA+" "+fnameB+" > "+sam,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1147 else:
|
jpayne@1
|
1148 subprocess.check_output("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1149 subprocess.check_output("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam, stderr=subprocess.STDOUT)
|
jpayne@1
|
1150 subprocess.check_output("samtools view -@ "+threads+" -F 4 -Sh "+sam+" > "+bam,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1151 ### check the version of samtools then use differnt commands
|
jpayne@1
|
1152 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
|
jpayne@1
|
1153 out, err = samtools_version.communicate()
|
jpayne@1
|
1154 version = str(err).split("ersion:")[1].strip().split(" ")[0].strip()
|
jpayne@1
|
1155 print("check samtools version:",version)
|
jpayne@1
|
1156 ### end of samtools version check and its analysis
|
jpayne@1
|
1157 if LooseVersion(version)<=LooseVersion("1.2"):
|
jpayne@1
|
1158 subprocess.check_output("samtools sort -@ "+threads+" -n "+bam+" "+fnameA+"_sorted",shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1159 else:
|
jpayne@1
|
1160 subprocess.check_output("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1161
|
jpayne@1
|
1162 def extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode):
|
jpayne@1
|
1163 #seqsero2 -a; extract, assembly and blast
|
jpayne@1
|
1164 subprocess.check_output("bamToFastq -i "+sorted_bam+" -fq "+combined_fq,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1165 if fnameB!="":
|
jpayne@1
|
1166 subprocess.check_output("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 ,shell=True, stderr=subprocess.STDOUT)#2> /dev/null if want no output
|
jpayne@1
|
1167 else:
|
jpayne@1
|
1168 pass
|
jpayne@1
|
1169 outdir=current_time+"_temp"
|
jpayne@1
|
1170 print("assembling...")
|
jpayne@1
|
1171 if int(threads)>4:
|
jpayne@1
|
1172 t="4"
|
jpayne@1
|
1173 else:
|
jpayne@1
|
1174 t=threads
|
jpayne@12
|
1175 new_fasta = None
|
jpayne@7
|
1176 if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-"
|
jpayne@1
|
1177 if fnameB!="":
|
jpayne@1
|
1178 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1179 else:
|
jpayne@1
|
1180 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1181 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta"
|
jpayne@1
|
1182 subprocess.check_output("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1183 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta
|
jpayne@1
|
1184 subprocess.check_output("rm -rf "+outdir+ " 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1185 print("blasting...","\n")
|
jpayne@1
|
1186 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml"
|
jpayne@1
|
1187 subprocess.check_output('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl',shell=True, stderr=subprocess.STDOUT) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015
|
jpayne@1
|
1188 subprocess.check_output("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5",shell=True, stderr=subprocess.STDOUT)###1/27/2015; 08272018, remove "-word_size 10"
|
jpayne@1
|
1189 else:
|
jpayne@1
|
1190 xmlfile="NA"
|
jpayne@4
|
1191 return xmlfile,new_fasta
|
jpayne@1
|
1192
|
jpayne@1
|
1193 def judge_subspecies(fnameA):
|
jpayne@1
|
1194 #seqsero2 -a; judge subspecies on just forward raw reads fastq
|
jpayne@12
|
1195 salmID_output=subprocess.check_output(os.path.join(dirpath, "SalmID/SalmID.py") + ' -i ' +fnameA, shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1196 #out, err = salmID_output.communicate()
|
jpayne@1
|
1197 #out=out.decode("utf-8")
|
jpayne@1
|
1198 out = salmID_output.decode("utf-8")
|
jpayne@1
|
1199 file=open("data_log.txt","a")
|
jpayne@1
|
1200 file.write(out)
|
jpayne@1
|
1201 file.close()
|
jpayne@1
|
1202 salm_species_scores=out.split("\n")[1].split("\t")[6:]
|
jpayne@1
|
1203 salm_species_results=out.split("\n")[0].split("\t")[6:]
|
jpayne@1
|
1204 max_score=0
|
jpayne@1
|
1205 max_score_index=1 #default is 1, means "I"
|
jpayne@1
|
1206 for i in range(len(salm_species_scores)):
|
jpayne@1
|
1207 if max_score<float(salm_species_scores[i]):
|
jpayne@1
|
1208 max_score=float(salm_species_scores[i])
|
jpayne@1
|
1209 max_score_index=i
|
jpayne@1
|
1210 prediction=salm_species_results[max_score_index].split(".")[1].strip().split(" ")[0]
|
jpayne@1
|
1211 if float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): #bongori and enterica compare
|
jpayne@1
|
1212 prediction="bongori" #if not, the prediction would always be enterica, since they are located in the later part
|
jpayne@1
|
1213 if max_score<10:
|
jpayne@1
|
1214 prediction="-"
|
jpayne@1
|
1215 return prediction
|
jpayne@1
|
1216
|
jpayne@1
|
1217 def judge_subspecies_Kmer(Special_dict):
|
jpayne@1
|
1218 #seqsero2 -k;
|
jpayne@1
|
1219 max_score=0
|
jpayne@1
|
1220 prediction="-" #default should be I
|
jpayne@1
|
1221 for x in Special_dict:
|
jpayne@1
|
1222 if "mer" in x:
|
jpayne@1
|
1223 if max_score<float(Special_dict[x]):
|
jpayne@1
|
1224 max_score=float(Special_dict[x])
|
jpayne@1
|
1225 prediction=x.split("_")[-1].strip()
|
jpayne@1
|
1226 if x.split("_")[-1].strip()=="bongori" and float(Special_dict[x])>95:#if bongori already, then no need to test enterica
|
jpayne@1
|
1227 prediction="bongori"
|
jpayne@1
|
1228 break
|
jpayne@1
|
1229 return prediction
|
jpayne@1
|
1230
|
jpayne@1
|
1231 def main():
|
jpayne@1
|
1232 #combine SeqSeroK and SeqSero2, also with SalmID
|
jpayne@1
|
1233 args = parse_args()
|
jpayne@1
|
1234 input_file = args.i
|
jpayne@1
|
1235 data_type = args.t
|
jpayne@1
|
1236 analysis_mode = args.m
|
jpayne@1
|
1237 mapping_mode=args.b
|
jpayne@1
|
1238 threads=args.p
|
jpayne@1
|
1239 make_dir=args.d
|
jpayne@1
|
1240 clean_mode=args.c
|
jpayne@1
|
1241 k_size=27 #will change for bug fixing
|
jpayne@1
|
1242 database="H_and_O_and_specific_genes.fasta"
|
jpayne@1
|
1243
|
jpayne@1
|
1244 if len(sys.argv)==1:
|
jpayne@1
|
1245 subprocess.check_output(dirpath+"/SeqSero2_package.py -h",shell=True, stderr=subprocess.STDOUT)#change name of python file
|
jpayne@1
|
1246 else:
|
jpayne@1
|
1247 request_id = time.strftime("%m_%d_%Y_%H_%M_%S", time.localtime())
|
jpayne@1
|
1248 request_id += str(random.randint(1, 10000000))
|
jpayne@1
|
1249 if make_dir is None:
|
jpayne@1
|
1250 make_dir="SeqSero_result_"+request_id
|
jpayne@1
|
1251 if os.path.isdir(make_dir):
|
jpayne@1
|
1252 pass
|
jpayne@1
|
1253 else:
|
jpayne@1
|
1254 subprocess.check_output(["mkdir",make_dir])
|
jpayne@1
|
1255 #subprocess.check_output("cp "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1256 subprocess.check_output("ln -srf "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1257 ############################begin the real analysis
|
jpayne@1
|
1258 if analysis_mode=="a":
|
jpayne@1
|
1259 if data_type in ["1","2","3"]:#use allele mode
|
jpayne@12
|
1260 H1_cont_stat_list = H2_cont_stat_list = []
|
jpayne@1
|
1261 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
|
jpayne@1
|
1262 os.chdir(make_dir)
|
jpayne@1
|
1263 ###add a function to tell input files
|
jpayne@1
|
1264 fnameA=for_fq.split("/")[-1]
|
jpayne@1
|
1265 fnameB=rev_fq.split("/")[-1]
|
jpayne@1
|
1266 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
|
jpayne@1
|
1267 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id
|
jpayne@1
|
1268 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort
|
jpayne@4
|
1269 xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast
|
jpayne@1
|
1270 if xmlfile=="NA":
|
jpayne@1
|
1271 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","")
|
jpayne@1
|
1272 else:
|
jpayne@1
|
1273 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results
|
jpayne@1
|
1274 file=open("data_log.txt","a")
|
jpayne@1
|
1275 for x in Final_list:
|
jpayne@1
|
1276 file.write("\t".join(str(y) for y in x)+"\n")
|
jpayne@1
|
1277 file.close()
|
jpayne@7
|
1278 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
|
jpayne@4
|
1279 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB
|
jpayne@1
|
1280 subspecies=judge_subspecies(fnameA) #predict subspecies
|
jpayne@1
|
1281 ###output
|
jpayne@1
|
1282 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
|
jpayne@7
|
1283 claim="" #04132019, disable claim for new report requirement
|
jpayne@1
|
1284 contamination_report=""
|
jpayne@4
|
1285 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0]
|
jpayne@1
|
1286 if contamination_O!="" and contamination_H=="":
|
jpayne@4
|
1287 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"."
|
jpayne@1
|
1288 elif contamination_O=="" and contamination_H!="":
|
jpayne@4
|
1289 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"."
|
jpayne@1
|
1290 elif contamination_O!="" and contamination_H!="":
|
jpayne@4
|
1291 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"."
|
jpayne@4
|
1292 if contamination_report!="":
|
jpayne@7
|
1293 #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019
|
jpayne@7
|
1294 contamination_report="Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles."
|
jpayne@7
|
1295 #claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version
|
jpayne@7
|
1296 if contamination_report+star_line+claim=="": #0413, new output style
|
jpayne@7
|
1297 note=""
|
jpayne@7
|
1298 else:
|
jpayne@7
|
1299 note="Note:"
|
jpayne@1
|
1300 if clean_mode:
|
jpayne@1
|
1301 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1302 make_dir="none-output-directory due to '-c' flag"
|
jpayne@1
|
1303 else:
|
jpayne@1
|
1304 #new_file=open("Seqsero_result.txt","w")
|
jpayne@1
|
1305 if O_choice=="":
|
jpayne@1
|
1306 O_choice="-"
|
jpayne@1
|
1307 result = OrderedDict(
|
jpayne@1
|
1308 sample_name = input_file[0],
|
jpayne@1
|
1309 O_antigen_prediction = O_choice,
|
jpayne@1
|
1310 H1_antigen_prediction = fliC_choice,
|
jpayne@1
|
1311 H2_antigen_prediction = fljB_choice,
|
jpayne@1
|
1312 predicted_antigenic_profile = predict_form,
|
jpayne@1
|
1313 predicted_subspecies = subspecies,
|
jpayne@7
|
1314 predicted_serotype = "{}{}".format(predict_sero, star),
|
jpayne@1
|
1315 note=claim.replace('\n','')
|
jpayne@1
|
1316 )
|
jpayne@9
|
1317 result['*'] = star_line.replace('\t', ' ').replace('\n','')
|
jpayne@1
|
1318 with open("Seqsero_result.tsv","w") as new_file:
|
jpayne@1
|
1319 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
|
jpayne@1
|
1320 #new_file.close()
|
jpayne@1
|
1321 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys())
|
jpayne@1
|
1322 wrtr.writeheader()
|
jpayne@1
|
1323 wrtr.writerow(result)
|
jpayne@1
|
1324
|
jpayne@1
|
1325
|
jpayne@1
|
1326 # new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+contamination_report+star+star_line+claim+"\n")#+##
|
jpayne@1
|
1327 # new_file.close()
|
jpayne@1
|
1328 #subprocess.check_output("cat Seqsero_result.txt",shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1329 #subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt *.xml "+fnameA+"*_db* 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1330 subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt "+fnameA+"*_db* 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1331 print("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+contamination_report+star+star_line+claim+"\n")#+##
|
jpayne@1
|
1332 else:
|
jpayne@1
|
1333 print("Allele modes only support raw reads datatype, i.e. '-t 1 or 2 or 3'; please use '-m k'")
|
jpayne@1
|
1334 elif analysis_mode=="k":
|
jpayne@1
|
1335 ex_dir = os.path.dirname(os.path.realpath(__file__))
|
jpayne@1
|
1336 #output_mode = args.mode
|
jpayne@1
|
1337 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
|
jpayne@1
|
1338 input_file = for_fq #-k will just use forward because not all reads were used
|
jpayne@1
|
1339 os.chdir(make_dir)
|
jpayne@1
|
1340 f = open(ex_dir + '/antigens.pickle', 'rb')
|
jpayne@1
|
1341 lib_dict = pickle.load(f)
|
jpayne@1
|
1342 f.close
|
jpayne@1
|
1343 input_Ks = get_input_K(input_file,lib_dict,data_type,k_size)
|
jpayne@1
|
1344 O_dict,H_dict,Special_dict = get_kmer_dict(lib_dict,input_Ks)
|
jpayne@1
|
1345 highest_O,highest_fliC,highest_fljB = call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir)
|
jpayne@1
|
1346 subspecies = judge_subspecies_Kmer(Special_dict)
|
jpayne@1
|
1347 if subspecies=="IIb" or subspecies=="IIa":
|
jpayne@1
|
1348 subspecies="II"
|
jpayne@1
|
1349 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
|
jpayne@1
|
1350 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
|
jpayne@7
|
1351 claim="" #no claim any more based on new output requirement
|
jpayne@7
|
1352 if star_line+claim=="": #0413, new output style
|
jpayne@7
|
1353 note=""
|
jpayne@7
|
1354 else:
|
jpayne@7
|
1355 note="Note:"
|
jpayne@1
|
1356 if clean_mode:
|
jpayne@1
|
1357 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1358 make_dir="none-output-directory due to '-c' flag"
|
jpayne@1
|
1359 else:
|
jpayne@1
|
1360 if highest_O.split('-')[-1]=="":
|
jpayne@1
|
1361 O_choice="-"
|
jpayne@1
|
1362 else:
|
jpayne@1
|
1363 O_choice=highest_O.split('-')[-1]
|
jpayne@1
|
1364 #print("Output_directory:"+make_dir+"\tInput_file:"+input_file+"\tPredicted subpecies:"+subspecies + '\tPredicted antigenic profile:' + predict_form + '\tPredicted serotype(s):' + predict_sero)
|
jpayne@1
|
1365 result = OrderedDict(
|
jpayne@1
|
1366 sample_name = input_file,
|
jpayne@1
|
1367 O_antigen_prediction = O_choice,
|
jpayne@1
|
1368 H1_antigen_prediction = highest_fliC,
|
jpayne@1
|
1369 H2_antigen_prediction = highest_fljB,
|
jpayne@1
|
1370 predicted_antigenic_profile = predict_form,
|
jpayne@1
|
1371 predicted_subspecies = subspecies,
|
jpayne@7
|
1372 predicted_serotype = "{}{}".format(predict_sero, star),
|
jpayne@1
|
1373 note=claim.replace('\n','')
|
jpayne@1
|
1374 )
|
jpayne@9
|
1375 result['*'] = star_line.replace('\t', ' ').replace('\n','')
|
jpayne@1
|
1376 with open("Seqsero_result.tsv","w") as new_file:
|
jpayne@1
|
1377 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
|
jpayne@1
|
1378 #new_file.close()
|
jpayne@1
|
1379 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys())
|
jpayne@1
|
1380 wrtr.writeheader()
|
jpayne@1
|
1381 wrtr.writerow(result)
|
jpayne@1
|
1382 subprocess.call("rm *.fasta* *.fastq *.gz *.fq temp.txt *.sra 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
|
jpayne@1
|
1383 print("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
|
jpayne@1
|
1384 return 0
|
jpayne@1
|
1385
|
jpayne@1
|
1386 if __name__ == '__main__':
|
jpayne@1
|
1387 try:
|
jpayne@1
|
1388 quit(main())
|
jpayne@1
|
1389 except subprocess.CalledProcessError as e:
|
jpayne@1
|
1390 print(str(e.output))
|
jpayne@1
|
1391 print(e)
|
jpayne@1
|
1392 quit(e.returncode)
|