comparison SeqSero2/SalmID/SalmID.py @ 12:4ac593d4b40f

planemo upload
author jpayne
date Wed, 14 Aug 2019 17:45:41 -0400
parents
children
comparison
equal deleted inserted replaced
11:f6f0702de3b4 12:4ac593d4b40f
1 #!/usr/bin/env python3
2
3
4 import gzip
5 import io
6 import pickle
7 import os
8 import sys
9
10 from argparse import ArgumentParser
11 try:
12 from version import SalmID_version
13 except:
14 SalmID_version = "version unknown"
15
16
17 def reverse_complement(sequence):
18 complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'M': 'K', 'R': 'Y', 'W': 'W',
19 'S': 'S', 'Y': 'R', 'K': 'M', 'V': 'B', 'H': 'D', 'D': 'H', 'B': 'V'}
20 return "".join(complement[base] for base in reversed(sequence))
21
22
23 def parse_args():
24 "Parse the input arguments, use '-h' for help."
25 parser = ArgumentParser(description='SalmID - rapid Kmer based Salmonella identifier from sequence data')
26 # inputs
27 parser.add_argument('-v','--version', action='version', version='%(prog)s ' + SalmID_version)
28 parser.add_argument(
29 '-i','--input_file', type=str, required=False, default= 'None', metavar = 'your_fastqgz',
30 help='Single fastq.gz file input, include path to file if file is not in same directory ')
31 parser.add_argument(
32 '-e', '--extension', type=str, required=False, default= '.fastq.gz', metavar = 'file_extension',
33 help='File extension, if specified without "--input_dir", SalmID will attempt to ID all files\n' +
34 ' with this extension in current directory, otherwise files in input directory')
35
36 parser.add_argument(
37 '-d','--input_dir', type=str, required=False, default='.', metavar = 'directory',
38 help='Directory which contains data for identification, when not specified files in current directory will be analyzed.')
39 parser.add_argument(
40 '-r', '--report', type=str, required=False, default='percentage', metavar = 'percentage, coverage or taxonomy',
41 help='Report either percentage ("percentage") of clade specific kmers recovered, average kmer-coverage ("cov"), or '
42 'taxonomy (taxonomic species ID, plus observed mean k-mer coverages and expected coverage).')
43 parser.add_argument(
44 '-m', '--mode', type=str, required=False, default='quick', metavar = 'quick or thorough',
45 help='Quick [quick] or thorough [thorough] mode')
46 if len(sys.argv)==1:
47 parser.print_help(sys.stderr)
48 sys.exit(1)
49 return parser.parse_args()
50
51 def get_av_read_length(file):
52 i = 1
53 n_reads = 0
54 total_length = 0
55 if file.endswith(".gz"):
56 file_content=io.BufferedReader(gzip.open(file))
57 else:
58 file_content=open(file,"r").readlines()
59 for line in file_content:
60 if i % 4 == 2:
61 total_length += len(line.strip())
62 n_reads +=1
63 i += 1
64 if n_reads == 100:
65 break
66 return total_length/100
67
68
69 def createKmerDict_reads(list_of_strings, kmer):
70 kmer_table = {}
71 for string in list_of_strings:
72 sequence = string.strip('\n')
73 for i in range(len(sequence)-kmer+1):
74 new_mer =sequence[i:i+kmer]
75 new_mer_rc = reverse_complement(new_mer)
76 if new_mer in kmer_table:
77 kmer_table[new_mer.upper()] += 1
78 else:
79 kmer_table[new_mer.upper()] = 1
80 if new_mer_rc in kmer_table:
81 kmer_table[new_mer_rc.upper()] += 1
82 else:
83 kmer_table[new_mer_rc.upper()] = 1
84 return kmer_table
85
86
87 def target_read_kmerizer_multi(file, k, kmerDict_1, kmerDict_2, mode):
88 mean_1 = None
89 mean_2 = None
90 i = 1
91 n_reads_1 = 0
92 n_reads_2 = 0
93 total_coverage_1 = 0
94 total_coverage_2 = 0
95 reads_1 = []
96 reads_2 = []
97 total_reads = 0
98 if file.endswith(".gz"):
99 file_content=io.BufferedReader(gzip.open(file))
100 else:
101 file_content=open(file,"r").readlines()
102 for line in file_content:
103 start = int((len(line) - k) // 2)
104 if i % 4 == 2:
105 total_reads += 1
106 if file.endswith(".gz"):
107 s1 = line[start:k + start].decode()
108 line=line.decode()
109 else:
110 s1 = line[start:k + start]
111 if s1 in kmerDict_1:
112 n_reads_1 += 1
113 total_coverage_1 += len(line)
114 reads_1.append(line)
115 if s1 in kmerDict_2:
116 n_reads_2 += 1
117 total_coverage_2 += len(line)
118 reads_2.append(line)
119 i += 1
120 if mode == 'quick':
121 if total_coverage_2 >= 800000:
122 break
123
124 if len(reads_1) == 0:
125 kmer_Dict1 = {}
126 else:
127 kmer_Dict1 = createKmerDict_reads(reads_1, k)
128 mers_1 = set([key for key in kmer_Dict1])
129 mean_1 = sum([kmer_Dict1[key] for key in kmer_Dict1])/len(mers_1)
130 if len(reads_2) == 0:
131 kmer_Dict2 = {}
132 else:
133 kmer_Dict2 = createKmerDict_reads(reads_2, k)
134 mers_2 = set([key for key in kmer_Dict2])
135 mean_2 = sum([kmer_Dict2[key] for key in kmer_Dict2])/len(mers_2)
136 return kmer_Dict1, kmer_Dict2, mean_1, mean_2, total_reads
137
138 def mean_cov_selected_kmers(iterable, kmer_dict, clade_specific_kmers):
139 '''
140 Given an iterable (list, set, dictrionary) returns mean coverage for the kmers in iterable
141 :param iterable: set, list or dictionary containing kmers
142 :param kmer_dict: dictionary with kmers as keys, kmer-frequency as value
143 :param clade_specific_kmers: list, dict or set of clade specific kmers
144 :return: mean frequency as float
145 '''
146 if len(iterable) == 0:
147 return 0
148 return sum([kmer_dict[value] for value in iterable])/len(clade_specific_kmers)
149
150 def kmer_lists(query_fastq_gz, k,
151 allmers,allmers_rpoB,
152 uniqmers_bongori,
153 uniqmers_I,
154 uniqmers_IIa,
155 uniqmers_IIb,
156 uniqmers_IIIa,
157 uniqmers_IIIb,
158 uniqmers_IV,
159 uniqmers_VI,
160 uniqmers_VII,
161 uniqmers_VIII,
162 uniqmers_bongori_rpoB,
163 uniqmers_S_enterica_rpoB,
164 uniqmers_Escherichia_rpoB,
165 uniqmers_Listeria_ss_rpoB,
166 uniqmers_Lmono_rpoB,
167 mode):
168 dict_invA, dict_rpoB, mean_invA, mean_rpoB , total_reads = target_read_kmerizer_multi(query_fastq_gz, k, allmers,
169 allmers_rpoB, mode)
170 target_mers_invA = set([key for key in dict_invA])
171 target_mers_rpoB = set([key for key in dict_rpoB])
172 if target_mers_invA == 0:
173 print('No reads found matching invA, no Salmonella in sample?')
174 else:
175 p_bongori = (len(uniqmers_bongori & target_mers_invA) / len(uniqmers_bongori)) * 100
176 p_I = (len(uniqmers_I & target_mers_invA) / len(uniqmers_I)) * 100
177 p_IIa = (len(uniqmers_IIa & target_mers_invA) / len(uniqmers_IIa)) * 100
178 p_IIb = (len(uniqmers_IIb & target_mers_invA) / len(uniqmers_IIb)) * 100
179 p_IIIa = (len(uniqmers_IIIa & target_mers_invA) / len(uniqmers_IIIa)) * 100
180 p_IIIb = (len(uniqmers_IIIb & target_mers_invA) / len(uniqmers_IIIb)) * 100
181 p_VI = (len(uniqmers_VI & target_mers_invA) / len(uniqmers_VI)) * 100
182 p_IV = (len(uniqmers_IV & target_mers_invA) / len(uniqmers_IV)) * 100
183 p_VII = (len(uniqmers_VII & target_mers_invA) / len(uniqmers_VII)) * 100
184 p_VIII = (len(uniqmers_VIII & target_mers_invA) / len(uniqmers_VIII)) * 100
185 p_bongori_rpoB = (len(uniqmers_bongori_rpoB & target_mers_rpoB) / len(uniqmers_bongori_rpoB)) * 100
186 p_Senterica = (len(uniqmers_S_enterica_rpoB & target_mers_rpoB) / len(uniqmers_S_enterica_rpoB)) * 100
187 p_Escherichia = (len(uniqmers_Escherichia_rpoB & target_mers_rpoB) / len(uniqmers_Escherichia_rpoB)) * 100
188 p_Listeria_ss = (len(uniqmers_Listeria_ss_rpoB & target_mers_rpoB) / len(uniqmers_Listeria_ss_rpoB)) * 100
189 p_Lmono = (len(uniqmers_Lmono_rpoB & target_mers_rpoB) / len(uniqmers_Lmono_rpoB)) * 100
190 bongori_invA_cov = mean_cov_selected_kmers(uniqmers_bongori & target_mers_invA, dict_invA, uniqmers_bongori)
191 I_invA_cov = mean_cov_selected_kmers(uniqmers_I & target_mers_invA, dict_invA, uniqmers_I)
192 IIa_invA_cov = mean_cov_selected_kmers(uniqmers_IIa & target_mers_invA, dict_invA, uniqmers_IIa)
193 IIb_invA_cov = mean_cov_selected_kmers(uniqmers_IIb & target_mers_invA, dict_invA, uniqmers_IIb)
194 IIIa_invA_cov = mean_cov_selected_kmers(uniqmers_IIIa & target_mers_invA, dict_invA, uniqmers_IIIa)
195 IIIb_invA_cov = mean_cov_selected_kmers(uniqmers_IIIb & target_mers_invA, dict_invA, uniqmers_IIIb)
196 IV_invA_cov = mean_cov_selected_kmers(uniqmers_IV & target_mers_invA, dict_invA, uniqmers_IV)
197 VI_invA_cov = mean_cov_selected_kmers(uniqmers_VI & target_mers_invA, dict_invA, uniqmers_VI)
198 VII_invA_cov = mean_cov_selected_kmers(uniqmers_VII & target_mers_invA, dict_invA, uniqmers_VII)
199 VIII_invA_cov = mean_cov_selected_kmers(uniqmers_VIII & target_mers_invA, dict_invA, uniqmers_VIII)
200 S_enterica_rpoB_cov = mean_cov_selected_kmers((uniqmers_S_enterica_rpoB & target_mers_rpoB), dict_rpoB,
201 uniqmers_S_enterica_rpoB)
202 S_bongori_rpoB_cov = mean_cov_selected_kmers((uniqmers_bongori_rpoB & target_mers_rpoB), dict_rpoB,
203 uniqmers_bongori_rpoB)
204 Escherichia_rpoB_cov = mean_cov_selected_kmers((uniqmers_Escherichia_rpoB & target_mers_rpoB), dict_rpoB,
205 uniqmers_Escherichia_rpoB)
206 Listeria_ss_rpoB_cov = mean_cov_selected_kmers((uniqmers_Listeria_ss_rpoB & target_mers_rpoB), dict_rpoB,
207 uniqmers_Listeria_ss_rpoB)
208 Lmono_rpoB_cov = mean_cov_selected_kmers((uniqmers_Lmono_rpoB & target_mers_rpoB), dict_rpoB,
209 uniqmers_Lmono_rpoB)
210 coverages = [Listeria_ss_rpoB_cov, Lmono_rpoB_cov, Escherichia_rpoB_cov, S_bongori_rpoB_cov,
211 S_enterica_rpoB_cov, bongori_invA_cov, I_invA_cov, IIa_invA_cov, IIb_invA_cov,
212 IIIa_invA_cov, IIIb_invA_cov, IV_invA_cov, VI_invA_cov, VII_invA_cov, VIII_invA_cov]
213 locus_scores = [p_Listeria_ss, p_Lmono, p_Escherichia, p_bongori_rpoB, p_Senterica, p_bongori,
214 p_I, p_IIa,p_IIb, p_IIIa, p_IIIb, p_IV, p_VI, p_VII, p_VIII]
215 return locus_scores, coverages, total_reads
216
217 def report_taxon(locus_covs, average_read_length, number_of_reads):
218 list_taxa = [ 'Listeria ss', 'Listeria monocytogenes', 'Escherichia sp.',
219 'Salmonella bongori (rpoB)', 'Salmonella enterica (rpoB)',
220 'Salmonella bongori (invA)', 'S. enterica subsp. enterica (invA)',
221 'S. enterica subsp. salamae (invA: clade a)','S. enterica subsp. salamae (invA: clade b)',
222 'S. enterica subsp. arizonae (invA)', 'S. enterica subsp. diarizonae (invA)',
223 'S. enterica subsp. houtenae (invA)', 'S. enterica subsp. indica (invA)',
224 'S. enterica subsp. VII (invA)', 'S. enterica subsp. salamae (invA: clade VIII)']
225 if sum(locus_covs) < 1:
226 rpoB = ('No rpoB matches!', 0)
227 invA = ('No invA matches!', 0)
228 return rpoB, invA, 0.0
229 else:
230 # given list of scores get taxon
231 if sum(locus_covs[0:5]) > 0:
232 best_rpoB = max(range(len(locus_covs[1:5])), key=lambda x: locus_covs[1:5][x])+1
233 all_rpoB = max(range(len(locus_covs[0:5])), key=lambda x: locus_covs[0:5][x])
234 if (locus_covs[best_rpoB] != 0) & (all_rpoB == 0):
235 rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB])
236 elif (all_rpoB == 0) & (round(sum(locus_covs[1:5]),1) < 1):
237 rpoB = (list_taxa[0], locus_covs[0])
238 else:
239 rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB])
240 else:
241 rpoB = ('No rpoB matches!', 0)
242 if sum(locus_covs[5:]) > 0:
243 best_invA = max(range(len(locus_covs[5:])), key=lambda x: locus_covs[5:][x])+5
244 invA = (list_taxa[best_invA], locus_covs[best_invA])
245 else:
246 invA = ('No invA matches!', 0)
247 if 'Listeria' in rpoB[0]:
248 return rpoB, invA, (average_read_length * number_of_reads) / 3000000
249 else:
250 return rpoB, invA, (average_read_length * number_of_reads) / 5000000
251
252
253
254 def main():
255 ex_dir = os.path.dirname(os.path.realpath(__file__))
256 args = parse_args()
257 input_file = args.input_file
258 if input_file != 'None':
259 files = [input_file]
260 else:
261 extension = args.extension
262 inputdir = args.input_dir
263 files = [inputdir + '/'+ f for f in os.listdir(inputdir) if f.endswith(extension)]
264 report = args.report
265 mode = args.mode
266 f_invA = open(ex_dir + "/invA_mers_dict", "rb")
267 sets_dict_invA = pickle.load(f_invA)
268 f_invA.close()
269 allmers = sets_dict_invA['allmers']
270 uniqmers_I = sets_dict_invA['uniqmers_I']
271 uniqmers_IIa = sets_dict_invA['uniqmers_IIa']
272 uniqmers_IIb = sets_dict_invA['uniqmers_IIb']
273 uniqmers_IIIa = sets_dict_invA['uniqmers_IIIa']
274 uniqmers_IIIb = sets_dict_invA['uniqmers_IIIb']
275 uniqmers_IV = sets_dict_invA['uniqmers_IV']
276 uniqmers_VI = sets_dict_invA['uniqmers_VI']
277 uniqmers_VII = sets_dict_invA['uniqmers_VII']
278 uniqmers_VIII = sets_dict_invA['uniqmers_VIII']
279 uniqmers_bongori = sets_dict_invA['uniqmers_bongori']
280
281 f = open(ex_dir + "/rpoB_mers_dict", "rb")
282 sets_dict = pickle.load(f)
283 f.close()
284
285 allmers_rpoB = sets_dict['allmers']
286 uniqmers_bongori_rpoB = sets_dict['uniqmers_bongori']
287 uniqmers_S_enterica_rpoB = sets_dict['uniqmers_S_enterica']
288 uniqmers_Escherichia_rpoB = sets_dict['uniqmers_Escherichia']
289 uniqmers_Listeria_ss_rpoB = sets_dict['uniqmers_Listeria_ss']
290 uniqmers_Lmono_rpoB = sets_dict['uniqmers_L_mono']
291 #todo: run kmer_lists() once, create list of tuples containing data to be used fro different reports
292 if report == 'taxonomy':
293 print('file\trpoB\tinvA\texpected coverage')
294 for f in files:
295 locus_scores, coverages, reads = kmer_lists(f, 27,
296 allmers, allmers_rpoB,
297 uniqmers_bongori,
298 uniqmers_I,
299 uniqmers_IIa,
300 uniqmers_IIb,
301 uniqmers_IIIa,
302 uniqmers_IIIb,
303 uniqmers_IV,
304 uniqmers_VI,
305 uniqmers_VII,
306 uniqmers_VIII,
307 uniqmers_bongori_rpoB,
308 uniqmers_S_enterica_rpoB,
309 uniqmers_Escherichia_rpoB,
310 uniqmers_Listeria_ss_rpoB,
311 uniqmers_Lmono_rpoB,
312 mode)
313 pretty_covs = [round(cov, 1) for cov in coverages]
314 report = report_taxon(pretty_covs, get_av_read_length(f), reads)
315 print(f.split('/')[-1] + '\t' + report[0][0] + '[' + str(report[0][1]) + ']' + '\t' + report[1][0] +
316 '[' + str(report[1][1]) + ']' +
317 '\t' + str(round(report[2], 1)))
318 else:
319 print(
320 'file\tListeria sensu stricto (rpoB)\tL. monocytogenes (rpoB)\tEscherichia spp. (rpoB)\tS. bongori (rpoB)\tS. enterica' +
321 '(rpoB)\tS. bongori (invA)\tsubsp. I (invA)\tsubsp. II (clade a: invA)\tsubsp. II' +
322 ' (clade b: invA)\tsubsp. IIIa (invA)\tsubsp. IIIb (invA)\tsubsp.IV (invA)\tsubsp. VI (invA)\tsubsp. VII (invA)' +
323 '\tsubsp. II (clade VIII : invA)')
324 if report == 'percentage':
325 for f in files:
326 locus_scores, coverages , reads = kmer_lists( f, 27,
327 allmers,allmers_rpoB,
328 uniqmers_bongori,
329 uniqmers_I,
330 uniqmers_IIa,
331 uniqmers_IIb,
332 uniqmers_IIIa,
333 uniqmers_IIIb,
334 uniqmers_IV,
335 uniqmers_VI,
336 uniqmers_VII,
337 uniqmers_VIII,
338 uniqmers_bongori_rpoB,
339 uniqmers_S_enterica_rpoB,
340 uniqmers_Escherichia_rpoB,
341 uniqmers_Listeria_ss_rpoB,
342 uniqmers_Lmono_rpoB,
343 mode)
344 pretty_scores = [str(round(score)) for score in locus_scores]
345 print(f.split('/')[-1] +'\t' + '\t'.join(pretty_scores))
346 else:
347 for f in files:
348 locus_scores, coverages , reads = kmer_lists( f, 27,
349 allmers,allmers_rpoB,
350 uniqmers_bongori,
351 uniqmers_I,
352 uniqmers_IIa,
353 uniqmers_IIb,
354 uniqmers_IIIa,
355 uniqmers_IIIb,
356 uniqmers_IV,
357 uniqmers_VI,
358 uniqmers_VII,
359 uniqmers_VIII,
360 uniqmers_bongori_rpoB,
361 uniqmers_S_enterica_rpoB,
362 uniqmers_Escherichia_rpoB,
363 uniqmers_Listeria_ss_rpoB,
364 uniqmers_Lmono_rpoB,
365 mode)
366 pretty_covs = [str(round(cov, 1)) for cov in coverages]
367 print(f.split('/')[-1] + '\t' + '\t'.join(pretty_covs))
368
369 if __name__ == '__main__':
370 main()
371