Mercurial > repos > jpayne > seqsero_v2
comparison SalmID/SalmID.py @ 1:fae43708974d
planemo upload
author | jpayne |
---|---|
date | Fri, 09 Nov 2018 11:30:45 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:4ff2aee11e5b | 1:fae43708974d |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 | |
4 import gzip | |
5 import io | |
6 import pickle | |
7 import os | |
8 import sys | |
9 | |
10 from argparse import ArgumentParser | |
11 try: | |
12 from version import SalmID_version | |
13 except: | |
14 SalmID_version = "version unknown" | |
15 | |
16 | |
17 def reverse_complement(sequence): | |
18 complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'M': 'K', 'R': 'Y', 'W': 'W', | |
19 'S': 'S', 'Y': 'R', 'K': 'M', 'V': 'B', 'H': 'D', 'D': 'H', 'B': 'V'} | |
20 return "".join(complement[base] for base in reversed(sequence)) | |
21 | |
22 | |
23 def parse_args(): | |
24 "Parse the input arguments, use '-h' for help." | |
25 parser = ArgumentParser(description='SalmID - rapid Kmer based Salmonella identifier from sequence data') | |
26 # inputs | |
27 parser.add_argument('-v','--version', action='version', version='%(prog)s ' + SalmID_version) | |
28 parser.add_argument( | |
29 '-i','--input_file', type=str, required=False, default= 'None', metavar = 'your_fastqgz', | |
30 help='Single fastq.gz file input, include path to file if file is not in same directory ') | |
31 parser.add_argument( | |
32 '-e', '--extension', type=str, required=False, default= '.fastq.gz', metavar = 'file_extension', | |
33 help='File extension, if specified without "--input_dir", SalmID will attempt to ID all files\n' + | |
34 ' with this extension in current directory, otherwise files in input directory') | |
35 | |
36 parser.add_argument( | |
37 '-d','--input_dir', type=str, required=False, default='.', metavar = 'directory', | |
38 help='Directory which contains data for identification, when not specified files in current directory will be analyzed.') | |
39 parser.add_argument( | |
40 '-r', '--report', type=str, required=False, default='percentage', metavar = 'percentage, coverage or taxonomy', | |
41 help='Report either percentage ("percentage") of clade specific kmers recovered, average kmer-coverage ("cov"), or ' | |
42 'taxonomy (taxonomic species ID, plus observed mean k-mer coverages and expected coverage).') | |
43 parser.add_argument( | |
44 '-m', '--mode', type=str, required=False, default='quick', metavar = 'quick or thorough', | |
45 help='Quick [quick] or thorough [thorough] mode') | |
46 if len(sys.argv)==1: | |
47 parser.print_help(sys.stderr) | |
48 sys.exit(1) | |
49 return parser.parse_args() | |
50 | |
51 def get_av_read_length(file): | |
52 i = 1 | |
53 n_reads = 0 | |
54 total_length = 0 | |
55 if file.endswith(".gz"): | |
56 file_content=io.BufferedReader(gzip.open(file)) | |
57 else: | |
58 file_content=open(file,"r").readlines() | |
59 for line in file_content: | |
60 if i % 4 == 2: | |
61 total_length += len(line.strip()) | |
62 n_reads +=1 | |
63 i += 1 | |
64 if n_reads == 100: | |
65 break | |
66 return total_length/100 | |
67 | |
68 | |
69 def createKmerDict_reads(list_of_strings, kmer): | |
70 kmer_table = {} | |
71 for string in list_of_strings: | |
72 sequence = string.strip('\n') | |
73 for i in range(len(sequence)-kmer+1): | |
74 new_mer =sequence[i:i+kmer] | |
75 new_mer_rc = reverse_complement(new_mer) | |
76 if new_mer in kmer_table: | |
77 kmer_table[new_mer.upper()] += 1 | |
78 else: | |
79 kmer_table[new_mer.upper()] = 1 | |
80 if new_mer_rc in kmer_table: | |
81 kmer_table[new_mer_rc.upper()] += 1 | |
82 else: | |
83 kmer_table[new_mer_rc.upper()] = 1 | |
84 return kmer_table | |
85 | |
86 | |
87 def target_read_kmerizer_multi(file, k, kmerDict_1, kmerDict_2, mode): | |
88 mean_1 = None | |
89 mean_2 = None | |
90 i = 1 | |
91 n_reads_1 = 0 | |
92 n_reads_2 = 0 | |
93 total_coverage_1 = 0 | |
94 total_coverage_2 = 0 | |
95 reads_1 = [] | |
96 reads_2 = [] | |
97 total_reads = 0 | |
98 if file.endswith(".gz"): | |
99 file_content=io.BufferedReader(gzip.open(file)) | |
100 else: | |
101 file_content=open(file,"r").readlines() | |
102 for line in file_content: | |
103 start = int((len(line) - k) // 2) | |
104 if i % 4 == 2: | |
105 total_reads += 1 | |
106 if file.endswith(".gz"): | |
107 s1 = line[start:k + start].decode() | |
108 line=line.decode() | |
109 else: | |
110 s1 = line[start:k + start] | |
111 if s1 in kmerDict_1: | |
112 n_reads_1 += 1 | |
113 total_coverage_1 += len(line) | |
114 reads_1.append(line) | |
115 if s1 in kmerDict_2: | |
116 n_reads_2 += 1 | |
117 total_coverage_2 += len(line) | |
118 reads_2.append(line) | |
119 i += 1 | |
120 if mode == 'quick': | |
121 if total_coverage_2 >= 800000: | |
122 break | |
123 | |
124 if len(reads_1) == 0: | |
125 kmer_Dict1 = {} | |
126 else: | |
127 kmer_Dict1 = createKmerDict_reads(reads_1, k) | |
128 mers_1 = set([key for key in kmer_Dict1]) | |
129 mean_1 = sum([kmer_Dict1[key] for key in kmer_Dict1])/len(mers_1) | |
130 if len(reads_2) == 0: | |
131 kmer_Dict2 = {} | |
132 else: | |
133 kmer_Dict2 = createKmerDict_reads(reads_2, k) | |
134 mers_2 = set([key for key in kmer_Dict2]) | |
135 mean_2 = sum([kmer_Dict2[key] for key in kmer_Dict2])/len(mers_2) | |
136 return kmer_Dict1, kmer_Dict2, mean_1, mean_2, total_reads | |
137 | |
138 def mean_cov_selected_kmers(iterable, kmer_dict, clade_specific_kmers): | |
139 ''' | |
140 Given an iterable (list, set, dictrionary) returns mean coverage for the kmers in iterable | |
141 :param iterable: set, list or dictionary containing kmers | |
142 :param kmer_dict: dictionary with kmers as keys, kmer-frequency as value | |
143 :param clade_specific_kmers: list, dict or set of clade specific kmers | |
144 :return: mean frequency as float | |
145 ''' | |
146 if len(iterable) == 0: | |
147 return 0 | |
148 return sum([kmer_dict[value] for value in iterable])/len(clade_specific_kmers) | |
149 | |
150 def kmer_lists(query_fastq_gz, k, | |
151 allmers,allmers_rpoB, | |
152 uniqmers_bongori, | |
153 uniqmers_I, | |
154 uniqmers_IIa, | |
155 uniqmers_IIb, | |
156 uniqmers_IIIa, | |
157 uniqmers_IIIb, | |
158 uniqmers_IV, | |
159 uniqmers_VI, | |
160 uniqmers_VII, | |
161 uniqmers_VIII, | |
162 uniqmers_bongori_rpoB, | |
163 uniqmers_S_enterica_rpoB, | |
164 uniqmers_Escherichia_rpoB, | |
165 uniqmers_Listeria_ss_rpoB, | |
166 uniqmers_Lmono_rpoB, | |
167 mode): | |
168 dict_invA, dict_rpoB, mean_invA, mean_rpoB , total_reads = target_read_kmerizer_multi(query_fastq_gz, k, allmers, | |
169 allmers_rpoB, mode) | |
170 target_mers_invA = set([key for key in dict_invA]) | |
171 target_mers_rpoB = set([key for key in dict_rpoB]) | |
172 if target_mers_invA == 0: | |
173 print('No reads found matching invA, no Salmonella in sample?') | |
174 else: | |
175 p_bongori = (len(uniqmers_bongori & target_mers_invA) / len(uniqmers_bongori)) * 100 | |
176 p_I = (len(uniqmers_I & target_mers_invA) / len(uniqmers_I)) * 100 | |
177 p_IIa = (len(uniqmers_IIa & target_mers_invA) / len(uniqmers_IIa)) * 100 | |
178 p_IIb = (len(uniqmers_IIb & target_mers_invA) / len(uniqmers_IIb)) * 100 | |
179 p_IIIa = (len(uniqmers_IIIa & target_mers_invA) / len(uniqmers_IIIa)) * 100 | |
180 p_IIIb = (len(uniqmers_IIIb & target_mers_invA) / len(uniqmers_IIIb)) * 100 | |
181 p_VI = (len(uniqmers_VI & target_mers_invA) / len(uniqmers_VI)) * 100 | |
182 p_IV = (len(uniqmers_IV & target_mers_invA) / len(uniqmers_IV)) * 100 | |
183 p_VII = (len(uniqmers_VII & target_mers_invA) / len(uniqmers_VII)) * 100 | |
184 p_VIII = (len(uniqmers_VIII & target_mers_invA) / len(uniqmers_VIII)) * 100 | |
185 p_bongori_rpoB = (len(uniqmers_bongori_rpoB & target_mers_rpoB) / len(uniqmers_bongori_rpoB)) * 100 | |
186 p_Senterica = (len(uniqmers_S_enterica_rpoB & target_mers_rpoB) / len(uniqmers_S_enterica_rpoB)) * 100 | |
187 p_Escherichia = (len(uniqmers_Escherichia_rpoB & target_mers_rpoB) / len(uniqmers_Escherichia_rpoB)) * 100 | |
188 p_Listeria_ss = (len(uniqmers_Listeria_ss_rpoB & target_mers_rpoB) / len(uniqmers_Listeria_ss_rpoB)) * 100 | |
189 p_Lmono = (len(uniqmers_Lmono_rpoB & target_mers_rpoB) / len(uniqmers_Lmono_rpoB)) * 100 | |
190 bongori_invA_cov = mean_cov_selected_kmers(uniqmers_bongori & target_mers_invA, dict_invA, uniqmers_bongori) | |
191 I_invA_cov = mean_cov_selected_kmers(uniqmers_I & target_mers_invA, dict_invA, uniqmers_I) | |
192 IIa_invA_cov = mean_cov_selected_kmers(uniqmers_IIa & target_mers_invA, dict_invA, uniqmers_IIa) | |
193 IIb_invA_cov = mean_cov_selected_kmers(uniqmers_IIb & target_mers_invA, dict_invA, uniqmers_IIb) | |
194 IIIa_invA_cov = mean_cov_selected_kmers(uniqmers_IIIa & target_mers_invA, dict_invA, uniqmers_IIIa) | |
195 IIIb_invA_cov = mean_cov_selected_kmers(uniqmers_IIIb & target_mers_invA, dict_invA, uniqmers_IIIb) | |
196 IV_invA_cov = mean_cov_selected_kmers(uniqmers_IV & target_mers_invA, dict_invA, uniqmers_IV) | |
197 VI_invA_cov = mean_cov_selected_kmers(uniqmers_VI & target_mers_invA, dict_invA, uniqmers_VI) | |
198 VII_invA_cov = mean_cov_selected_kmers(uniqmers_VII & target_mers_invA, dict_invA, uniqmers_VII) | |
199 VIII_invA_cov = mean_cov_selected_kmers(uniqmers_VIII & target_mers_invA, dict_invA, uniqmers_VIII) | |
200 S_enterica_rpoB_cov = mean_cov_selected_kmers((uniqmers_S_enterica_rpoB & target_mers_rpoB), dict_rpoB, | |
201 uniqmers_S_enterica_rpoB) | |
202 S_bongori_rpoB_cov = mean_cov_selected_kmers((uniqmers_bongori_rpoB & target_mers_rpoB), dict_rpoB, | |
203 uniqmers_bongori_rpoB) | |
204 Escherichia_rpoB_cov = mean_cov_selected_kmers((uniqmers_Escherichia_rpoB & target_mers_rpoB), dict_rpoB, | |
205 uniqmers_Escherichia_rpoB) | |
206 Listeria_ss_rpoB_cov = mean_cov_selected_kmers((uniqmers_Listeria_ss_rpoB & target_mers_rpoB), dict_rpoB, | |
207 uniqmers_Listeria_ss_rpoB) | |
208 Lmono_rpoB_cov = mean_cov_selected_kmers((uniqmers_Lmono_rpoB & target_mers_rpoB), dict_rpoB, | |
209 uniqmers_Lmono_rpoB) | |
210 coverages = [Listeria_ss_rpoB_cov, Lmono_rpoB_cov, Escherichia_rpoB_cov, S_bongori_rpoB_cov, | |
211 S_enterica_rpoB_cov, bongori_invA_cov, I_invA_cov, IIa_invA_cov, IIb_invA_cov, | |
212 IIIa_invA_cov, IIIb_invA_cov, IV_invA_cov, VI_invA_cov, VII_invA_cov, VIII_invA_cov] | |
213 locus_scores = [p_Listeria_ss, p_Lmono, p_Escherichia, p_bongori_rpoB, p_Senterica, p_bongori, | |
214 p_I, p_IIa,p_IIb, p_IIIa, p_IIIb, p_IV, p_VI, p_VII, p_VIII] | |
215 return locus_scores, coverages, total_reads | |
216 | |
217 def report_taxon(locus_covs, average_read_length, number_of_reads): | |
218 list_taxa = [ 'Listeria ss', 'Listeria monocytogenes', 'Escherichia sp.', | |
219 'Salmonella bongori (rpoB)', 'Salmonella enterica (rpoB)', | |
220 'Salmonella bongori (invA)', 'S. enterica subsp. enterica (invA)', | |
221 'S. enterica subsp. salamae (invA: clade a)','S. enterica subsp. salamae (invA: clade b)', | |
222 'S. enterica subsp. arizonae (invA)', 'S. enterica subsp. diarizonae (invA)', | |
223 'S. enterica subsp. houtenae (invA)', 'S. enterica subsp. indica (invA)', | |
224 'S. enterica subsp. VII (invA)', 'S. enterica subsp. salamae (invA: clade VIII)'] | |
225 if sum(locus_covs) < 1: | |
226 rpoB = ('No rpoB matches!', 0) | |
227 invA = ('No invA matches!', 0) | |
228 return rpoB, invA, 0.0 | |
229 else: | |
230 # given list of scores get taxon | |
231 if sum(locus_covs[0:5]) > 0: | |
232 best_rpoB = max(range(len(locus_covs[1:5])), key=lambda x: locus_covs[1:5][x])+1 | |
233 all_rpoB = max(range(len(locus_covs[0:5])), key=lambda x: locus_covs[0:5][x]) | |
234 if (locus_covs[best_rpoB] != 0) & (all_rpoB == 0): | |
235 rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB]) | |
236 elif (all_rpoB == 0) & (round(sum(locus_covs[1:5]),1) < 1): | |
237 rpoB = (list_taxa[0], locus_covs[0]) | |
238 else: | |
239 rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB]) | |
240 else: | |
241 rpoB = ('No rpoB matches!', 0) | |
242 if sum(locus_covs[5:]) > 0: | |
243 best_invA = max(range(len(locus_covs[5:])), key=lambda x: locus_covs[5:][x])+5 | |
244 invA = (list_taxa[best_invA], locus_covs[best_invA]) | |
245 else: | |
246 invA = ('No invA matches!', 0) | |
247 if 'Listeria' in rpoB[0]: | |
248 return rpoB, invA, (average_read_length * number_of_reads) / 3000000 | |
249 else: | |
250 return rpoB, invA, (average_read_length * number_of_reads) / 5000000 | |
251 | |
252 | |
253 | |
254 def main(): | |
255 ex_dir = os.path.dirname(os.path.realpath(__file__)) | |
256 args = parse_args() | |
257 input_file = args.input_file | |
258 if input_file != 'None': | |
259 files = [input_file] | |
260 else: | |
261 extension = args.extension | |
262 inputdir = args.input_dir | |
263 files = [inputdir + '/'+ f for f in os.listdir(inputdir) if f.endswith(extension)] | |
264 report = args.report | |
265 mode = args.mode | |
266 f_invA = open(ex_dir + "/invA_mers_dict", "rb") | |
267 sets_dict_invA = pickle.load(f_invA) | |
268 f_invA.close() | |
269 allmers = sets_dict_invA['allmers'] | |
270 uniqmers_I = sets_dict_invA['uniqmers_I'] | |
271 uniqmers_IIa = sets_dict_invA['uniqmers_IIa'] | |
272 uniqmers_IIb = sets_dict_invA['uniqmers_IIb'] | |
273 uniqmers_IIIa = sets_dict_invA['uniqmers_IIIa'] | |
274 uniqmers_IIIb = sets_dict_invA['uniqmers_IIIb'] | |
275 uniqmers_IV = sets_dict_invA['uniqmers_IV'] | |
276 uniqmers_VI = sets_dict_invA['uniqmers_VI'] | |
277 uniqmers_VII = sets_dict_invA['uniqmers_VII'] | |
278 uniqmers_VIII = sets_dict_invA['uniqmers_VIII'] | |
279 uniqmers_bongori = sets_dict_invA['uniqmers_bongori'] | |
280 | |
281 f = open(ex_dir + "/rpoB_mers_dict", "rb") | |
282 sets_dict = pickle.load(f) | |
283 f.close() | |
284 | |
285 allmers_rpoB = sets_dict['allmers'] | |
286 uniqmers_bongori_rpoB = sets_dict['uniqmers_bongori'] | |
287 uniqmers_S_enterica_rpoB = sets_dict['uniqmers_S_enterica'] | |
288 uniqmers_Escherichia_rpoB = sets_dict['uniqmers_Escherichia'] | |
289 uniqmers_Listeria_ss_rpoB = sets_dict['uniqmers_Listeria_ss'] | |
290 uniqmers_Lmono_rpoB = sets_dict['uniqmers_L_mono'] | |
291 #todo: run kmer_lists() once, create list of tuples containing data to be used fro different reports | |
292 if report == 'taxonomy': | |
293 print('file\trpoB\tinvA\texpected coverage') | |
294 for f in files: | |
295 locus_scores, coverages, reads = kmer_lists(f, 27, | |
296 allmers, allmers_rpoB, | |
297 uniqmers_bongori, | |
298 uniqmers_I, | |
299 uniqmers_IIa, | |
300 uniqmers_IIb, | |
301 uniqmers_IIIa, | |
302 uniqmers_IIIb, | |
303 uniqmers_IV, | |
304 uniqmers_VI, | |
305 uniqmers_VII, | |
306 uniqmers_VIII, | |
307 uniqmers_bongori_rpoB, | |
308 uniqmers_S_enterica_rpoB, | |
309 uniqmers_Escherichia_rpoB, | |
310 uniqmers_Listeria_ss_rpoB, | |
311 uniqmers_Lmono_rpoB, | |
312 mode) | |
313 pretty_covs = [round(cov, 1) for cov in coverages] | |
314 report = report_taxon(pretty_covs, get_av_read_length(f), reads) | |
315 print(f.split('/')[-1] + '\t' + report[0][0] + '[' + str(report[0][1]) + ']' + '\t' + report[1][0] + | |
316 '[' + str(report[1][1]) + ']' + | |
317 '\t' + str(round(report[2], 1))) | |
318 else: | |
319 print( | |
320 'file\tListeria sensu stricto (rpoB)\tL. monocytogenes (rpoB)\tEscherichia spp. (rpoB)\tS. bongori (rpoB)\tS. enterica' + | |
321 '(rpoB)\tS. bongori (invA)\tsubsp. I (invA)\tsubsp. II (clade a: invA)\tsubsp. II' + | |
322 ' (clade b: invA)\tsubsp. IIIa (invA)\tsubsp. IIIb (invA)\tsubsp.IV (invA)\tsubsp. VI (invA)\tsubsp. VII (invA)' + | |
323 '\tsubsp. II (clade VIII : invA)') | |
324 if report == 'percentage': | |
325 for f in files: | |
326 locus_scores, coverages , reads = kmer_lists( f, 27, | |
327 allmers,allmers_rpoB, | |
328 uniqmers_bongori, | |
329 uniqmers_I, | |
330 uniqmers_IIa, | |
331 uniqmers_IIb, | |
332 uniqmers_IIIa, | |
333 uniqmers_IIIb, | |
334 uniqmers_IV, | |
335 uniqmers_VI, | |
336 uniqmers_VII, | |
337 uniqmers_VIII, | |
338 uniqmers_bongori_rpoB, | |
339 uniqmers_S_enterica_rpoB, | |
340 uniqmers_Escherichia_rpoB, | |
341 uniqmers_Listeria_ss_rpoB, | |
342 uniqmers_Lmono_rpoB, | |
343 mode) | |
344 pretty_scores = [str(round(score)) for score in locus_scores] | |
345 print(f.split('/')[-1] +'\t' + '\t'.join(pretty_scores)) | |
346 else: | |
347 for f in files: | |
348 locus_scores, coverages , reads = kmer_lists( f, 27, | |
349 allmers,allmers_rpoB, | |
350 uniqmers_bongori, | |
351 uniqmers_I, | |
352 uniqmers_IIa, | |
353 uniqmers_IIb, | |
354 uniqmers_IIIa, | |
355 uniqmers_IIIb, | |
356 uniqmers_IV, | |
357 uniqmers_VI, | |
358 uniqmers_VII, | |
359 uniqmers_VIII, | |
360 uniqmers_bongori_rpoB, | |
361 uniqmers_S_enterica_rpoB, | |
362 uniqmers_Escherichia_rpoB, | |
363 uniqmers_Listeria_ss_rpoB, | |
364 uniqmers_Lmono_rpoB, | |
365 mode) | |
366 pretty_covs = [str(round(cov, 1)) for cov in coverages] | |
367 print(f.split('/')[-1] + '\t' + '\t'.join(pretty_covs)) | |
368 | |
369 if __name__ == '__main__': | |
370 main() | |
371 |