jpayne@1
|
1 #!/usr/bin/env python3
|
jpayne@1
|
2
|
jpayne@1
|
3
|
jpayne@1
|
4 import gzip
|
jpayne@1
|
5 import io
|
jpayne@1
|
6 import pickle
|
jpayne@1
|
7 import os
|
jpayne@1
|
8 import sys
|
jpayne@1
|
9
|
jpayne@1
|
10 from argparse import ArgumentParser
|
jpayne@1
|
11 try:
|
jpayne@1
|
12 from version import SalmID_version
|
jpayne@1
|
13 except:
|
jpayne@1
|
14 SalmID_version = "version unknown"
|
jpayne@1
|
15
|
jpayne@1
|
16
|
jpayne@1
|
17 def reverse_complement(sequence):
|
jpayne@1
|
18 complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'M': 'K', 'R': 'Y', 'W': 'W',
|
jpayne@1
|
19 'S': 'S', 'Y': 'R', 'K': 'M', 'V': 'B', 'H': 'D', 'D': 'H', 'B': 'V'}
|
jpayne@1
|
20 return "".join(complement[base] for base in reversed(sequence))
|
jpayne@1
|
21
|
jpayne@1
|
22
|
jpayne@1
|
23 def parse_args():
|
jpayne@1
|
24 "Parse the input arguments, use '-h' for help."
|
jpayne@1
|
25 parser = ArgumentParser(description='SalmID - rapid Kmer based Salmonella identifier from sequence data')
|
jpayne@1
|
26 # inputs
|
jpayne@1
|
27 parser.add_argument('-v','--version', action='version', version='%(prog)s ' + SalmID_version)
|
jpayne@1
|
28 parser.add_argument(
|
jpayne@1
|
29 '-i','--input_file', type=str, required=False, default= 'None', metavar = 'your_fastqgz',
|
jpayne@1
|
30 help='Single fastq.gz file input, include path to file if file is not in same directory ')
|
jpayne@1
|
31 parser.add_argument(
|
jpayne@1
|
32 '-e', '--extension', type=str, required=False, default= '.fastq.gz', metavar = 'file_extension',
|
jpayne@1
|
33 help='File extension, if specified without "--input_dir", SalmID will attempt to ID all files\n' +
|
jpayne@1
|
34 ' with this extension in current directory, otherwise files in input directory')
|
jpayne@1
|
35
|
jpayne@1
|
36 parser.add_argument(
|
jpayne@1
|
37 '-d','--input_dir', type=str, required=False, default='.', metavar = 'directory',
|
jpayne@1
|
38 help='Directory which contains data for identification, when not specified files in current directory will be analyzed.')
|
jpayne@1
|
39 parser.add_argument(
|
jpayne@1
|
40 '-r', '--report', type=str, required=False, default='percentage', metavar = 'percentage, coverage or taxonomy',
|
jpayne@1
|
41 help='Report either percentage ("percentage") of clade specific kmers recovered, average kmer-coverage ("cov"), or '
|
jpayne@1
|
42 'taxonomy (taxonomic species ID, plus observed mean k-mer coverages and expected coverage).')
|
jpayne@1
|
43 parser.add_argument(
|
jpayne@1
|
44 '-m', '--mode', type=str, required=False, default='quick', metavar = 'quick or thorough',
|
jpayne@1
|
45 help='Quick [quick] or thorough [thorough] mode')
|
jpayne@1
|
46 if len(sys.argv)==1:
|
jpayne@1
|
47 parser.print_help(sys.stderr)
|
jpayne@1
|
48 sys.exit(1)
|
jpayne@1
|
49 return parser.parse_args()
|
jpayne@1
|
50
|
jpayne@1
|
51 def get_av_read_length(file):
|
jpayne@1
|
52 i = 1
|
jpayne@1
|
53 n_reads = 0
|
jpayne@1
|
54 total_length = 0
|
jpayne@1
|
55 if file.endswith(".gz"):
|
jpayne@1
|
56 file_content=io.BufferedReader(gzip.open(file))
|
jpayne@1
|
57 else:
|
jpayne@1
|
58 file_content=open(file,"r").readlines()
|
jpayne@1
|
59 for line in file_content:
|
jpayne@1
|
60 if i % 4 == 2:
|
jpayne@1
|
61 total_length += len(line.strip())
|
jpayne@1
|
62 n_reads +=1
|
jpayne@1
|
63 i += 1
|
jpayne@1
|
64 if n_reads == 100:
|
jpayne@1
|
65 break
|
jpayne@1
|
66 return total_length/100
|
jpayne@1
|
67
|
jpayne@1
|
68
|
jpayne@1
|
69 def createKmerDict_reads(list_of_strings, kmer):
|
jpayne@1
|
70 kmer_table = {}
|
jpayne@1
|
71 for string in list_of_strings:
|
jpayne@1
|
72 sequence = string.strip('\n')
|
jpayne@1
|
73 for i in range(len(sequence)-kmer+1):
|
jpayne@1
|
74 new_mer =sequence[i:i+kmer]
|
jpayne@1
|
75 new_mer_rc = reverse_complement(new_mer)
|
jpayne@1
|
76 if new_mer in kmer_table:
|
jpayne@1
|
77 kmer_table[new_mer.upper()] += 1
|
jpayne@1
|
78 else:
|
jpayne@1
|
79 kmer_table[new_mer.upper()] = 1
|
jpayne@1
|
80 if new_mer_rc in kmer_table:
|
jpayne@1
|
81 kmer_table[new_mer_rc.upper()] += 1
|
jpayne@1
|
82 else:
|
jpayne@1
|
83 kmer_table[new_mer_rc.upper()] = 1
|
jpayne@1
|
84 return kmer_table
|
jpayne@1
|
85
|
jpayne@1
|
86
|
jpayne@1
|
87 def target_read_kmerizer_multi(file, k, kmerDict_1, kmerDict_2, mode):
|
jpayne@1
|
88 mean_1 = None
|
jpayne@1
|
89 mean_2 = None
|
jpayne@1
|
90 i = 1
|
jpayne@1
|
91 n_reads_1 = 0
|
jpayne@1
|
92 n_reads_2 = 0
|
jpayne@1
|
93 total_coverage_1 = 0
|
jpayne@1
|
94 total_coverage_2 = 0
|
jpayne@1
|
95 reads_1 = []
|
jpayne@1
|
96 reads_2 = []
|
jpayne@1
|
97 total_reads = 0
|
jpayne@1
|
98 if file.endswith(".gz"):
|
jpayne@1
|
99 file_content=io.BufferedReader(gzip.open(file))
|
jpayne@1
|
100 else:
|
jpayne@1
|
101 file_content=open(file,"r").readlines()
|
jpayne@1
|
102 for line in file_content:
|
jpayne@1
|
103 start = int((len(line) - k) // 2)
|
jpayne@1
|
104 if i % 4 == 2:
|
jpayne@1
|
105 total_reads += 1
|
jpayne@1
|
106 if file.endswith(".gz"):
|
jpayne@1
|
107 s1 = line[start:k + start].decode()
|
jpayne@1
|
108 line=line.decode()
|
jpayne@1
|
109 else:
|
jpayne@1
|
110 s1 = line[start:k + start]
|
jpayne@1
|
111 if s1 in kmerDict_1:
|
jpayne@1
|
112 n_reads_1 += 1
|
jpayne@1
|
113 total_coverage_1 += len(line)
|
jpayne@1
|
114 reads_1.append(line)
|
jpayne@1
|
115 if s1 in kmerDict_2:
|
jpayne@1
|
116 n_reads_2 += 1
|
jpayne@1
|
117 total_coverage_2 += len(line)
|
jpayne@1
|
118 reads_2.append(line)
|
jpayne@1
|
119 i += 1
|
jpayne@1
|
120 if mode == 'quick':
|
jpayne@1
|
121 if total_coverage_2 >= 800000:
|
jpayne@1
|
122 break
|
jpayne@1
|
123
|
jpayne@1
|
124 if len(reads_1) == 0:
|
jpayne@1
|
125 kmer_Dict1 = {}
|
jpayne@1
|
126 else:
|
jpayne@1
|
127 kmer_Dict1 = createKmerDict_reads(reads_1, k)
|
jpayne@1
|
128 mers_1 = set([key for key in kmer_Dict1])
|
jpayne@1
|
129 mean_1 = sum([kmer_Dict1[key] for key in kmer_Dict1])/len(mers_1)
|
jpayne@1
|
130 if len(reads_2) == 0:
|
jpayne@1
|
131 kmer_Dict2 = {}
|
jpayne@1
|
132 else:
|
jpayne@1
|
133 kmer_Dict2 = createKmerDict_reads(reads_2, k)
|
jpayne@1
|
134 mers_2 = set([key for key in kmer_Dict2])
|
jpayne@1
|
135 mean_2 = sum([kmer_Dict2[key] for key in kmer_Dict2])/len(mers_2)
|
jpayne@1
|
136 return kmer_Dict1, kmer_Dict2, mean_1, mean_2, total_reads
|
jpayne@1
|
137
|
jpayne@1
|
138 def mean_cov_selected_kmers(iterable, kmer_dict, clade_specific_kmers):
|
jpayne@1
|
139 '''
|
jpayne@1
|
140 Given an iterable (list, set, dictrionary) returns mean coverage for the kmers in iterable
|
jpayne@1
|
141 :param iterable: set, list or dictionary containing kmers
|
jpayne@1
|
142 :param kmer_dict: dictionary with kmers as keys, kmer-frequency as value
|
jpayne@1
|
143 :param clade_specific_kmers: list, dict or set of clade specific kmers
|
jpayne@1
|
144 :return: mean frequency as float
|
jpayne@1
|
145 '''
|
jpayne@1
|
146 if len(iterable) == 0:
|
jpayne@1
|
147 return 0
|
jpayne@1
|
148 return sum([kmer_dict[value] for value in iterable])/len(clade_specific_kmers)
|
jpayne@1
|
149
|
jpayne@1
|
150 def kmer_lists(query_fastq_gz, k,
|
jpayne@1
|
151 allmers,allmers_rpoB,
|
jpayne@1
|
152 uniqmers_bongori,
|
jpayne@1
|
153 uniqmers_I,
|
jpayne@1
|
154 uniqmers_IIa,
|
jpayne@1
|
155 uniqmers_IIb,
|
jpayne@1
|
156 uniqmers_IIIa,
|
jpayne@1
|
157 uniqmers_IIIb,
|
jpayne@1
|
158 uniqmers_IV,
|
jpayne@1
|
159 uniqmers_VI,
|
jpayne@1
|
160 uniqmers_VII,
|
jpayne@1
|
161 uniqmers_VIII,
|
jpayne@1
|
162 uniqmers_bongori_rpoB,
|
jpayne@1
|
163 uniqmers_S_enterica_rpoB,
|
jpayne@1
|
164 uniqmers_Escherichia_rpoB,
|
jpayne@1
|
165 uniqmers_Listeria_ss_rpoB,
|
jpayne@1
|
166 uniqmers_Lmono_rpoB,
|
jpayne@1
|
167 mode):
|
jpayne@1
|
168 dict_invA, dict_rpoB, mean_invA, mean_rpoB , total_reads = target_read_kmerizer_multi(query_fastq_gz, k, allmers,
|
jpayne@1
|
169 allmers_rpoB, mode)
|
jpayne@1
|
170 target_mers_invA = set([key for key in dict_invA])
|
jpayne@1
|
171 target_mers_rpoB = set([key for key in dict_rpoB])
|
jpayne@1
|
172 if target_mers_invA == 0:
|
jpayne@1
|
173 print('No reads found matching invA, no Salmonella in sample?')
|
jpayne@1
|
174 else:
|
jpayne@1
|
175 p_bongori = (len(uniqmers_bongori & target_mers_invA) / len(uniqmers_bongori)) * 100
|
jpayne@1
|
176 p_I = (len(uniqmers_I & target_mers_invA) / len(uniqmers_I)) * 100
|
jpayne@1
|
177 p_IIa = (len(uniqmers_IIa & target_mers_invA) / len(uniqmers_IIa)) * 100
|
jpayne@1
|
178 p_IIb = (len(uniqmers_IIb & target_mers_invA) / len(uniqmers_IIb)) * 100
|
jpayne@1
|
179 p_IIIa = (len(uniqmers_IIIa & target_mers_invA) / len(uniqmers_IIIa)) * 100
|
jpayne@1
|
180 p_IIIb = (len(uniqmers_IIIb & target_mers_invA) / len(uniqmers_IIIb)) * 100
|
jpayne@1
|
181 p_VI = (len(uniqmers_VI & target_mers_invA) / len(uniqmers_VI)) * 100
|
jpayne@1
|
182 p_IV = (len(uniqmers_IV & target_mers_invA) / len(uniqmers_IV)) * 100
|
jpayne@1
|
183 p_VII = (len(uniqmers_VII & target_mers_invA) / len(uniqmers_VII)) * 100
|
jpayne@1
|
184 p_VIII = (len(uniqmers_VIII & target_mers_invA) / len(uniqmers_VIII)) * 100
|
jpayne@1
|
185 p_bongori_rpoB = (len(uniqmers_bongori_rpoB & target_mers_rpoB) / len(uniqmers_bongori_rpoB)) * 100
|
jpayne@1
|
186 p_Senterica = (len(uniqmers_S_enterica_rpoB & target_mers_rpoB) / len(uniqmers_S_enterica_rpoB)) * 100
|
jpayne@1
|
187 p_Escherichia = (len(uniqmers_Escherichia_rpoB & target_mers_rpoB) / len(uniqmers_Escherichia_rpoB)) * 100
|
jpayne@1
|
188 p_Listeria_ss = (len(uniqmers_Listeria_ss_rpoB & target_mers_rpoB) / len(uniqmers_Listeria_ss_rpoB)) * 100
|
jpayne@1
|
189 p_Lmono = (len(uniqmers_Lmono_rpoB & target_mers_rpoB) / len(uniqmers_Lmono_rpoB)) * 100
|
jpayne@1
|
190 bongori_invA_cov = mean_cov_selected_kmers(uniqmers_bongori & target_mers_invA, dict_invA, uniqmers_bongori)
|
jpayne@1
|
191 I_invA_cov = mean_cov_selected_kmers(uniqmers_I & target_mers_invA, dict_invA, uniqmers_I)
|
jpayne@1
|
192 IIa_invA_cov = mean_cov_selected_kmers(uniqmers_IIa & target_mers_invA, dict_invA, uniqmers_IIa)
|
jpayne@1
|
193 IIb_invA_cov = mean_cov_selected_kmers(uniqmers_IIb & target_mers_invA, dict_invA, uniqmers_IIb)
|
jpayne@1
|
194 IIIa_invA_cov = mean_cov_selected_kmers(uniqmers_IIIa & target_mers_invA, dict_invA, uniqmers_IIIa)
|
jpayne@1
|
195 IIIb_invA_cov = mean_cov_selected_kmers(uniqmers_IIIb & target_mers_invA, dict_invA, uniqmers_IIIb)
|
jpayne@1
|
196 IV_invA_cov = mean_cov_selected_kmers(uniqmers_IV & target_mers_invA, dict_invA, uniqmers_IV)
|
jpayne@1
|
197 VI_invA_cov = mean_cov_selected_kmers(uniqmers_VI & target_mers_invA, dict_invA, uniqmers_VI)
|
jpayne@1
|
198 VII_invA_cov = mean_cov_selected_kmers(uniqmers_VII & target_mers_invA, dict_invA, uniqmers_VII)
|
jpayne@1
|
199 VIII_invA_cov = mean_cov_selected_kmers(uniqmers_VIII & target_mers_invA, dict_invA, uniqmers_VIII)
|
jpayne@1
|
200 S_enterica_rpoB_cov = mean_cov_selected_kmers((uniqmers_S_enterica_rpoB & target_mers_rpoB), dict_rpoB,
|
jpayne@1
|
201 uniqmers_S_enterica_rpoB)
|
jpayne@1
|
202 S_bongori_rpoB_cov = mean_cov_selected_kmers((uniqmers_bongori_rpoB & target_mers_rpoB), dict_rpoB,
|
jpayne@1
|
203 uniqmers_bongori_rpoB)
|
jpayne@1
|
204 Escherichia_rpoB_cov = mean_cov_selected_kmers((uniqmers_Escherichia_rpoB & target_mers_rpoB), dict_rpoB,
|
jpayne@1
|
205 uniqmers_Escherichia_rpoB)
|
jpayne@1
|
206 Listeria_ss_rpoB_cov = mean_cov_selected_kmers((uniqmers_Listeria_ss_rpoB & target_mers_rpoB), dict_rpoB,
|
jpayne@1
|
207 uniqmers_Listeria_ss_rpoB)
|
jpayne@1
|
208 Lmono_rpoB_cov = mean_cov_selected_kmers((uniqmers_Lmono_rpoB & target_mers_rpoB), dict_rpoB,
|
jpayne@1
|
209 uniqmers_Lmono_rpoB)
|
jpayne@1
|
210 coverages = [Listeria_ss_rpoB_cov, Lmono_rpoB_cov, Escherichia_rpoB_cov, S_bongori_rpoB_cov,
|
jpayne@1
|
211 S_enterica_rpoB_cov, bongori_invA_cov, I_invA_cov, IIa_invA_cov, IIb_invA_cov,
|
jpayne@1
|
212 IIIa_invA_cov, IIIb_invA_cov, IV_invA_cov, VI_invA_cov, VII_invA_cov, VIII_invA_cov]
|
jpayne@1
|
213 locus_scores = [p_Listeria_ss, p_Lmono, p_Escherichia, p_bongori_rpoB, p_Senterica, p_bongori,
|
jpayne@1
|
214 p_I, p_IIa,p_IIb, p_IIIa, p_IIIb, p_IV, p_VI, p_VII, p_VIII]
|
jpayne@1
|
215 return locus_scores, coverages, total_reads
|
jpayne@1
|
216
|
jpayne@1
|
217 def report_taxon(locus_covs, average_read_length, number_of_reads):
|
jpayne@1
|
218 list_taxa = [ 'Listeria ss', 'Listeria monocytogenes', 'Escherichia sp.',
|
jpayne@1
|
219 'Salmonella bongori (rpoB)', 'Salmonella enterica (rpoB)',
|
jpayne@1
|
220 'Salmonella bongori (invA)', 'S. enterica subsp. enterica (invA)',
|
jpayne@1
|
221 'S. enterica subsp. salamae (invA: clade a)','S. enterica subsp. salamae (invA: clade b)',
|
jpayne@1
|
222 'S. enterica subsp. arizonae (invA)', 'S. enterica subsp. diarizonae (invA)',
|
jpayne@1
|
223 'S. enterica subsp. houtenae (invA)', 'S. enterica subsp. indica (invA)',
|
jpayne@1
|
224 'S. enterica subsp. VII (invA)', 'S. enterica subsp. salamae (invA: clade VIII)']
|
jpayne@1
|
225 if sum(locus_covs) < 1:
|
jpayne@1
|
226 rpoB = ('No rpoB matches!', 0)
|
jpayne@1
|
227 invA = ('No invA matches!', 0)
|
jpayne@1
|
228 return rpoB, invA, 0.0
|
jpayne@1
|
229 else:
|
jpayne@1
|
230 # given list of scores get taxon
|
jpayne@1
|
231 if sum(locus_covs[0:5]) > 0:
|
jpayne@1
|
232 best_rpoB = max(range(len(locus_covs[1:5])), key=lambda x: locus_covs[1:5][x])+1
|
jpayne@1
|
233 all_rpoB = max(range(len(locus_covs[0:5])), key=lambda x: locus_covs[0:5][x])
|
jpayne@1
|
234 if (locus_covs[best_rpoB] != 0) & (all_rpoB == 0):
|
jpayne@1
|
235 rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB])
|
jpayne@1
|
236 elif (all_rpoB == 0) & (round(sum(locus_covs[1:5]),1) < 1):
|
jpayne@1
|
237 rpoB = (list_taxa[0], locus_covs[0])
|
jpayne@1
|
238 else:
|
jpayne@1
|
239 rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB])
|
jpayne@1
|
240 else:
|
jpayne@1
|
241 rpoB = ('No rpoB matches!', 0)
|
jpayne@1
|
242 if sum(locus_covs[5:]) > 0:
|
jpayne@1
|
243 best_invA = max(range(len(locus_covs[5:])), key=lambda x: locus_covs[5:][x])+5
|
jpayne@1
|
244 invA = (list_taxa[best_invA], locus_covs[best_invA])
|
jpayne@1
|
245 else:
|
jpayne@1
|
246 invA = ('No invA matches!', 0)
|
jpayne@1
|
247 if 'Listeria' in rpoB[0]:
|
jpayne@1
|
248 return rpoB, invA, (average_read_length * number_of_reads) / 3000000
|
jpayne@1
|
249 else:
|
jpayne@1
|
250 return rpoB, invA, (average_read_length * number_of_reads) / 5000000
|
jpayne@1
|
251
|
jpayne@1
|
252
|
jpayne@1
|
253
|
jpayne@1
|
254 def main():
|
jpayne@1
|
255 ex_dir = os.path.dirname(os.path.realpath(__file__))
|
jpayne@1
|
256 args = parse_args()
|
jpayne@1
|
257 input_file = args.input_file
|
jpayne@1
|
258 if input_file != 'None':
|
jpayne@1
|
259 files = [input_file]
|
jpayne@1
|
260 else:
|
jpayne@1
|
261 extension = args.extension
|
jpayne@1
|
262 inputdir = args.input_dir
|
jpayne@1
|
263 files = [inputdir + '/'+ f for f in os.listdir(inputdir) if f.endswith(extension)]
|
jpayne@1
|
264 report = args.report
|
jpayne@1
|
265 mode = args.mode
|
jpayne@1
|
266 f_invA = open(ex_dir + "/invA_mers_dict", "rb")
|
jpayne@1
|
267 sets_dict_invA = pickle.load(f_invA)
|
jpayne@1
|
268 f_invA.close()
|
jpayne@1
|
269 allmers = sets_dict_invA['allmers']
|
jpayne@1
|
270 uniqmers_I = sets_dict_invA['uniqmers_I']
|
jpayne@1
|
271 uniqmers_IIa = sets_dict_invA['uniqmers_IIa']
|
jpayne@1
|
272 uniqmers_IIb = sets_dict_invA['uniqmers_IIb']
|
jpayne@1
|
273 uniqmers_IIIa = sets_dict_invA['uniqmers_IIIa']
|
jpayne@1
|
274 uniqmers_IIIb = sets_dict_invA['uniqmers_IIIb']
|
jpayne@1
|
275 uniqmers_IV = sets_dict_invA['uniqmers_IV']
|
jpayne@1
|
276 uniqmers_VI = sets_dict_invA['uniqmers_VI']
|
jpayne@1
|
277 uniqmers_VII = sets_dict_invA['uniqmers_VII']
|
jpayne@1
|
278 uniqmers_VIII = sets_dict_invA['uniqmers_VIII']
|
jpayne@1
|
279 uniqmers_bongori = sets_dict_invA['uniqmers_bongori']
|
jpayne@1
|
280
|
jpayne@1
|
281 f = open(ex_dir + "/rpoB_mers_dict", "rb")
|
jpayne@1
|
282 sets_dict = pickle.load(f)
|
jpayne@1
|
283 f.close()
|
jpayne@1
|
284
|
jpayne@1
|
285 allmers_rpoB = sets_dict['allmers']
|
jpayne@1
|
286 uniqmers_bongori_rpoB = sets_dict['uniqmers_bongori']
|
jpayne@1
|
287 uniqmers_S_enterica_rpoB = sets_dict['uniqmers_S_enterica']
|
jpayne@1
|
288 uniqmers_Escherichia_rpoB = sets_dict['uniqmers_Escherichia']
|
jpayne@1
|
289 uniqmers_Listeria_ss_rpoB = sets_dict['uniqmers_Listeria_ss']
|
jpayne@1
|
290 uniqmers_Lmono_rpoB = sets_dict['uniqmers_L_mono']
|
jpayne@1
|
291 #todo: run kmer_lists() once, create list of tuples containing data to be used fro different reports
|
jpayne@1
|
292 if report == 'taxonomy':
|
jpayne@1
|
293 print('file\trpoB\tinvA\texpected coverage')
|
jpayne@1
|
294 for f in files:
|
jpayne@1
|
295 locus_scores, coverages, reads = kmer_lists(f, 27,
|
jpayne@1
|
296 allmers, allmers_rpoB,
|
jpayne@1
|
297 uniqmers_bongori,
|
jpayne@1
|
298 uniqmers_I,
|
jpayne@1
|
299 uniqmers_IIa,
|
jpayne@1
|
300 uniqmers_IIb,
|
jpayne@1
|
301 uniqmers_IIIa,
|
jpayne@1
|
302 uniqmers_IIIb,
|
jpayne@1
|
303 uniqmers_IV,
|
jpayne@1
|
304 uniqmers_VI,
|
jpayne@1
|
305 uniqmers_VII,
|
jpayne@1
|
306 uniqmers_VIII,
|
jpayne@1
|
307 uniqmers_bongori_rpoB,
|
jpayne@1
|
308 uniqmers_S_enterica_rpoB,
|
jpayne@1
|
309 uniqmers_Escherichia_rpoB,
|
jpayne@1
|
310 uniqmers_Listeria_ss_rpoB,
|
jpayne@1
|
311 uniqmers_Lmono_rpoB,
|
jpayne@1
|
312 mode)
|
jpayne@1
|
313 pretty_covs = [round(cov, 1) for cov in coverages]
|
jpayne@1
|
314 report = report_taxon(pretty_covs, get_av_read_length(f), reads)
|
jpayne@1
|
315 print(f.split('/')[-1] + '\t' + report[0][0] + '[' + str(report[0][1]) + ']' + '\t' + report[1][0] +
|
jpayne@1
|
316 '[' + str(report[1][1]) + ']' +
|
jpayne@1
|
317 '\t' + str(round(report[2], 1)))
|
jpayne@1
|
318 else:
|
jpayne@1
|
319 print(
|
jpayne@1
|
320 'file\tListeria sensu stricto (rpoB)\tL. monocytogenes (rpoB)\tEscherichia spp. (rpoB)\tS. bongori (rpoB)\tS. enterica' +
|
jpayne@1
|
321 '(rpoB)\tS. bongori (invA)\tsubsp. I (invA)\tsubsp. II (clade a: invA)\tsubsp. II' +
|
jpayne@1
|
322 ' (clade b: invA)\tsubsp. IIIa (invA)\tsubsp. IIIb (invA)\tsubsp.IV (invA)\tsubsp. VI (invA)\tsubsp. VII (invA)' +
|
jpayne@1
|
323 '\tsubsp. II (clade VIII : invA)')
|
jpayne@1
|
324 if report == 'percentage':
|
jpayne@1
|
325 for f in files:
|
jpayne@1
|
326 locus_scores, coverages , reads = kmer_lists( f, 27,
|
jpayne@1
|
327 allmers,allmers_rpoB,
|
jpayne@1
|
328 uniqmers_bongori,
|
jpayne@1
|
329 uniqmers_I,
|
jpayne@1
|
330 uniqmers_IIa,
|
jpayne@1
|
331 uniqmers_IIb,
|
jpayne@1
|
332 uniqmers_IIIa,
|
jpayne@1
|
333 uniqmers_IIIb,
|
jpayne@1
|
334 uniqmers_IV,
|
jpayne@1
|
335 uniqmers_VI,
|
jpayne@1
|
336 uniqmers_VII,
|
jpayne@1
|
337 uniqmers_VIII,
|
jpayne@1
|
338 uniqmers_bongori_rpoB,
|
jpayne@1
|
339 uniqmers_S_enterica_rpoB,
|
jpayne@1
|
340 uniqmers_Escherichia_rpoB,
|
jpayne@1
|
341 uniqmers_Listeria_ss_rpoB,
|
jpayne@1
|
342 uniqmers_Lmono_rpoB,
|
jpayne@1
|
343 mode)
|
jpayne@1
|
344 pretty_scores = [str(round(score)) for score in locus_scores]
|
jpayne@1
|
345 print(f.split('/')[-1] +'\t' + '\t'.join(pretty_scores))
|
jpayne@1
|
346 else:
|
jpayne@1
|
347 for f in files:
|
jpayne@1
|
348 locus_scores, coverages , reads = kmer_lists( f, 27,
|
jpayne@1
|
349 allmers,allmers_rpoB,
|
jpayne@1
|
350 uniqmers_bongori,
|
jpayne@1
|
351 uniqmers_I,
|
jpayne@1
|
352 uniqmers_IIa,
|
jpayne@1
|
353 uniqmers_IIb,
|
jpayne@1
|
354 uniqmers_IIIa,
|
jpayne@1
|
355 uniqmers_IIIb,
|
jpayne@1
|
356 uniqmers_IV,
|
jpayne@1
|
357 uniqmers_VI,
|
jpayne@1
|
358 uniqmers_VII,
|
jpayne@1
|
359 uniqmers_VIII,
|
jpayne@1
|
360 uniqmers_bongori_rpoB,
|
jpayne@1
|
361 uniqmers_S_enterica_rpoB,
|
jpayne@1
|
362 uniqmers_Escherichia_rpoB,
|
jpayne@1
|
363 uniqmers_Listeria_ss_rpoB,
|
jpayne@1
|
364 uniqmers_Lmono_rpoB,
|
jpayne@1
|
365 mode)
|
jpayne@1
|
366 pretty_covs = [str(round(cov, 1)) for cov in coverages]
|
jpayne@1
|
367 print(f.split('/')[-1] + '\t' + '\t'.join(pretty_covs))
|
jpayne@1
|
368
|
jpayne@1
|
369 if __name__ == '__main__':
|
jpayne@1
|
370 main()
|
jpayne@1
|
371
|