kkonganti@0
|
1 #!/usr/bin/env python3
|
kkonganti@0
|
2
|
kkonganti@0
|
3 # Kranti Konganti
|
kkonganti@0
|
4 # 03/06/2024
|
kkonganti@0
|
5
|
kkonganti@0
|
6 import argparse
|
kkonganti@0
|
7 import glob
|
kkonganti@0
|
8 import inspect
|
kkonganti@0
|
9 import logging
|
kkonganti@0
|
10 import os
|
kkonganti@0
|
11 import pprint
|
kkonganti@0
|
12 import re
|
kkonganti@0
|
13 from collections import defaultdict
|
kkonganti@0
|
14
|
kkonganti@0
|
15
|
kkonganti@0
|
16 # Multiple inheritence for pretty printing of help text.
|
kkonganti@0
|
17 class MultiArgFormatClasses(
|
kkonganti@0
|
18 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
|
kkonganti@0
|
19 ):
|
kkonganti@0
|
20 pass
|
kkonganti@0
|
21
|
kkonganti@0
|
22
|
kkonganti@0
|
23 # Main
|
kkonganti@0
|
24 def main() -> None:
|
kkonganti@0
|
25 """
|
kkonganti@0
|
26 The succesful execution of this script requires access to properly formatted
|
kkonganti@0
|
27 lineages.csv file which has no more than 9 columns.
|
kkonganti@0
|
28
|
kkonganti@0
|
29 It takes the lineages.csv file, the *_hits.csv results from `sourmash gather`
|
kkonganti@0
|
30 mentioned with -smres option and and a root parent directory of the
|
kkonganti@0
|
31 `salmon quant` results mentioned with -sal option and generates a final
|
kkonganti@0
|
32 results table with the TPM values and a .krona.tsv file for each sample
|
kkonganti@0
|
33 to be used by KronaTools.
|
kkonganti@0
|
34 """
|
kkonganti@0
|
35 # Set logging.
|
kkonganti@0
|
36 logging.basicConfig(
|
kkonganti@0
|
37 format="\n"
|
kkonganti@0
|
38 + "=" * 55
|
kkonganti@0
|
39 + "\n%(asctime)s - %(levelname)s\n"
|
kkonganti@0
|
40 + "=" * 55
|
kkonganti@0
|
41 + "\n%(message)s\n\n",
|
kkonganti@0
|
42 level=logging.DEBUG,
|
kkonganti@0
|
43 )
|
kkonganti@0
|
44
|
kkonganti@0
|
45 # Debug print.
|
kkonganti@0
|
46 ppp = pprint.PrettyPrinter(width=55)
|
kkonganti@0
|
47 prog_name = inspect.stack()[0].filename
|
kkonganti@0
|
48
|
kkonganti@0
|
49 parser = argparse.ArgumentParser(
|
kkonganti@0
|
50 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
|
kkonganti@0
|
51 )
|
kkonganti@0
|
52
|
kkonganti@0
|
53 required = parser.add_argument_group("required arguments")
|
kkonganti@0
|
54
|
kkonganti@0
|
55 required.add_argument(
|
kkonganti@0
|
56 "-sal",
|
kkonganti@0
|
57 dest="salmon_res_dir",
|
kkonganti@0
|
58 default=False,
|
kkonganti@0
|
59 required=True,
|
kkonganti@0
|
60 help="Absolute UNIX path to the parent directory that contains the\n"
|
kkonganti@0
|
61 + "`salmon quant` results. For example, if path to\n"
|
kkonganti@0
|
62 + "`quant.sf` is in /hpc/john_doe/test/salmon_res/sampleA/quant.sf, then\n"
|
kkonganti@0
|
63 + "use this command-line option as:\n"
|
kkonganti@0
|
64 + "-sal /hpc/john_doe/test/salmon_res",
|
kkonganti@0
|
65 )
|
kkonganti@0
|
66 required.add_argument(
|
kkonganti@0
|
67 "-lin",
|
kkonganti@0
|
68 dest="lin",
|
kkonganti@0
|
69 default=False,
|
kkonganti@0
|
70 required=True,
|
kkonganti@0
|
71 help="Absolute UNIX Path to the lineages CSV file.\n"
|
kkonganti@0
|
72 + "This file should have only 9 columns.",
|
kkonganti@0
|
73 )
|
kkonganti@0
|
74 required.add_argument(
|
kkonganti@0
|
75 "-smres",
|
kkonganti@0
|
76 dest="sm_res_dir",
|
kkonganti@0
|
77 default=False,
|
kkonganti@0
|
78 required=True,
|
kkonganti@0
|
79 help="Absolute UNIX path to the parent directory that contains the\n"
|
kkonganti@0
|
80 + "filtered `sourmas gather` results. For example, if path to\n"
|
kkonganti@0
|
81 + "`sampleA.csv` is in /hpc/john_doe/test/sourmash_gather/sampleA.csv,\n"
|
kkonganti@0
|
82 + "then use this command-line option as:\n"
|
kkonganti@0
|
83 + "-sal /hpc/john_doe/test",
|
kkonganti@0
|
84 )
|
kkonganti@0
|
85 parser.add_argument(
|
kkonganti@0
|
86 "-op",
|
kkonganti@0
|
87 dest="out_prefix",
|
kkonganti@0
|
88 default="nowayout.tblsum",
|
kkonganti@0
|
89 required=False,
|
kkonganti@0
|
90 help="Set the output file(s) prefix for output(s) generated\n"
|
kkonganti@0
|
91 + "by this program.",
|
kkonganti@0
|
92 )
|
kkonganti@0
|
93 parser.add_argument(
|
kkonganti@0
|
94 "-sf",
|
kkonganti@0
|
95 dest="scale_down_factor",
|
kkonganti@0
|
96 default=float(10000),
|
kkonganti@0
|
97 required=False,
|
kkonganti@0
|
98 help="Set the scaling factor by which TPM values are scaled\ndown.",
|
kkonganti@0
|
99 )
|
kkonganti@0
|
100 parser.add_argument(
|
kkonganti@0
|
101 "-smres-suffix",
|
kkonganti@0
|
102 dest="sm_res_suffix",
|
kkonganti@0
|
103 default="_hits.csv",
|
kkonganti@0
|
104 required=False,
|
kkonganti@0
|
105 help="Find the `sourmash gather` result files ending in this\nsuffix.",
|
kkonganti@0
|
106 )
|
kkonganti@0
|
107 parser.add_argument(
|
kkonganti@0
|
108 "-failed-suffix",
|
kkonganti@0
|
109 dest="failed_suffix",
|
kkonganti@0
|
110 default="_FAILED.txt",
|
kkonganti@0
|
111 required=False,
|
kkonganti@0
|
112 help="Find the sample names which failed classification stored\n"
|
kkonganti@0
|
113 + "inside the files ending in this suffix.",
|
kkonganti@0
|
114 )
|
kkonganti@0
|
115 parser.add_argument(
|
kkonganti@0
|
116 "-num-lin-cols",
|
kkonganti@0
|
117 dest="num_lin_cols",
|
kkonganti@0
|
118 default=int(9),
|
kkonganti@0
|
119 required=False,
|
kkonganti@0
|
120 help="Number of columns expected in the lineages CSV file.",
|
kkonganti@0
|
121 )
|
kkonganti@0
|
122 parser.add_argument(
|
kkonganti@0
|
123 "-lin-acc-regex",
|
kkonganti@0
|
124 dest="lin_acc_regex",
|
kkonganti@0
|
125 default=re.compile(r"\w+[\-\.]{1}[0-9]+"),
|
kkonganti@0
|
126 required=False,
|
kkonganti@0
|
127 help="The pattern of the lineage's accession.",
|
kkonganti@0
|
128 )
|
kkonganti@0
|
129
|
kkonganti@0
|
130 args = parser.parse_args()
|
kkonganti@0
|
131 salmon_res_dir = args.salmon_res_dir
|
kkonganti@0
|
132 sm_res_dir = args.sm_res_dir
|
kkonganti@0
|
133 sm_res_suffix = args.sm_res_suffix
|
kkonganti@0
|
134 failed_suffix = args.failed_suffix
|
kkonganti@0
|
135 out_prefix = args.out_prefix
|
kkonganti@0
|
136 lin = args.lin
|
kkonganti@0
|
137 num_lin_cols = args.num_lin_cols
|
kkonganti@0
|
138 acc_pat = args.lin_acc_regex
|
kkonganti@0
|
139 scale_down = float(args.scale_down_factor)
|
kkonganti@0
|
140 no_hit = "Unclassified"
|
kkonganti@0
|
141 no_hit_reads = "reads mapped to the database"
|
kkonganti@0
|
142 tpm_const = float(1000000.0000000000)
|
kkonganti@0
|
143 round_to = 10
|
kkonganti@0
|
144 all_samples = set()
|
kkonganti@0
|
145 (
|
kkonganti@0
|
146 lineage2sample,
|
kkonganti@0
|
147 unclassified2sample,
|
kkonganti@0
|
148 lineage2sm,
|
kkonganti@0
|
149 sm2passed,
|
kkonganti@0
|
150 reads_total,
|
kkonganti@0
|
151 per_taxon_reads,
|
kkonganti@0
|
152 lineages,
|
kkonganti@0
|
153 ) = (
|
kkonganti@0
|
154 defaultdict(defaultdict),
|
kkonganti@0
|
155 defaultdict(defaultdict),
|
kkonganti@0
|
156 defaultdict(defaultdict),
|
kkonganti@0
|
157 defaultdict(defaultdict),
|
kkonganti@0
|
158 defaultdict(defaultdict),
|
kkonganti@0
|
159 defaultdict(defaultdict),
|
kkonganti@0
|
160 defaultdict(int),
|
kkonganti@0
|
161 )
|
kkonganti@0
|
162
|
kkonganti@0
|
163 salmon_comb_res = os.path.join(os.getcwd(), out_prefix + ".txt")
|
kkonganti@0
|
164 # salmon_comb_res_reads_mapped = os.path.join(
|
kkonganti@0
|
165 # os.getcwd(), re.sub(".tblsum", "_reads_mapped.tblsum", out_prefix) + ".txt"
|
kkonganti@0
|
166 # )
|
kkonganti@0
|
167 salmon_comb_res_indiv_reads_mapped = os.path.join(
|
kkonganti@0
|
168 os.getcwd(),
|
kkonganti@0
|
169 re.sub(".tblsum", "_indiv_reads_mapped.tblsum", out_prefix) + ".txt",
|
kkonganti@0
|
170 )
|
kkonganti@0
|
171 salmon_res_files = glob.glob(
|
kkonganti@0
|
172 os.path.join(salmon_res_dir, "*", "quant.sf"), recursive=True
|
kkonganti@0
|
173 )
|
kkonganti@0
|
174 sample_res_files_failed = glob.glob(
|
kkonganti@0
|
175 os.path.join(salmon_res_dir, "*" + failed_suffix), recursive=True
|
kkonganti@0
|
176 )
|
kkonganti@0
|
177 sm_res_files = glob.glob(
|
kkonganti@0
|
178 os.path.join(sm_res_dir, "*" + sm_res_suffix), recursive=True
|
kkonganti@0
|
179 )
|
kkonganti@0
|
180
|
kkonganti@0
|
181 # Basic checks
|
kkonganti@0
|
182 if lin and not (os.path.exists(lin) and os.path.getsize(lin) > 0):
|
kkonganti@0
|
183 logging.error(
|
kkonganti@0
|
184 "The lineages file,\n"
|
kkonganti@0
|
185 + f"{os.path.basename(lin)} does not exist or is empty!"
|
kkonganti@0
|
186 )
|
kkonganti@0
|
187 exit(1)
|
kkonganti@0
|
188
|
kkonganti@0
|
189 if salmon_res_dir:
|
kkonganti@0
|
190 if not os.path.isdir(salmon_res_dir):
|
kkonganti@0
|
191 logging.error("UNIX path\n" + f"{salmon_res_dir}\n" + "does not exist!")
|
kkonganti@0
|
192 exit(1)
|
kkonganti@0
|
193 if len(salmon_res_files) <= 0:
|
kkonganti@0
|
194 with open(salmon_comb_res, "w") as salmon_comb_res_fh, open(
|
kkonganti@0
|
195 salmon_comb_res_indiv_reads_mapped, "w"
|
kkonganti@0
|
196 ) as salmon_comb_res_indiv_reads_mapped_fh:
|
kkonganti@0
|
197 salmon_comb_res_fh.write(f"Sample\n{no_hit} reads in all samples\n")
|
kkonganti@0
|
198 salmon_comb_res_indiv_reads_mapped_fh.write(
|
kkonganti@0
|
199 f"Sample\nNo {no_hit_reads} from all samples\n"
|
kkonganti@0
|
200 )
|
kkonganti@0
|
201 salmon_comb_res_fh.close()
|
kkonganti@0
|
202 salmon_comb_res_indiv_reads_mapped_fh.close()
|
kkonganti@0
|
203 exit(0)
|
kkonganti@0
|
204
|
kkonganti@0
|
205 # Only proceed if lineages.csv exists.
|
kkonganti@0
|
206 if lin and os.path.exists(lin) and os.path.getsize(lin) > 0:
|
kkonganti@0
|
207 lin_fh = open(lin, "r")
|
kkonganti@0
|
208 _ = lin_fh.readline()
|
kkonganti@0
|
209
|
kkonganti@0
|
210 # Index lineages.csv
|
kkonganti@0
|
211 for line in lin_fh:
|
kkonganti@0
|
212 cols = line.strip().split(",")
|
kkonganti@0
|
213
|
kkonganti@0
|
214 if len(cols) < num_lin_cols:
|
kkonganti@0
|
215 logging.error(
|
kkonganti@0
|
216 f"The file {os.path.basename(lin)} seems to\n"
|
kkonganti@0
|
217 + "be malformed. It contains less than required 9 columns."
|
kkonganti@0
|
218 )
|
kkonganti@0
|
219 exit(1)
|
kkonganti@0
|
220
|
kkonganti@0
|
221 if cols[0] in lineages.keys():
|
kkonganti@0
|
222 continue
|
kkonganti@0
|
223 # logging.info(
|
kkonganti@0
|
224 # f"There is a duplicate accession [{cols[0]}]"
|
kkonganti@0
|
225 # + f" in the lineages file {os.path.basename(lin)}!"
|
kkonganti@0
|
226 # )
|
kkonganti@0
|
227 elif acc_pat.match(cols[0]):
|
kkonganti@0
|
228 lineages[cols[0]] = ",".join(cols[1:])
|
kkonganti@0
|
229
|
kkonganti@0
|
230 lin_fh.close()
|
kkonganti@0
|
231
|
kkonganti@0
|
232 # Index each samples' filtered sourmash results.
|
kkonganti@0
|
233 for sm_res_file in sm_res_files:
|
kkonganti@0
|
234 sample_name = re.sub(sm_res_suffix, "", os.path.basename(sm_res_file))
|
kkonganti@0
|
235
|
kkonganti@0
|
236 with open(sm_res_file, "r") as sm_res_fh:
|
kkonganti@0
|
237 _ = sm_res_fh.readline()
|
kkonganti@0
|
238 for line in sm_res_fh:
|
kkonganti@0
|
239 acc = acc_pat.findall(line.strip().split(",")[9])
|
kkonganti@0
|
240
|
kkonganti@0
|
241 if len(acc) == 0:
|
kkonganti@0
|
242 logging.info(
|
kkonganti@0
|
243 f"Got empty lineage accession: {acc}"
|
kkonganti@0
|
244 + f"\nRow elements: {line.strip().split(',')}"
|
kkonganti@0
|
245 )
|
kkonganti@0
|
246 exit(1)
|
kkonganti@0
|
247 if len(acc) not in [1]:
|
kkonganti@0
|
248 logging.info(
|
kkonganti@0
|
249 f"Got more than one lineage accession: {acc}"
|
kkonganti@0
|
250 + f"\nRow elements: {line.strip().split(',')}"
|
kkonganti@0
|
251 )
|
kkonganti@0
|
252 logging.info(f"Considering first element: {acc[0]}")
|
kkonganti@0
|
253 if acc[0] not in lineages.keys():
|
kkonganti@0
|
254 logging.error(
|
kkonganti@0
|
255 f"The lineage accession {acc[0]} is not found in {os.path.basename(lin)}"
|
kkonganti@0
|
256 )
|
kkonganti@0
|
257 exit(1)
|
kkonganti@0
|
258 lineage2sm[lineages[acc[0]]].setdefault(sample_name, 1)
|
kkonganti@0
|
259 sm2passed["sourmash_passed"].setdefault(sample_name, 1)
|
kkonganti@0
|
260 sm_res_fh.close()
|
kkonganti@0
|
261
|
kkonganti@0
|
262 # Index each samples' salmon results.
|
kkonganti@0
|
263 for salmon_res_file in salmon_res_files:
|
kkonganti@0
|
264 sample_name = re.match(
|
kkonganti@0
|
265 r"(^.+?)((\_salmon\_res)|(\.salmon))$",
|
kkonganti@0
|
266 os.path.basename(os.path.dirname(salmon_res_file)),
|
kkonganti@0
|
267 )[1]
|
kkonganti@0
|
268 salmon_meta_json = os.path.join(
|
kkonganti@0
|
269 os.path.dirname(salmon_res_file), "aux_info", "meta_info.json"
|
kkonganti@0
|
270 )
|
kkonganti@0
|
271
|
kkonganti@0
|
272 if (
|
kkonganti@0
|
273 not os.path.exists(salmon_meta_json)
|
kkonganti@0
|
274 or not os.path.getsize(salmon_meta_json) > 0
|
kkonganti@0
|
275 ):
|
kkonganti@0
|
276 logging.error(
|
kkonganti@0
|
277 "The file\n"
|
kkonganti@0
|
278 + f"{salmon_meta_json}\ndoes not exist or is empty!\n"
|
kkonganti@0
|
279 + "Did `salmon quant` fail?"
|
kkonganti@0
|
280 )
|
kkonganti@0
|
281 exit(1)
|
kkonganti@0
|
282
|
kkonganti@0
|
283 if (
|
kkonganti@0
|
284 not os.path.exists(salmon_res_file)
|
kkonganti@0
|
285 or not os.path.getsize(salmon_res_file) > 0
|
kkonganti@0
|
286 ):
|
kkonganti@0
|
287 logging.error(
|
kkonganti@0
|
288 "The file\n"
|
kkonganti@0
|
289 + f"{salmon_res_file}\ndoes not exist or is empty!\n"
|
kkonganti@0
|
290 + "Did `salmon quant` fail?"
|
kkonganti@0
|
291 )
|
kkonganti@0
|
292 exit(1)
|
kkonganti@0
|
293
|
kkonganti@0
|
294 # Initiate all_tpm, rem_tpm and reads_mapped
|
kkonganti@0
|
295 # all_tpm
|
kkonganti@0
|
296 reads_total[sample_name].setdefault("all_tpm", []).append(float(0.0))
|
kkonganti@0
|
297 # rem_tpm
|
kkonganti@0
|
298 reads_total[sample_name].setdefault("rem_tpm", []).append(float(0.0))
|
kkonganti@0
|
299 # reads_mapped
|
kkonganti@0
|
300 reads_total[sample_name].setdefault("reads_mapped", []).append(float(0.0))
|
kkonganti@0
|
301
|
kkonganti@0
|
302 with open(salmon_res_file, "r") as salmon_res_fh:
|
kkonganti@0
|
303 for line in salmon_res_fh.readlines():
|
kkonganti@0
|
304 if re.match(r"^Name.+", line):
|
kkonganti@0
|
305 continue
|
kkonganti@0
|
306 cols = line.strip().split("\t")
|
kkonganti@0
|
307 ref_acc = cols[0]
|
kkonganti@0
|
308 tpm = cols[3]
|
kkonganti@0
|
309 num_reads_mapped = cols[4]
|
kkonganti@0
|
310
|
kkonganti@0
|
311 (
|
kkonganti@0
|
312 reads_total[sample_name]
|
kkonganti@0
|
313 .setdefault("all_tpm", [])
|
kkonganti@0
|
314 .append(
|
kkonganti@0
|
315 round(float(tpm), round_to),
|
kkonganti@0
|
316 )
|
kkonganti@0
|
317 )
|
kkonganti@0
|
318
|
kkonganti@0
|
319 (
|
kkonganti@0
|
320 reads_total[sample_name]
|
kkonganti@0
|
321 .setdefault("reads_mapped", [])
|
kkonganti@0
|
322 .append(
|
kkonganti@0
|
323 round(float(num_reads_mapped), round_to),
|
kkonganti@0
|
324 )
|
kkonganti@0
|
325 )
|
kkonganti@0
|
326
|
kkonganti@0
|
327 if lineages[ref_acc] in lineage2sm.keys():
|
kkonganti@0
|
328 (
|
kkonganti@0
|
329 lineage2sample[lineages[ref_acc]]
|
kkonganti@0
|
330 .setdefault(sample_name, [])
|
kkonganti@0
|
331 .append(round(float(tpm), round_to))
|
kkonganti@0
|
332 )
|
kkonganti@0
|
333 (
|
kkonganti@0
|
334 per_taxon_reads[sample_name]
|
kkonganti@0
|
335 .setdefault(lineages[ref_acc], [])
|
kkonganti@0
|
336 .append(round(float(num_reads_mapped)))
|
kkonganti@0
|
337 )
|
kkonganti@0
|
338 else:
|
kkonganti@0
|
339 (
|
kkonganti@0
|
340 reads_total[sample_name]
|
kkonganti@0
|
341 .setdefault("rem_tpm", [])
|
kkonganti@0
|
342 .append(
|
kkonganti@0
|
343 round(float(tpm), round_to),
|
kkonganti@0
|
344 )
|
kkonganti@0
|
345 )
|
kkonganti@0
|
346
|
kkonganti@0
|
347 salmon_res_fh.close()
|
kkonganti@0
|
348
|
kkonganti@0
|
349 # Index each samples' complete failure results i.e., 100% unclassified.
|
kkonganti@0
|
350 for sample_res_file_failed in sample_res_files_failed:
|
kkonganti@0
|
351 sample_name = re.sub(
|
kkonganti@0
|
352 failed_suffix, "", os.path.basename(sample_res_file_failed)
|
kkonganti@0
|
353 )
|
kkonganti@0
|
354 with open("".join(sample_res_file_failed), "r") as no_calls_fh:
|
kkonganti@0
|
355 for line in no_calls_fh.readlines():
|
kkonganti@0
|
356 if line in ["\n", "\n\r", "\r"]:
|
kkonganti@0
|
357 continue
|
kkonganti@0
|
358 unclassified2sample[sample_name].setdefault(no_hit, tpm_const)
|
kkonganti@0
|
359 no_calls_fh.close()
|
kkonganti@0
|
360
|
kkonganti@0
|
361 # Finally, write all results.
|
kkonganti@0
|
362 for sample in sorted(reads_total.keys()) + sorted(unclassified2sample.keys()):
|
kkonganti@0
|
363 all_samples.add(sample)
|
kkonganti@0
|
364
|
kkonganti@0
|
365 # Check if sourmash results exist but salmon `quant` failed
|
kkonganti@0
|
366 # and if so, set the sample to 100% Unclassified as well.
|
kkonganti@0
|
367 for sample in sm2passed["sourmash_passed"].keys():
|
kkonganti@0
|
368 if sample not in all_samples:
|
kkonganti@0
|
369 unclassified2sample[sample].setdefault(no_hit, tpm_const)
|
kkonganti@0
|
370 all_samples.add(sample)
|
kkonganti@0
|
371
|
kkonganti@0
|
372 # Write total number of reads mapped to nowayout database.
|
kkonganti@0
|
373 # with open(salmon_comb_res_reads_mapped, "w") as nowo_reads_mapped_fh:
|
kkonganti@0
|
374 # nowo_reads_mapped_fh.write(
|
kkonganti@0
|
375 # "\t".join(
|
kkonganti@0
|
376 # [
|
kkonganti@0
|
377 # "Sample",
|
kkonganti@0
|
378 # "All reads",
|
kkonganti@0
|
379 # "Classified reads",
|
kkonganti@0
|
380 # "Unclassified reads (Reads failed thresholds )",
|
kkonganti@0
|
381 # ]
|
kkonganti@0
|
382 # )
|
kkonganti@0
|
383 # )
|
kkonganti@0
|
384
|
kkonganti@0
|
385 # for sample in all_samples:
|
kkonganti@0
|
386 # if sample in reads_total.keys():
|
kkonganti@0
|
387 # nowo_reads_mapped_fh.write(
|
kkonganti@0
|
388 # "\n"
|
kkonganti@0
|
389 # + "\t".join(
|
kkonganti@0
|
390 # [
|
kkonganti@0
|
391 # f"\n{sample}",
|
kkonganti@0
|
392 # f"{int(sum(reads_total[sample]['reads_mapped']))}",
|
kkonganti@0
|
393 # f"{int(reads_total[sample]['reads_mapped'])}",
|
kkonganti@0
|
394 # f"{int(reads_total[sample]['rem_tpm'])}",
|
kkonganti@0
|
395 # ],
|
kkonganti@0
|
396 # )
|
kkonganti@0
|
397 # )
|
kkonganti@0
|
398 # else:
|
kkonganti@0
|
399 # nowo_reads_mapped_fh.write(f"\n{sample}\t{int(0.0)}")
|
kkonganti@0
|
400 # nowo_reads_mapped_fh.close()
|
kkonganti@0
|
401
|
kkonganti@0
|
402 # Write scaled down TPM values for each sample.
|
kkonganti@0
|
403 with open(salmon_comb_res, "w") as salmon_comb_res_fh, open(
|
kkonganti@0
|
404 salmon_comb_res_indiv_reads_mapped, "w"
|
kkonganti@0
|
405 ) as salmon_comb_res_indiv_reads_mapped_fh:
|
kkonganti@0
|
406 salmon_comb_res_fh.write("Lineage\t" + "\t".join(all_samples) + "\n")
|
kkonganti@0
|
407 salmon_comb_res_indiv_reads_mapped_fh.write(
|
kkonganti@0
|
408 "Lineage\t" + "\t".join(all_samples) + "\n"
|
kkonganti@0
|
409 )
|
kkonganti@0
|
410
|
kkonganti@0
|
411 # Write *.krona.tsv header for all samples.
|
kkonganti@0
|
412 for sample in all_samples:
|
kkonganti@0
|
413 krona_fh = open(
|
kkonganti@0
|
414 os.path.join(salmon_res_dir, sample + ".krona.tsv"), "w"
|
kkonganti@0
|
415 )
|
kkonganti@0
|
416 krona_fh.write(
|
kkonganti@0
|
417 "\t".join(
|
kkonganti@0
|
418 [
|
kkonganti@0
|
419 "fraction",
|
kkonganti@0
|
420 "superkingdom",
|
kkonganti@0
|
421 "phylum",
|
kkonganti@0
|
422 "class",
|
kkonganti@0
|
423 "order",
|
kkonganti@0
|
424 "family",
|
kkonganti@0
|
425 "genus",
|
kkonganti@0
|
426 "species",
|
kkonganti@0
|
427 ]
|
kkonganti@0
|
428 )
|
kkonganti@0
|
429 )
|
kkonganti@0
|
430 krona_fh.close()
|
kkonganti@0
|
431
|
kkonganti@0
|
432 # Write the TPM values (TPM/scale_down) for valid lineages.
|
kkonganti@0
|
433 for lineage in lineage2sm.keys():
|
kkonganti@0
|
434 salmon_comb_res_fh.write(lineage)
|
kkonganti@0
|
435 salmon_comb_res_indiv_reads_mapped_fh.write(lineage)
|
kkonganti@0
|
436
|
kkonganti@0
|
437 for sample in all_samples:
|
kkonganti@0
|
438 krona_fh = open(
|
kkonganti@0
|
439 os.path.join(salmon_res_dir, sample + ".krona.tsv"), "a"
|
kkonganti@0
|
440 )
|
kkonganti@0
|
441
|
kkonganti@0
|
442 if sample in unclassified2sample.keys():
|
kkonganti@0
|
443 salmon_comb_res_fh.write(f"\t0.0")
|
kkonganti@0
|
444 salmon_comb_res_indiv_reads_mapped_fh.write(f"\t0")
|
kkonganti@0
|
445 elif sample in lineage2sample[lineage].keys():
|
kkonganti@0
|
446 reads = sum(per_taxon_reads[sample][lineage])
|
kkonganti@0
|
447 tpm = sum(lineage2sample[lineage][sample])
|
kkonganti@0
|
448 tph = round(tpm / scale_down, round_to)
|
kkonganti@0
|
449 lineage2sample[sample].setdefault("hits_tpm", []).append(
|
kkonganti@0
|
450 float(tpm)
|
kkonganti@0
|
451 )
|
kkonganti@0
|
452
|
kkonganti@0
|
453 salmon_comb_res_fh.write(f"\t{tph}")
|
kkonganti@0
|
454 salmon_comb_res_indiv_reads_mapped_fh.write(f"\t{reads}")
|
kkonganti@0
|
455 krona_lin_row = lineage.split(",")
|
kkonganti@0
|
456
|
kkonganti@0
|
457 if len(krona_lin_row) > num_lin_cols - 1:
|
kkonganti@0
|
458 logging.error(
|
kkonganti@0
|
459 "Taxonomy columns are more than 8 for the following lineage:"
|
kkonganti@0
|
460 + f"{krona_lin_row}"
|
kkonganti@0
|
461 )
|
kkonganti@0
|
462 exit(1)
|
kkonganti@0
|
463 else:
|
kkonganti@0
|
464 krona_fh.write(
|
kkonganti@0
|
465 "\n"
|
kkonganti@0
|
466 + str(round((tpm / tpm_const), round_to))
|
kkonganti@0
|
467 + "\t"
|
kkonganti@0
|
468 + "\t".join(krona_lin_row[:-1])
|
kkonganti@0
|
469 )
|
kkonganti@0
|
470 else:
|
kkonganti@0
|
471 salmon_comb_res_fh.write(f"\t0.0")
|
kkonganti@0
|
472 salmon_comb_res_indiv_reads_mapped_fh.write(f"\t0")
|
kkonganti@0
|
473 krona_fh.close()
|
kkonganti@0
|
474
|
kkonganti@0
|
475 salmon_comb_res_fh.write("\n")
|
kkonganti@0
|
476 salmon_comb_res_indiv_reads_mapped_fh.write(f"\n")
|
kkonganti@0
|
477
|
kkonganti@0
|
478 # Finally write TPH (TPM/scale_down) for Unclassified
|
kkonganti@0
|
479 # Row = Unclassified / No reads mapped to the database ...
|
kkonganti@0
|
480 salmon_comb_res_fh.write(f"{no_hit}")
|
kkonganti@0
|
481 salmon_comb_res_indiv_reads_mapped_fh.write(f"Total {no_hit_reads}")
|
kkonganti@0
|
482
|
kkonganti@0
|
483 for sample in all_samples:
|
kkonganti@0
|
484 krona_ufh = open(
|
kkonganti@0
|
485 os.path.join(salmon_res_dir, sample + ".krona.tsv"), "a"
|
kkonganti@0
|
486 )
|
kkonganti@0
|
487 # krona_ufh.write("\t")
|
kkonganti@0
|
488 if sample in unclassified2sample.keys():
|
kkonganti@0
|
489 salmon_comb_res_fh.write(
|
kkonganti@0
|
490 f"\t{round((unclassified2sample[sample][no_hit] / scale_down), round_to)}"
|
kkonganti@0
|
491 )
|
kkonganti@0
|
492 salmon_comb_res_indiv_reads_mapped_fh.write(f"\t0")
|
kkonganti@0
|
493 krona_ufh.write(
|
kkonganti@0
|
494 f"\n{round((unclassified2sample[sample][no_hit] / tpm_const), round_to)}"
|
kkonganti@0
|
495 )
|
kkonganti@0
|
496 else:
|
kkonganti@0
|
497 trace_tpm = tpm_const - sum(reads_total[sample]["all_tpm"])
|
kkonganti@0
|
498 trace_tpm = float(f"{trace_tpm:.{round_to}f}")
|
kkonganti@0
|
499 if trace_tpm <= 0:
|
kkonganti@0
|
500 trace_tpm = float(0.0)
|
kkonganti@0
|
501 tph_unclassified = float(
|
kkonganti@0
|
502 f"{(sum(reads_total[sample]['rem_tpm']) + trace_tpm) / scale_down:{round_to}f}"
|
kkonganti@0
|
503 )
|
kkonganti@0
|
504 krona_unclassified = float(
|
kkonganti@0
|
505 f"{(sum(reads_total[sample]['rem_tpm']) + trace_tpm) / tpm_const:{round_to}f}"
|
kkonganti@0
|
506 )
|
kkonganti@0
|
507 salmon_comb_res_fh.write(f"\t{tph_unclassified}")
|
kkonganti@0
|
508 salmon_comb_res_indiv_reads_mapped_fh.write(
|
kkonganti@0
|
509 f"\t{int(sum(sum(per_taxon_reads[sample].values(), [])))}"
|
kkonganti@0
|
510 )
|
kkonganti@0
|
511 krona_ufh.write(f"\n{krona_unclassified}")
|
kkonganti@0
|
512 krona_ufh.write("\t" + "\t".join(["unclassified"] * (num_lin_cols - 2)))
|
kkonganti@0
|
513 krona_ufh.close()
|
kkonganti@0
|
514
|
kkonganti@0
|
515 salmon_comb_res_fh.close()
|
kkonganti@0
|
516 salmon_comb_res_indiv_reads_mapped_fh.close()
|
kkonganti@0
|
517 # ppp.pprint(lineage2sample)
|
kkonganti@0
|
518 # ppp.pprint(lineage2sm)
|
kkonganti@0
|
519 # ppp.pprint(reads_total)
|
kkonganti@0
|
520
|
kkonganti@0
|
521
|
kkonganti@0
|
522 if __name__ == "__main__":
|
kkonganti@0
|
523 main()
|