Mercurial > repos > kkonganti > hfp_nowayout
comparison 0.5.0/bin/gen_salmon_tph_and_krona_tsv.py @ 0:97cd2f532efe
planemo upload
author | kkonganti |
---|---|
date | Mon, 31 Mar 2025 14:50:40 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:97cd2f532efe |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 # Kranti Konganti | |
4 # 03/06/2024 | |
5 | |
6 import argparse | |
7 import glob | |
8 import inspect | |
9 import logging | |
10 import os | |
11 import pprint | |
12 import re | |
13 from collections import defaultdict | |
14 | |
15 | |
16 # Multiple inheritence for pretty printing of help text. | |
17 class MultiArgFormatClasses( | |
18 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter | |
19 ): | |
20 pass | |
21 | |
22 | |
23 # Main | |
24 def main() -> None: | |
25 """ | |
26 The succesful execution of this script requires access to properly formatted | |
27 lineages.csv file which has no more than 9 columns. | |
28 | |
29 It takes the lineages.csv file, the *_hits.csv results from `sourmash gather` | |
30 mentioned with -smres option and and a root parent directory of the | |
31 `salmon quant` results mentioned with -sal option and generates a final | |
32 results table with the TPM values and a .krona.tsv file for each sample | |
33 to be used by KronaTools. | |
34 """ | |
35 # Set logging. | |
36 logging.basicConfig( | |
37 format="\n" | |
38 + "=" * 55 | |
39 + "\n%(asctime)s - %(levelname)s\n" | |
40 + "=" * 55 | |
41 + "\n%(message)s\n\n", | |
42 level=logging.DEBUG, | |
43 ) | |
44 | |
45 # Debug print. | |
46 ppp = pprint.PrettyPrinter(width=55) | |
47 prog_name = inspect.stack()[0].filename | |
48 | |
49 parser = argparse.ArgumentParser( | |
50 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses | |
51 ) | |
52 | |
53 required = parser.add_argument_group("required arguments") | |
54 | |
55 required.add_argument( | |
56 "-sal", | |
57 dest="salmon_res_dir", | |
58 default=False, | |
59 required=True, | |
60 help="Absolute UNIX path to the parent directory that contains the\n" | |
61 + "`salmon quant` results. For example, if path to\n" | |
62 + "`quant.sf` is in /hpc/john_doe/test/salmon_res/sampleA/quant.sf, then\n" | |
63 + "use this command-line option as:\n" | |
64 + "-sal /hpc/john_doe/test/salmon_res", | |
65 ) | |
66 required.add_argument( | |
67 "-lin", | |
68 dest="lin", | |
69 default=False, | |
70 required=True, | |
71 help="Absolute UNIX Path to the lineages CSV file.\n" | |
72 + "This file should have only 9 columns.", | |
73 ) | |
74 required.add_argument( | |
75 "-smres", | |
76 dest="sm_res_dir", | |
77 default=False, | |
78 required=True, | |
79 help="Absolute UNIX path to the parent directory that contains the\n" | |
80 + "filtered `sourmas gather` results. For example, if path to\n" | |
81 + "`sampleA.csv` is in /hpc/john_doe/test/sourmash_gather/sampleA.csv,\n" | |
82 + "then use this command-line option as:\n" | |
83 + "-sal /hpc/john_doe/test", | |
84 ) | |
85 parser.add_argument( | |
86 "-op", | |
87 dest="out_prefix", | |
88 default="nowayout.tblsum", | |
89 required=False, | |
90 help="Set the output file(s) prefix for output(s) generated\n" | |
91 + "by this program.", | |
92 ) | |
93 parser.add_argument( | |
94 "-sf", | |
95 dest="scale_down_factor", | |
96 default=float(10000), | |
97 required=False, | |
98 help="Set the scaling factor by which TPM values are scaled\ndown.", | |
99 ) | |
100 parser.add_argument( | |
101 "-smres-suffix", | |
102 dest="sm_res_suffix", | |
103 default="_hits.csv", | |
104 required=False, | |
105 help="Find the `sourmash gather` result files ending in this\nsuffix.", | |
106 ) | |
107 parser.add_argument( | |
108 "-failed-suffix", | |
109 dest="failed_suffix", | |
110 default="_FAILED.txt", | |
111 required=False, | |
112 help="Find the sample names which failed classification stored\n" | |
113 + "inside the files ending in this suffix.", | |
114 ) | |
115 parser.add_argument( | |
116 "-num-lin-cols", | |
117 dest="num_lin_cols", | |
118 default=int(9), | |
119 required=False, | |
120 help="Number of columns expected in the lineages CSV file.", | |
121 ) | |
122 parser.add_argument( | |
123 "-lin-acc-regex", | |
124 dest="lin_acc_regex", | |
125 default=re.compile(r"\w+[\-\.]{1}[0-9]+"), | |
126 required=False, | |
127 help="The pattern of the lineage's accession.", | |
128 ) | |
129 | |
130 args = parser.parse_args() | |
131 salmon_res_dir = args.salmon_res_dir | |
132 sm_res_dir = args.sm_res_dir | |
133 sm_res_suffix = args.sm_res_suffix | |
134 failed_suffix = args.failed_suffix | |
135 out_prefix = args.out_prefix | |
136 lin = args.lin | |
137 num_lin_cols = args.num_lin_cols | |
138 acc_pat = args.lin_acc_regex | |
139 scale_down = float(args.scale_down_factor) | |
140 no_hit = "Unclassified" | |
141 no_hit_reads = "reads mapped to the database" | |
142 tpm_const = float(1000000.0000000000) | |
143 round_to = 10 | |
144 all_samples = set() | |
145 ( | |
146 lineage2sample, | |
147 unclassified2sample, | |
148 lineage2sm, | |
149 sm2passed, | |
150 reads_total, | |
151 per_taxon_reads, | |
152 lineages, | |
153 ) = ( | |
154 defaultdict(defaultdict), | |
155 defaultdict(defaultdict), | |
156 defaultdict(defaultdict), | |
157 defaultdict(defaultdict), | |
158 defaultdict(defaultdict), | |
159 defaultdict(defaultdict), | |
160 defaultdict(int), | |
161 ) | |
162 | |
163 salmon_comb_res = os.path.join(os.getcwd(), out_prefix + ".txt") | |
164 # salmon_comb_res_reads_mapped = os.path.join( | |
165 # os.getcwd(), re.sub(".tblsum", "_reads_mapped.tblsum", out_prefix) + ".txt" | |
166 # ) | |
167 salmon_comb_res_indiv_reads_mapped = os.path.join( | |
168 os.getcwd(), | |
169 re.sub(".tblsum", "_indiv_reads_mapped.tblsum", out_prefix) + ".txt", | |
170 ) | |
171 salmon_res_files = glob.glob( | |
172 os.path.join(salmon_res_dir, "*", "quant.sf"), recursive=True | |
173 ) | |
174 sample_res_files_failed = glob.glob( | |
175 os.path.join(salmon_res_dir, "*" + failed_suffix), recursive=True | |
176 ) | |
177 sm_res_files = glob.glob( | |
178 os.path.join(sm_res_dir, "*" + sm_res_suffix), recursive=True | |
179 ) | |
180 | |
181 # Basic checks | |
182 if lin and not (os.path.exists(lin) and os.path.getsize(lin) > 0): | |
183 logging.error( | |
184 "The lineages file,\n" | |
185 + f"{os.path.basename(lin)} does not exist or is empty!" | |
186 ) | |
187 exit(1) | |
188 | |
189 if salmon_res_dir: | |
190 if not os.path.isdir(salmon_res_dir): | |
191 logging.error("UNIX path\n" + f"{salmon_res_dir}\n" + "does not exist!") | |
192 exit(1) | |
193 if len(salmon_res_files) <= 0: | |
194 with open(salmon_comb_res, "w") as salmon_comb_res_fh, open( | |
195 salmon_comb_res_indiv_reads_mapped, "w" | |
196 ) as salmon_comb_res_indiv_reads_mapped_fh: | |
197 salmon_comb_res_fh.write(f"Sample\n{no_hit} reads in all samples\n") | |
198 salmon_comb_res_indiv_reads_mapped_fh.write( | |
199 f"Sample\nNo {no_hit_reads} from all samples\n" | |
200 ) | |
201 salmon_comb_res_fh.close() | |
202 salmon_comb_res_indiv_reads_mapped_fh.close() | |
203 exit(0) | |
204 | |
205 # Only proceed if lineages.csv exists. | |
206 if lin and os.path.exists(lin) and os.path.getsize(lin) > 0: | |
207 lin_fh = open(lin, "r") | |
208 _ = lin_fh.readline() | |
209 | |
210 # Index lineages.csv | |
211 for line in lin_fh: | |
212 cols = line.strip().split(",") | |
213 | |
214 if len(cols) < num_lin_cols: | |
215 logging.error( | |
216 f"The file {os.path.basename(lin)} seems to\n" | |
217 + "be malformed. It contains less than required 9 columns." | |
218 ) | |
219 exit(1) | |
220 | |
221 if cols[0] in lineages.keys(): | |
222 continue | |
223 # logging.info( | |
224 # f"There is a duplicate accession [{cols[0]}]" | |
225 # + f" in the lineages file {os.path.basename(lin)}!" | |
226 # ) | |
227 elif acc_pat.match(cols[0]): | |
228 lineages[cols[0]] = ",".join(cols[1:]) | |
229 | |
230 lin_fh.close() | |
231 | |
232 # Index each samples' filtered sourmash results. | |
233 for sm_res_file in sm_res_files: | |
234 sample_name = re.sub(sm_res_suffix, "", os.path.basename(sm_res_file)) | |
235 | |
236 with open(sm_res_file, "r") as sm_res_fh: | |
237 _ = sm_res_fh.readline() | |
238 for line in sm_res_fh: | |
239 acc = acc_pat.findall(line.strip().split(",")[9]) | |
240 | |
241 if len(acc) == 0: | |
242 logging.info( | |
243 f"Got empty lineage accession: {acc}" | |
244 + f"\nRow elements: {line.strip().split(',')}" | |
245 ) | |
246 exit(1) | |
247 if len(acc) not in [1]: | |
248 logging.info( | |
249 f"Got more than one lineage accession: {acc}" | |
250 + f"\nRow elements: {line.strip().split(',')}" | |
251 ) | |
252 logging.info(f"Considering first element: {acc[0]}") | |
253 if acc[0] not in lineages.keys(): | |
254 logging.error( | |
255 f"The lineage accession {acc[0]} is not found in {os.path.basename(lin)}" | |
256 ) | |
257 exit(1) | |
258 lineage2sm[lineages[acc[0]]].setdefault(sample_name, 1) | |
259 sm2passed["sourmash_passed"].setdefault(sample_name, 1) | |
260 sm_res_fh.close() | |
261 | |
262 # Index each samples' salmon results. | |
263 for salmon_res_file in salmon_res_files: | |
264 sample_name = re.match( | |
265 r"(^.+?)((\_salmon\_res)|(\.salmon))$", | |
266 os.path.basename(os.path.dirname(salmon_res_file)), | |
267 )[1] | |
268 salmon_meta_json = os.path.join( | |
269 os.path.dirname(salmon_res_file), "aux_info", "meta_info.json" | |
270 ) | |
271 | |
272 if ( | |
273 not os.path.exists(salmon_meta_json) | |
274 or not os.path.getsize(salmon_meta_json) > 0 | |
275 ): | |
276 logging.error( | |
277 "The file\n" | |
278 + f"{salmon_meta_json}\ndoes not exist or is empty!\n" | |
279 + "Did `salmon quant` fail?" | |
280 ) | |
281 exit(1) | |
282 | |
283 if ( | |
284 not os.path.exists(salmon_res_file) | |
285 or not os.path.getsize(salmon_res_file) > 0 | |
286 ): | |
287 logging.error( | |
288 "The file\n" | |
289 + f"{salmon_res_file}\ndoes not exist or is empty!\n" | |
290 + "Did `salmon quant` fail?" | |
291 ) | |
292 exit(1) | |
293 | |
294 # Initiate all_tpm, rem_tpm and reads_mapped | |
295 # all_tpm | |
296 reads_total[sample_name].setdefault("all_tpm", []).append(float(0.0)) | |
297 # rem_tpm | |
298 reads_total[sample_name].setdefault("rem_tpm", []).append(float(0.0)) | |
299 # reads_mapped | |
300 reads_total[sample_name].setdefault("reads_mapped", []).append(float(0.0)) | |
301 | |
302 with open(salmon_res_file, "r") as salmon_res_fh: | |
303 for line in salmon_res_fh.readlines(): | |
304 if re.match(r"^Name.+", line): | |
305 continue | |
306 cols = line.strip().split("\t") | |
307 ref_acc = cols[0] | |
308 tpm = cols[3] | |
309 num_reads_mapped = cols[4] | |
310 | |
311 ( | |
312 reads_total[sample_name] | |
313 .setdefault("all_tpm", []) | |
314 .append( | |
315 round(float(tpm), round_to), | |
316 ) | |
317 ) | |
318 | |
319 ( | |
320 reads_total[sample_name] | |
321 .setdefault("reads_mapped", []) | |
322 .append( | |
323 round(float(num_reads_mapped), round_to), | |
324 ) | |
325 ) | |
326 | |
327 if lineages[ref_acc] in lineage2sm.keys(): | |
328 ( | |
329 lineage2sample[lineages[ref_acc]] | |
330 .setdefault(sample_name, []) | |
331 .append(round(float(tpm), round_to)) | |
332 ) | |
333 ( | |
334 per_taxon_reads[sample_name] | |
335 .setdefault(lineages[ref_acc], []) | |
336 .append(round(float(num_reads_mapped))) | |
337 ) | |
338 else: | |
339 ( | |
340 reads_total[sample_name] | |
341 .setdefault("rem_tpm", []) | |
342 .append( | |
343 round(float(tpm), round_to), | |
344 ) | |
345 ) | |
346 | |
347 salmon_res_fh.close() | |
348 | |
349 # Index each samples' complete failure results i.e., 100% unclassified. | |
350 for sample_res_file_failed in sample_res_files_failed: | |
351 sample_name = re.sub( | |
352 failed_suffix, "", os.path.basename(sample_res_file_failed) | |
353 ) | |
354 with open("".join(sample_res_file_failed), "r") as no_calls_fh: | |
355 for line in no_calls_fh.readlines(): | |
356 if line in ["\n", "\n\r", "\r"]: | |
357 continue | |
358 unclassified2sample[sample_name].setdefault(no_hit, tpm_const) | |
359 no_calls_fh.close() | |
360 | |
361 # Finally, write all results. | |
362 for sample in sorted(reads_total.keys()) + sorted(unclassified2sample.keys()): | |
363 all_samples.add(sample) | |
364 | |
365 # Check if sourmash results exist but salmon `quant` failed | |
366 # and if so, set the sample to 100% Unclassified as well. | |
367 for sample in sm2passed["sourmash_passed"].keys(): | |
368 if sample not in all_samples: | |
369 unclassified2sample[sample].setdefault(no_hit, tpm_const) | |
370 all_samples.add(sample) | |
371 | |
372 # Write total number of reads mapped to nowayout database. | |
373 # with open(salmon_comb_res_reads_mapped, "w") as nowo_reads_mapped_fh: | |
374 # nowo_reads_mapped_fh.write( | |
375 # "\t".join( | |
376 # [ | |
377 # "Sample", | |
378 # "All reads", | |
379 # "Classified reads", | |
380 # "Unclassified reads (Reads failed thresholds )", | |
381 # ] | |
382 # ) | |
383 # ) | |
384 | |
385 # for sample in all_samples: | |
386 # if sample in reads_total.keys(): | |
387 # nowo_reads_mapped_fh.write( | |
388 # "\n" | |
389 # + "\t".join( | |
390 # [ | |
391 # f"\n{sample}", | |
392 # f"{int(sum(reads_total[sample]['reads_mapped']))}", | |
393 # f"{int(reads_total[sample]['reads_mapped'])}", | |
394 # f"{int(reads_total[sample]['rem_tpm'])}", | |
395 # ], | |
396 # ) | |
397 # ) | |
398 # else: | |
399 # nowo_reads_mapped_fh.write(f"\n{sample}\t{int(0.0)}") | |
400 # nowo_reads_mapped_fh.close() | |
401 | |
402 # Write scaled down TPM values for each sample. | |
403 with open(salmon_comb_res, "w") as salmon_comb_res_fh, open( | |
404 salmon_comb_res_indiv_reads_mapped, "w" | |
405 ) as salmon_comb_res_indiv_reads_mapped_fh: | |
406 salmon_comb_res_fh.write("Lineage\t" + "\t".join(all_samples) + "\n") | |
407 salmon_comb_res_indiv_reads_mapped_fh.write( | |
408 "Lineage\t" + "\t".join(all_samples) + "\n" | |
409 ) | |
410 | |
411 # Write *.krona.tsv header for all samples. | |
412 for sample in all_samples: | |
413 krona_fh = open( | |
414 os.path.join(salmon_res_dir, sample + ".krona.tsv"), "w" | |
415 ) | |
416 krona_fh.write( | |
417 "\t".join( | |
418 [ | |
419 "fraction", | |
420 "superkingdom", | |
421 "phylum", | |
422 "class", | |
423 "order", | |
424 "family", | |
425 "genus", | |
426 "species", | |
427 ] | |
428 ) | |
429 ) | |
430 krona_fh.close() | |
431 | |
432 # Write the TPM values (TPM/scale_down) for valid lineages. | |
433 for lineage in lineage2sm.keys(): | |
434 salmon_comb_res_fh.write(lineage) | |
435 salmon_comb_res_indiv_reads_mapped_fh.write(lineage) | |
436 | |
437 for sample in all_samples: | |
438 krona_fh = open( | |
439 os.path.join(salmon_res_dir, sample + ".krona.tsv"), "a" | |
440 ) | |
441 | |
442 if sample in unclassified2sample.keys(): | |
443 salmon_comb_res_fh.write(f"\t0.0") | |
444 salmon_comb_res_indiv_reads_mapped_fh.write(f"\t0") | |
445 elif sample in lineage2sample[lineage].keys(): | |
446 reads = sum(per_taxon_reads[sample][lineage]) | |
447 tpm = sum(lineage2sample[lineage][sample]) | |
448 tph = round(tpm / scale_down, round_to) | |
449 lineage2sample[sample].setdefault("hits_tpm", []).append( | |
450 float(tpm) | |
451 ) | |
452 | |
453 salmon_comb_res_fh.write(f"\t{tph}") | |
454 salmon_comb_res_indiv_reads_mapped_fh.write(f"\t{reads}") | |
455 krona_lin_row = lineage.split(",") | |
456 | |
457 if len(krona_lin_row) > num_lin_cols - 1: | |
458 logging.error( | |
459 "Taxonomy columns are more than 8 for the following lineage:" | |
460 + f"{krona_lin_row}" | |
461 ) | |
462 exit(1) | |
463 else: | |
464 krona_fh.write( | |
465 "\n" | |
466 + str(round((tpm / tpm_const), round_to)) | |
467 + "\t" | |
468 + "\t".join(krona_lin_row[:-1]) | |
469 ) | |
470 else: | |
471 salmon_comb_res_fh.write(f"\t0.0") | |
472 salmon_comb_res_indiv_reads_mapped_fh.write(f"\t0") | |
473 krona_fh.close() | |
474 | |
475 salmon_comb_res_fh.write("\n") | |
476 salmon_comb_res_indiv_reads_mapped_fh.write(f"\n") | |
477 | |
478 # Finally write TPH (TPM/scale_down) for Unclassified | |
479 # Row = Unclassified / No reads mapped to the database ... | |
480 salmon_comb_res_fh.write(f"{no_hit}") | |
481 salmon_comb_res_indiv_reads_mapped_fh.write(f"Total {no_hit_reads}") | |
482 | |
483 for sample in all_samples: | |
484 krona_ufh = open( | |
485 os.path.join(salmon_res_dir, sample + ".krona.tsv"), "a" | |
486 ) | |
487 # krona_ufh.write("\t") | |
488 if sample in unclassified2sample.keys(): | |
489 salmon_comb_res_fh.write( | |
490 f"\t{round((unclassified2sample[sample][no_hit] / scale_down), round_to)}" | |
491 ) | |
492 salmon_comb_res_indiv_reads_mapped_fh.write(f"\t0") | |
493 krona_ufh.write( | |
494 f"\n{round((unclassified2sample[sample][no_hit] / tpm_const), round_to)}" | |
495 ) | |
496 else: | |
497 trace_tpm = tpm_const - sum(reads_total[sample]["all_tpm"]) | |
498 trace_tpm = float(f"{trace_tpm:.{round_to}f}") | |
499 if trace_tpm <= 0: | |
500 trace_tpm = float(0.0) | |
501 tph_unclassified = float( | |
502 f"{(sum(reads_total[sample]['rem_tpm']) + trace_tpm) / scale_down:{round_to}f}" | |
503 ) | |
504 krona_unclassified = float( | |
505 f"{(sum(reads_total[sample]['rem_tpm']) + trace_tpm) / tpm_const:{round_to}f}" | |
506 ) | |
507 salmon_comb_res_fh.write(f"\t{tph_unclassified}") | |
508 salmon_comb_res_indiv_reads_mapped_fh.write( | |
509 f"\t{int(sum(sum(per_taxon_reads[sample].values(), [])))}" | |
510 ) | |
511 krona_ufh.write(f"\n{krona_unclassified}") | |
512 krona_ufh.write("\t" + "\t".join(["unclassified"] * (num_lin_cols - 2))) | |
513 krona_ufh.close() | |
514 | |
515 salmon_comb_res_fh.close() | |
516 salmon_comb_res_indiv_reads_mapped_fh.close() | |
517 # ppp.pprint(lineage2sample) | |
518 # ppp.pprint(lineage2sm) | |
519 # ppp.pprint(reads_total) | |
520 | |
521 | |
522 if __name__ == "__main__": | |
523 main() |