annotate 0.5.0/bin/gen_salmon_tph_and_krona_tsv.py @ 10:74ac6f6a9526 tip

planemo upload
author kkonganti
date Tue, 01 Apr 2025 11:08:01 -0400
parents 97cd2f532efe
children
rev   line source
kkonganti@0 1 #!/usr/bin/env python3
kkonganti@0 2
kkonganti@0 3 # Kranti Konganti
kkonganti@0 4 # 03/06/2024
kkonganti@0 5
kkonganti@0 6 import argparse
kkonganti@0 7 import glob
kkonganti@0 8 import inspect
kkonganti@0 9 import logging
kkonganti@0 10 import os
kkonganti@0 11 import pprint
kkonganti@0 12 import re
kkonganti@0 13 from collections import defaultdict
kkonganti@0 14
kkonganti@0 15
kkonganti@0 16 # Multiple inheritence for pretty printing of help text.
kkonganti@0 17 class MultiArgFormatClasses(
kkonganti@0 18 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
kkonganti@0 19 ):
kkonganti@0 20 pass
kkonganti@0 21
kkonganti@0 22
kkonganti@0 23 # Main
kkonganti@0 24 def main() -> None:
kkonganti@0 25 """
kkonganti@0 26 The succesful execution of this script requires access to properly formatted
kkonganti@0 27 lineages.csv file which has no more than 9 columns.
kkonganti@0 28
kkonganti@0 29 It takes the lineages.csv file, the *_hits.csv results from `sourmash gather`
kkonganti@0 30 mentioned with -smres option and and a root parent directory of the
kkonganti@0 31 `salmon quant` results mentioned with -sal option and generates a final
kkonganti@0 32 results table with the TPM values and a .krona.tsv file for each sample
kkonganti@0 33 to be used by KronaTools.
kkonganti@0 34 """
kkonganti@0 35 # Set logging.
kkonganti@0 36 logging.basicConfig(
kkonganti@0 37 format="\n"
kkonganti@0 38 + "=" * 55
kkonganti@0 39 + "\n%(asctime)s - %(levelname)s\n"
kkonganti@0 40 + "=" * 55
kkonganti@0 41 + "\n%(message)s\n\n",
kkonganti@0 42 level=logging.DEBUG,
kkonganti@0 43 )
kkonganti@0 44
kkonganti@0 45 # Debug print.
kkonganti@0 46 ppp = pprint.PrettyPrinter(width=55)
kkonganti@0 47 prog_name = inspect.stack()[0].filename
kkonganti@0 48
kkonganti@0 49 parser = argparse.ArgumentParser(
kkonganti@0 50 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@0 51 )
kkonganti@0 52
kkonganti@0 53 required = parser.add_argument_group("required arguments")
kkonganti@0 54
kkonganti@0 55 required.add_argument(
kkonganti@0 56 "-sal",
kkonganti@0 57 dest="salmon_res_dir",
kkonganti@0 58 default=False,
kkonganti@0 59 required=True,
kkonganti@0 60 help="Absolute UNIX path to the parent directory that contains the\n"
kkonganti@0 61 + "`salmon quant` results. For example, if path to\n"
kkonganti@0 62 + "`quant.sf` is in /hpc/john_doe/test/salmon_res/sampleA/quant.sf, then\n"
kkonganti@0 63 + "use this command-line option as:\n"
kkonganti@0 64 + "-sal /hpc/john_doe/test/salmon_res",
kkonganti@0 65 )
kkonganti@0 66 required.add_argument(
kkonganti@0 67 "-lin",
kkonganti@0 68 dest="lin",
kkonganti@0 69 default=False,
kkonganti@0 70 required=True,
kkonganti@0 71 help="Absolute UNIX Path to the lineages CSV file.\n"
kkonganti@0 72 + "This file should have only 9 columns.",
kkonganti@0 73 )
kkonganti@0 74 required.add_argument(
kkonganti@0 75 "-smres",
kkonganti@0 76 dest="sm_res_dir",
kkonganti@0 77 default=False,
kkonganti@0 78 required=True,
kkonganti@0 79 help="Absolute UNIX path to the parent directory that contains the\n"
kkonganti@0 80 + "filtered `sourmas gather` results. For example, if path to\n"
kkonganti@0 81 + "`sampleA.csv` is in /hpc/john_doe/test/sourmash_gather/sampleA.csv,\n"
kkonganti@0 82 + "then use this command-line option as:\n"
kkonganti@0 83 + "-sal /hpc/john_doe/test",
kkonganti@0 84 )
kkonganti@0 85 parser.add_argument(
kkonganti@0 86 "-op",
kkonganti@0 87 dest="out_prefix",
kkonganti@0 88 default="nowayout.tblsum",
kkonganti@0 89 required=False,
kkonganti@0 90 help="Set the output file(s) prefix for output(s) generated\n"
kkonganti@0 91 + "by this program.",
kkonganti@0 92 )
kkonganti@0 93 parser.add_argument(
kkonganti@0 94 "-sf",
kkonganti@0 95 dest="scale_down_factor",
kkonganti@0 96 default=float(10000),
kkonganti@0 97 required=False,
kkonganti@0 98 help="Set the scaling factor by which TPM values are scaled\ndown.",
kkonganti@0 99 )
kkonganti@0 100 parser.add_argument(
kkonganti@0 101 "-smres-suffix",
kkonganti@0 102 dest="sm_res_suffix",
kkonganti@0 103 default="_hits.csv",
kkonganti@0 104 required=False,
kkonganti@0 105 help="Find the `sourmash gather` result files ending in this\nsuffix.",
kkonganti@0 106 )
kkonganti@0 107 parser.add_argument(
kkonganti@0 108 "-failed-suffix",
kkonganti@0 109 dest="failed_suffix",
kkonganti@0 110 default="_FAILED.txt",
kkonganti@0 111 required=False,
kkonganti@0 112 help="Find the sample names which failed classification stored\n"
kkonganti@0 113 + "inside the files ending in this suffix.",
kkonganti@0 114 )
kkonganti@0 115 parser.add_argument(
kkonganti@0 116 "-num-lin-cols",
kkonganti@0 117 dest="num_lin_cols",
kkonganti@0 118 default=int(9),
kkonganti@0 119 required=False,
kkonganti@0 120 help="Number of columns expected in the lineages CSV file.",
kkonganti@0 121 )
kkonganti@0 122 parser.add_argument(
kkonganti@0 123 "-lin-acc-regex",
kkonganti@0 124 dest="lin_acc_regex",
kkonganti@0 125 default=re.compile(r"\w+[\-\.]{1}[0-9]+"),
kkonganti@0 126 required=False,
kkonganti@0 127 help="The pattern of the lineage's accession.",
kkonganti@0 128 )
kkonganti@0 129
kkonganti@0 130 args = parser.parse_args()
kkonganti@0 131 salmon_res_dir = args.salmon_res_dir
kkonganti@0 132 sm_res_dir = args.sm_res_dir
kkonganti@0 133 sm_res_suffix = args.sm_res_suffix
kkonganti@0 134 failed_suffix = args.failed_suffix
kkonganti@0 135 out_prefix = args.out_prefix
kkonganti@0 136 lin = args.lin
kkonganti@0 137 num_lin_cols = args.num_lin_cols
kkonganti@0 138 acc_pat = args.lin_acc_regex
kkonganti@0 139 scale_down = float(args.scale_down_factor)
kkonganti@0 140 no_hit = "Unclassified"
kkonganti@0 141 no_hit_reads = "reads mapped to the database"
kkonganti@0 142 tpm_const = float(1000000.0000000000)
kkonganti@0 143 round_to = 10
kkonganti@0 144 all_samples = set()
kkonganti@0 145 (
kkonganti@0 146 lineage2sample,
kkonganti@0 147 unclassified2sample,
kkonganti@0 148 lineage2sm,
kkonganti@0 149 sm2passed,
kkonganti@0 150 reads_total,
kkonganti@0 151 per_taxon_reads,
kkonganti@0 152 lineages,
kkonganti@0 153 ) = (
kkonganti@0 154 defaultdict(defaultdict),
kkonganti@0 155 defaultdict(defaultdict),
kkonganti@0 156 defaultdict(defaultdict),
kkonganti@0 157 defaultdict(defaultdict),
kkonganti@0 158 defaultdict(defaultdict),
kkonganti@0 159 defaultdict(defaultdict),
kkonganti@0 160 defaultdict(int),
kkonganti@0 161 )
kkonganti@0 162
kkonganti@0 163 salmon_comb_res = os.path.join(os.getcwd(), out_prefix + ".txt")
kkonganti@0 164 # salmon_comb_res_reads_mapped = os.path.join(
kkonganti@0 165 # os.getcwd(), re.sub(".tblsum", "_reads_mapped.tblsum", out_prefix) + ".txt"
kkonganti@0 166 # )
kkonganti@0 167 salmon_comb_res_indiv_reads_mapped = os.path.join(
kkonganti@0 168 os.getcwd(),
kkonganti@0 169 re.sub(".tblsum", "_indiv_reads_mapped.tblsum", out_prefix) + ".txt",
kkonganti@0 170 )
kkonganti@0 171 salmon_res_files = glob.glob(
kkonganti@0 172 os.path.join(salmon_res_dir, "*", "quant.sf"), recursive=True
kkonganti@0 173 )
kkonganti@0 174 sample_res_files_failed = glob.glob(
kkonganti@0 175 os.path.join(salmon_res_dir, "*" + failed_suffix), recursive=True
kkonganti@0 176 )
kkonganti@0 177 sm_res_files = glob.glob(
kkonganti@0 178 os.path.join(sm_res_dir, "*" + sm_res_suffix), recursive=True
kkonganti@0 179 )
kkonganti@0 180
kkonganti@0 181 # Basic checks
kkonganti@0 182 if lin and not (os.path.exists(lin) and os.path.getsize(lin) > 0):
kkonganti@0 183 logging.error(
kkonganti@0 184 "The lineages file,\n"
kkonganti@0 185 + f"{os.path.basename(lin)} does not exist or is empty!"
kkonganti@0 186 )
kkonganti@0 187 exit(1)
kkonganti@0 188
kkonganti@0 189 if salmon_res_dir:
kkonganti@0 190 if not os.path.isdir(salmon_res_dir):
kkonganti@0 191 logging.error("UNIX path\n" + f"{salmon_res_dir}\n" + "does not exist!")
kkonganti@0 192 exit(1)
kkonganti@0 193 if len(salmon_res_files) <= 0:
kkonganti@0 194 with open(salmon_comb_res, "w") as salmon_comb_res_fh, open(
kkonganti@0 195 salmon_comb_res_indiv_reads_mapped, "w"
kkonganti@0 196 ) as salmon_comb_res_indiv_reads_mapped_fh:
kkonganti@0 197 salmon_comb_res_fh.write(f"Sample\n{no_hit} reads in all samples\n")
kkonganti@0 198 salmon_comb_res_indiv_reads_mapped_fh.write(
kkonganti@0 199 f"Sample\nNo {no_hit_reads} from all samples\n"
kkonganti@0 200 )
kkonganti@0 201 salmon_comb_res_fh.close()
kkonganti@0 202 salmon_comb_res_indiv_reads_mapped_fh.close()
kkonganti@0 203 exit(0)
kkonganti@0 204
kkonganti@0 205 # Only proceed if lineages.csv exists.
kkonganti@0 206 if lin and os.path.exists(lin) and os.path.getsize(lin) > 0:
kkonganti@0 207 lin_fh = open(lin, "r")
kkonganti@0 208 _ = lin_fh.readline()
kkonganti@0 209
kkonganti@0 210 # Index lineages.csv
kkonganti@0 211 for line in lin_fh:
kkonganti@0 212 cols = line.strip().split(",")
kkonganti@0 213
kkonganti@0 214 if len(cols) < num_lin_cols:
kkonganti@0 215 logging.error(
kkonganti@0 216 f"The file {os.path.basename(lin)} seems to\n"
kkonganti@0 217 + "be malformed. It contains less than required 9 columns."
kkonganti@0 218 )
kkonganti@0 219 exit(1)
kkonganti@0 220
kkonganti@0 221 if cols[0] in lineages.keys():
kkonganti@0 222 continue
kkonganti@0 223 # logging.info(
kkonganti@0 224 # f"There is a duplicate accession [{cols[0]}]"
kkonganti@0 225 # + f" in the lineages file {os.path.basename(lin)}!"
kkonganti@0 226 # )
kkonganti@0 227 elif acc_pat.match(cols[0]):
kkonganti@0 228 lineages[cols[0]] = ",".join(cols[1:])
kkonganti@0 229
kkonganti@0 230 lin_fh.close()
kkonganti@0 231
kkonganti@0 232 # Index each samples' filtered sourmash results.
kkonganti@0 233 for sm_res_file in sm_res_files:
kkonganti@0 234 sample_name = re.sub(sm_res_suffix, "", os.path.basename(sm_res_file))
kkonganti@0 235
kkonganti@0 236 with open(sm_res_file, "r") as sm_res_fh:
kkonganti@0 237 _ = sm_res_fh.readline()
kkonganti@0 238 for line in sm_res_fh:
kkonganti@0 239 acc = acc_pat.findall(line.strip().split(",")[9])
kkonganti@0 240
kkonganti@0 241 if len(acc) == 0:
kkonganti@0 242 logging.info(
kkonganti@0 243 f"Got empty lineage accession: {acc}"
kkonganti@0 244 + f"\nRow elements: {line.strip().split(',')}"
kkonganti@0 245 )
kkonganti@0 246 exit(1)
kkonganti@0 247 if len(acc) not in [1]:
kkonganti@0 248 logging.info(
kkonganti@0 249 f"Got more than one lineage accession: {acc}"
kkonganti@0 250 + f"\nRow elements: {line.strip().split(',')}"
kkonganti@0 251 )
kkonganti@0 252 logging.info(f"Considering first element: {acc[0]}")
kkonganti@0 253 if acc[0] not in lineages.keys():
kkonganti@0 254 logging.error(
kkonganti@0 255 f"The lineage accession {acc[0]} is not found in {os.path.basename(lin)}"
kkonganti@0 256 )
kkonganti@0 257 exit(1)
kkonganti@0 258 lineage2sm[lineages[acc[0]]].setdefault(sample_name, 1)
kkonganti@0 259 sm2passed["sourmash_passed"].setdefault(sample_name, 1)
kkonganti@0 260 sm_res_fh.close()
kkonganti@0 261
kkonganti@0 262 # Index each samples' salmon results.
kkonganti@0 263 for salmon_res_file in salmon_res_files:
kkonganti@0 264 sample_name = re.match(
kkonganti@0 265 r"(^.+?)((\_salmon\_res)|(\.salmon))$",
kkonganti@0 266 os.path.basename(os.path.dirname(salmon_res_file)),
kkonganti@0 267 )[1]
kkonganti@0 268 salmon_meta_json = os.path.join(
kkonganti@0 269 os.path.dirname(salmon_res_file), "aux_info", "meta_info.json"
kkonganti@0 270 )
kkonganti@0 271
kkonganti@0 272 if (
kkonganti@0 273 not os.path.exists(salmon_meta_json)
kkonganti@0 274 or not os.path.getsize(salmon_meta_json) > 0
kkonganti@0 275 ):
kkonganti@0 276 logging.error(
kkonganti@0 277 "The file\n"
kkonganti@0 278 + f"{salmon_meta_json}\ndoes not exist or is empty!\n"
kkonganti@0 279 + "Did `salmon quant` fail?"
kkonganti@0 280 )
kkonganti@0 281 exit(1)
kkonganti@0 282
kkonganti@0 283 if (
kkonganti@0 284 not os.path.exists(salmon_res_file)
kkonganti@0 285 or not os.path.getsize(salmon_res_file) > 0
kkonganti@0 286 ):
kkonganti@0 287 logging.error(
kkonganti@0 288 "The file\n"
kkonganti@0 289 + f"{salmon_res_file}\ndoes not exist or is empty!\n"
kkonganti@0 290 + "Did `salmon quant` fail?"
kkonganti@0 291 )
kkonganti@0 292 exit(1)
kkonganti@0 293
kkonganti@0 294 # Initiate all_tpm, rem_tpm and reads_mapped
kkonganti@0 295 # all_tpm
kkonganti@0 296 reads_total[sample_name].setdefault("all_tpm", []).append(float(0.0))
kkonganti@0 297 # rem_tpm
kkonganti@0 298 reads_total[sample_name].setdefault("rem_tpm", []).append(float(0.0))
kkonganti@0 299 # reads_mapped
kkonganti@0 300 reads_total[sample_name].setdefault("reads_mapped", []).append(float(0.0))
kkonganti@0 301
kkonganti@0 302 with open(salmon_res_file, "r") as salmon_res_fh:
kkonganti@0 303 for line in salmon_res_fh.readlines():
kkonganti@0 304 if re.match(r"^Name.+", line):
kkonganti@0 305 continue
kkonganti@0 306 cols = line.strip().split("\t")
kkonganti@0 307 ref_acc = cols[0]
kkonganti@0 308 tpm = cols[3]
kkonganti@0 309 num_reads_mapped = cols[4]
kkonganti@0 310
kkonganti@0 311 (
kkonganti@0 312 reads_total[sample_name]
kkonganti@0 313 .setdefault("all_tpm", [])
kkonganti@0 314 .append(
kkonganti@0 315 round(float(tpm), round_to),
kkonganti@0 316 )
kkonganti@0 317 )
kkonganti@0 318
kkonganti@0 319 (
kkonganti@0 320 reads_total[sample_name]
kkonganti@0 321 .setdefault("reads_mapped", [])
kkonganti@0 322 .append(
kkonganti@0 323 round(float(num_reads_mapped), round_to),
kkonganti@0 324 )
kkonganti@0 325 )
kkonganti@0 326
kkonganti@0 327 if lineages[ref_acc] in lineage2sm.keys():
kkonganti@0 328 (
kkonganti@0 329 lineage2sample[lineages[ref_acc]]
kkonganti@0 330 .setdefault(sample_name, [])
kkonganti@0 331 .append(round(float(tpm), round_to))
kkonganti@0 332 )
kkonganti@0 333 (
kkonganti@0 334 per_taxon_reads[sample_name]
kkonganti@0 335 .setdefault(lineages[ref_acc], [])
kkonganti@0 336 .append(round(float(num_reads_mapped)))
kkonganti@0 337 )
kkonganti@0 338 else:
kkonganti@0 339 (
kkonganti@0 340 reads_total[sample_name]
kkonganti@0 341 .setdefault("rem_tpm", [])
kkonganti@0 342 .append(
kkonganti@0 343 round(float(tpm), round_to),
kkonganti@0 344 )
kkonganti@0 345 )
kkonganti@0 346
kkonganti@0 347 salmon_res_fh.close()
kkonganti@0 348
kkonganti@0 349 # Index each samples' complete failure results i.e., 100% unclassified.
kkonganti@0 350 for sample_res_file_failed in sample_res_files_failed:
kkonganti@0 351 sample_name = re.sub(
kkonganti@0 352 failed_suffix, "", os.path.basename(sample_res_file_failed)
kkonganti@0 353 )
kkonganti@0 354 with open("".join(sample_res_file_failed), "r") as no_calls_fh:
kkonganti@0 355 for line in no_calls_fh.readlines():
kkonganti@0 356 if line in ["\n", "\n\r", "\r"]:
kkonganti@0 357 continue
kkonganti@0 358 unclassified2sample[sample_name].setdefault(no_hit, tpm_const)
kkonganti@0 359 no_calls_fh.close()
kkonganti@0 360
kkonganti@0 361 # Finally, write all results.
kkonganti@0 362 for sample in sorted(reads_total.keys()) + sorted(unclassified2sample.keys()):
kkonganti@0 363 all_samples.add(sample)
kkonganti@0 364
kkonganti@0 365 # Check if sourmash results exist but salmon `quant` failed
kkonganti@0 366 # and if so, set the sample to 100% Unclassified as well.
kkonganti@0 367 for sample in sm2passed["sourmash_passed"].keys():
kkonganti@0 368 if sample not in all_samples:
kkonganti@0 369 unclassified2sample[sample].setdefault(no_hit, tpm_const)
kkonganti@0 370 all_samples.add(sample)
kkonganti@0 371
kkonganti@0 372 # Write total number of reads mapped to nowayout database.
kkonganti@0 373 # with open(salmon_comb_res_reads_mapped, "w") as nowo_reads_mapped_fh:
kkonganti@0 374 # nowo_reads_mapped_fh.write(
kkonganti@0 375 # "\t".join(
kkonganti@0 376 # [
kkonganti@0 377 # "Sample",
kkonganti@0 378 # "All reads",
kkonganti@0 379 # "Classified reads",
kkonganti@0 380 # "Unclassified reads (Reads failed thresholds )",
kkonganti@0 381 # ]
kkonganti@0 382 # )
kkonganti@0 383 # )
kkonganti@0 384
kkonganti@0 385 # for sample in all_samples:
kkonganti@0 386 # if sample in reads_total.keys():
kkonganti@0 387 # nowo_reads_mapped_fh.write(
kkonganti@0 388 # "\n"
kkonganti@0 389 # + "\t".join(
kkonganti@0 390 # [
kkonganti@0 391 # f"\n{sample}",
kkonganti@0 392 # f"{int(sum(reads_total[sample]['reads_mapped']))}",
kkonganti@0 393 # f"{int(reads_total[sample]['reads_mapped'])}",
kkonganti@0 394 # f"{int(reads_total[sample]['rem_tpm'])}",
kkonganti@0 395 # ],
kkonganti@0 396 # )
kkonganti@0 397 # )
kkonganti@0 398 # else:
kkonganti@0 399 # nowo_reads_mapped_fh.write(f"\n{sample}\t{int(0.0)}")
kkonganti@0 400 # nowo_reads_mapped_fh.close()
kkonganti@0 401
kkonganti@0 402 # Write scaled down TPM values for each sample.
kkonganti@0 403 with open(salmon_comb_res, "w") as salmon_comb_res_fh, open(
kkonganti@0 404 salmon_comb_res_indiv_reads_mapped, "w"
kkonganti@0 405 ) as salmon_comb_res_indiv_reads_mapped_fh:
kkonganti@0 406 salmon_comb_res_fh.write("Lineage\t" + "\t".join(all_samples) + "\n")
kkonganti@0 407 salmon_comb_res_indiv_reads_mapped_fh.write(
kkonganti@0 408 "Lineage\t" + "\t".join(all_samples) + "\n"
kkonganti@0 409 )
kkonganti@0 410
kkonganti@0 411 # Write *.krona.tsv header for all samples.
kkonganti@0 412 for sample in all_samples:
kkonganti@0 413 krona_fh = open(
kkonganti@0 414 os.path.join(salmon_res_dir, sample + ".krona.tsv"), "w"
kkonganti@0 415 )
kkonganti@0 416 krona_fh.write(
kkonganti@0 417 "\t".join(
kkonganti@0 418 [
kkonganti@0 419 "fraction",
kkonganti@0 420 "superkingdom",
kkonganti@0 421 "phylum",
kkonganti@0 422 "class",
kkonganti@0 423 "order",
kkonganti@0 424 "family",
kkonganti@0 425 "genus",
kkonganti@0 426 "species",
kkonganti@0 427 ]
kkonganti@0 428 )
kkonganti@0 429 )
kkonganti@0 430 krona_fh.close()
kkonganti@0 431
kkonganti@0 432 # Write the TPM values (TPM/scale_down) for valid lineages.
kkonganti@0 433 for lineage in lineage2sm.keys():
kkonganti@0 434 salmon_comb_res_fh.write(lineage)
kkonganti@0 435 salmon_comb_res_indiv_reads_mapped_fh.write(lineage)
kkonganti@0 436
kkonganti@0 437 for sample in all_samples:
kkonganti@0 438 krona_fh = open(
kkonganti@0 439 os.path.join(salmon_res_dir, sample + ".krona.tsv"), "a"
kkonganti@0 440 )
kkonganti@0 441
kkonganti@0 442 if sample in unclassified2sample.keys():
kkonganti@0 443 salmon_comb_res_fh.write(f"\t0.0")
kkonganti@0 444 salmon_comb_res_indiv_reads_mapped_fh.write(f"\t0")
kkonganti@0 445 elif sample in lineage2sample[lineage].keys():
kkonganti@0 446 reads = sum(per_taxon_reads[sample][lineage])
kkonganti@0 447 tpm = sum(lineage2sample[lineage][sample])
kkonganti@0 448 tph = round(tpm / scale_down, round_to)
kkonganti@0 449 lineage2sample[sample].setdefault("hits_tpm", []).append(
kkonganti@0 450 float(tpm)
kkonganti@0 451 )
kkonganti@0 452
kkonganti@0 453 salmon_comb_res_fh.write(f"\t{tph}")
kkonganti@0 454 salmon_comb_res_indiv_reads_mapped_fh.write(f"\t{reads}")
kkonganti@0 455 krona_lin_row = lineage.split(",")
kkonganti@0 456
kkonganti@0 457 if len(krona_lin_row) > num_lin_cols - 1:
kkonganti@0 458 logging.error(
kkonganti@0 459 "Taxonomy columns are more than 8 for the following lineage:"
kkonganti@0 460 + f"{krona_lin_row}"
kkonganti@0 461 )
kkonganti@0 462 exit(1)
kkonganti@0 463 else:
kkonganti@0 464 krona_fh.write(
kkonganti@0 465 "\n"
kkonganti@0 466 + str(round((tpm / tpm_const), round_to))
kkonganti@0 467 + "\t"
kkonganti@0 468 + "\t".join(krona_lin_row[:-1])
kkonganti@0 469 )
kkonganti@0 470 else:
kkonganti@0 471 salmon_comb_res_fh.write(f"\t0.0")
kkonganti@0 472 salmon_comb_res_indiv_reads_mapped_fh.write(f"\t0")
kkonganti@0 473 krona_fh.close()
kkonganti@0 474
kkonganti@0 475 salmon_comb_res_fh.write("\n")
kkonganti@0 476 salmon_comb_res_indiv_reads_mapped_fh.write(f"\n")
kkonganti@0 477
kkonganti@0 478 # Finally write TPH (TPM/scale_down) for Unclassified
kkonganti@0 479 # Row = Unclassified / No reads mapped to the database ...
kkonganti@0 480 salmon_comb_res_fh.write(f"{no_hit}")
kkonganti@0 481 salmon_comb_res_indiv_reads_mapped_fh.write(f"Total {no_hit_reads}")
kkonganti@0 482
kkonganti@0 483 for sample in all_samples:
kkonganti@0 484 krona_ufh = open(
kkonganti@0 485 os.path.join(salmon_res_dir, sample + ".krona.tsv"), "a"
kkonganti@0 486 )
kkonganti@0 487 # krona_ufh.write("\t")
kkonganti@0 488 if sample in unclassified2sample.keys():
kkonganti@0 489 salmon_comb_res_fh.write(
kkonganti@0 490 f"\t{round((unclassified2sample[sample][no_hit] / scale_down), round_to)}"
kkonganti@0 491 )
kkonganti@0 492 salmon_comb_res_indiv_reads_mapped_fh.write(f"\t0")
kkonganti@0 493 krona_ufh.write(
kkonganti@0 494 f"\n{round((unclassified2sample[sample][no_hit] / tpm_const), round_to)}"
kkonganti@0 495 )
kkonganti@0 496 else:
kkonganti@0 497 trace_tpm = tpm_const - sum(reads_total[sample]["all_tpm"])
kkonganti@0 498 trace_tpm = float(f"{trace_tpm:.{round_to}f}")
kkonganti@0 499 if trace_tpm <= 0:
kkonganti@0 500 trace_tpm = float(0.0)
kkonganti@0 501 tph_unclassified = float(
kkonganti@0 502 f"{(sum(reads_total[sample]['rem_tpm']) + trace_tpm) / scale_down:{round_to}f}"
kkonganti@0 503 )
kkonganti@0 504 krona_unclassified = float(
kkonganti@0 505 f"{(sum(reads_total[sample]['rem_tpm']) + trace_tpm) / tpm_const:{round_to}f}"
kkonganti@0 506 )
kkonganti@0 507 salmon_comb_res_fh.write(f"\t{tph_unclassified}")
kkonganti@0 508 salmon_comb_res_indiv_reads_mapped_fh.write(
kkonganti@0 509 f"\t{int(sum(sum(per_taxon_reads[sample].values(), [])))}"
kkonganti@0 510 )
kkonganti@0 511 krona_ufh.write(f"\n{krona_unclassified}")
kkonganti@0 512 krona_ufh.write("\t" + "\t".join(["unclassified"] * (num_lin_cols - 2)))
kkonganti@0 513 krona_ufh.close()
kkonganti@0 514
kkonganti@0 515 salmon_comb_res_fh.close()
kkonganti@0 516 salmon_comb_res_indiv_reads_mapped_fh.close()
kkonganti@0 517 # ppp.pprint(lineage2sample)
kkonganti@0 518 # ppp.pprint(lineage2sm)
kkonganti@0 519 # ppp.pprint(reads_total)
kkonganti@0 520
kkonganti@0 521
kkonganti@0 522 if __name__ == "__main__":
kkonganti@0 523 main()