Mercurial > repos > kkonganti > hfp_nowayout
diff 0.5.0/bin/gen_salmon_tph_and_krona_tsv.py @ 0:97cd2f532efe
planemo upload
author | kkonganti |
---|---|
date | Mon, 31 Mar 2025 14:50:40 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/0.5.0/bin/gen_salmon_tph_and_krona_tsv.py Mon Mar 31 14:50:40 2025 -0400 @@ -0,0 +1,523 @@ +#!/usr/bin/env python3 + +# Kranti Konganti +# 03/06/2024 + +import argparse +import glob +import inspect +import logging +import os +import pprint +import re +from collections import defaultdict + + +# Multiple inheritence for pretty printing of help text. +class MultiArgFormatClasses( + argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter +): + pass + + +# Main +def main() -> None: + """ + The succesful execution of this script requires access to properly formatted + lineages.csv file which has no more than 9 columns. + + It takes the lineages.csv file, the *_hits.csv results from `sourmash gather` + mentioned with -smres option and and a root parent directory of the + `salmon quant` results mentioned with -sal option and generates a final + results table with the TPM values and a .krona.tsv file for each sample + to be used by KronaTools. + """ + # Set logging. + logging.basicConfig( + format="\n" + + "=" * 55 + + "\n%(asctime)s - %(levelname)s\n" + + "=" * 55 + + "\n%(message)s\n\n", + level=logging.DEBUG, + ) + + # Debug print. + ppp = pprint.PrettyPrinter(width=55) + prog_name = inspect.stack()[0].filename + + parser = argparse.ArgumentParser( + prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses + ) + + required = parser.add_argument_group("required arguments") + + required.add_argument( + "-sal", + dest="salmon_res_dir", + default=False, + required=True, + help="Absolute UNIX path to the parent directory that contains the\n" + + "`salmon quant` results. For example, if path to\n" + + "`quant.sf` is in /hpc/john_doe/test/salmon_res/sampleA/quant.sf, then\n" + + "use this command-line option as:\n" + + "-sal /hpc/john_doe/test/salmon_res", + ) + required.add_argument( + "-lin", + dest="lin", + default=False, + required=True, + help="Absolute UNIX Path to the lineages CSV file.\n" + + "This file should have only 9 columns.", + ) + required.add_argument( + "-smres", + dest="sm_res_dir", + default=False, + required=True, + help="Absolute UNIX path to the parent directory that contains the\n" + + "filtered `sourmas gather` results. For example, if path to\n" + + "`sampleA.csv` is in /hpc/john_doe/test/sourmash_gather/sampleA.csv,\n" + + "then use this command-line option as:\n" + + "-sal /hpc/john_doe/test", + ) + parser.add_argument( + "-op", + dest="out_prefix", + default="nowayout.tblsum", + required=False, + help="Set the output file(s) prefix for output(s) generated\n" + + "by this program.", + ) + parser.add_argument( + "-sf", + dest="scale_down_factor", + default=float(10000), + required=False, + help="Set the scaling factor by which TPM values are scaled\ndown.", + ) + parser.add_argument( + "-smres-suffix", + dest="sm_res_suffix", + default="_hits.csv", + required=False, + help="Find the `sourmash gather` result files ending in this\nsuffix.", + ) + parser.add_argument( + "-failed-suffix", + dest="failed_suffix", + default="_FAILED.txt", + required=False, + help="Find the sample names which failed classification stored\n" + + "inside the files ending in this suffix.", + ) + parser.add_argument( + "-num-lin-cols", + dest="num_lin_cols", + default=int(9), + required=False, + help="Number of columns expected in the lineages CSV file.", + ) + parser.add_argument( + "-lin-acc-regex", + dest="lin_acc_regex", + default=re.compile(r"\w+[\-\.]{1}[0-9]+"), + required=False, + help="The pattern of the lineage's accession.", + ) + + args = parser.parse_args() + salmon_res_dir = args.salmon_res_dir + sm_res_dir = args.sm_res_dir + sm_res_suffix = args.sm_res_suffix + failed_suffix = args.failed_suffix + out_prefix = args.out_prefix + lin = args.lin + num_lin_cols = args.num_lin_cols + acc_pat = args.lin_acc_regex + scale_down = float(args.scale_down_factor) + no_hit = "Unclassified" + no_hit_reads = "reads mapped to the database" + tpm_const = float(1000000.0000000000) + round_to = 10 + all_samples = set() + ( + lineage2sample, + unclassified2sample, + lineage2sm, + sm2passed, + reads_total, + per_taxon_reads, + lineages, + ) = ( + defaultdict(defaultdict), + defaultdict(defaultdict), + defaultdict(defaultdict), + defaultdict(defaultdict), + defaultdict(defaultdict), + defaultdict(defaultdict), + defaultdict(int), + ) + + salmon_comb_res = os.path.join(os.getcwd(), out_prefix + ".txt") + # salmon_comb_res_reads_mapped = os.path.join( + # os.getcwd(), re.sub(".tblsum", "_reads_mapped.tblsum", out_prefix) + ".txt" + # ) + salmon_comb_res_indiv_reads_mapped = os.path.join( + os.getcwd(), + re.sub(".tblsum", "_indiv_reads_mapped.tblsum", out_prefix) + ".txt", + ) + salmon_res_files = glob.glob( + os.path.join(salmon_res_dir, "*", "quant.sf"), recursive=True + ) + sample_res_files_failed = glob.glob( + os.path.join(salmon_res_dir, "*" + failed_suffix), recursive=True + ) + sm_res_files = glob.glob( + os.path.join(sm_res_dir, "*" + sm_res_suffix), recursive=True + ) + + # Basic checks + if lin and not (os.path.exists(lin) and os.path.getsize(lin) > 0): + logging.error( + "The lineages file,\n" + + f"{os.path.basename(lin)} does not exist or is empty!" + ) + exit(1) + + if salmon_res_dir: + if not os.path.isdir(salmon_res_dir): + logging.error("UNIX path\n" + f"{salmon_res_dir}\n" + "does not exist!") + exit(1) + if len(salmon_res_files) <= 0: + with open(salmon_comb_res, "w") as salmon_comb_res_fh, open( + salmon_comb_res_indiv_reads_mapped, "w" + ) as salmon_comb_res_indiv_reads_mapped_fh: + salmon_comb_res_fh.write(f"Sample\n{no_hit} reads in all samples\n") + salmon_comb_res_indiv_reads_mapped_fh.write( + f"Sample\nNo {no_hit_reads} from all samples\n" + ) + salmon_comb_res_fh.close() + salmon_comb_res_indiv_reads_mapped_fh.close() + exit(0) + + # Only proceed if lineages.csv exists. + if lin and os.path.exists(lin) and os.path.getsize(lin) > 0: + lin_fh = open(lin, "r") + _ = lin_fh.readline() + + # Index lineages.csv + for line in lin_fh: + cols = line.strip().split(",") + + if len(cols) < num_lin_cols: + logging.error( + f"The file {os.path.basename(lin)} seems to\n" + + "be malformed. It contains less than required 9 columns." + ) + exit(1) + + if cols[0] in lineages.keys(): + continue + # logging.info( + # f"There is a duplicate accession [{cols[0]}]" + # + f" in the lineages file {os.path.basename(lin)}!" + # ) + elif acc_pat.match(cols[0]): + lineages[cols[0]] = ",".join(cols[1:]) + + lin_fh.close() + + # Index each samples' filtered sourmash results. + for sm_res_file in sm_res_files: + sample_name = re.sub(sm_res_suffix, "", os.path.basename(sm_res_file)) + + with open(sm_res_file, "r") as sm_res_fh: + _ = sm_res_fh.readline() + for line in sm_res_fh: + acc = acc_pat.findall(line.strip().split(",")[9]) + + if len(acc) == 0: + logging.info( + f"Got empty lineage accession: {acc}" + + f"\nRow elements: {line.strip().split(',')}" + ) + exit(1) + if len(acc) not in [1]: + logging.info( + f"Got more than one lineage accession: {acc}" + + f"\nRow elements: {line.strip().split(',')}" + ) + logging.info(f"Considering first element: {acc[0]}") + if acc[0] not in lineages.keys(): + logging.error( + f"The lineage accession {acc[0]} is not found in {os.path.basename(lin)}" + ) + exit(1) + lineage2sm[lineages[acc[0]]].setdefault(sample_name, 1) + sm2passed["sourmash_passed"].setdefault(sample_name, 1) + sm_res_fh.close() + + # Index each samples' salmon results. + for salmon_res_file in salmon_res_files: + sample_name = re.match( + r"(^.+?)((\_salmon\_res)|(\.salmon))$", + os.path.basename(os.path.dirname(salmon_res_file)), + )[1] + salmon_meta_json = os.path.join( + os.path.dirname(salmon_res_file), "aux_info", "meta_info.json" + ) + + if ( + not os.path.exists(salmon_meta_json) + or not os.path.getsize(salmon_meta_json) > 0 + ): + logging.error( + "The file\n" + + f"{salmon_meta_json}\ndoes not exist or is empty!\n" + + "Did `salmon quant` fail?" + ) + exit(1) + + if ( + not os.path.exists(salmon_res_file) + or not os.path.getsize(salmon_res_file) > 0 + ): + logging.error( + "The file\n" + + f"{salmon_res_file}\ndoes not exist or is empty!\n" + + "Did `salmon quant` fail?" + ) + exit(1) + + # Initiate all_tpm, rem_tpm and reads_mapped + # all_tpm + reads_total[sample_name].setdefault("all_tpm", []).append(float(0.0)) + # rem_tpm + reads_total[sample_name].setdefault("rem_tpm", []).append(float(0.0)) + # reads_mapped + reads_total[sample_name].setdefault("reads_mapped", []).append(float(0.0)) + + with open(salmon_res_file, "r") as salmon_res_fh: + for line in salmon_res_fh.readlines(): + if re.match(r"^Name.+", line): + continue + cols = line.strip().split("\t") + ref_acc = cols[0] + tpm = cols[3] + num_reads_mapped = cols[4] + + ( + reads_total[sample_name] + .setdefault("all_tpm", []) + .append( + round(float(tpm), round_to), + ) + ) + + ( + reads_total[sample_name] + .setdefault("reads_mapped", []) + .append( + round(float(num_reads_mapped), round_to), + ) + ) + + if lineages[ref_acc] in lineage2sm.keys(): + ( + lineage2sample[lineages[ref_acc]] + .setdefault(sample_name, []) + .append(round(float(tpm), round_to)) + ) + ( + per_taxon_reads[sample_name] + .setdefault(lineages[ref_acc], []) + .append(round(float(num_reads_mapped))) + ) + else: + ( + reads_total[sample_name] + .setdefault("rem_tpm", []) + .append( + round(float(tpm), round_to), + ) + ) + + salmon_res_fh.close() + + # Index each samples' complete failure results i.e., 100% unclassified. + for sample_res_file_failed in sample_res_files_failed: + sample_name = re.sub( + failed_suffix, "", os.path.basename(sample_res_file_failed) + ) + with open("".join(sample_res_file_failed), "r") as no_calls_fh: + for line in no_calls_fh.readlines(): + if line in ["\n", "\n\r", "\r"]: + continue + unclassified2sample[sample_name].setdefault(no_hit, tpm_const) + no_calls_fh.close() + + # Finally, write all results. + for sample in sorted(reads_total.keys()) + sorted(unclassified2sample.keys()): + all_samples.add(sample) + + # Check if sourmash results exist but salmon `quant` failed + # and if so, set the sample to 100% Unclassified as well. + for sample in sm2passed["sourmash_passed"].keys(): + if sample not in all_samples: + unclassified2sample[sample].setdefault(no_hit, tpm_const) + all_samples.add(sample) + + # Write total number of reads mapped to nowayout database. + # with open(salmon_comb_res_reads_mapped, "w") as nowo_reads_mapped_fh: + # nowo_reads_mapped_fh.write( + # "\t".join( + # [ + # "Sample", + # "All reads", + # "Classified reads", + # "Unclassified reads (Reads failed thresholds )", + # ] + # ) + # ) + + # for sample in all_samples: + # if sample in reads_total.keys(): + # nowo_reads_mapped_fh.write( + # "\n" + # + "\t".join( + # [ + # f"\n{sample}", + # f"{int(sum(reads_total[sample]['reads_mapped']))}", + # f"{int(reads_total[sample]['reads_mapped'])}", + # f"{int(reads_total[sample]['rem_tpm'])}", + # ], + # ) + # ) + # else: + # nowo_reads_mapped_fh.write(f"\n{sample}\t{int(0.0)}") + # nowo_reads_mapped_fh.close() + + # Write scaled down TPM values for each sample. + with open(salmon_comb_res, "w") as salmon_comb_res_fh, open( + salmon_comb_res_indiv_reads_mapped, "w" + ) as salmon_comb_res_indiv_reads_mapped_fh: + salmon_comb_res_fh.write("Lineage\t" + "\t".join(all_samples) + "\n") + salmon_comb_res_indiv_reads_mapped_fh.write( + "Lineage\t" + "\t".join(all_samples) + "\n" + ) + + # Write *.krona.tsv header for all samples. + for sample in all_samples: + krona_fh = open( + os.path.join(salmon_res_dir, sample + ".krona.tsv"), "w" + ) + krona_fh.write( + "\t".join( + [ + "fraction", + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + ] + ) + ) + krona_fh.close() + + # Write the TPM values (TPM/scale_down) for valid lineages. + for lineage in lineage2sm.keys(): + salmon_comb_res_fh.write(lineage) + salmon_comb_res_indiv_reads_mapped_fh.write(lineage) + + for sample in all_samples: + krona_fh = open( + os.path.join(salmon_res_dir, sample + ".krona.tsv"), "a" + ) + + if sample in unclassified2sample.keys(): + salmon_comb_res_fh.write(f"\t0.0") + salmon_comb_res_indiv_reads_mapped_fh.write(f"\t0") + elif sample in lineage2sample[lineage].keys(): + reads = sum(per_taxon_reads[sample][lineage]) + tpm = sum(lineage2sample[lineage][sample]) + tph = round(tpm / scale_down, round_to) + lineage2sample[sample].setdefault("hits_tpm", []).append( + float(tpm) + ) + + salmon_comb_res_fh.write(f"\t{tph}") + salmon_comb_res_indiv_reads_mapped_fh.write(f"\t{reads}") + krona_lin_row = lineage.split(",") + + if len(krona_lin_row) > num_lin_cols - 1: + logging.error( + "Taxonomy columns are more than 8 for the following lineage:" + + f"{krona_lin_row}" + ) + exit(1) + else: + krona_fh.write( + "\n" + + str(round((tpm / tpm_const), round_to)) + + "\t" + + "\t".join(krona_lin_row[:-1]) + ) + else: + salmon_comb_res_fh.write(f"\t0.0") + salmon_comb_res_indiv_reads_mapped_fh.write(f"\t0") + krona_fh.close() + + salmon_comb_res_fh.write("\n") + salmon_comb_res_indiv_reads_mapped_fh.write(f"\n") + + # Finally write TPH (TPM/scale_down) for Unclassified + # Row = Unclassified / No reads mapped to the database ... + salmon_comb_res_fh.write(f"{no_hit}") + salmon_comb_res_indiv_reads_mapped_fh.write(f"Total {no_hit_reads}") + + for sample in all_samples: + krona_ufh = open( + os.path.join(salmon_res_dir, sample + ".krona.tsv"), "a" + ) + # krona_ufh.write("\t") + if sample in unclassified2sample.keys(): + salmon_comb_res_fh.write( + f"\t{round((unclassified2sample[sample][no_hit] / scale_down), round_to)}" + ) + salmon_comb_res_indiv_reads_mapped_fh.write(f"\t0") + krona_ufh.write( + f"\n{round((unclassified2sample[sample][no_hit] / tpm_const), round_to)}" + ) + else: + trace_tpm = tpm_const - sum(reads_total[sample]["all_tpm"]) + trace_tpm = float(f"{trace_tpm:.{round_to}f}") + if trace_tpm <= 0: + trace_tpm = float(0.0) + tph_unclassified = float( + f"{(sum(reads_total[sample]['rem_tpm']) + trace_tpm) / scale_down:{round_to}f}" + ) + krona_unclassified = float( + f"{(sum(reads_total[sample]['rem_tpm']) + trace_tpm) / tpm_const:{round_to}f}" + ) + salmon_comb_res_fh.write(f"\t{tph_unclassified}") + salmon_comb_res_indiv_reads_mapped_fh.write( + f"\t{int(sum(sum(per_taxon_reads[sample].values(), [])))}" + ) + krona_ufh.write(f"\n{krona_unclassified}") + krona_ufh.write("\t" + "\t".join(["unclassified"] * (num_lin_cols - 2))) + krona_ufh.close() + + salmon_comb_res_fh.close() + salmon_comb_res_indiv_reads_mapped_fh.close() + # ppp.pprint(lineage2sample) + # ppp.pprint(lineage2sm) + # ppp.pprint(reads_total) + + +if __name__ == "__main__": + main()