annotate 0.5.0/bin/gen_sim_abn_table.py @ 0:97cd2f532efe

planemo upload
author kkonganti
date Mon, 31 Mar 2025 14:50:40 -0400
parents
children
rev   line source
kkonganti@0 1 #!/usr/bin/env python3
kkonganti@0 2
kkonganti@0 3 # Kranti Konganti
kkonganti@0 4
kkonganti@0 5 import argparse
kkonganti@0 6 import glob
kkonganti@0 7 import inspect
kkonganti@0 8 import logging
kkonganti@0 9 import os
kkonganti@0 10 import pprint
kkonganti@0 11 import re
kkonganti@0 12 from collections import defaultdict
kkonganti@0 13
kkonganti@0 14
kkonganti@0 15 # Multiple inheritence for pretty printing of help text.
kkonganti@0 16 class MultiArgFormatClasses(
kkonganti@0 17 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
kkonganti@0 18 ):
kkonganti@0 19 pass
kkonganti@0 20
kkonganti@0 21
kkonganti@0 22 # Main
kkonganti@0 23 def main() -> None:
kkonganti@0 24 """
kkonganti@0 25 This script will take the final taxonomic classification files and create a
kkonganti@0 26 global relative abundance type file in the current working directory. The
kkonganti@0 27 relative abundance type files should be in CSV or TSV format and should have
kkonganti@0 28 the lineage or taxonomy in first column and samples in the subsequent columns.
kkonganti@0 29 """
kkonganti@0 30 # Set logging.
kkonganti@0 31 logging.basicConfig(
kkonganti@0 32 format="\n"
kkonganti@0 33 + "=" * 55
kkonganti@0 34 + "\n%(asctime)s - %(levelname)s\n"
kkonganti@0 35 + "=" * 55
kkonganti@0 36 + "\n%(message)s\n\n",
kkonganti@0 37 level=logging.DEBUG,
kkonganti@0 38 )
kkonganti@0 39
kkonganti@0 40 # Debug print.
kkonganti@0 41 ppp = pprint.PrettyPrinter(width=55)
kkonganti@0 42 prog_name = inspect.stack()[0].filename
kkonganti@0 43
kkonganti@0 44 parser = argparse.ArgumentParser(
kkonganti@0 45 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@0 46 )
kkonganti@0 47
kkonganti@0 48 required = parser.add_argument_group("required arguments")
kkonganti@0 49
kkonganti@0 50 required.add_argument(
kkonganti@0 51 "-abn",
kkonganti@0 52 dest="rel_abn_dir",
kkonganti@0 53 default=False,
kkonganti@0 54 required=True,
kkonganti@0 55 help="Absolute UNIX path to the parent directory that contains the\n"
kkonganti@0 56 + "abundance type files.",
kkonganti@0 57 )
kkonganti@0 58 parser.add_argument(
kkonganti@0 59 "-op",
kkonganti@0 60 dest="out_prefix",
kkonganti@0 61 default="nowayout.tblsum",
kkonganti@0 62 required=False,
kkonganti@0 63 help="Set the output file(s) prefix for output(s) generated\nby this program.",
kkonganti@0 64 )
kkonganti@0 65 parser.add_argument(
kkonganti@0 66 "-header",
kkonganti@0 67 dest="header",
kkonganti@0 68 action="store_true",
kkonganti@0 69 default=True,
kkonganti@0 70 required=False,
kkonganti@0 71 help="Do the relative abundance files have a header.",
kkonganti@0 72 )
kkonganti@0 73 parser.add_argument(
kkonganti@0 74 "-filepat",
kkonganti@0 75 dest="file_pat",
kkonganti@0 76 default="*.lineage_summary.tsv",
kkonganti@0 77 required=False,
kkonganti@0 78 help="Files will be searched by this suffix for merged output generation\nby this program.",
kkonganti@0 79 )
kkonganti@0 80 parser.add_argument(
kkonganti@0 81 "-failedfilepat",
kkonganti@0 82 dest="failed_file_pat",
kkonganti@0 83 default="*FAILED.txt",
kkonganti@0 84 required=False,
kkonganti@0 85 help="Files will be searched by this suffix for merged output generation\nby this program.",
kkonganti@0 86 )
kkonganti@0 87 parser.add_argument(
kkonganti@0 88 "-delim",
kkonganti@0 89 dest="delim",
kkonganti@0 90 default="\t",
kkonganti@0 91 required=False,
kkonganti@0 92 help="The delimitor by which the fields are separated in the file.",
kkonganti@0 93 )
kkonganti@0 94
kkonganti@0 95 args = parser.parse_args()
kkonganti@0 96 rel_abn_dir = args.rel_abn_dir
kkonganti@0 97 is_header = args.header
kkonganti@0 98 out_prefix = args.out_prefix
kkonganti@0 99 file_pat = args.file_pat
kkonganti@0 100 failed_file_pat = args.failed_file_pat
kkonganti@0 101 delim = args.delim
kkonganti@0 102 suffix = re.sub(r"^\*", "", file_pat)
kkonganti@0 103 rel_abn_comb = os.path.join(os.getcwd(), out_prefix + ".txt")
kkonganti@0 104 rel_abn_files = glob.glob(os.path.join(rel_abn_dir, file_pat))
kkonganti@0 105 failed_rel_abn_files = glob.glob(os.path.join(rel_abn_dir, failed_file_pat))
kkonganti@0 106 empty_results = "Relative abundance results did not pass thresholds"
kkonganti@0 107 sample2lineage, seen_lineage = (defaultdict(defaultdict), defaultdict(int))
kkonganti@0 108
kkonganti@0 109 if len(rel_abn_files) == 0:
kkonganti@0 110 logging.info(
kkonganti@0 111 "Unable to find any files with .tsv extentsion.\nNow trying .csv extension."
kkonganti@0 112 )
kkonganti@0 113 rel_abn_files = glob.glob(os.path.join(rel_abn_dir, "*.csv"))
kkonganti@0 114 delim = ","
kkonganti@0 115
kkonganti@0 116 if len(failed_rel_abn_files) == 0:
kkonganti@0 117 logging.info(
kkonganti@0 118 f"Unable to find any files with patttern {failed_file_pat}.\n"
kkonganti@0 119 + "The failed samples will not appear in the final aggregate file."
kkonganti@0 120 )
kkonganti@0 121
kkonganti@0 122 if rel_abn_dir:
kkonganti@0 123 if not os.path.isdir(rel_abn_dir):
kkonganti@0 124 logging.error("UNIX path\n" + f"{rel_abn_dir}\n" + "does not exist!")
kkonganti@0 125 exit(1)
kkonganti@0 126 if len(rel_abn_files) <= 0:
kkonganti@0 127 with open(rel_abn_comb, "w") as rel_abn_comb_fh:
kkonganti@0 128 rel_abn_comb_fh.write(f"Sample\n{empty_results} in any samples\n")
kkonganti@0 129 rel_abn_comb_fh.close()
kkonganti@0 130 exit(0)
kkonganti@0 131
kkonganti@0 132 for failed_rel_abn in failed_rel_abn_files:
kkonganti@0 133 with open(failed_rel_abn, "r") as failed_fh:
kkonganti@0 134 sample2lineage[failed_fh.readline().strip()].setdefault(
kkonganti@0 135 "unclassified", []
kkonganti@0 136 ).append(float("1.0"))
kkonganti@0 137 failed_fh.close()
kkonganti@0 138
kkonganti@0 139 for rel_abn_file in rel_abn_files:
kkonganti@0 140 sample_name = re.match(r"(^.+?)\..*$", os.path.basename(rel_abn_file))[1]
kkonganti@0 141
kkonganti@0 142 with open(rel_abn_file, "r") as rel_abn_fh:
kkonganti@0 143 if is_header:
kkonganti@0 144 sample_names = rel_abn_fh.readline().strip().split(delim)[1:]
kkonganti@0 145 if len(sample_names) > 2:
kkonganti@0 146 logging.error(
kkonganti@0 147 "The individual relative abundance file has more "
kkonganti@0 148 + "\nthan 1 sample. This is rare in the context of running the "
kkonganti@0 149 + "\n nowayout Nextflow workflow."
kkonganti@0 150 )
kkonganti@0 151 exit(1)
kkonganti@0 152 elif len(sample_names) < 2:
kkonganti@0 153 sample_name = re.sub(suffix, "", os.path.basename(rel_abn_file))
kkonganti@0 154 logging.info(
kkonganti@0 155 "Seems like there is no sample name in the lineage summary file."
kkonganti@0 156 + f"\nTherefore, sample name has been extracted from file name: {sample_name}."
kkonganti@0 157 )
kkonganti@0 158 else:
kkonganti@0 159 sample_name = sample_names[0]
kkonganti@0 160
kkonganti@0 161 for line in rel_abn_fh.readlines():
kkonganti@0 162 cols = line.strip().split(delim)
kkonganti@0 163 lineage = cols[0]
kkonganti@0 164 abn = cols[1]
kkonganti@0 165 sample2lineage[sample_name].setdefault(lineage, []).append(
kkonganti@0 166 float(abn)
kkonganti@0 167 )
kkonganti@0 168 seen_lineage[lineage] = 1
kkonganti@0 169
kkonganti@0 170 with open(rel_abn_comb, "w") as rel_abn_comb_fh:
kkonganti@0 171 samples = sorted(sample2lineage.keys())
kkonganti@0 172 rel_abn_comb_fh.write(f"Lineage{delim}" + delim.join(samples) + "\n")
kkonganti@0 173
kkonganti@0 174 for lineage in sorted(seen_lineage.keys()):
kkonganti@0 175 rel_abn_comb_fh.write(lineage)
kkonganti@0 176 for sample in samples:
kkonganti@0 177 if lineage in sample2lineage[sample].keys():
kkonganti@0 178 rel_abn_comb_fh.write(
kkonganti@0 179 delim
kkonganti@0 180 + "".join(
kkonganti@0 181 [str(abn) for abn in sample2lineage[sample][lineage]]
kkonganti@0 182 )
kkonganti@0 183 )
kkonganti@0 184 else:
kkonganti@0 185 rel_abn_comb_fh.write(f"{delim}0.0")
kkonganti@0 186 rel_abn_comb_fh.write("\n")
kkonganti@0 187 rel_abn_comb_fh.close()
kkonganti@0 188
kkonganti@0 189
kkonganti@0 190 if __name__ == "__main__":
kkonganti@0 191 main()