annotate 0.5.0/bin/gen_sim_abn_table.py @ 0:3c767f9cfd88 draft default tip

planemo upload
author galaxytrakr
date Fri, 29 May 2026 13:37:56 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
1 #!/usr/bin/env python3
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
2
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
3 # Kranti Konganti
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
4
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
5 import argparse
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
6 import glob
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
7 import inspect
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
8 import logging
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
9 import os
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
10 import pprint
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
11 import re
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
12 from collections import defaultdict
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
13
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
14
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
15 # Multiple inheritence for pretty printing of help text.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
16 class MultiArgFormatClasses(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
17 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
18 ):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
19 pass
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
20
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
21
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
22 # Main
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
23 def main() -> None:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
24 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
25 This script will take the final taxonomic classification files and create a
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
26 global relative abundance type file in the current working directory. The
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
27 relative abundance type files should be in CSV or TSV format and should have
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
28 the lineage or taxonomy in first column and samples in the subsequent columns.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
29 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
30 # Set logging.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
31 logging.basicConfig(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
32 format="\n"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
33 + "=" * 55
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
34 + "\n%(asctime)s - %(levelname)s\n"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
35 + "=" * 55
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
36 + "\n%(message)s\n\n",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
37 level=logging.DEBUG,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
38 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
39
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
40 # Debug print.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
41 ppp = pprint.PrettyPrinter(width=55)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
42 prog_name = inspect.stack()[0].filename
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
43
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
44 parser = argparse.ArgumentParser(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
45 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
46 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
47
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
48 required = parser.add_argument_group("required arguments")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
49
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
50 required.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
51 "-abn",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
52 dest="rel_abn_dir",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
53 default=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
54 required=True,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
55 help="Absolute UNIX path to the parent directory that contains the\n"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
56 + "abundance type files.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
57 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
58 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
59 "-op",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
60 dest="out_prefix",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
61 default="nowayout.tblsum",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
62 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
63 help="Set the output file(s) prefix for output(s) generated\nby this program.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
64 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
65 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
66 "-header",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
67 dest="header",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
68 action="store_true",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
69 default=True,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
70 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
71 help="Do the relative abundance files have a header.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
72 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
73 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
74 "-filepat",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
75 dest="file_pat",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
76 default="*.lineage_summary.tsv",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
77 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
78 help="Files will be searched by this suffix for merged output generation\nby this program.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
79 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
80 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
81 "-failedfilepat",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
82 dest="failed_file_pat",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
83 default="*FAILED.txt",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
84 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
85 help="Files will be searched by this suffix for merged output generation\nby this program.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
86 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
87 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
88 "-delim",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
89 dest="delim",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
90 default="\t",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
91 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
92 help="The delimitor by which the fields are separated in the file.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
93 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
94
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
95 args = parser.parse_args()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
96 rel_abn_dir = args.rel_abn_dir
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
97 is_header = args.header
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
98 out_prefix = args.out_prefix
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
99 file_pat = args.file_pat
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
100 failed_file_pat = args.failed_file_pat
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
101 delim = args.delim
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
102 suffix = re.sub(r"^\*", "", file_pat)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
103 rel_abn_comb = os.path.join(os.getcwd(), out_prefix + ".txt")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
104 rel_abn_files = glob.glob(os.path.join(rel_abn_dir, file_pat))
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
105 failed_rel_abn_files = glob.glob(os.path.join(rel_abn_dir, failed_file_pat))
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
106 empty_results = "Relative abundance results did not pass thresholds"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
107 sample2lineage, seen_lineage = (defaultdict(defaultdict), defaultdict(int))
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
108
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
109 if len(rel_abn_files) == 0:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
110 logging.info(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
111 "Unable to find any files with .tsv extentsion.\nNow trying .csv extension."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
112 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
113 rel_abn_files = glob.glob(os.path.join(rel_abn_dir, "*.csv"))
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
114 delim = ","
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
115
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
116 if len(failed_rel_abn_files) == 0:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
117 logging.info(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
118 f"Unable to find any files with patttern {failed_file_pat}.\n"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
119 + "The failed samples will not appear in the final aggregate file."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
120 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
121
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
122 if rel_abn_dir:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
123 if not os.path.isdir(rel_abn_dir):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
124 logging.error("UNIX path\n" + f"{rel_abn_dir}\n" + "does not exist!")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
125 exit(1)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
126 if len(rel_abn_files) <= 0:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
127 with open(rel_abn_comb, "w") as rel_abn_comb_fh:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
128 rel_abn_comb_fh.write(f"Sample\n{empty_results} in any samples\n")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
129 rel_abn_comb_fh.close()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
130 exit(0)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
131
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
132 for failed_rel_abn in failed_rel_abn_files:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
133 with open(failed_rel_abn, "r") as failed_fh:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
134 sample2lineage[failed_fh.readline().strip()].setdefault(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
135 "unclassified", []
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
136 ).append(float("1.0"))
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
137 failed_fh.close()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
138
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
139 for rel_abn_file in rel_abn_files:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
140 sample_name = re.match(r"(^.+?)\..*$", os.path.basename(rel_abn_file))[1]
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
141
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
142 with open(rel_abn_file, "r") as rel_abn_fh:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
143 if is_header:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
144 sample_names = rel_abn_fh.readline().strip().split(delim)[1:]
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
145 if len(sample_names) > 2:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
146 logging.error(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
147 "The individual relative abundance file has more "
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
148 + "\nthan 1 sample. This is rare in the context of running the "
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
149 + "\n nowayout Nextflow workflow."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
150 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
151 exit(1)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
152 elif len(sample_names) < 2:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
153 sample_name = re.sub(suffix, "", os.path.basename(rel_abn_file))
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
154 logging.info(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
155 "Seems like there is no sample name in the lineage summary file."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
156 + f"\nTherefore, sample name has been extracted from file name: {sample_name}."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
157 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
158 else:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
159 sample_name = sample_names[0]
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
160
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
161 for line in rel_abn_fh.readlines():
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
162 cols = line.strip().split(delim)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
163 lineage = cols[0]
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
164 abn = cols[1]
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
165 sample2lineage[sample_name].setdefault(lineage, []).append(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
166 float(abn)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
167 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
168 seen_lineage[lineage] = 1
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
169
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
170 with open(rel_abn_comb, "w") as rel_abn_comb_fh:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
171 samples = sorted(sample2lineage.keys())
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
172 rel_abn_comb_fh.write(f"Lineage{delim}" + delim.join(samples) + "\n")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
173
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
174 for lineage in sorted(seen_lineage.keys()):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
175 rel_abn_comb_fh.write(lineage)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
176 for sample in samples:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
177 if lineage in sample2lineage[sample].keys():
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
178 rel_abn_comb_fh.write(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
179 delim
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
180 + "".join(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
181 [str(abn) for abn in sample2lineage[sample][lineage]]
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
182 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
183 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
184 else:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
185 rel_abn_comb_fh.write(f"{delim}0.0")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
186 rel_abn_comb_fh.write("\n")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
187 rel_abn_comb_fh.close()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
188
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
189
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
190 if __name__ == "__main__":
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
191 main()