kkonganti@0
|
1 #!/usr/bin/env python3
|
kkonganti@0
|
2
|
kkonganti@0
|
3 # Kranti Konganti
|
kkonganti@0
|
4
|
kkonganti@0
|
5 import argparse
|
kkonganti@0
|
6 import glob
|
kkonganti@0
|
7 import inspect
|
kkonganti@0
|
8 import logging
|
kkonganti@0
|
9 import os
|
kkonganti@0
|
10 import pprint
|
kkonganti@0
|
11 import re
|
kkonganti@0
|
12 from collections import defaultdict
|
kkonganti@0
|
13
|
kkonganti@0
|
14
|
kkonganti@0
|
15 # Multiple inheritence for pretty printing of help text.
|
kkonganti@0
|
16 class MultiArgFormatClasses(
|
kkonganti@0
|
17 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
|
kkonganti@0
|
18 ):
|
kkonganti@0
|
19 pass
|
kkonganti@0
|
20
|
kkonganti@0
|
21
|
kkonganti@0
|
22 # Main
|
kkonganti@0
|
23 def main() -> None:
|
kkonganti@0
|
24 """
|
kkonganti@0
|
25 This script will take the final taxonomic classification files and create a
|
kkonganti@0
|
26 global relative abundance type file in the current working directory. The
|
kkonganti@0
|
27 relative abundance type files should be in CSV or TSV format and should have
|
kkonganti@0
|
28 the lineage or taxonomy in first column and samples in the subsequent columns.
|
kkonganti@0
|
29 """
|
kkonganti@0
|
30 # Set logging.
|
kkonganti@0
|
31 logging.basicConfig(
|
kkonganti@0
|
32 format="\n"
|
kkonganti@0
|
33 + "=" * 55
|
kkonganti@0
|
34 + "\n%(asctime)s - %(levelname)s\n"
|
kkonganti@0
|
35 + "=" * 55
|
kkonganti@0
|
36 + "\n%(message)s\n\n",
|
kkonganti@0
|
37 level=logging.DEBUG,
|
kkonganti@0
|
38 )
|
kkonganti@0
|
39
|
kkonganti@0
|
40 # Debug print.
|
kkonganti@0
|
41 ppp = pprint.PrettyPrinter(width=55)
|
kkonganti@0
|
42 prog_name = inspect.stack()[0].filename
|
kkonganti@0
|
43
|
kkonganti@0
|
44 parser = argparse.ArgumentParser(
|
kkonganti@0
|
45 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
|
kkonganti@0
|
46 )
|
kkonganti@0
|
47
|
kkonganti@0
|
48 required = parser.add_argument_group("required arguments")
|
kkonganti@0
|
49
|
kkonganti@0
|
50 required.add_argument(
|
kkonganti@0
|
51 "-abn",
|
kkonganti@0
|
52 dest="rel_abn_dir",
|
kkonganti@0
|
53 default=False,
|
kkonganti@0
|
54 required=True,
|
kkonganti@0
|
55 help="Absolute UNIX path to the parent directory that contains the\n"
|
kkonganti@0
|
56 + "abundance type files.",
|
kkonganti@0
|
57 )
|
kkonganti@0
|
58 parser.add_argument(
|
kkonganti@0
|
59 "-op",
|
kkonganti@0
|
60 dest="out_prefix",
|
kkonganti@0
|
61 default="nowayout.tblsum",
|
kkonganti@0
|
62 required=False,
|
kkonganti@0
|
63 help="Set the output file(s) prefix for output(s) generated\nby this program.",
|
kkonganti@0
|
64 )
|
kkonganti@0
|
65 parser.add_argument(
|
kkonganti@0
|
66 "-header",
|
kkonganti@0
|
67 dest="header",
|
kkonganti@0
|
68 action="store_true",
|
kkonganti@0
|
69 default=True,
|
kkonganti@0
|
70 required=False,
|
kkonganti@0
|
71 help="Do the relative abundance files have a header.",
|
kkonganti@0
|
72 )
|
kkonganti@0
|
73 parser.add_argument(
|
kkonganti@0
|
74 "-filepat",
|
kkonganti@0
|
75 dest="file_pat",
|
kkonganti@0
|
76 default="*.lineage_summary.tsv",
|
kkonganti@0
|
77 required=False,
|
kkonganti@0
|
78 help="Files will be searched by this suffix for merged output generation\nby this program.",
|
kkonganti@0
|
79 )
|
kkonganti@0
|
80 parser.add_argument(
|
kkonganti@0
|
81 "-failedfilepat",
|
kkonganti@0
|
82 dest="failed_file_pat",
|
kkonganti@0
|
83 default="*FAILED.txt",
|
kkonganti@0
|
84 required=False,
|
kkonganti@0
|
85 help="Files will be searched by this suffix for merged output generation\nby this program.",
|
kkonganti@0
|
86 )
|
kkonganti@0
|
87 parser.add_argument(
|
kkonganti@0
|
88 "-delim",
|
kkonganti@0
|
89 dest="delim",
|
kkonganti@0
|
90 default="\t",
|
kkonganti@0
|
91 required=False,
|
kkonganti@0
|
92 help="The delimitor by which the fields are separated in the file.",
|
kkonganti@0
|
93 )
|
kkonganti@0
|
94
|
kkonganti@0
|
95 args = parser.parse_args()
|
kkonganti@0
|
96 rel_abn_dir = args.rel_abn_dir
|
kkonganti@0
|
97 is_header = args.header
|
kkonganti@0
|
98 out_prefix = args.out_prefix
|
kkonganti@0
|
99 file_pat = args.file_pat
|
kkonganti@0
|
100 failed_file_pat = args.failed_file_pat
|
kkonganti@0
|
101 delim = args.delim
|
kkonganti@0
|
102 suffix = re.sub(r"^\*", "", file_pat)
|
kkonganti@0
|
103 rel_abn_comb = os.path.join(os.getcwd(), out_prefix + ".txt")
|
kkonganti@0
|
104 rel_abn_files = glob.glob(os.path.join(rel_abn_dir, file_pat))
|
kkonganti@0
|
105 failed_rel_abn_files = glob.glob(os.path.join(rel_abn_dir, failed_file_pat))
|
kkonganti@0
|
106 empty_results = "Relative abundance results did not pass thresholds"
|
kkonganti@0
|
107 sample2lineage, seen_lineage = (defaultdict(defaultdict), defaultdict(int))
|
kkonganti@0
|
108
|
kkonganti@0
|
109 if len(rel_abn_files) == 0:
|
kkonganti@0
|
110 logging.info(
|
kkonganti@0
|
111 "Unable to find any files with .tsv extentsion.\nNow trying .csv extension."
|
kkonganti@0
|
112 )
|
kkonganti@0
|
113 rel_abn_files = glob.glob(os.path.join(rel_abn_dir, "*.csv"))
|
kkonganti@0
|
114 delim = ","
|
kkonganti@0
|
115
|
kkonganti@0
|
116 if len(failed_rel_abn_files) == 0:
|
kkonganti@0
|
117 logging.info(
|
kkonganti@0
|
118 f"Unable to find any files with patttern {failed_file_pat}.\n"
|
kkonganti@0
|
119 + "The failed samples will not appear in the final aggregate file."
|
kkonganti@0
|
120 )
|
kkonganti@0
|
121
|
kkonganti@0
|
122 if rel_abn_dir:
|
kkonganti@0
|
123 if not os.path.isdir(rel_abn_dir):
|
kkonganti@0
|
124 logging.error("UNIX path\n" + f"{rel_abn_dir}\n" + "does not exist!")
|
kkonganti@0
|
125 exit(1)
|
kkonganti@0
|
126 if len(rel_abn_files) <= 0:
|
kkonganti@0
|
127 with open(rel_abn_comb, "w") as rel_abn_comb_fh:
|
kkonganti@0
|
128 rel_abn_comb_fh.write(f"Sample\n{empty_results} in any samples\n")
|
kkonganti@0
|
129 rel_abn_comb_fh.close()
|
kkonganti@0
|
130 exit(0)
|
kkonganti@0
|
131
|
kkonganti@0
|
132 for failed_rel_abn in failed_rel_abn_files:
|
kkonganti@0
|
133 with open(failed_rel_abn, "r") as failed_fh:
|
kkonganti@0
|
134 sample2lineage[failed_fh.readline().strip()].setdefault(
|
kkonganti@0
|
135 "unclassified", []
|
kkonganti@0
|
136 ).append(float("1.0"))
|
kkonganti@0
|
137 failed_fh.close()
|
kkonganti@0
|
138
|
kkonganti@0
|
139 for rel_abn_file in rel_abn_files:
|
kkonganti@0
|
140 sample_name = re.match(r"(^.+?)\..*$", os.path.basename(rel_abn_file))[1]
|
kkonganti@0
|
141
|
kkonganti@0
|
142 with open(rel_abn_file, "r") as rel_abn_fh:
|
kkonganti@0
|
143 if is_header:
|
kkonganti@0
|
144 sample_names = rel_abn_fh.readline().strip().split(delim)[1:]
|
kkonganti@0
|
145 if len(sample_names) > 2:
|
kkonganti@0
|
146 logging.error(
|
kkonganti@0
|
147 "The individual relative abundance file has more "
|
kkonganti@0
|
148 + "\nthan 1 sample. This is rare in the context of running the "
|
kkonganti@0
|
149 + "\n nowayout Nextflow workflow."
|
kkonganti@0
|
150 )
|
kkonganti@0
|
151 exit(1)
|
kkonganti@0
|
152 elif len(sample_names) < 2:
|
kkonganti@0
|
153 sample_name = re.sub(suffix, "", os.path.basename(rel_abn_file))
|
kkonganti@0
|
154 logging.info(
|
kkonganti@0
|
155 "Seems like there is no sample name in the lineage summary file."
|
kkonganti@0
|
156 + f"\nTherefore, sample name has been extracted from file name: {sample_name}."
|
kkonganti@0
|
157 )
|
kkonganti@0
|
158 else:
|
kkonganti@0
|
159 sample_name = sample_names[0]
|
kkonganti@0
|
160
|
kkonganti@0
|
161 for line in rel_abn_fh.readlines():
|
kkonganti@0
|
162 cols = line.strip().split(delim)
|
kkonganti@0
|
163 lineage = cols[0]
|
kkonganti@0
|
164 abn = cols[1]
|
kkonganti@0
|
165 sample2lineage[sample_name].setdefault(lineage, []).append(
|
kkonganti@0
|
166 float(abn)
|
kkonganti@0
|
167 )
|
kkonganti@0
|
168 seen_lineage[lineage] = 1
|
kkonganti@0
|
169
|
kkonganti@0
|
170 with open(rel_abn_comb, "w") as rel_abn_comb_fh:
|
kkonganti@0
|
171 samples = sorted(sample2lineage.keys())
|
kkonganti@0
|
172 rel_abn_comb_fh.write(f"Lineage{delim}" + delim.join(samples) + "\n")
|
kkonganti@0
|
173
|
kkonganti@0
|
174 for lineage in sorted(seen_lineage.keys()):
|
kkonganti@0
|
175 rel_abn_comb_fh.write(lineage)
|
kkonganti@0
|
176 for sample in samples:
|
kkonganti@0
|
177 if lineage in sample2lineage[sample].keys():
|
kkonganti@0
|
178 rel_abn_comb_fh.write(
|
kkonganti@0
|
179 delim
|
kkonganti@0
|
180 + "".join(
|
kkonganti@0
|
181 [str(abn) for abn in sample2lineage[sample][lineage]]
|
kkonganti@0
|
182 )
|
kkonganti@0
|
183 )
|
kkonganti@0
|
184 else:
|
kkonganti@0
|
185 rel_abn_comb_fh.write(f"{delim}0.0")
|
kkonganti@0
|
186 rel_abn_comb_fh.write("\n")
|
kkonganti@0
|
187 rel_abn_comb_fh.close()
|
kkonganti@0
|
188
|
kkonganti@0
|
189
|
kkonganti@0
|
190 if __name__ == "__main__":
|
kkonganti@0
|
191 main()
|