Mercurial > repos > kkonganti > cfsan_bettercallsal
comparison 0.5.0/bin/gen_salmon_res_table.py @ 1:365849f031fd
"planemo upload"
author | kkonganti |
---|---|
date | Mon, 05 Jun 2023 18:48:51 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:a4b1ee4b68b1 | 1:365849f031fd |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 # Kranti Konganti | |
4 | |
5 import argparse | |
6 import glob | |
7 import inspect | |
8 import json | |
9 import logging | |
10 import os | |
11 import pickle | |
12 import pprint | |
13 import re | |
14 from collections import defaultdict | |
15 | |
16 import yaml | |
17 | |
18 | |
19 # Multiple inheritence for pretty printing of help text. | |
20 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): | |
21 pass | |
22 | |
23 | |
24 # Main | |
25 def main() -> None: | |
26 """ | |
27 The succesful execution of this script requires access to bettercallsal formatted | |
28 db flat files. On raven2, they are at /hpc/db/bettercallsall/PDGXXXXXXXXXX.XXXXX | |
29 | |
30 It takes the ACC2SERO.pickle file and *.reference_target.cluster_list.tsv file | |
31 for that particular NCBI Pathogens release from the db directory mentioned with | |
32 -db option and a root parent directory of the `salmon quant` results mentioned | |
33 with -sal option and generates a final results table with number of reads | |
34 mapped and a .json file to be used with MultiQC to generate a stacked bar plot. | |
35 | |
36 Using -url option optionally adds an extra column of NCBI Pathogens Isolates | |
37 Browser, which directly links out to NCBI Pathogens Isolates SNP viewer tool. | |
38 """ | |
39 # Set logging. | |
40 logging.basicConfig( | |
41 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n", | |
42 level=logging.DEBUG, | |
43 ) | |
44 | |
45 # Debug print. | |
46 ppp = pprint.PrettyPrinter(width=55) | |
47 prog_name = inspect.stack()[0].filename | |
48 | |
49 parser = argparse.ArgumentParser( | |
50 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses | |
51 ) | |
52 | |
53 required = parser.add_argument_group("required arguments") | |
54 | |
55 required.add_argument( | |
56 "-sal", | |
57 dest="salmon_res_dir", | |
58 default=False, | |
59 required=True, | |
60 help="Absolute UNIX path to the parent directory that contains the\n" | |
61 + "`salmon quant` results directory. For example, if path to\n" | |
62 + "`quant.sf` is in /hpc/john_doe/test/salmon_res/quant.sf, then\n" | |
63 + "use this command-line option as:\n" | |
64 + "-sal /hpc/john_doe/test", | |
65 ) | |
66 required.add_argument( | |
67 "-snp", | |
68 dest="rtc", | |
69 default=False, | |
70 required=True, | |
71 help="Absolute UNIX Path to the PDG SNP reference target cluster\n" | |
72 + "metadata file. On raven2, these are located at\n" | |
73 + "/hpc/db/bettercallsal/PDGXXXXXXXXXX.XXXXX\n" | |
74 + "Required if -sal is on.", | |
75 ) | |
76 required.add_argument( | |
77 "-pickle", | |
78 dest="acc2sero", | |
79 default=False, | |
80 required=True, | |
81 help="Absolute UNIX Path to the *ACC2SERO.pickle\n" | |
82 + "metadata file. On raven2, these are located at\n" | |
83 + "/hpc/db/bettercallsal/PDGXXXXXXXXXX.XXXXX\n" | |
84 + "Required if -sal is on.", | |
85 ) | |
86 parser.add_argument( | |
87 "-op", | |
88 dest="out_prefix", | |
89 default="bettercallsal.tblsum", | |
90 required=False, | |
91 help="Set the output file(s) prefix for output(s) generated\n" + "by this program.", | |
92 ) | |
93 parser.add_argument( | |
94 "-url", | |
95 dest="show_snp_clust_info", | |
96 default=False, | |
97 required=False, | |
98 action="store_true", | |
99 help="Show SNP cluster participation information of the final genome hit.\n" | |
100 + "This may be useful to see a relative placement of your sample in\n" | |
101 + "NCBI Isolates SNP Tree Viewer based on genome similarity but however\n" | |
102 + "due to rapid nature of the updates at NCBI Pathogen Detection Project,\n" | |
103 + "the placement may be in an outdated cluster.", | |
104 ) | |
105 | |
106 args = parser.parse_args() | |
107 salmon_res_dir = args.salmon_res_dir | |
108 out_prefix = args.out_prefix | |
109 show_snp_clust_col = args.show_snp_clust_info | |
110 rtc = args.rtc | |
111 pickled_sero = args.acc2sero | |
112 no_hit = "No genome hit" | |
113 bcs_sal_yn_prefix = "bettercallsal_salyn" | |
114 sal_y = "Detected" | |
115 sal_n = "Not detected" | |
116 ncbi_pathogens_base_url = "https://www.ncbi.nlm.nih.gov/pathogens/" | |
117 | |
118 sample2salmon, snp_clusters, multiqc_salmon_counts, seen_sero, sal_yn = ( | |
119 defaultdict(defaultdict), | |
120 defaultdict(defaultdict), | |
121 defaultdict(defaultdict), | |
122 defaultdict(int), | |
123 defaultdict(int), | |
124 ) | |
125 | |
126 cell_colors_yml = { | |
127 bcs_sal_yn_prefix: {sal_y: "#c8e6c9 !important;", sal_n: "#ffcdd2 !important;"} | |
128 } | |
129 | |
130 salmon_comb_res = os.path.join(os.getcwd(), out_prefix + ".txt") | |
131 bcs_sal_yn = re.sub(out_prefix, bcs_sal_yn_prefix + ".tblsum", salmon_comb_res) | |
132 cell_colors_yml_file = re.sub( | |
133 out_prefix + ".txt", bcs_sal_yn_prefix + ".cellcolors.yml", salmon_comb_res | |
134 ) | |
135 salmon_comb_res_mqc = os.path.join(os.getcwd(), str(out_prefix).split(".")[0] + "_mqc.json") | |
136 salmon_res_files = glob.glob(os.path.join(salmon_res_dir, "*", "quant.sf"), recursive=True) | |
137 salmon_res_file_failed = glob.glob(os.path.join(salmon_res_dir, "BCS_NO_CALLS.txt")) | |
138 | |
139 if rtc and (not os.path.exists(rtc) or not os.path.getsize(rtc) > 0): | |
140 logging.error( | |
141 "The reference target cluster metadata file,\n" | |
142 + f"{os.path.basename(rtc)} does not exist or is empty!" | |
143 ) | |
144 exit(1) | |
145 | |
146 if rtc and (not salmon_res_dir or not pickled_sero): | |
147 logging.error("When -rtc is on, -sal and -ps are also required.") | |
148 exit(1) | |
149 | |
150 if pickled_sero and (not os.path.exists(pickled_sero) or not os.path.getsize(pickled_sero)): | |
151 logging.error( | |
152 "The pickle file,\n" + f"{os.path.basename(pickled_sero)} does not exist or is empty!" | |
153 ) | |
154 exit(1) | |
155 | |
156 if salmon_res_dir: | |
157 if not os.path.isdir(salmon_res_dir): | |
158 logging.error("UNIX path\n" + f"{salmon_res_dir}\n" + "does not exist!") | |
159 exit(1) | |
160 if len(salmon_res_files) <= 0: | |
161 # logging.error( | |
162 # "Parent directory,\n" | |
163 # + f"{salmon_res_dir}" | |
164 # + "\ndoes not seem to have any directories that contain\n" | |
165 # + "the `quant.sf` file(s)." | |
166 # ) | |
167 # exit(1) | |
168 with open(salmon_comb_res, "w") as salmon_comb_res_fh: | |
169 salmon_comb_res_fh.write(f"Sample\n{no_hit}s in any samples\n") | |
170 salmon_comb_res_fh.close() | |
171 exit(0) | |
172 | |
173 if rtc and os.path.exists(rtc) and os.path.getsize(rtc) > 0: | |
174 | |
175 # pdg_release = re.match(r"(^PDG\d+\.\d+)\..+", os.path.basename(rtc))[1] + "/" | |
176 acc2sero = pickle.load(file=open(pickled_sero, "rb")) | |
177 | |
178 with open(rtc, "r") as rtc_fh: | |
179 | |
180 for line in rtc_fh: | |
181 cols = line.strip().split("\t") | |
182 | |
183 if len(cols) < 4: | |
184 logging.error( | |
185 f"The file {os.path.basename(rtc)} seems to\n" | |
186 + "be malformed. It contains less than required 4 columns." | |
187 ) | |
188 exit(1) | |
189 elif cols[3] != "NULL": | |
190 snp_clusters[cols[0]].setdefault("assembly_accs", []).append(cols[3]) | |
191 snp_clusters[cols[3]].setdefault("snp_clust_id", []).append(cols[0]) | |
192 snp_clusters[cols[3]].setdefault("pathdb_acc_id", []).append(cols[1]) | |
193 if len(snp_clusters[cols[3]]["snp_clust_id"]) > 1: | |
194 logging.error( | |
195 f"There is a duplicate reference accession [{cols[3]}]" | |
196 + f"in the metadata file{os.path.basename(rtc)}!" | |
197 ) | |
198 exit(1) | |
199 | |
200 rtc_fh.close() | |
201 | |
202 for salmon_res_file in salmon_res_files: | |
203 sample_name = re.match( | |
204 r"(^.+?)((\_salmon\_res)|(\.salmon))$", | |
205 os.path.basename(os.path.dirname(salmon_res_file)), | |
206 )[1] | |
207 salmon_meta_json = os.path.join( | |
208 os.path.dirname(salmon_res_file), "aux_info", "meta_info.json" | |
209 ) | |
210 | |
211 if not os.path.exists(salmon_meta_json) or not os.path.getsize(salmon_meta_json) > 0: | |
212 logging.error( | |
213 "The file\n" | |
214 + f"{salmon_meta_json}\ndoes not exist or is empty!\n" | |
215 + "Did `salmon quant` fail?" | |
216 ) | |
217 exit(1) | |
218 | |
219 if not os.path.exists(salmon_res_file) or not os.path.getsize(salmon_res_file): | |
220 logging.error( | |
221 "The file\n" | |
222 + f"{salmon_res_file}\ndoes not exist or is empty!\n" | |
223 + "Did `salmon quant` fail?" | |
224 ) | |
225 exit(1) | |
226 | |
227 with open(salmon_res_file, "r") as salmon_res_fh: | |
228 for line in salmon_res_fh.readlines(): | |
229 if re.match(r"^Name.+", line): | |
230 continue | |
231 cols = line.strip().split("\t") | |
232 ref_acc = "_".join(cols[0].split("_")[:2]) | |
233 ( | |
234 sample2salmon[sample_name] | |
235 .setdefault(acc2sero[cols[0]], []) | |
236 .append(int(round(float(cols[4]), 2))) | |
237 ) | |
238 ( | |
239 sample2salmon[sample_name] | |
240 .setdefault("snp_clust_ids", {}) | |
241 .setdefault("".join(snp_clusters[ref_acc]["snp_clust_id"]), []) | |
242 .append("".join(snp_clusters[ref_acc]["pathdb_acc_id"])) | |
243 ) | |
244 seen_sero[acc2sero[cols[0]]] = 1 | |
245 | |
246 salmon_meta_json_read = json.load(open(salmon_meta_json, "r")) | |
247 ( | |
248 sample2salmon[sample_name] | |
249 .setdefault("tot_reads", []) | |
250 .append(salmon_meta_json_read["num_processed"]) | |
251 ) | |
252 | |
253 with open(salmon_comb_res, "w") as salmon_comb_res_fh: | |
254 | |
255 # snp_clust_col_header = ( | |
256 # "\tSNP Cluster(s) by Genome Hit\n" if show_snp_clust_col else "\n" | |
257 # ) | |
258 snp_clust_col_header = ( | |
259 "\tNCBI Pathogens Isolate Browser\n" if show_snp_clust_col else "\n" | |
260 ) | |
261 serotypes = sorted(seen_sero.keys()) | |
262 formatted_serotypes = [ | |
263 re.sub(r"\,antigen_formula=", " | ", s) | |
264 for s in [re.sub(r"serotype=", "", s) for s in serotypes] | |
265 ] | |
266 salmon_comb_res_fh.write( | |
267 "Sample\t" + "\t".join(formatted_serotypes) + snp_clust_col_header | |
268 ) | |
269 # sample_snp_relation = ( | |
270 # ncbi_pathogens_base_url | |
271 # + pdg_release | |
272 # + "".join(snp_clusters[ref_acc]["snp_clust_id"]) | |
273 # + "?accessions=" | |
274 # ) | |
275 sample_snp_relation = ncbi_pathogens_base_url + "isolates/#" | |
276 | |
277 if len(salmon_res_file_failed) == 1: | |
278 with (open("".join(salmon_res_file_failed), "r")) as no_calls_fh: | |
279 for line in no_calls_fh.readlines(): | |
280 if line in ["\n", "\n\r", "\r"]: | |
281 continue | |
282 salmon_comb_res_fh.write(line.strip()) | |
283 sal_yn[line.strip()] += 0 | |
284 for serotype in serotypes: | |
285 salmon_comb_res_fh.write("\t-") | |
286 salmon_comb_res_fh.write( | |
287 "\t-\n" | |
288 ) if show_snp_clust_col else salmon_comb_res_fh.write("\n") | |
289 no_calls_fh.close() | |
290 | |
291 for sample, counts in sorted(sample2salmon.items()): | |
292 salmon_comb_res_fh.write(sample) | |
293 snp_cluster_res_col = list() | |
294 | |
295 for snp_clust_id in sample2salmon[sample]["snp_clust_ids"].keys(): | |
296 # print(snp_clust_id) | |
297 # print(",".join(sample2salmon[sample]["snp_clust_ids"][snp_clust_id])) | |
298 # ppp.pprint(sample2salmon[sample]["snp_clust_ids"]) | |
299 # ppp.pprint(sample2salmon[sample]["snp_clust_ids"][snp_clust_id]) | |
300 # final_url_text = ",".join( | |
301 # sample2salmon[sample]["snp_clust_ids"][snp_clust_id] | |
302 # ) | |
303 # final_url_text_to_show = snp_clust_id | |
304 # snp_cluster_res_col.append( | |
305 # "".join( | |
306 # [ | |
307 # f'<a href="', | |
308 # sample_snp_relation, | |
309 # ",".join(sample2salmon[sample]["snp_clust_ids"][snp_clust_id]), | |
310 # f'" target="_blank">{snp_clust_id}</a>', | |
311 # ] | |
312 # ) | |
313 # ) | |
314 final_url_text_to_show = " ".join( | |
315 sample2salmon[sample]["snp_clust_ids"][snp_clust_id] | |
316 ) | |
317 snp_cluster_res_col.append( | |
318 "".join( | |
319 [ | |
320 f'<a href="', | |
321 sample_snp_relation, | |
322 final_url_text_to_show, | |
323 f'" target="_blank">{final_url_text_to_show}</a>', | |
324 ] | |
325 ) | |
326 ) | |
327 | |
328 per_serotype_counts = 0 | |
329 for serotype in serotypes: | |
330 | |
331 if serotype in sample2salmon[sample].keys(): | |
332 # ppp.pprint(counts) | |
333 sample_perc_mapped = round( | |
334 sum(counts[serotype]) / sum(counts["tot_reads"]) * 100, 2 | |
335 ) | |
336 salmon_comb_res_fh.write( | |
337 f"\t{sum(counts[serotype])} ({sample_perc_mapped}%)" | |
338 ) | |
339 multiqc_salmon_counts[sample].setdefault( | |
340 re.match(r"^serotype=(.+?)\,antigen_formula.*", serotype)[1], | |
341 sum(counts[serotype]), | |
342 ) | |
343 per_serotype_counts += sum(counts[serotype]) | |
344 sal_yn[sample] += 1 | |
345 else: | |
346 salmon_comb_res_fh.write(f"\t-") | |
347 sal_yn[sample] += 0 | |
348 | |
349 multiqc_salmon_counts[sample].setdefault( | |
350 no_hit, sum(counts["tot_reads"]) - per_serotype_counts | |
351 ) | |
352 snp_clust_col_val = ( | |
353 f'\t{" ".join(snp_cluster_res_col)}\n' if show_snp_clust_col else "\n" | |
354 ) | |
355 # ppp.pprint(multiqc_salmon_counts) | |
356 salmon_comb_res_fh.write(snp_clust_col_val) | |
357 | |
358 with open(bcs_sal_yn, "w") as bcs_sal_yn_fh: | |
359 bcs_sal_yn_fh.write("Sample\tSalmonella Presence\tNo. of Serotypes\n") | |
360 for sample in sal_yn.keys(): | |
361 if sal_yn[sample] > 0: | |
362 bcs_sal_yn_fh.write(f"{sample}\tDetected\t{sal_yn[sample]}\n") | |
363 else: | |
364 bcs_sal_yn_fh.write(f"{sample}\tNot detected\t{sal_yn[sample]}\n") | |
365 | |
366 with open(cell_colors_yml_file, "w") as cell_colors_fh: | |
367 yaml.dump(cell_colors_yml, cell_colors_fh, default_flow_style=False) | |
368 | |
369 salmon_plot_json(salmon_comb_res_mqc, multiqc_salmon_counts, no_hit) | |
370 | |
371 salmon_comb_res_fh.close() | |
372 bcs_sal_yn_fh.close() | |
373 cell_colors_fh.close() | |
374 | |
375 | |
376 def salmon_plot_json(file: None, sample_salmon_counts: None, no_hit: None) -> None: | |
377 """ | |
378 This method will take a dictionary of salmon counts per sample | |
379 and will dump a JSON that will be used by MultiQC. | |
380 """ | |
381 | |
382 if file is None or sample_salmon_counts is None: | |
383 logging.error( | |
384 "Neither an output file to dump the JSON for MultiQC or the" | |
385 + "dictionary holding the salmon counts was not passed." | |
386 ) | |
387 | |
388 # Credit: http://phrogz.net/tmp/24colors.html | |
389 # Will cycle through 20 distinct colors. | |
390 distinct_color_palette = [ | |
391 "#FF0000", | |
392 "#FFFF00", | |
393 "#00EAFF", | |
394 "#AA00FF", | |
395 "#FF7F00", | |
396 "#BFFF00", | |
397 "#0095FF", | |
398 "#FF00AA", | |
399 "#FFD400", | |
400 "#6AFF00", | |
401 "#0040FF", | |
402 "#EDB9B9", | |
403 "#B9D7ED", | |
404 "#E7E9B9", | |
405 "#DCB9ED", | |
406 "#B9EDE0", | |
407 "#8F2323", | |
408 "#23628F", | |
409 "#8F6A23", | |
410 "#6B238F", | |
411 "#4F8F23", | |
412 ] | |
413 | |
414 no_hit_color = "#434348" | |
415 col_count = 0 | |
416 serotypes = set() | |
417 salmon_counts = defaultdict(defaultdict) | |
418 salmon_counts["id"] = "BETTERCALLSAL_SALMON_COUNTS" | |
419 salmon_counts["section_name"] = "Salmon read counts" | |
420 salmon_counts["description"] = ( | |
421 "This section shows the read counts from running <code>salmon</code> " | |
422 + "in <code>--meta</code> mode using SE, merged PE or concatenated PE reads against " | |
423 + "an on-the-fly <code>salmon</code> index generated from the genome hits " | |
424 + "of <code>kma</code>." | |
425 ) | |
426 salmon_counts["plot_type"] = "bargraph" | |
427 salmon_counts["pconfig"]["id"] = "bettercallsal_salmon_counts_plot" | |
428 salmon_counts["pconfig"]["title"] = "Salmon: Read counts" | |
429 salmon_counts["pconfig"]["ylab"] = "Number of reads" | |
430 salmon_counts["pconfig"]["xDecimals"] = "false" | |
431 salmon_counts["pconfig"]["cpswitch_counts_label"] = "Number of reads (Counts)" | |
432 salmon_counts["pconfig"]["cpswitch_percent_label"] = "Number of reads (Percentages)" | |
433 | |
434 for sample in sorted(sample_salmon_counts.keys()): | |
435 serotypes.update(list(sample_salmon_counts[sample].keys())) | |
436 salmon_counts["data"][sample] = sample_salmon_counts[sample] | |
437 | |
438 for serotype in sorted(serotypes): | |
439 if serotype == no_hit: | |
440 continue | |
441 if col_count == len(distinct_color_palette) - 1: | |
442 col_count = 0 | |
443 | |
444 col_count += 1 | |
445 salmon_counts["categories"][serotype] = {"color": distinct_color_palette[col_count]} | |
446 | |
447 salmon_counts["categories"][no_hit] = {"color": no_hit_color} | |
448 json.dump(salmon_counts, open(file, "w")) | |
449 | |
450 | |
451 if __name__ == "__main__": | |
452 main() |