kkonganti@1
|
1 #!/usr/bin/env python3
|
kkonganti@1
|
2
|
kkonganti@1
|
3 # Kranti Konganti
|
kkonganti@1
|
4
|
kkonganti@1
|
5 import argparse
|
kkonganti@1
|
6 import glob
|
kkonganti@1
|
7 import inspect
|
kkonganti@1
|
8 import json
|
kkonganti@1
|
9 import logging
|
kkonganti@1
|
10 import os
|
kkonganti@1
|
11 import pickle
|
kkonganti@1
|
12 import pprint
|
kkonganti@1
|
13 import re
|
kkonganti@1
|
14 from collections import defaultdict
|
kkonganti@1
|
15
|
kkonganti@1
|
16 import yaml
|
kkonganti@1
|
17
|
kkonganti@1
|
18
|
kkonganti@1
|
19 # Multiple inheritence for pretty printing of help text.
|
kkonganti@1
|
20 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
|
kkonganti@1
|
21 pass
|
kkonganti@1
|
22
|
kkonganti@1
|
23
|
kkonganti@1
|
24 # Main
|
kkonganti@1
|
25 def main() -> None:
|
kkonganti@1
|
26 """
|
kkonganti@1
|
27 The succesful execution of this script requires access to bettercallsal formatted
|
kkonganti@1
|
28 db flat files. On raven2, they are at /hpc/db/bettercallsall/PDGXXXXXXXXXX.XXXXX
|
kkonganti@1
|
29
|
kkonganti@1
|
30 It takes the ACC2SERO.pickle file and *.reference_target.cluster_list.tsv file
|
kkonganti@1
|
31 for that particular NCBI Pathogens release from the db directory mentioned with
|
kkonganti@1
|
32 -db option and a root parent directory of the `salmon quant` results mentioned
|
kkonganti@1
|
33 with -sal option and generates a final results table with number of reads
|
kkonganti@1
|
34 mapped and a .json file to be used with MultiQC to generate a stacked bar plot.
|
kkonganti@1
|
35
|
kkonganti@1
|
36 Using -url option optionally adds an extra column of NCBI Pathogens Isolates
|
kkonganti@1
|
37 Browser, which directly links out to NCBI Pathogens Isolates SNP viewer tool.
|
kkonganti@1
|
38 """
|
kkonganti@1
|
39 # Set logging.
|
kkonganti@1
|
40 logging.basicConfig(
|
kkonganti@1
|
41 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n",
|
kkonganti@1
|
42 level=logging.DEBUG,
|
kkonganti@1
|
43 )
|
kkonganti@1
|
44
|
kkonganti@1
|
45 # Debug print.
|
kkonganti@1
|
46 ppp = pprint.PrettyPrinter(width=55)
|
kkonganti@1
|
47 prog_name = inspect.stack()[0].filename
|
kkonganti@1
|
48
|
kkonganti@1
|
49 parser = argparse.ArgumentParser(
|
kkonganti@1
|
50 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
|
kkonganti@1
|
51 )
|
kkonganti@1
|
52
|
kkonganti@1
|
53 required = parser.add_argument_group("required arguments")
|
kkonganti@1
|
54
|
kkonganti@1
|
55 required.add_argument(
|
kkonganti@1
|
56 "-sal",
|
kkonganti@1
|
57 dest="salmon_res_dir",
|
kkonganti@1
|
58 default=False,
|
kkonganti@1
|
59 required=True,
|
kkonganti@1
|
60 help="Absolute UNIX path to the parent directory that contains the\n"
|
kkonganti@1
|
61 + "`salmon quant` results directory. For example, if path to\n"
|
kkonganti@1
|
62 + "`quant.sf` is in /hpc/john_doe/test/salmon_res/quant.sf, then\n"
|
kkonganti@1
|
63 + "use this command-line option as:\n"
|
kkonganti@1
|
64 + "-sal /hpc/john_doe/test",
|
kkonganti@1
|
65 )
|
kkonganti@1
|
66 required.add_argument(
|
kkonganti@1
|
67 "-snp",
|
kkonganti@1
|
68 dest="rtc",
|
kkonganti@1
|
69 default=False,
|
kkonganti@1
|
70 required=True,
|
kkonganti@1
|
71 help="Absolute UNIX Path to the PDG SNP reference target cluster\n"
|
kkonganti@1
|
72 + "metadata file. On raven2, these are located at\n"
|
kkonganti@1
|
73 + "/hpc/db/bettercallsal/PDGXXXXXXXXXX.XXXXX\n"
|
kkonganti@1
|
74 + "Required if -sal is on.",
|
kkonganti@1
|
75 )
|
kkonganti@1
|
76 required.add_argument(
|
kkonganti@1
|
77 "-pickle",
|
kkonganti@1
|
78 dest="acc2sero",
|
kkonganti@1
|
79 default=False,
|
kkonganti@1
|
80 required=True,
|
kkonganti@1
|
81 help="Absolute UNIX Path to the *ACC2SERO.pickle\n"
|
kkonganti@1
|
82 + "metadata file. On raven2, these are located at\n"
|
kkonganti@1
|
83 + "/hpc/db/bettercallsal/PDGXXXXXXXXXX.XXXXX\n"
|
kkonganti@1
|
84 + "Required if -sal is on.",
|
kkonganti@1
|
85 )
|
kkonganti@1
|
86 parser.add_argument(
|
kkonganti@1
|
87 "-op",
|
kkonganti@1
|
88 dest="out_prefix",
|
kkonganti@1
|
89 default="bettercallsal.tblsum",
|
kkonganti@1
|
90 required=False,
|
kkonganti@1
|
91 help="Set the output file(s) prefix for output(s) generated\n" + "by this program.",
|
kkonganti@1
|
92 )
|
kkonganti@1
|
93 parser.add_argument(
|
kkonganti@1
|
94 "-url",
|
kkonganti@1
|
95 dest="show_snp_clust_info",
|
kkonganti@1
|
96 default=False,
|
kkonganti@1
|
97 required=False,
|
kkonganti@1
|
98 action="store_true",
|
kkonganti@1
|
99 help="Show SNP cluster participation information of the final genome hit.\n"
|
kkonganti@1
|
100 + "This may be useful to see a relative placement of your sample in\n"
|
kkonganti@1
|
101 + "NCBI Isolates SNP Tree Viewer based on genome similarity but however\n"
|
kkonganti@1
|
102 + "due to rapid nature of the updates at NCBI Pathogen Detection Project,\n"
|
kkonganti@1
|
103 + "the placement may be in an outdated cluster.",
|
kkonganti@1
|
104 )
|
kkonganti@1
|
105
|
kkonganti@1
|
106 args = parser.parse_args()
|
kkonganti@1
|
107 salmon_res_dir = args.salmon_res_dir
|
kkonganti@1
|
108 out_prefix = args.out_prefix
|
kkonganti@1
|
109 show_snp_clust_col = args.show_snp_clust_info
|
kkonganti@1
|
110 rtc = args.rtc
|
kkonganti@1
|
111 pickled_sero = args.acc2sero
|
kkonganti@1
|
112 no_hit = "No genome hit"
|
kkonganti@1
|
113 bcs_sal_yn_prefix = "bettercallsal_salyn"
|
kkonganti@1
|
114 sal_y = "Detected"
|
kkonganti@1
|
115 sal_n = "Not detected"
|
kkonganti@1
|
116 ncbi_pathogens_base_url = "https://www.ncbi.nlm.nih.gov/pathogens/"
|
kkonganti@1
|
117
|
kkonganti@1
|
118 sample2salmon, snp_clusters, multiqc_salmon_counts, seen_sero, sal_yn = (
|
kkonganti@1
|
119 defaultdict(defaultdict),
|
kkonganti@1
|
120 defaultdict(defaultdict),
|
kkonganti@1
|
121 defaultdict(defaultdict),
|
kkonganti@1
|
122 defaultdict(int),
|
kkonganti@1
|
123 defaultdict(int),
|
kkonganti@1
|
124 )
|
kkonganti@1
|
125
|
kkonganti@1
|
126 cell_colors_yml = {
|
kkonganti@1
|
127 bcs_sal_yn_prefix: {sal_y: "#c8e6c9 !important;", sal_n: "#ffcdd2 !important;"}
|
kkonganti@1
|
128 }
|
kkonganti@1
|
129
|
kkonganti@1
|
130 salmon_comb_res = os.path.join(os.getcwd(), out_prefix + ".txt")
|
kkonganti@1
|
131 bcs_sal_yn = re.sub(out_prefix, bcs_sal_yn_prefix + ".tblsum", salmon_comb_res)
|
kkonganti@1
|
132 cell_colors_yml_file = re.sub(
|
kkonganti@1
|
133 out_prefix + ".txt", bcs_sal_yn_prefix + ".cellcolors.yml", salmon_comb_res
|
kkonganti@1
|
134 )
|
kkonganti@1
|
135 salmon_comb_res_mqc = os.path.join(os.getcwd(), str(out_prefix).split(".")[0] + "_mqc.json")
|
kkonganti@1
|
136 salmon_res_files = glob.glob(os.path.join(salmon_res_dir, "*", "quant.sf"), recursive=True)
|
kkonganti@1
|
137 salmon_res_file_failed = glob.glob(os.path.join(salmon_res_dir, "BCS_NO_CALLS.txt"))
|
kkonganti@1
|
138
|
kkonganti@1
|
139 if rtc and (not os.path.exists(rtc) or not os.path.getsize(rtc) > 0):
|
kkonganti@1
|
140 logging.error(
|
kkonganti@1
|
141 "The reference target cluster metadata file,\n"
|
kkonganti@1
|
142 + f"{os.path.basename(rtc)} does not exist or is empty!"
|
kkonganti@1
|
143 )
|
kkonganti@1
|
144 exit(1)
|
kkonganti@1
|
145
|
kkonganti@1
|
146 if rtc and (not salmon_res_dir or not pickled_sero):
|
kkonganti@1
|
147 logging.error("When -rtc is on, -sal and -ps are also required.")
|
kkonganti@1
|
148 exit(1)
|
kkonganti@1
|
149
|
kkonganti@1
|
150 if pickled_sero and (not os.path.exists(pickled_sero) or not os.path.getsize(pickled_sero)):
|
kkonganti@1
|
151 logging.error(
|
kkonganti@1
|
152 "The pickle file,\n" + f"{os.path.basename(pickled_sero)} does not exist or is empty!"
|
kkonganti@1
|
153 )
|
kkonganti@1
|
154 exit(1)
|
kkonganti@1
|
155
|
kkonganti@1
|
156 if salmon_res_dir:
|
kkonganti@1
|
157 if not os.path.isdir(salmon_res_dir):
|
kkonganti@1
|
158 logging.error("UNIX path\n" + f"{salmon_res_dir}\n" + "does not exist!")
|
kkonganti@1
|
159 exit(1)
|
kkonganti@1
|
160 if len(salmon_res_files) <= 0:
|
kkonganti@1
|
161 # logging.error(
|
kkonganti@1
|
162 # "Parent directory,\n"
|
kkonganti@1
|
163 # + f"{salmon_res_dir}"
|
kkonganti@1
|
164 # + "\ndoes not seem to have any directories that contain\n"
|
kkonganti@1
|
165 # + "the `quant.sf` file(s)."
|
kkonganti@1
|
166 # )
|
kkonganti@1
|
167 # exit(1)
|
kkonganti@1
|
168 with open(salmon_comb_res, "w") as salmon_comb_res_fh:
|
kkonganti@1
|
169 salmon_comb_res_fh.write(f"Sample\n{no_hit}s in any samples\n")
|
kkonganti@1
|
170 salmon_comb_res_fh.close()
|
kkonganti@1
|
171 exit(0)
|
kkonganti@1
|
172
|
kkonganti@1
|
173 if rtc and os.path.exists(rtc) and os.path.getsize(rtc) > 0:
|
kkonganti@1
|
174
|
kkonganti@1
|
175 # pdg_release = re.match(r"(^PDG\d+\.\d+)\..+", os.path.basename(rtc))[1] + "/"
|
kkonganti@1
|
176 acc2sero = pickle.load(file=open(pickled_sero, "rb"))
|
kkonganti@1
|
177
|
kkonganti@1
|
178 with open(rtc, "r") as rtc_fh:
|
kkonganti@1
|
179
|
kkonganti@1
|
180 for line in rtc_fh:
|
kkonganti@1
|
181 cols = line.strip().split("\t")
|
kkonganti@1
|
182
|
kkonganti@1
|
183 if len(cols) < 4:
|
kkonganti@1
|
184 logging.error(
|
kkonganti@1
|
185 f"The file {os.path.basename(rtc)} seems to\n"
|
kkonganti@1
|
186 + "be malformed. It contains less than required 4 columns."
|
kkonganti@1
|
187 )
|
kkonganti@1
|
188 exit(1)
|
kkonganti@1
|
189 elif cols[3] != "NULL":
|
kkonganti@1
|
190 snp_clusters[cols[0]].setdefault("assembly_accs", []).append(cols[3])
|
kkonganti@1
|
191 snp_clusters[cols[3]].setdefault("snp_clust_id", []).append(cols[0])
|
kkonganti@1
|
192 snp_clusters[cols[3]].setdefault("pathdb_acc_id", []).append(cols[1])
|
kkonganti@1
|
193 if len(snp_clusters[cols[3]]["snp_clust_id"]) > 1:
|
kkonganti@1
|
194 logging.error(
|
kkonganti@1
|
195 f"There is a duplicate reference accession [{cols[3]}]"
|
kkonganti@1
|
196 + f"in the metadata file{os.path.basename(rtc)}!"
|
kkonganti@1
|
197 )
|
kkonganti@1
|
198 exit(1)
|
kkonganti@1
|
199
|
kkonganti@1
|
200 rtc_fh.close()
|
kkonganti@1
|
201
|
kkonganti@1
|
202 for salmon_res_file in salmon_res_files:
|
kkonganti@1
|
203 sample_name = re.match(
|
kkonganti@1
|
204 r"(^.+?)((\_salmon\_res)|(\.salmon))$",
|
kkonganti@1
|
205 os.path.basename(os.path.dirname(salmon_res_file)),
|
kkonganti@1
|
206 )[1]
|
kkonganti@1
|
207 salmon_meta_json = os.path.join(
|
kkonganti@1
|
208 os.path.dirname(salmon_res_file), "aux_info", "meta_info.json"
|
kkonganti@1
|
209 )
|
kkonganti@1
|
210
|
kkonganti@1
|
211 if not os.path.exists(salmon_meta_json) or not os.path.getsize(salmon_meta_json) > 0:
|
kkonganti@1
|
212 logging.error(
|
kkonganti@1
|
213 "The file\n"
|
kkonganti@1
|
214 + f"{salmon_meta_json}\ndoes not exist or is empty!\n"
|
kkonganti@1
|
215 + "Did `salmon quant` fail?"
|
kkonganti@1
|
216 )
|
kkonganti@1
|
217 exit(1)
|
kkonganti@1
|
218
|
kkonganti@1
|
219 if not os.path.exists(salmon_res_file) or not os.path.getsize(salmon_res_file):
|
kkonganti@1
|
220 logging.error(
|
kkonganti@1
|
221 "The file\n"
|
kkonganti@1
|
222 + f"{salmon_res_file}\ndoes not exist or is empty!\n"
|
kkonganti@1
|
223 + "Did `salmon quant` fail?"
|
kkonganti@1
|
224 )
|
kkonganti@1
|
225 exit(1)
|
kkonganti@1
|
226
|
kkonganti@1
|
227 with open(salmon_res_file, "r") as salmon_res_fh:
|
kkonganti@1
|
228 for line in salmon_res_fh.readlines():
|
kkonganti@1
|
229 if re.match(r"^Name.+", line):
|
kkonganti@1
|
230 continue
|
kkonganti@1
|
231 cols = line.strip().split("\t")
|
kkonganti@1
|
232 ref_acc = "_".join(cols[0].split("_")[:2])
|
kkonganti@1
|
233 (
|
kkonganti@1
|
234 sample2salmon[sample_name]
|
kkonganti@1
|
235 .setdefault(acc2sero[cols[0]], [])
|
kkonganti@1
|
236 .append(int(round(float(cols[4]), 2)))
|
kkonganti@1
|
237 )
|
kkonganti@1
|
238 (
|
kkonganti@1
|
239 sample2salmon[sample_name]
|
kkonganti@1
|
240 .setdefault("snp_clust_ids", {})
|
kkonganti@1
|
241 .setdefault("".join(snp_clusters[ref_acc]["snp_clust_id"]), [])
|
kkonganti@1
|
242 .append("".join(snp_clusters[ref_acc]["pathdb_acc_id"]))
|
kkonganti@1
|
243 )
|
kkonganti@1
|
244 seen_sero[acc2sero[cols[0]]] = 1
|
kkonganti@1
|
245
|
kkonganti@1
|
246 salmon_meta_json_read = json.load(open(salmon_meta_json, "r"))
|
kkonganti@1
|
247 (
|
kkonganti@1
|
248 sample2salmon[sample_name]
|
kkonganti@1
|
249 .setdefault("tot_reads", [])
|
kkonganti@1
|
250 .append(salmon_meta_json_read["num_processed"])
|
kkonganti@1
|
251 )
|
kkonganti@1
|
252
|
kkonganti@1
|
253 with open(salmon_comb_res, "w") as salmon_comb_res_fh:
|
kkonganti@1
|
254
|
kkonganti@1
|
255 # snp_clust_col_header = (
|
kkonganti@1
|
256 # "\tSNP Cluster(s) by Genome Hit\n" if show_snp_clust_col else "\n"
|
kkonganti@1
|
257 # )
|
kkonganti@1
|
258 snp_clust_col_header = (
|
kkonganti@1
|
259 "\tNCBI Pathogens Isolate Browser\n" if show_snp_clust_col else "\n"
|
kkonganti@1
|
260 )
|
kkonganti@1
|
261 serotypes = sorted(seen_sero.keys())
|
kkonganti@1
|
262 formatted_serotypes = [
|
kkonganti@1
|
263 re.sub(r"\,antigen_formula=", " | ", s)
|
kkonganti@1
|
264 for s in [re.sub(r"serotype=", "", s) for s in serotypes]
|
kkonganti@1
|
265 ]
|
kkonganti@1
|
266 salmon_comb_res_fh.write(
|
kkonganti@1
|
267 "Sample\t" + "\t".join(formatted_serotypes) + snp_clust_col_header
|
kkonganti@1
|
268 )
|
kkonganti@1
|
269 # sample_snp_relation = (
|
kkonganti@1
|
270 # ncbi_pathogens_base_url
|
kkonganti@1
|
271 # + pdg_release
|
kkonganti@1
|
272 # + "".join(snp_clusters[ref_acc]["snp_clust_id"])
|
kkonganti@1
|
273 # + "?accessions="
|
kkonganti@1
|
274 # )
|
kkonganti@1
|
275 sample_snp_relation = ncbi_pathogens_base_url + "isolates/#"
|
kkonganti@1
|
276
|
kkonganti@1
|
277 if len(salmon_res_file_failed) == 1:
|
kkonganti@1
|
278 with (open("".join(salmon_res_file_failed), "r")) as no_calls_fh:
|
kkonganti@1
|
279 for line in no_calls_fh.readlines():
|
kkonganti@1
|
280 if line in ["\n", "\n\r", "\r"]:
|
kkonganti@1
|
281 continue
|
kkonganti@1
|
282 salmon_comb_res_fh.write(line.strip())
|
kkonganti@1
|
283 sal_yn[line.strip()] += 0
|
kkonganti@1
|
284 for serotype in serotypes:
|
kkonganti@1
|
285 salmon_comb_res_fh.write("\t-")
|
kkonganti@1
|
286 salmon_comb_res_fh.write(
|
kkonganti@1
|
287 "\t-\n"
|
kkonganti@1
|
288 ) if show_snp_clust_col else salmon_comb_res_fh.write("\n")
|
kkonganti@1
|
289 no_calls_fh.close()
|
kkonganti@1
|
290
|
kkonganti@1
|
291 for sample, counts in sorted(sample2salmon.items()):
|
kkonganti@1
|
292 salmon_comb_res_fh.write(sample)
|
kkonganti@1
|
293 snp_cluster_res_col = list()
|
kkonganti@1
|
294
|
kkonganti@1
|
295 for snp_clust_id in sample2salmon[sample]["snp_clust_ids"].keys():
|
kkonganti@1
|
296 # print(snp_clust_id)
|
kkonganti@1
|
297 # print(",".join(sample2salmon[sample]["snp_clust_ids"][snp_clust_id]))
|
kkonganti@1
|
298 # ppp.pprint(sample2salmon[sample]["snp_clust_ids"])
|
kkonganti@1
|
299 # ppp.pprint(sample2salmon[sample]["snp_clust_ids"][snp_clust_id])
|
kkonganti@1
|
300 # final_url_text = ",".join(
|
kkonganti@1
|
301 # sample2salmon[sample]["snp_clust_ids"][snp_clust_id]
|
kkonganti@1
|
302 # )
|
kkonganti@1
|
303 # final_url_text_to_show = snp_clust_id
|
kkonganti@1
|
304 # snp_cluster_res_col.append(
|
kkonganti@1
|
305 # "".join(
|
kkonganti@1
|
306 # [
|
kkonganti@1
|
307 # f'<a href="',
|
kkonganti@1
|
308 # sample_snp_relation,
|
kkonganti@1
|
309 # ",".join(sample2salmon[sample]["snp_clust_ids"][snp_clust_id]),
|
kkonganti@1
|
310 # f'" target="_blank">{snp_clust_id}</a>',
|
kkonganti@1
|
311 # ]
|
kkonganti@1
|
312 # )
|
kkonganti@1
|
313 # )
|
kkonganti@1
|
314 final_url_text_to_show = " ".join(
|
kkonganti@1
|
315 sample2salmon[sample]["snp_clust_ids"][snp_clust_id]
|
kkonganti@1
|
316 )
|
kkonganti@1
|
317 snp_cluster_res_col.append(
|
kkonganti@1
|
318 "".join(
|
kkonganti@1
|
319 [
|
kkonganti@1
|
320 f'<a href="',
|
kkonganti@1
|
321 sample_snp_relation,
|
kkonganti@1
|
322 final_url_text_to_show,
|
kkonganti@1
|
323 f'" target="_blank">{final_url_text_to_show}</a>',
|
kkonganti@1
|
324 ]
|
kkonganti@1
|
325 )
|
kkonganti@1
|
326 )
|
kkonganti@1
|
327
|
kkonganti@1
|
328 per_serotype_counts = 0
|
kkonganti@1
|
329 for serotype in serotypes:
|
kkonganti@1
|
330
|
kkonganti@1
|
331 if serotype in sample2salmon[sample].keys():
|
kkonganti@1
|
332 # ppp.pprint(counts)
|
kkonganti@1
|
333 sample_perc_mapped = round(
|
kkonganti@1
|
334 sum(counts[serotype]) / sum(counts["tot_reads"]) * 100, 2
|
kkonganti@1
|
335 )
|
kkonganti@1
|
336 salmon_comb_res_fh.write(
|
kkonganti@1
|
337 f"\t{sum(counts[serotype])} ({sample_perc_mapped}%)"
|
kkonganti@1
|
338 )
|
kkonganti@1
|
339 multiqc_salmon_counts[sample].setdefault(
|
kkonganti@1
|
340 re.match(r"^serotype=(.+?)\,antigen_formula.*", serotype)[1],
|
kkonganti@1
|
341 sum(counts[serotype]),
|
kkonganti@1
|
342 )
|
kkonganti@1
|
343 per_serotype_counts += sum(counts[serotype])
|
kkonganti@1
|
344 sal_yn[sample] += 1
|
kkonganti@1
|
345 else:
|
kkonganti@1
|
346 salmon_comb_res_fh.write(f"\t-")
|
kkonganti@1
|
347 sal_yn[sample] += 0
|
kkonganti@1
|
348
|
kkonganti@1
|
349 multiqc_salmon_counts[sample].setdefault(
|
kkonganti@1
|
350 no_hit, sum(counts["tot_reads"]) - per_serotype_counts
|
kkonganti@1
|
351 )
|
kkonganti@1
|
352 snp_clust_col_val = (
|
kkonganti@1
|
353 f'\t{" ".join(snp_cluster_res_col)}\n' if show_snp_clust_col else "\n"
|
kkonganti@1
|
354 )
|
kkonganti@1
|
355 # ppp.pprint(multiqc_salmon_counts)
|
kkonganti@1
|
356 salmon_comb_res_fh.write(snp_clust_col_val)
|
kkonganti@1
|
357
|
kkonganti@1
|
358 with open(bcs_sal_yn, "w") as bcs_sal_yn_fh:
|
kkonganti@1
|
359 bcs_sal_yn_fh.write("Sample\tSalmonella Presence\tNo. of Serotypes\n")
|
kkonganti@1
|
360 for sample in sal_yn.keys():
|
kkonganti@1
|
361 if sal_yn[sample] > 0:
|
kkonganti@1
|
362 bcs_sal_yn_fh.write(f"{sample}\tDetected\t{sal_yn[sample]}\n")
|
kkonganti@1
|
363 else:
|
kkonganti@1
|
364 bcs_sal_yn_fh.write(f"{sample}\tNot detected\t{sal_yn[sample]}\n")
|
kkonganti@1
|
365
|
kkonganti@1
|
366 with open(cell_colors_yml_file, "w") as cell_colors_fh:
|
kkonganti@1
|
367 yaml.dump(cell_colors_yml, cell_colors_fh, default_flow_style=False)
|
kkonganti@1
|
368
|
kkonganti@1
|
369 salmon_plot_json(salmon_comb_res_mqc, multiqc_salmon_counts, no_hit)
|
kkonganti@1
|
370
|
kkonganti@1
|
371 salmon_comb_res_fh.close()
|
kkonganti@1
|
372 bcs_sal_yn_fh.close()
|
kkonganti@1
|
373 cell_colors_fh.close()
|
kkonganti@1
|
374
|
kkonganti@1
|
375
|
kkonganti@1
|
376 def salmon_plot_json(file: None, sample_salmon_counts: None, no_hit: None) -> None:
|
kkonganti@1
|
377 """
|
kkonganti@1
|
378 This method will take a dictionary of salmon counts per sample
|
kkonganti@1
|
379 and will dump a JSON that will be used by MultiQC.
|
kkonganti@1
|
380 """
|
kkonganti@1
|
381
|
kkonganti@1
|
382 if file is None or sample_salmon_counts is None:
|
kkonganti@1
|
383 logging.error(
|
kkonganti@1
|
384 "Neither an output file to dump the JSON for MultiQC or the"
|
kkonganti@1
|
385 + "dictionary holding the salmon counts was not passed."
|
kkonganti@1
|
386 )
|
kkonganti@1
|
387
|
kkonganti@1
|
388 # Credit: http://phrogz.net/tmp/24colors.html
|
kkonganti@1
|
389 # Will cycle through 20 distinct colors.
|
kkonganti@1
|
390 distinct_color_palette = [
|
kkonganti@1
|
391 "#FF0000",
|
kkonganti@1
|
392 "#FFFF00",
|
kkonganti@1
|
393 "#00EAFF",
|
kkonganti@1
|
394 "#AA00FF",
|
kkonganti@1
|
395 "#FF7F00",
|
kkonganti@1
|
396 "#BFFF00",
|
kkonganti@1
|
397 "#0095FF",
|
kkonganti@1
|
398 "#FF00AA",
|
kkonganti@1
|
399 "#FFD400",
|
kkonganti@1
|
400 "#6AFF00",
|
kkonganti@1
|
401 "#0040FF",
|
kkonganti@1
|
402 "#EDB9B9",
|
kkonganti@1
|
403 "#B9D7ED",
|
kkonganti@1
|
404 "#E7E9B9",
|
kkonganti@1
|
405 "#DCB9ED",
|
kkonganti@1
|
406 "#B9EDE0",
|
kkonganti@1
|
407 "#8F2323",
|
kkonganti@1
|
408 "#23628F",
|
kkonganti@1
|
409 "#8F6A23",
|
kkonganti@1
|
410 "#6B238F",
|
kkonganti@1
|
411 "#4F8F23",
|
kkonganti@1
|
412 ]
|
kkonganti@1
|
413
|
kkonganti@1
|
414 no_hit_color = "#434348"
|
kkonganti@1
|
415 col_count = 0
|
kkonganti@1
|
416 serotypes = set()
|
kkonganti@1
|
417 salmon_counts = defaultdict(defaultdict)
|
kkonganti@1
|
418 salmon_counts["id"] = "BETTERCALLSAL_SALMON_COUNTS"
|
kkonganti@1
|
419 salmon_counts["section_name"] = "Salmon read counts"
|
kkonganti@1
|
420 salmon_counts["description"] = (
|
kkonganti@1
|
421 "This section shows the read counts from running <code>salmon</code> "
|
kkonganti@1
|
422 + "in <code>--meta</code> mode using SE, merged PE or concatenated PE reads against "
|
kkonganti@1
|
423 + "an on-the-fly <code>salmon</code> index generated from the genome hits "
|
kkonganti@1
|
424 + "of <code>kma</code>."
|
kkonganti@1
|
425 )
|
kkonganti@1
|
426 salmon_counts["plot_type"] = "bargraph"
|
kkonganti@1
|
427 salmon_counts["pconfig"]["id"] = "bettercallsal_salmon_counts_plot"
|
kkonganti@1
|
428 salmon_counts["pconfig"]["title"] = "Salmon: Read counts"
|
kkonganti@1
|
429 salmon_counts["pconfig"]["ylab"] = "Number of reads"
|
kkonganti@1
|
430 salmon_counts["pconfig"]["xDecimals"] = "false"
|
kkonganti@1
|
431 salmon_counts["pconfig"]["cpswitch_counts_label"] = "Number of reads (Counts)"
|
kkonganti@1
|
432 salmon_counts["pconfig"]["cpswitch_percent_label"] = "Number of reads (Percentages)"
|
kkonganti@1
|
433
|
kkonganti@1
|
434 for sample in sorted(sample_salmon_counts.keys()):
|
kkonganti@1
|
435 serotypes.update(list(sample_salmon_counts[sample].keys()))
|
kkonganti@1
|
436 salmon_counts["data"][sample] = sample_salmon_counts[sample]
|
kkonganti@1
|
437
|
kkonganti@1
|
438 for serotype in sorted(serotypes):
|
kkonganti@1
|
439 if serotype == no_hit:
|
kkonganti@1
|
440 continue
|
kkonganti@1
|
441 if col_count == len(distinct_color_palette) - 1:
|
kkonganti@1
|
442 col_count = 0
|
kkonganti@1
|
443
|
kkonganti@1
|
444 col_count += 1
|
kkonganti@1
|
445 salmon_counts["categories"][serotype] = {"color": distinct_color_palette[col_count]}
|
kkonganti@1
|
446
|
kkonganti@1
|
447 salmon_counts["categories"][no_hit] = {"color": no_hit_color}
|
kkonganti@1
|
448 json.dump(salmon_counts, open(file, "w"))
|
kkonganti@1
|
449
|
kkonganti@1
|
450
|
kkonganti@1
|
451 if __name__ == "__main__":
|
kkonganti@1
|
452 main()
|