Mercurial > repos > galaxytrakr > hfp_bettercallsal_awsbatch
comparison 1.0.0/bin/gen_salmon_res_table.py @ 0:801b85b03a17 draft default tip
planemo upload
| author | galaxytrakr |
|---|---|
| date | Thu, 28 May 2026 20:31:42 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:801b85b03a17 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 # Kranti Konganti | |
| 4 | |
| 5 import argparse | |
| 6 import glob | |
| 7 import inspect | |
| 8 import json | |
| 9 import logging | |
| 10 import os | |
| 11 import pickle | |
| 12 import pprint | |
| 13 import re | |
| 14 from collections import defaultdict | |
| 15 | |
| 16 import yaml | |
| 17 | |
| 18 | |
| 19 # Multiple inheritence for pretty printing of help text. | |
| 20 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): | |
| 21 pass | |
| 22 | |
| 23 | |
| 24 # Main | |
| 25 def main() -> None: | |
| 26 """ | |
| 27 The succesful execution of this script requires access to bettercallsal formatted | |
| 28 db flat files. On raven2, they are at /hpc/db/bettercallsall/PDGXXXXXXXXXX.XXXXX | |
| 29 | |
| 30 It takes the ACC2SERO.pickle file and *.reference_target.cluster_list.tsv file | |
| 31 for that particular NCBI Pathogens release from the db directory mentioned with | |
| 32 -db option and a root parent directory of the `salmon quant` results mentioned | |
| 33 with -sal option and generates a final results table with number of reads | |
| 34 mapped and a .json file to be used with MultiQC to generate a stacked bar plot. | |
| 35 | |
| 36 Using -url option optionally adds an extra column of NCBI Pathogens Isolates | |
| 37 Browser, which directly links out to NCBI Pathogens Isolates SNP viewer tool. | |
| 38 """ | |
| 39 # Set logging. | |
| 40 logging.basicConfig( | |
| 41 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n", | |
| 42 level=logging.DEBUG, | |
| 43 ) | |
| 44 | |
| 45 # Debug print. | |
| 46 ppp = pprint.PrettyPrinter(width=55) | |
| 47 prog_name = inspect.stack()[0].filename | |
| 48 | |
| 49 parser = argparse.ArgumentParser( | |
| 50 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses | |
| 51 ) | |
| 52 | |
| 53 required = parser.add_argument_group("required arguments") | |
| 54 | |
| 55 required.add_argument( | |
| 56 "-sal", | |
| 57 dest="salmon_res_dir", | |
| 58 default=False, | |
| 59 required=True, | |
| 60 help="Absolute UNIX path to the parent directory that contains the\n" | |
| 61 + "`salmon quant` results directory. For example, if path to\n" | |
| 62 + "`quant.sf` is in /hpc/john_doe/test/salmon_res/quant.sf, then\n" | |
| 63 + "use this command-line option as:\n" | |
| 64 + "-sal /hpc/john_doe/test", | |
| 65 ) | |
| 66 required.add_argument( | |
| 67 "-snp", | |
| 68 dest="rtc", | |
| 69 default=False, | |
| 70 required=True, | |
| 71 help="Absolute UNIX Path to the PDG SNP reference target cluster\n" | |
| 72 + "metadata file. On raven2, these are located at\n" | |
| 73 + "/hpc/db/bettercallsal/PDGXXXXXXXXXX.XXXXX\n" | |
| 74 + "Required if -sal is on.", | |
| 75 ) | |
| 76 required.add_argument( | |
| 77 "-pickle", | |
| 78 dest="acc2sero", | |
| 79 default=False, | |
| 80 required=True, | |
| 81 help="Absolute UNIX Path to the *ACC2SERO.pickle\n" | |
| 82 + "metadata file. On raven2, these are located at\n" | |
| 83 + "/hpc/db/bettercallsal/PDGXXXXXXXXXX.XXXXX\n" | |
| 84 + "Required if -sal is on.", | |
| 85 ) | |
| 86 parser.add_argument( | |
| 87 "-op", | |
| 88 dest="out_prefix", | |
| 89 default="bettercallsal.tblsum", | |
| 90 required=False, | |
| 91 help="Set the output file(s) prefix for output(s) generated\n" + "by this program.", | |
| 92 ) | |
| 93 parser.add_argument( | |
| 94 "-url", | |
| 95 dest="show_snp_clust_info", | |
| 96 default=False, | |
| 97 required=False, | |
| 98 action="store_true", | |
| 99 help="Show SNP cluster participation information of the final genome hit.\n" | |
| 100 + "This may be useful to see a relative placement of your sample in\n" | |
| 101 + "NCBI Isolates SNP Tree Viewer based on genome similarity but however\n" | |
| 102 + "due to rapid nature of the updates at NCBI Pathogen Detection Project,\n" | |
| 103 + "the placement may be in an outdated cluster.", | |
| 104 ) | |
| 105 | |
| 106 args = parser.parse_args() | |
| 107 salmon_res_dir = args.salmon_res_dir | |
| 108 out_prefix = args.out_prefix | |
| 109 show_snp_clust_col = args.show_snp_clust_info | |
| 110 rtc = args.rtc | |
| 111 pickled_sero = args.acc2sero | |
| 112 no_hit = "No genome hit" | |
| 113 no_presence = "Salmonella presence not detected" | |
| 114 bcs_sal_yn_prefix = "bettercallsal_salyn" | |
| 115 sal_y = "Detected" | |
| 116 sal_n = "Not detected" | |
| 117 null_value = "NULL" | |
| 118 assm_pat = re.compile(r"GC[AF]\_[0-9]+\.*[0-9]*") | |
| 119 ncbi_pathogens_base_url = "https://www.ncbi.nlm.nih.gov/pathogens/" | |
| 120 ncbi_pathogens_genome_base = "https://www.ncbi.nlm.nih.gov/datasets/genome/" | |
| 121 | |
| 122 sample2salmon, snp_clusters, multiqc_salmon_counts, seen_sero, sal_yn = ( | |
| 123 defaultdict(defaultdict), | |
| 124 defaultdict(defaultdict), | |
| 125 defaultdict(defaultdict), | |
| 126 defaultdict(int), | |
| 127 defaultdict(int), | |
| 128 ) | |
| 129 | |
| 130 cell_colors_yml = { | |
| 131 bcs_sal_yn_prefix: {sal_y: "#c8e6c9 !important;", sal_n: "#ffcdd2 !important;"} | |
| 132 } | |
| 133 | |
| 134 salmon_comb_res = os.path.join(os.getcwd(), out_prefix + ".txt") | |
| 135 bcs_sal_yn = re.sub(out_prefix, bcs_sal_yn_prefix + ".tblsum", salmon_comb_res) | |
| 136 cell_colors_yml_file = re.sub( | |
| 137 out_prefix + ".txt", bcs_sal_yn_prefix + ".cellcolors.yml", salmon_comb_res | |
| 138 ) | |
| 139 salmon_comb_res_mqc = os.path.join(os.getcwd(), str(out_prefix).split(".")[0] + "_mqc.json") | |
| 140 salmon_res_files = glob.glob(os.path.join(salmon_res_dir, "*", "quant.sf"), recursive=True) | |
| 141 salmon_res_file_failed = glob.glob(os.path.join(salmon_res_dir, "BCS_NO_CALLS.txt")) | |
| 142 | |
| 143 if rtc and (not os.path.exists(rtc) or not os.path.getsize(rtc) > 0): | |
| 144 logging.error( | |
| 145 "The reference target cluster metadata file,\n" | |
| 146 + f"{os.path.basename(rtc)} does not exist or is empty!" | |
| 147 ) | |
| 148 exit(1) | |
| 149 | |
| 150 if rtc and (not salmon_res_dir or not pickled_sero): | |
| 151 logging.error("When -rtc is on, -sal and -ps are also required.") | |
| 152 exit(1) | |
| 153 | |
| 154 if pickled_sero and (not os.path.exists(pickled_sero) or not os.path.getsize(pickled_sero)): | |
| 155 logging.error( | |
| 156 "The pickle file,\n" + f"{os.path.basename(pickled_sero)} does not exist or is empty!" | |
| 157 ) | |
| 158 exit(1) | |
| 159 | |
| 160 if salmon_res_dir: | |
| 161 if not os.path.isdir(salmon_res_dir): | |
| 162 logging.error("UNIX path\n" + f"{salmon_res_dir}\n" + "does not exist!") | |
| 163 exit(1) | |
| 164 if len(salmon_res_files) <= 0: | |
| 165 # logging.error( | |
| 166 # "Parent directory,\n" | |
| 167 # + f"{salmon_res_dir}" | |
| 168 # + "\ndoes not seem to have any directories that contain\n" | |
| 169 # + "the `quant.sf` file(s)." | |
| 170 # ) | |
| 171 # exit(1) | |
| 172 with open(salmon_comb_res, "w") as salmon_comb_res_fh: | |
| 173 salmon_comb_res_fh.write(f"Sample\n{no_hit}s in any samples\n") | |
| 174 salmon_comb_res_fh.close() | |
| 175 | |
| 176 with open(bcs_sal_yn, "w") as bcs_sal_yn_fh: | |
| 177 bcs_sal_yn_fh.write(f"Sample\n{no_presence} in any samples\n") | |
| 178 bcs_sal_yn_fh.close() | |
| 179 | |
| 180 exit(0) | |
| 181 | |
| 182 if rtc and os.path.exists(rtc) and os.path.getsize(rtc) > 0: | |
| 183 | |
| 184 # pdg_release = re.match(r"(^PDG\d+\.\d+)\..+", os.path.basename(rtc))[1] + "/" | |
| 185 acc2sero = pickle.load(file=open(pickled_sero, "rb")) | |
| 186 | |
| 187 with open(rtc, "r") as rtc_fh: | |
| 188 | |
| 189 for line in rtc_fh: | |
| 190 cols = line.strip().split("\t") | |
| 191 | |
| 192 if len(cols) < 4: | |
| 193 logging.error( | |
| 194 f"The file {os.path.basename(rtc)} seems to\n" | |
| 195 + "be malformed. It contains less than required 4 columns." | |
| 196 ) | |
| 197 exit(1) | |
| 198 elif cols[3] != null_value: | |
| 199 snp_clusters[cols[0]].setdefault("assembly_accs", []).append(cols[3]) | |
| 200 snp_clusters[cols[3]].setdefault("snp_clust_id", []).append(cols[0]) | |
| 201 snp_clusters[cols[3]].setdefault("pathdb_acc_id", []).append(cols[1]) | |
| 202 if len(snp_clusters[cols[3]]["snp_clust_id"]) > 1: | |
| 203 logging.error( | |
| 204 f"There is a duplicate reference accession [{cols[3]}]" | |
| 205 + f"in the metadata file{os.path.basename(rtc)}!" | |
| 206 ) | |
| 207 exit(1) | |
| 208 | |
| 209 rtc_fh.close() | |
| 210 | |
| 211 for salmon_res_file in salmon_res_files: | |
| 212 sample_name = re.match( | |
| 213 r"(^.+?)((\_salmon\_res)|(\.salmon))$", | |
| 214 os.path.basename(os.path.dirname(salmon_res_file)), | |
| 215 )[1] | |
| 216 salmon_meta_json = os.path.join( | |
| 217 os.path.dirname(salmon_res_file), "aux_info", "meta_info.json" | |
| 218 ) | |
| 219 | |
| 220 if not os.path.exists(salmon_meta_json) or not os.path.getsize(salmon_meta_json) > 0: | |
| 221 logging.error( | |
| 222 "The file\n" | |
| 223 + f"{salmon_meta_json}\ndoes not exist or is empty!\n" | |
| 224 + "Did `salmon quant` fail?" | |
| 225 ) | |
| 226 exit(1) | |
| 227 | |
| 228 if not os.path.exists(salmon_res_file) or not os.path.getsize(salmon_res_file): | |
| 229 logging.error( | |
| 230 "The file\n" | |
| 231 + f"{salmon_res_file}\ndoes not exist or is empty!\n" | |
| 232 + "Did `salmon quant` fail?" | |
| 233 ) | |
| 234 exit(1) | |
| 235 | |
| 236 with open(salmon_res_file, "r") as salmon_res_fh: | |
| 237 for line in salmon_res_fh.readlines(): | |
| 238 if re.match(r"^Name.+", line): | |
| 239 continue | |
| 240 cols = line.strip().split("\t") | |
| 241 ref_acc = "_".join(cols[0].split("_")[:2]) | |
| 242 | |
| 243 if ref_acc not in snp_clusters.keys(): | |
| 244 snp_clusters[ref_acc]["snp_clust_id"] = ref_acc | |
| 245 snp_clusters[ref_acc]["pathdb_acc_id"] = ref_acc | |
| 246 | |
| 247 ( | |
| 248 sample2salmon[sample_name] | |
| 249 .setdefault(acc2sero[cols[0]], []) | |
| 250 .append(int(round(float(cols[4]), 2))) | |
| 251 ) | |
| 252 ( | |
| 253 sample2salmon[sample_name] | |
| 254 .setdefault("snp_clust_ids", {}) | |
| 255 .setdefault("".join(snp_clusters[ref_acc]["snp_clust_id"]), []) | |
| 256 .append("".join(snp_clusters[ref_acc]["pathdb_acc_id"])) | |
| 257 ) | |
| 258 seen_sero[acc2sero[cols[0]]] = 1 | |
| 259 | |
| 260 salmon_meta_json_read = json.load(open(salmon_meta_json, "r")) | |
| 261 ( | |
| 262 sample2salmon[sample_name] | |
| 263 .setdefault("tot_reads", []) | |
| 264 .append(salmon_meta_json_read["num_processed"]) | |
| 265 ) | |
| 266 | |
| 267 with open(salmon_comb_res, "w") as salmon_comb_res_fh: | |
| 268 | |
| 269 # snp_clust_col_header = ( | |
| 270 # "\tSNP Cluster(s) by Genome Hit\n" if show_snp_clust_col else "\n" | |
| 271 # ) | |
| 272 snp_clust_col_header = ( | |
| 273 "\tNCBI Pathogens Isolate Browser\n" if show_snp_clust_col else "\n" | |
| 274 ) | |
| 275 serotypes = sorted(seen_sero.keys()) | |
| 276 formatted_serotypes = [ | |
| 277 re.sub(r"\,antigen_formula=", " | ", s) | |
| 278 for s in [re.sub(r"serotype=", "", s) for s in serotypes] | |
| 279 ] | |
| 280 salmon_comb_res_fh.write( | |
| 281 "Sample\t" + "\t".join(formatted_serotypes) + snp_clust_col_header | |
| 282 ) | |
| 283 # sample_snp_relation = ( | |
| 284 # ncbi_pathogens_base_url | |
| 285 # + pdg_release | |
| 286 # + "".join(snp_clusters[ref_acc]["snp_clust_id"]) | |
| 287 # + "?accessions=" | |
| 288 # ) | |
| 289 if len(salmon_res_file_failed) == 1: | |
| 290 with (open("".join(salmon_res_file_failed), "r")) as no_calls_fh: | |
| 291 for line in no_calls_fh.readlines(): | |
| 292 if line in ["\n", "\n\r", "\r"]: | |
| 293 continue | |
| 294 salmon_comb_res_fh.write(line.strip()) | |
| 295 sal_yn[line.strip()] += 0 | |
| 296 for serotype in serotypes: | |
| 297 salmon_comb_res_fh.write("\t-") | |
| 298 salmon_comb_res_fh.write( | |
| 299 "\t-\n" | |
| 300 ) if show_snp_clust_col else salmon_comb_res_fh.write("\n") | |
| 301 no_calls_fh.close() | |
| 302 | |
| 303 for sample, counts in sorted(sample2salmon.items()): | |
| 304 salmon_comb_res_fh.write(sample) | |
| 305 snp_cluster_res_col = list() | |
| 306 | |
| 307 for snp_clust_id in sample2salmon[sample]["snp_clust_ids"].keys(): | |
| 308 # print(snp_clust_id) | |
| 309 # print(",".join(sample2salmon[sample]["snp_clust_ids"][snp_clust_id])) | |
| 310 # ppp.pprint(sample2salmon[sample]["snp_clust_ids"]) | |
| 311 # ppp.pprint(sample2salmon[sample]["snp_clust_ids"][snp_clust_id]) | |
| 312 # final_url_text = ",".join( | |
| 313 # sample2salmon[sample]["snp_clust_ids"][snp_clust_id] | |
| 314 # ) | |
| 315 # final_url_text_to_show = snp_clust_id | |
| 316 # snp_cluster_res_col.append( | |
| 317 # "".join( | |
| 318 # [ | |
| 319 # f'<a href="', | |
| 320 # sample_snp_relation, | |
| 321 # ",".join(sample2salmon[sample]["snp_clust_ids"][snp_clust_id]), | |
| 322 # f'" target="_blank">{snp_clust_id}</a>', | |
| 323 # ] | |
| 324 # ) | |
| 325 # ) | |
| 326 # ppp.pprint(sample2salmon[sample]) | |
| 327 for pathdbacc in sample2salmon[sample]["snp_clust_ids"][snp_clust_id]: | |
| 328 # final_url_text_to_show = " ".join( | |
| 329 # sample2salmon[sample]["snp_clust_ids"][snp_clust_id] | |
| 330 # ) | |
| 331 sample_snp_relation = ( | |
| 332 ncbi_pathogens_genome_base | |
| 333 if assm_pat.match(pathdbacc) | |
| 334 else ncbi_pathogens_base_url + "isolates/#" | |
| 335 ) | |
| 336 | |
| 337 snp_cluster_res_col.append( | |
| 338 "".join( | |
| 339 [ | |
| 340 f'<a href="', | |
| 341 sample_snp_relation, | |
| 342 pathdbacc, | |
| 343 f'" target="_blank">{pathdbacc}</a>', | |
| 344 ] | |
| 345 ) | |
| 346 ) | |
| 347 | |
| 348 per_serotype_counts = 0 | |
| 349 for serotype in serotypes: | |
| 350 | |
| 351 if serotype in sample2salmon[sample].keys(): | |
| 352 # ppp.pprint(counts) | |
| 353 sample_perc_mapped = round( | |
| 354 sum(counts[serotype]) / sum(counts["tot_reads"]) * 100, 2 | |
| 355 ) | |
| 356 salmon_comb_res_fh.write( | |
| 357 f"\t{sum(counts[serotype])} ({sample_perc_mapped}%)" | |
| 358 ) | |
| 359 multiqc_salmon_counts[sample].setdefault( | |
| 360 re.match(r"^serotype=(.+?)\,antigen_formula.*", serotype)[1], | |
| 361 sum(counts[serotype]), | |
| 362 ) | |
| 363 per_serotype_counts += sum(counts[serotype]) | |
| 364 sal_yn[sample] += 1 | |
| 365 else: | |
| 366 salmon_comb_res_fh.write(f"\t-") | |
| 367 sal_yn[sample] += 0 | |
| 368 | |
| 369 multiqc_salmon_counts[sample].setdefault( | |
| 370 no_hit, sum(counts["tot_reads"]) - per_serotype_counts | |
| 371 ) | |
| 372 snp_clust_col_val = ( | |
| 373 f'\t{" ".join(snp_cluster_res_col)}\n' if show_snp_clust_col else "\n" | |
| 374 ) | |
| 375 # ppp.pprint(multiqc_salmon_counts) | |
| 376 salmon_comb_res_fh.write(snp_clust_col_val) | |
| 377 | |
| 378 with open(bcs_sal_yn, "w") as bcs_sal_yn_fh: | |
| 379 bcs_sal_yn_fh.write("Sample\tSalmonella Presence\tNo. of Serotypes\n") | |
| 380 for sample in sal_yn.keys(): | |
| 381 if sal_yn[sample] > 0: | |
| 382 bcs_sal_yn_fh.write(f"{sample}\tDetected\t{sal_yn[sample]}\n") | |
| 383 else: | |
| 384 bcs_sal_yn_fh.write(f"{sample}\tNot detected\t{sal_yn[sample]}\n") | |
| 385 | |
| 386 with open(cell_colors_yml_file, "w") as cell_colors_fh: | |
| 387 yaml.dump(cell_colors_yml, cell_colors_fh, default_flow_style=False) | |
| 388 | |
| 389 salmon_plot_json(salmon_comb_res_mqc, multiqc_salmon_counts, no_hit) | |
| 390 | |
| 391 salmon_comb_res_fh.close() | |
| 392 bcs_sal_yn_fh.close() | |
| 393 cell_colors_fh.close() | |
| 394 | |
| 395 | |
| 396 def salmon_plot_json(file: None, sample_salmon_counts: None, no_hit: None) -> None: | |
| 397 """ | |
| 398 This method will take a dictionary of salmon counts per sample | |
| 399 and will dump a JSON that will be used by MultiQC. | |
| 400 """ | |
| 401 | |
| 402 if file is None or sample_salmon_counts is None: | |
| 403 logging.error( | |
| 404 "Neither an output file to dump the JSON for MultiQC or the" | |
| 405 + "dictionary holding the salmon counts was not passed." | |
| 406 ) | |
| 407 | |
| 408 # Credit: http://phrogz.net/tmp/24colors.html | |
| 409 # Will cycle through 20 distinct colors. | |
| 410 distinct_color_palette = [ | |
| 411 "#FF0000", | |
| 412 "#FFFF00", | |
| 413 "#00EAFF", | |
| 414 "#AA00FF", | |
| 415 "#FF7F00", | |
| 416 "#BFFF00", | |
| 417 "#0095FF", | |
| 418 "#FF00AA", | |
| 419 "#FFD400", | |
| 420 "#6AFF00", | |
| 421 "#0040FF", | |
| 422 "#EDB9B9", | |
| 423 "#B9D7ED", | |
| 424 "#E7E9B9", | |
| 425 "#DCB9ED", | |
| 426 "#B9EDE0", | |
| 427 "#8F2323", | |
| 428 "#23628F", | |
| 429 "#8F6A23", | |
| 430 "#6B238F", | |
| 431 "#4F8F23", | |
| 432 ] | |
| 433 | |
| 434 # Credit: https://mokole.com/palette.html | |
| 435 # Will use this palette if we run out ouf | |
| 436 # 20 serotypes. More than 50 serotypes | |
| 437 # per run is probably rare but if not, | |
| 438 # will cycle through about 45. | |
| 439 distinct_color_palette2 = [ | |
| 440 "#2F4F4F", # darkslategray | |
| 441 "#556B2F", # darkolivegreen | |
| 442 "#A0522D", # sienna | |
| 443 "#2E8B57", # seagreen | |
| 444 "#006400", # darkgreen | |
| 445 "#8B0000", # darkred | |
| 446 "#808000", # olive | |
| 447 "#BC8F8F", # rosybrown | |
| 448 "#663399", # rebeccapurple | |
| 449 "#B8860B", # darkgoldenrod | |
| 450 "#4682B4", # steelblue | |
| 451 "#000080", # navy | |
| 452 "#D2691E", # chocolate | |
| 453 "#9ACD32", # yellowgreen | |
| 454 "#20B2AA", # lightseagreen | |
| 455 "#CD5C5C", # indianred | |
| 456 "#8FBC8F", # darkseagreen | |
| 457 "#800080", # purple | |
| 458 "#B03060", # maroon3 | |
| 459 "#FF8C00", # darkorange | |
| 460 "#FFD700", # gold | |
| 461 "#FFFF00", # yellow | |
| 462 "#DEB887", # burlywood | |
| 463 "#00FF00", # lime | |
| 464 "#BA55D3", # mediumorchid | |
| 465 "#00FA9A", # mediumspringgreen | |
| 466 "#4169E1", # royalblue | |
| 467 "#E9967A", # darksalmon | |
| 468 "#DC143C", # crimson | |
| 469 "#00FFFF", # aqua | |
| 470 "#F4A460", # sandybrown | |
| 471 "#9370DB", # mediumpurple | |
| 472 "#0000FF", # blue | |
| 473 "#ADFF2F", # greenyellow | |
| 474 "#FF6347", # tomato | |
| 475 "#D8BFD8", # thistle | |
| 476 "#FF00FF", # fuchsia | |
| 477 "#DB7093", # palevioletred | |
| 478 "#F0E68C", # khaki | |
| 479 "#6495ED", # cornflower | |
| 480 "#DDA0DD", # plum | |
| 481 "#EE82EE", # violet | |
| 482 "#7FFFD4", # aquamarine | |
| 483 "#FAFAD2", # lightgoldenrod | |
| 484 "#FF69B4", # hotpink | |
| 485 "#FFB6C1", # lightpink | |
| 486 ] | |
| 487 | |
| 488 no_hit_color = "#434348" | |
| 489 col_count = 0 | |
| 490 serotypes = set() | |
| 491 salmon_counts = defaultdict(defaultdict) | |
| 492 salmon_counts["id"] = "BETTERCALLSAL_SALMON_COUNTS" | |
| 493 salmon_counts["section_name"] = "Salmon read counts" | |
| 494 salmon_counts["description"] = ( | |
| 495 "This section shows the read counts from running <code>salmon</code> " | |
| 496 + "in <code>--meta</code> mode using SE, merged PE or concatenated PE reads against " | |
| 497 + "an on-the-fly <code>salmon</code> index generated from the genome hits " | |
| 498 + "of <code>kma</code>." | |
| 499 ) | |
| 500 salmon_counts["plot_type"] = "bargraph" | |
| 501 salmon_counts["pconfig"]["id"] = "bettercallsal_salmon_counts_plot" | |
| 502 salmon_counts["pconfig"]["title"] = "Salmon: Read counts" | |
| 503 salmon_counts["pconfig"]["ylab"] = "Number of reads" | |
| 504 salmon_counts["pconfig"]["xDecimals"] = "false" | |
| 505 salmon_counts["pconfig"]["cpswitch_counts_label"] = "Number of reads (Counts)" | |
| 506 salmon_counts["pconfig"]["cpswitch_percent_label"] = "Number of reads (Percentages)" | |
| 507 | |
| 508 for sample in sorted(sample_salmon_counts.keys()): | |
| 509 serotypes.update(list(sample_salmon_counts[sample].keys())) | |
| 510 salmon_counts["data"][sample] = sample_salmon_counts[sample] | |
| 511 | |
| 512 if len(serotypes) > len(distinct_color_palette): | |
| 513 distinct_color_palette = distinct_color_palette2 | |
| 514 | |
| 515 for serotype in sorted(serotypes): | |
| 516 if serotype == no_hit: | |
| 517 continue | |
| 518 if col_count == len(distinct_color_palette) - 1: | |
| 519 col_count = 0 | |
| 520 | |
| 521 col_count += 1 | |
| 522 salmon_counts["categories"][serotype] = {"color": distinct_color_palette[col_count]} | |
| 523 | |
| 524 salmon_counts["categories"][no_hit] = {"color": no_hit_color} | |
| 525 json.dump(salmon_counts, open(file, "w")) | |
| 526 | |
| 527 | |
| 528 if __name__ == "__main__": | |
| 529 main() |
