cfsan_bettercallsal: 0.7.0/bin/gen_salmon_res

annotate 0.7.0/bin/gen_salmon_res_table.py @ 21:4ce0e079377d tip

planemo upload

author	kkonganti
date	Mon, 15 Jul 2024 12:01:00 -0400
parents	0e7a0053e4a6
children

rev	line source
kkonganti@17	1 #!/usr/bin/env python3
kkonganti@17	2
kkonganti@17	3 # Kranti Konganti
kkonganti@17	4
kkonganti@17	5 import argparse
kkonganti@17	6 import glob
kkonganti@17	7 import inspect
kkonganti@17	8 import json
kkonganti@17	9 import logging
kkonganti@17	10 import os
kkonganti@17	11 import pickle
kkonganti@17	12 import pprint
kkonganti@17	13 import re
kkonganti@17	14 from collections import defaultdict
kkonganti@17	15
kkonganti@17	16 import yaml
kkonganti@17	17
kkonganti@17	18
kkonganti@17	19 # Multiple inheritence for pretty printing of help text.
kkonganti@17	20 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
kkonganti@17	21 pass
kkonganti@17	22
kkonganti@17	23
kkonganti@17	24 # Main
kkonganti@17	25 def main() -> None:
kkonganti@17	26 """
kkonganti@17	27 The succesful execution of this script requires access to bettercallsal formatted
kkonganti@17	28 db flat files. On raven2, they are at /hpc/db/bettercallsall/PDGXXXXXXXXXX.XXXXX
kkonganti@17	29
kkonganti@17	30 It takes the ACC2SERO.pickle file and *.reference_target.cluster_list.tsv file
kkonganti@17	31 for that particular NCBI Pathogens release from the db directory mentioned with
kkonganti@17	32 -db option and a root parent directory of the `salmon quant` results mentioned
kkonganti@17	33 with -sal option and generates a final results table with number of reads
kkonganti@17	34 mapped and a .json file to be used with MultiQC to generate a stacked bar plot.
kkonganti@17	35
kkonganti@17	36 Using -url option optionally adds an extra column of NCBI Pathogens Isolates
kkonganti@17	37 Browser, which directly links out to NCBI Pathogens Isolates SNP viewer tool.
kkonganti@17	38 """
kkonganti@17	39 # Set logging.
kkonganti@17	40 logging.basicConfig(
kkonganti@17	41 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n",
kkonganti@17	42 level=logging.DEBUG,
kkonganti@17	43 )
kkonganti@17	44
kkonganti@17	45 # Debug print.
kkonganti@17	46 ppp = pprint.PrettyPrinter(width=55)
kkonganti@17	47 prog_name = inspect.stack()[0].filename
kkonganti@17	48
kkonganti@17	49 parser = argparse.ArgumentParser(
kkonganti@17	50 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@17	51 )
kkonganti@17	52
kkonganti@17	53 required = parser.add_argument_group("required arguments")
kkonganti@17	54
kkonganti@17	55 required.add_argument(
kkonganti@17	56 "-sal",
kkonganti@17	57 dest="salmon_res_dir",
kkonganti@17	58 default=False,
kkonganti@17	59 required=True,
kkonganti@17	60 help="Absolute UNIX path to the parent directory that contains the\n"
kkonganti@17	61 + "`salmon quant` results directory. For example, if path to\n"
kkonganti@17	62 + "`quant.sf` is in /hpc/john_doe/test/salmon_res/quant.sf, then\n"
kkonganti@17	63 + "use this command-line option as:\n"
kkonganti@17	64 + "-sal /hpc/john_doe/test",
kkonganti@17	65 )
kkonganti@17	66 required.add_argument(
kkonganti@17	67 "-snp",
kkonganti@17	68 dest="rtc",
kkonganti@17	69 default=False,
kkonganti@17	70 required=True,
kkonganti@17	71 help="Absolute UNIX Path to the PDG SNP reference target cluster\n"
kkonganti@17	72 + "metadata file. On raven2, these are located at\n"
kkonganti@17	73 + "/hpc/db/bettercallsal/PDGXXXXXXXXXX.XXXXX\n"
kkonganti@17	74 + "Required if -sal is on.",
kkonganti@17	75 )
kkonganti@17	76 required.add_argument(
kkonganti@17	77 "-pickle",
kkonganti@17	78 dest="acc2sero",
kkonganti@17	79 default=False,
kkonganti@17	80 required=True,
kkonganti@17	81 help="Absolute UNIX Path to the *ACC2SERO.pickle\n"
kkonganti@17	82 + "metadata file. On raven2, these are located at\n"
kkonganti@17	83 + "/hpc/db/bettercallsal/PDGXXXXXXXXXX.XXXXX\n"
kkonganti@17	84 + "Required if -sal is on.",
kkonganti@17	85 )
kkonganti@17	86 parser.add_argument(
kkonganti@17	87 "-op",
kkonganti@17	88 dest="out_prefix",
kkonganti@17	89 default="bettercallsal.tblsum",
kkonganti@17	90 required=False,
kkonganti@17	91 help="Set the output file(s) prefix for output(s) generated\n" + "by this program.",
kkonganti@17	92 )
kkonganti@17	93 parser.add_argument(
kkonganti@17	94 "-url",
kkonganti@17	95 dest="show_snp_clust_info",
kkonganti@17	96 default=False,
kkonganti@17	97 required=False,
kkonganti@17	98 action="store_true",
kkonganti@17	99 help="Show SNP cluster participation information of the final genome hit.\n"
kkonganti@17	100 + "This may be useful to see a relative placement of your sample in\n"
kkonganti@17	101 + "NCBI Isolates SNP Tree Viewer based on genome similarity but however\n"
kkonganti@17	102 + "due to rapid nature of the updates at NCBI Pathogen Detection Project,\n"
kkonganti@17	103 + "the placement may be in an outdated cluster.",
kkonganti@17	104 )
kkonganti@17	105
kkonganti@17	106 args = parser.parse_args()
kkonganti@17	107 salmon_res_dir = args.salmon_res_dir
kkonganti@17	108 out_prefix = args.out_prefix
kkonganti@17	109 show_snp_clust_col = args.show_snp_clust_info
kkonganti@17	110 rtc = args.rtc
kkonganti@17	111 pickled_sero = args.acc2sero
kkonganti@17	112 no_hit = "No genome hit"
kkonganti@17	113 no_presence = "Salmonella presence not detected"
kkonganti@17	114 bcs_sal_yn_prefix = "bettercallsal_salyn"
kkonganti@17	115 sal_y = "Detected"
kkonganti@17	116 sal_n = "Not detected"
kkonganti@17	117 null_value = "NULL"
kkonganti@17	118 assm_pat = re.compile(r"GC[AF]\_[0-9]+\.[0-9]")
kkonganti@17	119 ncbi_pathogens_base_url = "https://www.ncbi.nlm.nih.gov/pathogens/"
kkonganti@17	120 ncbi_pathogens_genome_base = "https://www.ncbi.nlm.nih.gov/datasets/genome/"
kkonganti@17	121
kkonganti@17	122 sample2salmon, snp_clusters, multiqc_salmon_counts, seen_sero, sal_yn = (
kkonganti@17	123 defaultdict(defaultdict),
kkonganti@17	124 defaultdict(defaultdict),
kkonganti@17	125 defaultdict(defaultdict),
kkonganti@17	126 defaultdict(int),
kkonganti@17	127 defaultdict(int),
kkonganti@17	128 )
kkonganti@17	129
kkonganti@17	130 cell_colors_yml = {
kkonganti@17	131 bcs_sal_yn_prefix: {sal_y: "#c8e6c9 !important;", sal_n: "#ffcdd2 !important;"}
kkonganti@17	132 }
kkonganti@17	133
kkonganti@17	134 salmon_comb_res = os.path.join(os.getcwd(), out_prefix + ".txt")
kkonganti@17	135 bcs_sal_yn = re.sub(out_prefix, bcs_sal_yn_prefix + ".tblsum", salmon_comb_res)
kkonganti@17	136 cell_colors_yml_file = re.sub(
kkonganti@17	137 out_prefix + ".txt", bcs_sal_yn_prefix + ".cellcolors.yml", salmon_comb_res
kkonganti@17	138 )
kkonganti@17	139 salmon_comb_res_mqc = os.path.join(os.getcwd(), str(out_prefix).split(".")[0] + "_mqc.json")
kkonganti@17	140 salmon_res_files = glob.glob(os.path.join(salmon_res_dir, "*", "quant.sf"), recursive=True)
kkonganti@17	141 salmon_res_file_failed = glob.glob(os.path.join(salmon_res_dir, "BCS_NO_CALLS.txt"))
kkonganti@17	142
kkonganti@17	143 if rtc and (not os.path.exists(rtc) or not os.path.getsize(rtc) > 0):
kkonganti@17	144 logging.error(
kkonganti@17	145 "The reference target cluster metadata file,\n"
kkonganti@17	146 + f"{os.path.basename(rtc)} does not exist or is empty!"
kkonganti@17	147 )
kkonganti@17	148 exit(1)
kkonganti@17	149
kkonganti@17	150 if rtc and (not salmon_res_dir or not pickled_sero):
kkonganti@17	151 logging.error("When -rtc is on, -sal and -ps are also required.")
kkonganti@17	152 exit(1)
kkonganti@17	153
kkonganti@17	154 if pickled_sero and (not os.path.exists(pickled_sero) or not os.path.getsize(pickled_sero)):
kkonganti@17	155 logging.error(
kkonganti@17	156 "The pickle file,\n" + f"{os.path.basename(pickled_sero)} does not exist or is empty!"
kkonganti@17	157 )
kkonganti@17	158 exit(1)
kkonganti@17	159
kkonganti@17	160 if salmon_res_dir:
kkonganti@17	161 if not os.path.isdir(salmon_res_dir):
kkonganti@17	162 logging.error("UNIX path\n" + f"{salmon_res_dir}\n" + "does not exist!")
kkonganti@17	163 exit(1)
kkonganti@17	164 if len(salmon_res_files) <= 0:
kkonganti@17	165 # logging.error(
kkonganti@17	166 # "Parent directory,\n"
kkonganti@17	167 # + f"{salmon_res_dir}"
kkonganti@17	168 # + "\ndoes not seem to have any directories that contain\n"
kkonganti@17	169 # + "the `quant.sf` file(s)."
kkonganti@17	170 # )
kkonganti@17	171 # exit(1)
kkonganti@17	172 with open(salmon_comb_res, "w") as salmon_comb_res_fh:
kkonganti@17	173 salmon_comb_res_fh.write(f"Sample\n{no_hit}s in any samples\n")
kkonganti@17	174 salmon_comb_res_fh.close()
kkonganti@17	175
kkonganti@17	176 with open(bcs_sal_yn, "w") as bcs_sal_yn_fh:
kkonganti@17	177 bcs_sal_yn_fh.write(f"Sample\n{no_presence} in any samples\n")
kkonganti@17	178 bcs_sal_yn_fh.close()
kkonganti@17	179
kkonganti@17	180 exit(0)
kkonganti@17	181
kkonganti@17	182 if rtc and os.path.exists(rtc) and os.path.getsize(rtc) > 0:
kkonganti@17	183
kkonganti@17	184 # pdg_release = re.match(r"(^PDG\d+\.\d+)\..+", os.path.basename(rtc))[1] + "/"
kkonganti@17	185 acc2sero = pickle.load(file=open(pickled_sero, "rb"))
kkonganti@17	186
kkonganti@17	187 with open(rtc, "r") as rtc_fh:
kkonganti@17	188
kkonganti@17	189 for line in rtc_fh:
kkonganti@17	190 cols = line.strip().split("\t")
kkonganti@17	191
kkonganti@17	192 if len(cols) < 4:
kkonganti@17	193 logging.error(
kkonganti@17	194 f"The file {os.path.basename(rtc)} seems to\n"
kkonganti@17	195 + "be malformed. It contains less than required 4 columns."
kkonganti@17	196 )
kkonganti@17	197 exit(1)
kkonganti@17	198 elif cols[3] != null_value:
kkonganti@17	199 snp_clusters[cols[0]].setdefault("assembly_accs", []).append(cols[3])
kkonganti@17	200 snp_clusters[cols[3]].setdefault("snp_clust_id", []).append(cols[0])
kkonganti@17	201 snp_clusters[cols[3]].setdefault("pathdb_acc_id", []).append(cols[1])
kkonganti@17	202 if len(snp_clusters[cols[3]]["snp_clust_id"]) > 1:
kkonganti@17	203 logging.error(
kkonganti@17	204 f"There is a duplicate reference accession [{cols[3]}]"
kkonganti@17	205 + f"in the metadata file{os.path.basename(rtc)}!"
kkonganti@17	206 )
kkonganti@17	207 exit(1)
kkonganti@17	208
kkonganti@17	209 rtc_fh.close()
kkonganti@17	210
kkonganti@17	211 for salmon_res_file in salmon_res_files:
kkonganti@17	212 sample_name = re.match(
kkonganti@17	213 r"(^.+?)((\_salmon\_res)\|(\.salmon))$",
kkonganti@17	214 os.path.basename(os.path.dirname(salmon_res_file)),
kkonganti@17	215 )[1]
kkonganti@17	216 salmon_meta_json = os.path.join(
kkonganti@17	217 os.path.dirname(salmon_res_file), "aux_info", "meta_info.json"
kkonganti@17	218 )
kkonganti@17	219
kkonganti@17	220 if not os.path.exists(salmon_meta_json) or not os.path.getsize(salmon_meta_json) > 0:
kkonganti@17	221 logging.error(
kkonganti@17	222 "The file\n"
kkonganti@17	223 + f"{salmon_meta_json}\ndoes not exist or is empty!\n"
kkonganti@17	224 + "Did `salmon quant` fail?"
kkonganti@17	225 )
kkonganti@17	226 exit(1)
kkonganti@17	227
kkonganti@17	228 if not os.path.exists(salmon_res_file) or not os.path.getsize(salmon_res_file):
kkonganti@17	229 logging.error(
kkonganti@17	230 "The file\n"
kkonganti@17	231 + f"{salmon_res_file}\ndoes not exist or is empty!\n"
kkonganti@17	232 + "Did `salmon quant` fail?"
kkonganti@17	233 )
kkonganti@17	234 exit(1)
kkonganti@17	235
kkonganti@17	236 with open(salmon_res_file, "r") as salmon_res_fh:
kkonganti@17	237 for line in salmon_res_fh.readlines():
kkonganti@17	238 if re.match(r"^Name.+", line):
kkonganti@17	239 continue
kkonganti@17	240 cols = line.strip().split("\t")
kkonganti@17	241 ref_acc = "_".join(cols[0].split("_")[:2])
kkonganti@17	242
kkonganti@17	243 if ref_acc not in snp_clusters.keys():
kkonganti@17	244 snp_clusters[ref_acc]["snp_clust_id"] = ref_acc
kkonganti@17	245 snp_clusters[ref_acc]["pathdb_acc_id"] = ref_acc
kkonganti@17	246
kkonganti@17	247 (
kkonganti@17	248 sample2salmon[sample_name]
kkonganti@17	249 .setdefault(acc2sero[cols[0]], [])
kkonganti@17	250 .append(int(round(float(cols[4]), 2)))
kkonganti@17	251 )
kkonganti@17	252 (
kkonganti@17	253 sample2salmon[sample_name]
kkonganti@17	254 .setdefault("snp_clust_ids", {})
kkonganti@17	255 .setdefault("".join(snp_clusters[ref_acc]["snp_clust_id"]), [])
kkonganti@17	256 .append("".join(snp_clusters[ref_acc]["pathdb_acc_id"]))
kkonganti@17	257 )
kkonganti@17	258 seen_sero[acc2sero[cols[0]]] = 1
kkonganti@17	259
kkonganti@17	260 salmon_meta_json_read = json.load(open(salmon_meta_json, "r"))
kkonganti@17	261 (
kkonganti@17	262 sample2salmon[sample_name]
kkonganti@17	263 .setdefault("tot_reads", [])
kkonganti@17	264 .append(salmon_meta_json_read["num_processed"])
kkonganti@17	265 )
kkonganti@17	266
kkonganti@17	267 with open(salmon_comb_res, "w") as salmon_comb_res_fh:
kkonganti@17	268
kkonganti@17	269 # snp_clust_col_header = (
kkonganti@17	270 # "\tSNP Cluster(s) by Genome Hit\n" if show_snp_clust_col else "\n"
kkonganti@17	271 # )
kkonganti@17	272 snp_clust_col_header = (
kkonganti@17	273 "\tNCBI Pathogens Isolate Browser\n" if show_snp_clust_col else "\n"
kkonganti@17	274 )
kkonganti@17	275 serotypes = sorted(seen_sero.keys())
kkonganti@17	276 formatted_serotypes = [
kkonganti@17	277 re.sub(r"\,antigen_formula=", " \| ", s)
kkonganti@17	278 for s in [re.sub(r"serotype=", "", s) for s in serotypes]
kkonganti@17	279 ]
kkonganti@17	280 salmon_comb_res_fh.write(
kkonganti@17	281 "Sample\t" + "\t".join(formatted_serotypes) + snp_clust_col_header
kkonganti@17	282 )
kkonganti@17	283 # sample_snp_relation = (
kkonganti@17	284 # ncbi_pathogens_base_url
kkonganti@17	285 # + pdg_release
kkonganti@17	286 # + "".join(snp_clusters[ref_acc]["snp_clust_id"])
kkonganti@17	287 # + "?accessions="
kkonganti@17	288 # )
kkonganti@17	289 if len(salmon_res_file_failed) == 1:
kkonganti@17	290 with (open("".join(salmon_res_file_failed), "r")) as no_calls_fh:
kkonganti@17	291 for line in no_calls_fh.readlines():
kkonganti@17	292 if line in ["\n", "\n\r", "\r"]:
kkonganti@17	293 continue
kkonganti@17	294 salmon_comb_res_fh.write(line.strip())
kkonganti@17	295 sal_yn[line.strip()] += 0
kkonganti@17	296 for serotype in serotypes:
kkonganti@17	297 salmon_comb_res_fh.write("\t-")
kkonganti@17	298 salmon_comb_res_fh.write(
kkonganti@17	299 "\t-\n"
kkonganti@17	300 ) if show_snp_clust_col else salmon_comb_res_fh.write("\n")
kkonganti@17	301 no_calls_fh.close()
kkonganti@17	302
kkonganti@17	303 for sample, counts in sorted(sample2salmon.items()):
kkonganti@17	304 salmon_comb_res_fh.write(sample)
kkonganti@17	305 snp_cluster_res_col = list()
kkonganti@17	306
kkonganti@17	307 for snp_clust_id in sample2salmon[sample]["snp_clust_ids"].keys():
kkonganti@17	308 # print(snp_clust_id)
kkonganti@17	309 # print(",".join(sample2salmon[sample]["snp_clust_ids"][snp_clust_id]))
kkonganti@17	310 # ppp.pprint(sample2salmon[sample]["snp_clust_ids"])
kkonganti@17	311 # ppp.pprint(sample2salmon[sample]["snp_clust_ids"][snp_clust_id])
kkonganti@17	312 # final_url_text = ",".join(
kkonganti@17	313 # sample2salmon[sample]["snp_clust_ids"][snp_clust_id]
kkonganti@17	314 # )
kkonganti@17	315 # final_url_text_to_show = snp_clust_id
kkonganti@17	316 # snp_cluster_res_col.append(
kkonganti@17	317 # "".join(
kkonganti@17	318 # [
kkonganti@17	319 # f'<a href="',
kkonganti@17	320 # sample_snp_relation,
kkonganti@17	321 # ",".join(sample2salmon[sample]["snp_clust_ids"][snp_clust_id]),
kkonganti@17	322 # f'" target="_blank">{snp_clust_id}</a>',
kkonganti@17	323 # ]
kkonganti@17	324 # )
kkonganti@17	325 # )
kkonganti@17	326 # ppp.pprint(sample2salmon[sample])
kkonganti@17	327 for pathdbacc in sample2salmon[sample]["snp_clust_ids"][snp_clust_id]:
kkonganti@17	328 # final_url_text_to_show = " ".join(
kkonganti@17	329 # sample2salmon[sample]["snp_clust_ids"][snp_clust_id]
kkonganti@17	330 # )
kkonganti@17	331 sample_snp_relation = (
kkonganti@17	332 ncbi_pathogens_genome_base
kkonganti@17	333 if assm_pat.match(pathdbacc)
kkonganti@17	334 else ncbi_pathogens_base_url + "isolates/#"
kkonganti@17	335 )
kkonganti@17	336
kkonganti@17	337 snp_cluster_res_col.append(
kkonganti@17	338 "".join(
kkonganti@17	339 [
kkonganti@17	340 f'<a href="',
kkonganti@17	341 sample_snp_relation,
kkonganti@17	342 pathdbacc,
kkonganti@17	343 f'" target="_blank">{pathdbacc}</a>',
kkonganti@17	344 ]
kkonganti@17	345 )
kkonganti@17	346 )
kkonganti@17	347
kkonganti@17	348 per_serotype_counts = 0
kkonganti@17	349 for serotype in serotypes:
kkonganti@17	350
kkonganti@17	351 if serotype in sample2salmon[sample].keys():
kkonganti@17	352 # ppp.pprint(counts)
kkonganti@17	353 sample_perc_mapped = round(
kkonganti@17	354 sum(counts[serotype]) / sum(counts["tot_reads"]) * 100, 2
kkonganti@17	355 )
kkonganti@17	356 salmon_comb_res_fh.write(
kkonganti@17	357 f"\t{sum(counts[serotype])} ({sample_perc_mapped}%)"
kkonganti@17	358 )
kkonganti@17	359 multiqc_salmon_counts[sample].setdefault(
kkonganti@17	360 re.match(r"^serotype=(.+?)\,antigen_formula.*", serotype)[1],
kkonganti@17	361 sum(counts[serotype]),
kkonganti@17	362 )
kkonganti@17	363 per_serotype_counts += sum(counts[serotype])
kkonganti@17	364 sal_yn[sample] += 1
kkonganti@17	365 else:
kkonganti@17	366 salmon_comb_res_fh.write(f"\t-")
kkonganti@17	367 sal_yn[sample] += 0
kkonganti@17	368
kkonganti@17	369 multiqc_salmon_counts[sample].setdefault(
kkonganti@17	370 no_hit, sum(counts["tot_reads"]) - per_serotype_counts
kkonganti@17	371 )
kkonganti@17	372 snp_clust_col_val = (
kkonganti@17	373 f'\t{" ".join(snp_cluster_res_col)}\n' if show_snp_clust_col else "\n"
kkonganti@17	374 )
kkonganti@17	375 # ppp.pprint(multiqc_salmon_counts)
kkonganti@17	376 salmon_comb_res_fh.write(snp_clust_col_val)
kkonganti@17	377
kkonganti@17	378 with open(bcs_sal_yn, "w") as bcs_sal_yn_fh:
kkonganti@17	379 bcs_sal_yn_fh.write("Sample\tSalmonella Presence\tNo. of Serotypes\n")
kkonganti@17	380 for sample in sal_yn.keys():
kkonganti@17	381 if sal_yn[sample] > 0:
kkonganti@17	382 bcs_sal_yn_fh.write(f"{sample}\tDetected\t{sal_yn[sample]}\n")
kkonganti@17	383 else:
kkonganti@17	384 bcs_sal_yn_fh.write(f"{sample}\tNot detected\t{sal_yn[sample]}\n")
kkonganti@17	385
kkonganti@17	386 with open(cell_colors_yml_file, "w") as cell_colors_fh:
kkonganti@17	387 yaml.dump(cell_colors_yml, cell_colors_fh, default_flow_style=False)
kkonganti@17	388
kkonganti@17	389 salmon_plot_json(salmon_comb_res_mqc, multiqc_salmon_counts, no_hit)
kkonganti@17	390
kkonganti@17	391 salmon_comb_res_fh.close()
kkonganti@17	392 bcs_sal_yn_fh.close()
kkonganti@17	393 cell_colors_fh.close()
kkonganti@17	394
kkonganti@17	395
kkonganti@17	396 def salmon_plot_json(file: None, sample_salmon_counts: None, no_hit: None) -> None:
kkonganti@17	397 """
kkonganti@17	398 This method will take a dictionary of salmon counts per sample
kkonganti@17	399 and will dump a JSON that will be used by MultiQC.
kkonganti@17	400 """
kkonganti@17	401
kkonganti@17	402 if file is None or sample_salmon_counts is None:
kkonganti@17	403 logging.error(
kkonganti@17	404 "Neither an output file to dump the JSON for MultiQC or the"
kkonganti@17	405 + "dictionary holding the salmon counts was not passed."
kkonganti@17	406 )
kkonganti@17	407
kkonganti@17	408 # Credit: http://phrogz.net/tmp/24colors.html
kkonganti@17	409 # Will cycle through 20 distinct colors.
kkonganti@17	410 distinct_color_palette = [
kkonganti@17	411 "#FF0000",
kkonganti@17	412 "#FFFF00",
kkonganti@17	413 "#00EAFF",
kkonganti@17	414 "#AA00FF",
kkonganti@17	415 "#FF7F00",
kkonganti@17	416 "#BFFF00",
kkonganti@17	417 "#0095FF",
kkonganti@17	418 "#FF00AA",
kkonganti@17	419 "#FFD400",
kkonganti@17	420 "#6AFF00",
kkonganti@17	421 "#0040FF",
kkonganti@17	422 "#EDB9B9",
kkonganti@17	423 "#B9D7ED",
kkonganti@17	424 "#E7E9B9",
kkonganti@17	425 "#DCB9ED",
kkonganti@17	426 "#B9EDE0",
kkonganti@17	427 "#8F2323",
kkonganti@17	428 "#23628F",
kkonganti@17	429 "#8F6A23",
kkonganti@17	430 "#6B238F",
kkonganti@17	431 "#4F8F23",
kkonganti@17	432 ]
kkonganti@17	433
kkonganti@17	434 # Credit: https://mokole.com/palette.html
kkonganti@17	435 # Will use this palette if we run out ouf
kkonganti@17	436 # 20 serotypes. More than 50 serotypes
kkonganti@17	437 # per run is probably rare but if not,
kkonganti@17	438 # will cycle through about 45.
kkonganti@17	439 distinct_color_palette2 = [
kkonganti@17	440 "#2F4F4F", # darkslategray
kkonganti@17	441 "#556B2F", # darkolivegreen
kkonganti@17	442 "#A0522D", # sienna
kkonganti@17	443 "#2E8B57", # seagreen
kkonganti@17	444 "#006400", # darkgreen
kkonganti@17	445 "#8B0000", # darkred
kkonganti@17	446 "#808000", # olive
kkonganti@17	447 "#BC8F8F", # rosybrown
kkonganti@17	448 "#663399", # rebeccapurple
kkonganti@17	449 "#B8860B", # darkgoldenrod
kkonganti@17	450 "#4682B4", # steelblue
kkonganti@17	451 "#000080", # navy
kkonganti@17	452 "#D2691E", # chocolate
kkonganti@17	453 "#9ACD32", # yellowgreen
kkonganti@17	454 "#20B2AA", # lightseagreen
kkonganti@17	455 "#CD5C5C", # indianred
kkonganti@17	456 "#8FBC8F", # darkseagreen
kkonganti@17	457 "#800080", # purple
kkonganti@17	458 "#B03060", # maroon3
kkonganti@17	459 "#FF8C00", # darkorange
kkonganti@17	460 "#FFD700", # gold
kkonganti@17	461 "#FFFF00", # yellow
kkonganti@17	462 "#DEB887", # burlywood
kkonganti@17	463 "#00FF00", # lime
kkonganti@17	464 "#BA55D3", # mediumorchid
kkonganti@17	465 "#00FA9A", # mediumspringgreen
kkonganti@17	466 "#4169E1", # royalblue
kkonganti@17	467 "#E9967A", # darksalmon
kkonganti@17	468 "#DC143C", # crimson
kkonganti@17	469 "#00FFFF", # aqua
kkonganti@17	470 "#F4A460", # sandybrown
kkonganti@17	471 "#9370DB", # mediumpurple
kkonganti@17	472 "#0000FF", # blue
kkonganti@17	473 "#ADFF2F", # greenyellow
kkonganti@17	474 "#FF6347", # tomato
kkonganti@17	475 "#D8BFD8", # thistle
kkonganti@17	476 "#FF00FF", # fuchsia
kkonganti@17	477 "#DB7093", # palevioletred
kkonganti@17	478 "#F0E68C", # khaki
kkonganti@17	479 "#6495ED", # cornflower
kkonganti@17	480 "#DDA0DD", # plum
kkonganti@17	481 "#EE82EE", # violet
kkonganti@17	482 "#7FFFD4", # aquamarine
kkonganti@17	483 "#FAFAD2", # lightgoldenrod
kkonganti@17	484 "#FF69B4", # hotpink
kkonganti@17	485 "#FFB6C1", # lightpink
kkonganti@17	486 ]
kkonganti@17	487
kkonganti@17	488 no_hit_color = "#434348"
kkonganti@17	489 col_count = 0
kkonganti@17	490 serotypes = set()
kkonganti@17	491 salmon_counts = defaultdict(defaultdict)
kkonganti@17	492 salmon_counts["id"] = "BETTERCALLSAL_SALMON_COUNTS"
kkonganti@17	493 salmon_counts["section_name"] = "Salmon read counts"
kkonganti@17	494 salmon_counts["description"] = (
kkonganti@17	495 "This section shows the read counts from running <code>salmon</code> "
kkonganti@17	496 + "in <code>--meta</code> mode using SE, merged PE or concatenated PE reads against "
kkonganti@17	497 + "an on-the-fly <code>salmon</code> index generated from the genome hits "
kkonganti@17	498 + "of <code>kma</code>."
kkonganti@17	499 )
kkonganti@17	500 salmon_counts["plot_type"] = "bargraph"
kkonganti@17	501 salmon_counts["pconfig"]["id"] = "bettercallsal_salmon_counts_plot"
kkonganti@17	502 salmon_counts["pconfig"]["title"] = "Salmon: Read counts"
kkonganti@17	503 salmon_counts["pconfig"]["ylab"] = "Number of reads"
kkonganti@17	504 salmon_counts["pconfig"]["xDecimals"] = "false"
kkonganti@17	505 salmon_counts["pconfig"]["cpswitch_counts_label"] = "Number of reads (Counts)"
kkonganti@17	506 salmon_counts["pconfig"]["cpswitch_percent_label"] = "Number of reads (Percentages)"
kkonganti@17	507
kkonganti@17	508 for sample in sorted(sample_salmon_counts.keys()):
kkonganti@17	509 serotypes.update(list(sample_salmon_counts[sample].keys()))
kkonganti@17	510 salmon_counts["data"][sample] = sample_salmon_counts[sample]
kkonganti@17	511
kkonganti@17	512 if len(serotypes) > len(distinct_color_palette):
kkonganti@17	513 distinct_color_palette = distinct_color_palette2
kkonganti@17	514
kkonganti@17	515 for serotype in sorted(serotypes):
kkonganti@17	516 if serotype == no_hit:
kkonganti@17	517 continue
kkonganti@17	518 if col_count == len(distinct_color_palette) - 1:
kkonganti@17	519 col_count = 0
kkonganti@17	520
kkonganti@17	521 col_count += 1
kkonganti@17	522 salmon_counts["categories"][serotype] = {"color": distinct_color_palette[col_count]}
kkonganti@17	523
kkonganti@17	524 salmon_counts["categories"][no_hit] = {"color": no_hit_color}
kkonganti@17	525 json.dump(salmon_counts, open(file, "w"))
kkonganti@17	526
kkonganti@17	527
kkonganti@17	528 if __name__ == "__main__":
kkonganti@17	529 main()

Mercurial > repos > kkonganti > cfsan_bettercallsal

annotate 0.7.0/bin/gen_salmon_res_table.py @ 21:4ce0e079377d tip