cfsan_bettercallsal: 0.5.0/bin/gen_salmon_res

annotate 0.5.0/bin/gen_salmon_res_table.py @ 1:365849f031fd

"planemo upload"

author	kkonganti
date	Mon, 05 Jun 2023 18:48:51 -0400
parents
children

rev	line source
kkonganti@1	1 #!/usr/bin/env python3
kkonganti@1	2
kkonganti@1	3 # Kranti Konganti
kkonganti@1	4
kkonganti@1	5 import argparse
kkonganti@1	6 import glob
kkonganti@1	7 import inspect
kkonganti@1	8 import json
kkonganti@1	9 import logging
kkonganti@1	10 import os
kkonganti@1	11 import pickle
kkonganti@1	12 import pprint
kkonganti@1	13 import re
kkonganti@1	14 from collections import defaultdict
kkonganti@1	15
kkonganti@1	16 import yaml
kkonganti@1	17
kkonganti@1	18
kkonganti@1	19 # Multiple inheritence for pretty printing of help text.
kkonganti@1	20 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
kkonganti@1	21 pass
kkonganti@1	22
kkonganti@1	23
kkonganti@1	24 # Main
kkonganti@1	25 def main() -> None:
kkonganti@1	26 """
kkonganti@1	27 The succesful execution of this script requires access to bettercallsal formatted
kkonganti@1	28 db flat files. On raven2, they are at /hpc/db/bettercallsall/PDGXXXXXXXXXX.XXXXX
kkonganti@1	29
kkonganti@1	30 It takes the ACC2SERO.pickle file and *.reference_target.cluster_list.tsv file
kkonganti@1	31 for that particular NCBI Pathogens release from the db directory mentioned with
kkonganti@1	32 -db option and a root parent directory of the `salmon quant` results mentioned
kkonganti@1	33 with -sal option and generates a final results table with number of reads
kkonganti@1	34 mapped and a .json file to be used with MultiQC to generate a stacked bar plot.
kkonganti@1	35
kkonganti@1	36 Using -url option optionally adds an extra column of NCBI Pathogens Isolates
kkonganti@1	37 Browser, which directly links out to NCBI Pathogens Isolates SNP viewer tool.
kkonganti@1	38 """
kkonganti@1	39 # Set logging.
kkonganti@1	40 logging.basicConfig(
kkonganti@1	41 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n",
kkonganti@1	42 level=logging.DEBUG,
kkonganti@1	43 )
kkonganti@1	44
kkonganti@1	45 # Debug print.
kkonganti@1	46 ppp = pprint.PrettyPrinter(width=55)
kkonganti@1	47 prog_name = inspect.stack()[0].filename
kkonganti@1	48
kkonganti@1	49 parser = argparse.ArgumentParser(
kkonganti@1	50 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@1	51 )
kkonganti@1	52
kkonganti@1	53 required = parser.add_argument_group("required arguments")
kkonganti@1	54
kkonganti@1	55 required.add_argument(
kkonganti@1	56 "-sal",
kkonganti@1	57 dest="salmon_res_dir",
kkonganti@1	58 default=False,
kkonganti@1	59 required=True,
kkonganti@1	60 help="Absolute UNIX path to the parent directory that contains the\n"
kkonganti@1	61 + "`salmon quant` results directory. For example, if path to\n"
kkonganti@1	62 + "`quant.sf` is in /hpc/john_doe/test/salmon_res/quant.sf, then\n"
kkonganti@1	63 + "use this command-line option as:\n"
kkonganti@1	64 + "-sal /hpc/john_doe/test",
kkonganti@1	65 )
kkonganti@1	66 required.add_argument(
kkonganti@1	67 "-snp",
kkonganti@1	68 dest="rtc",
kkonganti@1	69 default=False,
kkonganti@1	70 required=True,
kkonganti@1	71 help="Absolute UNIX Path to the PDG SNP reference target cluster\n"
kkonganti@1	72 + "metadata file. On raven2, these are located at\n"
kkonganti@1	73 + "/hpc/db/bettercallsal/PDGXXXXXXXXXX.XXXXX\n"
kkonganti@1	74 + "Required if -sal is on.",
kkonganti@1	75 )
kkonganti@1	76 required.add_argument(
kkonganti@1	77 "-pickle",
kkonganti@1	78 dest="acc2sero",
kkonganti@1	79 default=False,
kkonganti@1	80 required=True,
kkonganti@1	81 help="Absolute UNIX Path to the *ACC2SERO.pickle\n"
kkonganti@1	82 + "metadata file. On raven2, these are located at\n"
kkonganti@1	83 + "/hpc/db/bettercallsal/PDGXXXXXXXXXX.XXXXX\n"
kkonganti@1	84 + "Required if -sal is on.",
kkonganti@1	85 )
kkonganti@1	86 parser.add_argument(
kkonganti@1	87 "-op",
kkonganti@1	88 dest="out_prefix",
kkonganti@1	89 default="bettercallsal.tblsum",
kkonganti@1	90 required=False,
kkonganti@1	91 help="Set the output file(s) prefix for output(s) generated\n" + "by this program.",
kkonganti@1	92 )
kkonganti@1	93 parser.add_argument(
kkonganti@1	94 "-url",
kkonganti@1	95 dest="show_snp_clust_info",
kkonganti@1	96 default=False,
kkonganti@1	97 required=False,
kkonganti@1	98 action="store_true",
kkonganti@1	99 help="Show SNP cluster participation information of the final genome hit.\n"
kkonganti@1	100 + "This may be useful to see a relative placement of your sample in\n"
kkonganti@1	101 + "NCBI Isolates SNP Tree Viewer based on genome similarity but however\n"
kkonganti@1	102 + "due to rapid nature of the updates at NCBI Pathogen Detection Project,\n"
kkonganti@1	103 + "the placement may be in an outdated cluster.",
kkonganti@1	104 )
kkonganti@1	105
kkonganti@1	106 args = parser.parse_args()
kkonganti@1	107 salmon_res_dir = args.salmon_res_dir
kkonganti@1	108 out_prefix = args.out_prefix
kkonganti@1	109 show_snp_clust_col = args.show_snp_clust_info
kkonganti@1	110 rtc = args.rtc
kkonganti@1	111 pickled_sero = args.acc2sero
kkonganti@1	112 no_hit = "No genome hit"
kkonganti@1	113 bcs_sal_yn_prefix = "bettercallsal_salyn"
kkonganti@1	114 sal_y = "Detected"
kkonganti@1	115 sal_n = "Not detected"
kkonganti@1	116 ncbi_pathogens_base_url = "https://www.ncbi.nlm.nih.gov/pathogens/"
kkonganti@1	117
kkonganti@1	118 sample2salmon, snp_clusters, multiqc_salmon_counts, seen_sero, sal_yn = (
kkonganti@1	119 defaultdict(defaultdict),
kkonganti@1	120 defaultdict(defaultdict),
kkonganti@1	121 defaultdict(defaultdict),
kkonganti@1	122 defaultdict(int),
kkonganti@1	123 defaultdict(int),
kkonganti@1	124 )
kkonganti@1	125
kkonganti@1	126 cell_colors_yml = {
kkonganti@1	127 bcs_sal_yn_prefix: {sal_y: "#c8e6c9 !important;", sal_n: "#ffcdd2 !important;"}
kkonganti@1	128 }
kkonganti@1	129
kkonganti@1	130 salmon_comb_res = os.path.join(os.getcwd(), out_prefix + ".txt")
kkonganti@1	131 bcs_sal_yn = re.sub(out_prefix, bcs_sal_yn_prefix + ".tblsum", salmon_comb_res)
kkonganti@1	132 cell_colors_yml_file = re.sub(
kkonganti@1	133 out_prefix + ".txt", bcs_sal_yn_prefix + ".cellcolors.yml", salmon_comb_res
kkonganti@1	134 )
kkonganti@1	135 salmon_comb_res_mqc = os.path.join(os.getcwd(), str(out_prefix).split(".")[0] + "_mqc.json")
kkonganti@1	136 salmon_res_files = glob.glob(os.path.join(salmon_res_dir, "*", "quant.sf"), recursive=True)
kkonganti@1	137 salmon_res_file_failed = glob.glob(os.path.join(salmon_res_dir, "BCS_NO_CALLS.txt"))
kkonganti@1	138
kkonganti@1	139 if rtc and (not os.path.exists(rtc) or not os.path.getsize(rtc) > 0):
kkonganti@1	140 logging.error(
kkonganti@1	141 "The reference target cluster metadata file,\n"
kkonganti@1	142 + f"{os.path.basename(rtc)} does not exist or is empty!"
kkonganti@1	143 )
kkonganti@1	144 exit(1)
kkonganti@1	145
kkonganti@1	146 if rtc and (not salmon_res_dir or not pickled_sero):
kkonganti@1	147 logging.error("When -rtc is on, -sal and -ps are also required.")
kkonganti@1	148 exit(1)
kkonganti@1	149
kkonganti@1	150 if pickled_sero and (not os.path.exists(pickled_sero) or not os.path.getsize(pickled_sero)):
kkonganti@1	151 logging.error(
kkonganti@1	152 "The pickle file,\n" + f"{os.path.basename(pickled_sero)} does not exist or is empty!"
kkonganti@1	153 )
kkonganti@1	154 exit(1)
kkonganti@1	155
kkonganti@1	156 if salmon_res_dir:
kkonganti@1	157 if not os.path.isdir(salmon_res_dir):
kkonganti@1	158 logging.error("UNIX path\n" + f"{salmon_res_dir}\n" + "does not exist!")
kkonganti@1	159 exit(1)
kkonganti@1	160 if len(salmon_res_files) <= 0:
kkonganti@1	161 # logging.error(
kkonganti@1	162 # "Parent directory,\n"
kkonganti@1	163 # + f"{salmon_res_dir}"
kkonganti@1	164 # + "\ndoes not seem to have any directories that contain\n"
kkonganti@1	165 # + "the `quant.sf` file(s)."
kkonganti@1	166 # )
kkonganti@1	167 # exit(1)
kkonganti@1	168 with open(salmon_comb_res, "w") as salmon_comb_res_fh:
kkonganti@1	169 salmon_comb_res_fh.write(f"Sample\n{no_hit}s in any samples\n")
kkonganti@1	170 salmon_comb_res_fh.close()
kkonganti@1	171 exit(0)
kkonganti@1	172
kkonganti@1	173 if rtc and os.path.exists(rtc) and os.path.getsize(rtc) > 0:
kkonganti@1	174
kkonganti@1	175 # pdg_release = re.match(r"(^PDG\d+\.\d+)\..+", os.path.basename(rtc))[1] + "/"
kkonganti@1	176 acc2sero = pickle.load(file=open(pickled_sero, "rb"))
kkonganti@1	177
kkonganti@1	178 with open(rtc, "r") as rtc_fh:
kkonganti@1	179
kkonganti@1	180 for line in rtc_fh:
kkonganti@1	181 cols = line.strip().split("\t")
kkonganti@1	182
kkonganti@1	183 if len(cols) < 4:
kkonganti@1	184 logging.error(
kkonganti@1	185 f"The file {os.path.basename(rtc)} seems to\n"
kkonganti@1	186 + "be malformed. It contains less than required 4 columns."
kkonganti@1	187 )
kkonganti@1	188 exit(1)
kkonganti@1	189 elif cols[3] != "NULL":
kkonganti@1	190 snp_clusters[cols[0]].setdefault("assembly_accs", []).append(cols[3])
kkonganti@1	191 snp_clusters[cols[3]].setdefault("snp_clust_id", []).append(cols[0])
kkonganti@1	192 snp_clusters[cols[3]].setdefault("pathdb_acc_id", []).append(cols[1])
kkonganti@1	193 if len(snp_clusters[cols[3]]["snp_clust_id"]) > 1:
kkonganti@1	194 logging.error(
kkonganti@1	195 f"There is a duplicate reference accession [{cols[3]}]"
kkonganti@1	196 + f"in the metadata file{os.path.basename(rtc)}!"
kkonganti@1	197 )
kkonganti@1	198 exit(1)
kkonganti@1	199
kkonganti@1	200 rtc_fh.close()
kkonganti@1	201
kkonganti@1	202 for salmon_res_file in salmon_res_files:
kkonganti@1	203 sample_name = re.match(
kkonganti@1	204 r"(^.+?)((\_salmon\_res)\|(\.salmon))$",
kkonganti@1	205 os.path.basename(os.path.dirname(salmon_res_file)),
kkonganti@1	206 )[1]
kkonganti@1	207 salmon_meta_json = os.path.join(
kkonganti@1	208 os.path.dirname(salmon_res_file), "aux_info", "meta_info.json"
kkonganti@1	209 )
kkonganti@1	210
kkonganti@1	211 if not os.path.exists(salmon_meta_json) or not os.path.getsize(salmon_meta_json) > 0:
kkonganti@1	212 logging.error(
kkonganti@1	213 "The file\n"
kkonganti@1	214 + f"{salmon_meta_json}\ndoes not exist or is empty!\n"
kkonganti@1	215 + "Did `salmon quant` fail?"
kkonganti@1	216 )
kkonganti@1	217 exit(1)
kkonganti@1	218
kkonganti@1	219 if not os.path.exists(salmon_res_file) or not os.path.getsize(salmon_res_file):
kkonganti@1	220 logging.error(
kkonganti@1	221 "The file\n"
kkonganti@1	222 + f"{salmon_res_file}\ndoes not exist or is empty!\n"
kkonganti@1	223 + "Did `salmon quant` fail?"
kkonganti@1	224 )
kkonganti@1	225 exit(1)
kkonganti@1	226
kkonganti@1	227 with open(salmon_res_file, "r") as salmon_res_fh:
kkonganti@1	228 for line in salmon_res_fh.readlines():
kkonganti@1	229 if re.match(r"^Name.+", line):
kkonganti@1	230 continue
kkonganti@1	231 cols = line.strip().split("\t")
kkonganti@1	232 ref_acc = "_".join(cols[0].split("_")[:2])
kkonganti@1	233 (
kkonganti@1	234 sample2salmon[sample_name]
kkonganti@1	235 .setdefault(acc2sero[cols[0]], [])
kkonganti@1	236 .append(int(round(float(cols[4]), 2)))
kkonganti@1	237 )
kkonganti@1	238 (
kkonganti@1	239 sample2salmon[sample_name]
kkonganti@1	240 .setdefault("snp_clust_ids", {})
kkonganti@1	241 .setdefault("".join(snp_clusters[ref_acc]["snp_clust_id"]), [])
kkonganti@1	242 .append("".join(snp_clusters[ref_acc]["pathdb_acc_id"]))
kkonganti@1	243 )
kkonganti@1	244 seen_sero[acc2sero[cols[0]]] = 1
kkonganti@1	245
kkonganti@1	246 salmon_meta_json_read = json.load(open(salmon_meta_json, "r"))
kkonganti@1	247 (
kkonganti@1	248 sample2salmon[sample_name]
kkonganti@1	249 .setdefault("tot_reads", [])
kkonganti@1	250 .append(salmon_meta_json_read["num_processed"])
kkonganti@1	251 )
kkonganti@1	252
kkonganti@1	253 with open(salmon_comb_res, "w") as salmon_comb_res_fh:
kkonganti@1	254
kkonganti@1	255 # snp_clust_col_header = (
kkonganti@1	256 # "\tSNP Cluster(s) by Genome Hit\n" if show_snp_clust_col else "\n"
kkonganti@1	257 # )
kkonganti@1	258 snp_clust_col_header = (
kkonganti@1	259 "\tNCBI Pathogens Isolate Browser\n" if show_snp_clust_col else "\n"
kkonganti@1	260 )
kkonganti@1	261 serotypes = sorted(seen_sero.keys())
kkonganti@1	262 formatted_serotypes = [
kkonganti@1	263 re.sub(r"\,antigen_formula=", " \| ", s)
kkonganti@1	264 for s in [re.sub(r"serotype=", "", s) for s in serotypes]
kkonganti@1	265 ]
kkonganti@1	266 salmon_comb_res_fh.write(
kkonganti@1	267 "Sample\t" + "\t".join(formatted_serotypes) + snp_clust_col_header
kkonganti@1	268 )
kkonganti@1	269 # sample_snp_relation = (
kkonganti@1	270 # ncbi_pathogens_base_url
kkonganti@1	271 # + pdg_release
kkonganti@1	272 # + "".join(snp_clusters[ref_acc]["snp_clust_id"])
kkonganti@1	273 # + "?accessions="
kkonganti@1	274 # )
kkonganti@1	275 sample_snp_relation = ncbi_pathogens_base_url + "isolates/#"
kkonganti@1	276
kkonganti@1	277 if len(salmon_res_file_failed) == 1:
kkonganti@1	278 with (open("".join(salmon_res_file_failed), "r")) as no_calls_fh:
kkonganti@1	279 for line in no_calls_fh.readlines():
kkonganti@1	280 if line in ["\n", "\n\r", "\r"]:
kkonganti@1	281 continue
kkonganti@1	282 salmon_comb_res_fh.write(line.strip())
kkonganti@1	283 sal_yn[line.strip()] += 0
kkonganti@1	284 for serotype in serotypes:
kkonganti@1	285 salmon_comb_res_fh.write("\t-")
kkonganti@1	286 salmon_comb_res_fh.write(
kkonganti@1	287 "\t-\n"
kkonganti@1	288 ) if show_snp_clust_col else salmon_comb_res_fh.write("\n")
kkonganti@1	289 no_calls_fh.close()
kkonganti@1	290
kkonganti@1	291 for sample, counts in sorted(sample2salmon.items()):
kkonganti@1	292 salmon_comb_res_fh.write(sample)
kkonganti@1	293 snp_cluster_res_col = list()
kkonganti@1	294
kkonganti@1	295 for snp_clust_id in sample2salmon[sample]["snp_clust_ids"].keys():
kkonganti@1	296 # print(snp_clust_id)
kkonganti@1	297 # print(",".join(sample2salmon[sample]["snp_clust_ids"][snp_clust_id]))
kkonganti@1	298 # ppp.pprint(sample2salmon[sample]["snp_clust_ids"])
kkonganti@1	299 # ppp.pprint(sample2salmon[sample]["snp_clust_ids"][snp_clust_id])
kkonganti@1	300 # final_url_text = ",".join(
kkonganti@1	301 # sample2salmon[sample]["snp_clust_ids"][snp_clust_id]
kkonganti@1	302 # )
kkonganti@1	303 # final_url_text_to_show = snp_clust_id
kkonganti@1	304 # snp_cluster_res_col.append(
kkonganti@1	305 # "".join(
kkonganti@1	306 # [
kkonganti@1	307 # f'<a href="',
kkonganti@1	308 # sample_snp_relation,
kkonganti@1	309 # ",".join(sample2salmon[sample]["snp_clust_ids"][snp_clust_id]),
kkonganti@1	310 # f'" target="_blank">{snp_clust_id}</a>',
kkonganti@1	311 # ]
kkonganti@1	312 # )
kkonganti@1	313 # )
kkonganti@1	314 final_url_text_to_show = " ".join(
kkonganti@1	315 sample2salmon[sample]["snp_clust_ids"][snp_clust_id]
kkonganti@1	316 )
kkonganti@1	317 snp_cluster_res_col.append(
kkonganti@1	318 "".join(
kkonganti@1	319 [
kkonganti@1	320 f'<a href="',
kkonganti@1	321 sample_snp_relation,
kkonganti@1	322 final_url_text_to_show,
kkonganti@1	323 f'" target="_blank">{final_url_text_to_show}</a>',
kkonganti@1	324 ]
kkonganti@1	325 )
kkonganti@1	326 )
kkonganti@1	327
kkonganti@1	328 per_serotype_counts = 0
kkonganti@1	329 for serotype in serotypes:
kkonganti@1	330
kkonganti@1	331 if serotype in sample2salmon[sample].keys():
kkonganti@1	332 # ppp.pprint(counts)
kkonganti@1	333 sample_perc_mapped = round(
kkonganti@1	334 sum(counts[serotype]) / sum(counts["tot_reads"]) * 100, 2
kkonganti@1	335 )
kkonganti@1	336 salmon_comb_res_fh.write(
kkonganti@1	337 f"\t{sum(counts[serotype])} ({sample_perc_mapped}%)"
kkonganti@1	338 )
kkonganti@1	339 multiqc_salmon_counts[sample].setdefault(
kkonganti@1	340 re.match(r"^serotype=(.+?)\,antigen_formula.*", serotype)[1],
kkonganti@1	341 sum(counts[serotype]),
kkonganti@1	342 )
kkonganti@1	343 per_serotype_counts += sum(counts[serotype])
kkonganti@1	344 sal_yn[sample] += 1
kkonganti@1	345 else:
kkonganti@1	346 salmon_comb_res_fh.write(f"\t-")
kkonganti@1	347 sal_yn[sample] += 0
kkonganti@1	348
kkonganti@1	349 multiqc_salmon_counts[sample].setdefault(
kkonganti@1	350 no_hit, sum(counts["tot_reads"]) - per_serotype_counts
kkonganti@1	351 )
kkonganti@1	352 snp_clust_col_val = (
kkonganti@1	353 f'\t{" ".join(snp_cluster_res_col)}\n' if show_snp_clust_col else "\n"
kkonganti@1	354 )
kkonganti@1	355 # ppp.pprint(multiqc_salmon_counts)
kkonganti@1	356 salmon_comb_res_fh.write(snp_clust_col_val)
kkonganti@1	357
kkonganti@1	358 with open(bcs_sal_yn, "w") as bcs_sal_yn_fh:
kkonganti@1	359 bcs_sal_yn_fh.write("Sample\tSalmonella Presence\tNo. of Serotypes\n")
kkonganti@1	360 for sample in sal_yn.keys():
kkonganti@1	361 if sal_yn[sample] > 0:
kkonganti@1	362 bcs_sal_yn_fh.write(f"{sample}\tDetected\t{sal_yn[sample]}\n")
kkonganti@1	363 else:
kkonganti@1	364 bcs_sal_yn_fh.write(f"{sample}\tNot detected\t{sal_yn[sample]}\n")
kkonganti@1	365
kkonganti@1	366 with open(cell_colors_yml_file, "w") as cell_colors_fh:
kkonganti@1	367 yaml.dump(cell_colors_yml, cell_colors_fh, default_flow_style=False)
kkonganti@1	368
kkonganti@1	369 salmon_plot_json(salmon_comb_res_mqc, multiqc_salmon_counts, no_hit)
kkonganti@1	370
kkonganti@1	371 salmon_comb_res_fh.close()
kkonganti@1	372 bcs_sal_yn_fh.close()
kkonganti@1	373 cell_colors_fh.close()
kkonganti@1	374
kkonganti@1	375
kkonganti@1	376 def salmon_plot_json(file: None, sample_salmon_counts: None, no_hit: None) -> None:
kkonganti@1	377 """
kkonganti@1	378 This method will take a dictionary of salmon counts per sample
kkonganti@1	379 and will dump a JSON that will be used by MultiQC.
kkonganti@1	380 """
kkonganti@1	381
kkonganti@1	382 if file is None or sample_salmon_counts is None:
kkonganti@1	383 logging.error(
kkonganti@1	384 "Neither an output file to dump the JSON for MultiQC or the"
kkonganti@1	385 + "dictionary holding the salmon counts was not passed."
kkonganti@1	386 )
kkonganti@1	387
kkonganti@1	388 # Credit: http://phrogz.net/tmp/24colors.html
kkonganti@1	389 # Will cycle through 20 distinct colors.
kkonganti@1	390 distinct_color_palette = [
kkonganti@1	391 "#FF0000",
kkonganti@1	392 "#FFFF00",
kkonganti@1	393 "#00EAFF",
kkonganti@1	394 "#AA00FF",
kkonganti@1	395 "#FF7F00",
kkonganti@1	396 "#BFFF00",
kkonganti@1	397 "#0095FF",
kkonganti@1	398 "#FF00AA",
kkonganti@1	399 "#FFD400",
kkonganti@1	400 "#6AFF00",
kkonganti@1	401 "#0040FF",
kkonganti@1	402 "#EDB9B9",
kkonganti@1	403 "#B9D7ED",
kkonganti@1	404 "#E7E9B9",
kkonganti@1	405 "#DCB9ED",
kkonganti@1	406 "#B9EDE0",
kkonganti@1	407 "#8F2323",
kkonganti@1	408 "#23628F",
kkonganti@1	409 "#8F6A23",
kkonganti@1	410 "#6B238F",
kkonganti@1	411 "#4F8F23",
kkonganti@1	412 ]
kkonganti@1	413
kkonganti@1	414 no_hit_color = "#434348"
kkonganti@1	415 col_count = 0
kkonganti@1	416 serotypes = set()
kkonganti@1	417 salmon_counts = defaultdict(defaultdict)
kkonganti@1	418 salmon_counts["id"] = "BETTERCALLSAL_SALMON_COUNTS"
kkonganti@1	419 salmon_counts["section_name"] = "Salmon read counts"
kkonganti@1	420 salmon_counts["description"] = (
kkonganti@1	421 "This section shows the read counts from running <code>salmon</code> "
kkonganti@1	422 + "in <code>--meta</code> mode using SE, merged PE or concatenated PE reads against "
kkonganti@1	423 + "an on-the-fly <code>salmon</code> index generated from the genome hits "
kkonganti@1	424 + "of <code>kma</code>."
kkonganti@1	425 )
kkonganti@1	426 salmon_counts["plot_type"] = "bargraph"
kkonganti@1	427 salmon_counts["pconfig"]["id"] = "bettercallsal_salmon_counts_plot"
kkonganti@1	428 salmon_counts["pconfig"]["title"] = "Salmon: Read counts"
kkonganti@1	429 salmon_counts["pconfig"]["ylab"] = "Number of reads"
kkonganti@1	430 salmon_counts["pconfig"]["xDecimals"] = "false"
kkonganti@1	431 salmon_counts["pconfig"]["cpswitch_counts_label"] = "Number of reads (Counts)"
kkonganti@1	432 salmon_counts["pconfig"]["cpswitch_percent_label"] = "Number of reads (Percentages)"
kkonganti@1	433
kkonganti@1	434 for sample in sorted(sample_salmon_counts.keys()):
kkonganti@1	435 serotypes.update(list(sample_salmon_counts[sample].keys()))
kkonganti@1	436 salmon_counts["data"][sample] = sample_salmon_counts[sample]
kkonganti@1	437
kkonganti@1	438 for serotype in sorted(serotypes):
kkonganti@1	439 if serotype == no_hit:
kkonganti@1	440 continue
kkonganti@1	441 if col_count == len(distinct_color_palette) - 1:
kkonganti@1	442 col_count = 0
kkonganti@1	443
kkonganti@1	444 col_count += 1
kkonganti@1	445 salmon_counts["categories"][serotype] = {"color": distinct_color_palette[col_count]}
kkonganti@1	446
kkonganti@1	447 salmon_counts["categories"][no_hit] = {"color": no_hit_color}
kkonganti@1	448 json.dump(salmon_counts, open(file, "w"))
kkonganti@1	449
kkonganti@1	450
kkonganti@1	451 if __name__ == "__main__":
kkonganti@1	452 main()

Mercurial > repos > kkonganti > cfsan_bettercallsal

annotate 0.5.0/bin/gen_salmon_res_table.py @ 1:365849f031fd