kkonganti@17: #!/usr/bin/env python3 kkonganti@17: kkonganti@17: # Kranti Konganti kkonganti@17: kkonganti@17: import argparse kkonganti@17: import glob kkonganti@17: import inspect kkonganti@17: import json kkonganti@17: import logging kkonganti@17: import os kkonganti@17: import pickle kkonganti@17: import pprint kkonganti@17: import re kkonganti@17: from collections import defaultdict kkonganti@17: kkonganti@17: import yaml kkonganti@17: kkonganti@17: kkonganti@17: # Multiple inheritence for pretty printing of help text. kkonganti@17: class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): kkonganti@17: pass kkonganti@17: kkonganti@17: kkonganti@17: # Main kkonganti@17: def main() -> None: kkonganti@17: """ kkonganti@17: The succesful execution of this script requires access to bettercallsal formatted kkonganti@17: db flat files. On raven2, they are at /hpc/db/bettercallsall/PDGXXXXXXXXXX.XXXXX kkonganti@17: kkonganti@17: It takes the ACC2SERO.pickle file and *.reference_target.cluster_list.tsv file kkonganti@17: for that particular NCBI Pathogens release from the db directory mentioned with kkonganti@17: -db option and a root parent directory of the `salmon quant` results mentioned kkonganti@17: with -sal option and generates a final results table with number of reads kkonganti@17: mapped and a .json file to be used with MultiQC to generate a stacked bar plot. kkonganti@17: kkonganti@17: Using -url option optionally adds an extra column of NCBI Pathogens Isolates kkonganti@17: Browser, which directly links out to NCBI Pathogens Isolates SNP viewer tool. kkonganti@17: """ kkonganti@17: # Set logging. kkonganti@17: logging.basicConfig( kkonganti@17: format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n", kkonganti@17: level=logging.DEBUG, kkonganti@17: ) kkonganti@17: kkonganti@17: # Debug print. kkonganti@17: ppp = pprint.PrettyPrinter(width=55) kkonganti@17: prog_name = inspect.stack()[0].filename kkonganti@17: kkonganti@17: parser = argparse.ArgumentParser( kkonganti@17: prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses kkonganti@17: ) kkonganti@17: kkonganti@17: required = parser.add_argument_group("required arguments") kkonganti@17: kkonganti@17: required.add_argument( kkonganti@17: "-sal", kkonganti@17: dest="salmon_res_dir", kkonganti@17: default=False, kkonganti@17: required=True, kkonganti@17: help="Absolute UNIX path to the parent directory that contains the\n" kkonganti@17: + "`salmon quant` results directory. For example, if path to\n" kkonganti@17: + "`quant.sf` is in /hpc/john_doe/test/salmon_res/quant.sf, then\n" kkonganti@17: + "use this command-line option as:\n" kkonganti@17: + "-sal /hpc/john_doe/test", kkonganti@17: ) kkonganti@17: required.add_argument( kkonganti@17: "-snp", kkonganti@17: dest="rtc", kkonganti@17: default=False, kkonganti@17: required=True, kkonganti@17: help="Absolute UNIX Path to the PDG SNP reference target cluster\n" kkonganti@17: + "metadata file. On raven2, these are located at\n" kkonganti@17: + "/hpc/db/bettercallsal/PDGXXXXXXXXXX.XXXXX\n" kkonganti@17: + "Required if -sal is on.", kkonganti@17: ) kkonganti@17: required.add_argument( kkonganti@17: "-pickle", kkonganti@17: dest="acc2sero", kkonganti@17: default=False, kkonganti@17: required=True, kkonganti@17: help="Absolute UNIX Path to the *ACC2SERO.pickle\n" kkonganti@17: + "metadata file. On raven2, these are located at\n" kkonganti@17: + "/hpc/db/bettercallsal/PDGXXXXXXXXXX.XXXXX\n" kkonganti@17: + "Required if -sal is on.", kkonganti@17: ) kkonganti@17: parser.add_argument( kkonganti@17: "-op", kkonganti@17: dest="out_prefix", kkonganti@17: default="bettercallsal.tblsum", kkonganti@17: required=False, kkonganti@17: help="Set the output file(s) prefix for output(s) generated\n" + "by this program.", kkonganti@17: ) kkonganti@17: parser.add_argument( kkonganti@17: "-url", kkonganti@17: dest="show_snp_clust_info", kkonganti@17: default=False, kkonganti@17: required=False, kkonganti@17: action="store_true", kkonganti@17: help="Show SNP cluster participation information of the final genome hit.\n" kkonganti@17: + "This may be useful to see a relative placement of your sample in\n" kkonganti@17: + "NCBI Isolates SNP Tree Viewer based on genome similarity but however\n" kkonganti@17: + "due to rapid nature of the updates at NCBI Pathogen Detection Project,\n" kkonganti@17: + "the placement may be in an outdated cluster.", kkonganti@17: ) kkonganti@17: kkonganti@17: args = parser.parse_args() kkonganti@17: salmon_res_dir = args.salmon_res_dir kkonganti@17: out_prefix = args.out_prefix kkonganti@17: show_snp_clust_col = args.show_snp_clust_info kkonganti@17: rtc = args.rtc kkonganti@17: pickled_sero = args.acc2sero kkonganti@17: no_hit = "No genome hit" kkonganti@17: no_presence = "Salmonella presence not detected" kkonganti@17: bcs_sal_yn_prefix = "bettercallsal_salyn" kkonganti@17: sal_y = "Detected" kkonganti@17: sal_n = "Not detected" kkonganti@17: null_value = "NULL" kkonganti@17: assm_pat = re.compile(r"GC[AF]\_[0-9]+\.*[0-9]*") kkonganti@17: ncbi_pathogens_base_url = "https://www.ncbi.nlm.nih.gov/pathogens/" kkonganti@17: ncbi_pathogens_genome_base = "https://www.ncbi.nlm.nih.gov/datasets/genome/" kkonganti@17: kkonganti@17: sample2salmon, snp_clusters, multiqc_salmon_counts, seen_sero, sal_yn = ( kkonganti@17: defaultdict(defaultdict), kkonganti@17: defaultdict(defaultdict), kkonganti@17: defaultdict(defaultdict), kkonganti@17: defaultdict(int), kkonganti@17: defaultdict(int), kkonganti@17: ) kkonganti@17: kkonganti@17: cell_colors_yml = { kkonganti@17: bcs_sal_yn_prefix: {sal_y: "#c8e6c9 !important;", sal_n: "#ffcdd2 !important;"} kkonganti@17: } kkonganti@17: kkonganti@17: salmon_comb_res = os.path.join(os.getcwd(), out_prefix + ".txt") kkonganti@17: bcs_sal_yn = re.sub(out_prefix, bcs_sal_yn_prefix + ".tblsum", salmon_comb_res) kkonganti@17: cell_colors_yml_file = re.sub( kkonganti@17: out_prefix + ".txt", bcs_sal_yn_prefix + ".cellcolors.yml", salmon_comb_res kkonganti@17: ) kkonganti@17: salmon_comb_res_mqc = os.path.join(os.getcwd(), str(out_prefix).split(".")[0] + "_mqc.json") kkonganti@17: salmon_res_files = glob.glob(os.path.join(salmon_res_dir, "*", "quant.sf"), recursive=True) kkonganti@17: salmon_res_file_failed = glob.glob(os.path.join(salmon_res_dir, "BCS_NO_CALLS.txt")) kkonganti@17: kkonganti@17: if rtc and (not os.path.exists(rtc) or not os.path.getsize(rtc) > 0): kkonganti@17: logging.error( kkonganti@17: "The reference target cluster metadata file,\n" kkonganti@17: + f"{os.path.basename(rtc)} does not exist or is empty!" kkonganti@17: ) kkonganti@17: exit(1) kkonganti@17: kkonganti@17: if rtc and (not salmon_res_dir or not pickled_sero): kkonganti@17: logging.error("When -rtc is on, -sal and -ps are also required.") kkonganti@17: exit(1) kkonganti@17: kkonganti@17: if pickled_sero and (not os.path.exists(pickled_sero) or not os.path.getsize(pickled_sero)): kkonganti@17: logging.error( kkonganti@17: "The pickle file,\n" + f"{os.path.basename(pickled_sero)} does not exist or is empty!" kkonganti@17: ) kkonganti@17: exit(1) kkonganti@17: kkonganti@17: if salmon_res_dir: kkonganti@17: if not os.path.isdir(salmon_res_dir): kkonganti@17: logging.error("UNIX path\n" + f"{salmon_res_dir}\n" + "does not exist!") kkonganti@17: exit(1) kkonganti@17: if len(salmon_res_files) <= 0: kkonganti@17: # logging.error( kkonganti@17: # "Parent directory,\n" kkonganti@17: # + f"{salmon_res_dir}" kkonganti@17: # + "\ndoes not seem to have any directories that contain\n" kkonganti@17: # + "the `quant.sf` file(s)." kkonganti@17: # ) kkonganti@17: # exit(1) kkonganti@17: with open(salmon_comb_res, "w") as salmon_comb_res_fh: kkonganti@17: salmon_comb_res_fh.write(f"Sample\n{no_hit}s in any samples\n") kkonganti@17: salmon_comb_res_fh.close() kkonganti@17: kkonganti@17: with open(bcs_sal_yn, "w") as bcs_sal_yn_fh: kkonganti@17: bcs_sal_yn_fh.write(f"Sample\n{no_presence} in any samples\n") kkonganti@17: bcs_sal_yn_fh.close() kkonganti@17: kkonganti@17: exit(0) kkonganti@17: kkonganti@17: if rtc and os.path.exists(rtc) and os.path.getsize(rtc) > 0: kkonganti@17: kkonganti@17: # pdg_release = re.match(r"(^PDG\d+\.\d+)\..+", os.path.basename(rtc))[1] + "/" kkonganti@17: acc2sero = pickle.load(file=open(pickled_sero, "rb")) kkonganti@17: kkonganti@17: with open(rtc, "r") as rtc_fh: kkonganti@17: kkonganti@17: for line in rtc_fh: kkonganti@17: cols = line.strip().split("\t") kkonganti@17: kkonganti@17: if len(cols) < 4: kkonganti@17: logging.error( kkonganti@17: f"The file {os.path.basename(rtc)} seems to\n" kkonganti@17: + "be malformed. It contains less than required 4 columns." kkonganti@17: ) kkonganti@17: exit(1) kkonganti@17: elif cols[3] != null_value: kkonganti@17: snp_clusters[cols[0]].setdefault("assembly_accs", []).append(cols[3]) kkonganti@17: snp_clusters[cols[3]].setdefault("snp_clust_id", []).append(cols[0]) kkonganti@17: snp_clusters[cols[3]].setdefault("pathdb_acc_id", []).append(cols[1]) kkonganti@17: if len(snp_clusters[cols[3]]["snp_clust_id"]) > 1: kkonganti@17: logging.error( kkonganti@17: f"There is a duplicate reference accession [{cols[3]}]" kkonganti@17: + f"in the metadata file{os.path.basename(rtc)}!" kkonganti@17: ) kkonganti@17: exit(1) kkonganti@17: kkonganti@17: rtc_fh.close() kkonganti@17: kkonganti@17: for salmon_res_file in salmon_res_files: kkonganti@17: sample_name = re.match( kkonganti@17: r"(^.+?)((\_salmon\_res)|(\.salmon))$", kkonganti@17: os.path.basename(os.path.dirname(salmon_res_file)), kkonganti@17: )[1] kkonganti@17: salmon_meta_json = os.path.join( kkonganti@17: os.path.dirname(salmon_res_file), "aux_info", "meta_info.json" kkonganti@17: ) kkonganti@17: kkonganti@17: if not os.path.exists(salmon_meta_json) or not os.path.getsize(salmon_meta_json) > 0: kkonganti@17: logging.error( kkonganti@17: "The file\n" kkonganti@17: + f"{salmon_meta_json}\ndoes not exist or is empty!\n" kkonganti@17: + "Did `salmon quant` fail?" kkonganti@17: ) kkonganti@17: exit(1) kkonganti@17: kkonganti@17: if not os.path.exists(salmon_res_file) or not os.path.getsize(salmon_res_file): kkonganti@17: logging.error( kkonganti@17: "The file\n" kkonganti@17: + f"{salmon_res_file}\ndoes not exist or is empty!\n" kkonganti@17: + "Did `salmon quant` fail?" kkonganti@17: ) kkonganti@17: exit(1) kkonganti@17: kkonganti@17: with open(salmon_res_file, "r") as salmon_res_fh: kkonganti@17: for line in salmon_res_fh.readlines(): kkonganti@17: if re.match(r"^Name.+", line): kkonganti@17: continue kkonganti@17: cols = line.strip().split("\t") kkonganti@17: ref_acc = "_".join(cols[0].split("_")[:2]) kkonganti@17: kkonganti@17: if ref_acc not in snp_clusters.keys(): kkonganti@17: snp_clusters[ref_acc]["snp_clust_id"] = ref_acc kkonganti@17: snp_clusters[ref_acc]["pathdb_acc_id"] = ref_acc kkonganti@17: kkonganti@17: ( kkonganti@17: sample2salmon[sample_name] kkonganti@17: .setdefault(acc2sero[cols[0]], []) kkonganti@17: .append(int(round(float(cols[4]), 2))) kkonganti@17: ) kkonganti@17: ( kkonganti@17: sample2salmon[sample_name] kkonganti@17: .setdefault("snp_clust_ids", {}) kkonganti@17: .setdefault("".join(snp_clusters[ref_acc]["snp_clust_id"]), []) kkonganti@17: .append("".join(snp_clusters[ref_acc]["pathdb_acc_id"])) kkonganti@17: ) kkonganti@17: seen_sero[acc2sero[cols[0]]] = 1 kkonganti@17: kkonganti@17: salmon_meta_json_read = json.load(open(salmon_meta_json, "r")) kkonganti@17: ( kkonganti@17: sample2salmon[sample_name] kkonganti@17: .setdefault("tot_reads", []) kkonganti@17: .append(salmon_meta_json_read["num_processed"]) kkonganti@17: ) kkonganti@17: kkonganti@17: with open(salmon_comb_res, "w") as salmon_comb_res_fh: kkonganti@17: kkonganti@17: # snp_clust_col_header = ( kkonganti@17: # "\tSNP Cluster(s) by Genome Hit\n" if show_snp_clust_col else "\n" kkonganti@17: # ) kkonganti@17: snp_clust_col_header = ( kkonganti@17: "\tNCBI Pathogens Isolate Browser\n" if show_snp_clust_col else "\n" kkonganti@17: ) kkonganti@17: serotypes = sorted(seen_sero.keys()) kkonganti@17: formatted_serotypes = [ kkonganti@17: re.sub(r"\,antigen_formula=", " | ", s) kkonganti@17: for s in [re.sub(r"serotype=", "", s) for s in serotypes] kkonganti@17: ] kkonganti@17: salmon_comb_res_fh.write( kkonganti@17: "Sample\t" + "\t".join(formatted_serotypes) + snp_clust_col_header kkonganti@17: ) kkonganti@17: # sample_snp_relation = ( kkonganti@17: # ncbi_pathogens_base_url kkonganti@17: # + pdg_release kkonganti@17: # + "".join(snp_clusters[ref_acc]["snp_clust_id"]) kkonganti@17: # + "?accessions=" kkonganti@17: # ) kkonganti@17: if len(salmon_res_file_failed) == 1: kkonganti@17: with (open("".join(salmon_res_file_failed), "r")) as no_calls_fh: kkonganti@17: for line in no_calls_fh.readlines(): kkonganti@17: if line in ["\n", "\n\r", "\r"]: kkonganti@17: continue kkonganti@17: salmon_comb_res_fh.write(line.strip()) kkonganti@17: sal_yn[line.strip()] += 0 kkonganti@17: for serotype in serotypes: kkonganti@17: salmon_comb_res_fh.write("\t-") kkonganti@17: salmon_comb_res_fh.write( kkonganti@17: "\t-\n" kkonganti@17: ) if show_snp_clust_col else salmon_comb_res_fh.write("\n") kkonganti@17: no_calls_fh.close() kkonganti@17: kkonganti@17: for sample, counts in sorted(sample2salmon.items()): kkonganti@17: salmon_comb_res_fh.write(sample) kkonganti@17: snp_cluster_res_col = list() kkonganti@17: kkonganti@17: for snp_clust_id in sample2salmon[sample]["snp_clust_ids"].keys(): kkonganti@17: # print(snp_clust_id) kkonganti@17: # print(",".join(sample2salmon[sample]["snp_clust_ids"][snp_clust_id])) kkonganti@17: # ppp.pprint(sample2salmon[sample]["snp_clust_ids"]) kkonganti@17: # ppp.pprint(sample2salmon[sample]["snp_clust_ids"][snp_clust_id]) kkonganti@17: # final_url_text = ",".join( kkonganti@17: # sample2salmon[sample]["snp_clust_ids"][snp_clust_id] kkonganti@17: # ) kkonganti@17: # final_url_text_to_show = snp_clust_id kkonganti@17: # snp_cluster_res_col.append( kkonganti@17: # "".join( kkonganti@17: # [ kkonganti@17: # f'{snp_clust_id}', kkonganti@17: # ] kkonganti@17: # ) kkonganti@17: # ) kkonganti@17: # ppp.pprint(sample2salmon[sample]) kkonganti@17: for pathdbacc in sample2salmon[sample]["snp_clust_ids"][snp_clust_id]: kkonganti@17: # final_url_text_to_show = " ".join( kkonganti@17: # sample2salmon[sample]["snp_clust_ids"][snp_clust_id] kkonganti@17: # ) kkonganti@17: sample_snp_relation = ( kkonganti@17: ncbi_pathogens_genome_base kkonganti@17: if assm_pat.match(pathdbacc) kkonganti@17: else ncbi_pathogens_base_url + "isolates/#" kkonganti@17: ) kkonganti@17: kkonganti@17: snp_cluster_res_col.append( kkonganti@17: "".join( kkonganti@17: [ kkonganti@17: f'{pathdbacc}', kkonganti@17: ] kkonganti@17: ) kkonganti@17: ) kkonganti@17: kkonganti@17: per_serotype_counts = 0 kkonganti@17: for serotype in serotypes: kkonganti@17: kkonganti@17: if serotype in sample2salmon[sample].keys(): kkonganti@17: # ppp.pprint(counts) kkonganti@17: sample_perc_mapped = round( kkonganti@17: sum(counts[serotype]) / sum(counts["tot_reads"]) * 100, 2 kkonganti@17: ) kkonganti@17: salmon_comb_res_fh.write( kkonganti@17: f"\t{sum(counts[serotype])} ({sample_perc_mapped}%)" kkonganti@17: ) kkonganti@17: multiqc_salmon_counts[sample].setdefault( kkonganti@17: re.match(r"^serotype=(.+?)\,antigen_formula.*", serotype)[1], kkonganti@17: sum(counts[serotype]), kkonganti@17: ) kkonganti@17: per_serotype_counts += sum(counts[serotype]) kkonganti@17: sal_yn[sample] += 1 kkonganti@17: else: kkonganti@17: salmon_comb_res_fh.write(f"\t-") kkonganti@17: sal_yn[sample] += 0 kkonganti@17: kkonganti@17: multiqc_salmon_counts[sample].setdefault( kkonganti@17: no_hit, sum(counts["tot_reads"]) - per_serotype_counts kkonganti@17: ) kkonganti@17: snp_clust_col_val = ( kkonganti@17: f'\t{" ".join(snp_cluster_res_col)}\n' if show_snp_clust_col else "\n" kkonganti@17: ) kkonganti@17: # ppp.pprint(multiqc_salmon_counts) kkonganti@17: salmon_comb_res_fh.write(snp_clust_col_val) kkonganti@17: kkonganti@17: with open(bcs_sal_yn, "w") as bcs_sal_yn_fh: kkonganti@17: bcs_sal_yn_fh.write("Sample\tSalmonella Presence\tNo. of Serotypes\n") kkonganti@17: for sample in sal_yn.keys(): kkonganti@17: if sal_yn[sample] > 0: kkonganti@17: bcs_sal_yn_fh.write(f"{sample}\tDetected\t{sal_yn[sample]}\n") kkonganti@17: else: kkonganti@17: bcs_sal_yn_fh.write(f"{sample}\tNot detected\t{sal_yn[sample]}\n") kkonganti@17: kkonganti@17: with open(cell_colors_yml_file, "w") as cell_colors_fh: kkonganti@17: yaml.dump(cell_colors_yml, cell_colors_fh, default_flow_style=False) kkonganti@17: kkonganti@17: salmon_plot_json(salmon_comb_res_mqc, multiqc_salmon_counts, no_hit) kkonganti@17: kkonganti@17: salmon_comb_res_fh.close() kkonganti@17: bcs_sal_yn_fh.close() kkonganti@17: cell_colors_fh.close() kkonganti@17: kkonganti@17: kkonganti@17: def salmon_plot_json(file: None, sample_salmon_counts: None, no_hit: None) -> None: kkonganti@17: """ kkonganti@17: This method will take a dictionary of salmon counts per sample kkonganti@17: and will dump a JSON that will be used by MultiQC. kkonganti@17: """ kkonganti@17: kkonganti@17: if file is None or sample_salmon_counts is None: kkonganti@17: logging.error( kkonganti@17: "Neither an output file to dump the JSON for MultiQC or the" kkonganti@17: + "dictionary holding the salmon counts was not passed." kkonganti@17: ) kkonganti@17: kkonganti@17: # Credit: http://phrogz.net/tmp/24colors.html kkonganti@17: # Will cycle through 20 distinct colors. kkonganti@17: distinct_color_palette = [ kkonganti@17: "#FF0000", kkonganti@17: "#FFFF00", kkonganti@17: "#00EAFF", kkonganti@17: "#AA00FF", kkonganti@17: "#FF7F00", kkonganti@17: "#BFFF00", kkonganti@17: "#0095FF", kkonganti@17: "#FF00AA", kkonganti@17: "#FFD400", kkonganti@17: "#6AFF00", kkonganti@17: "#0040FF", kkonganti@17: "#EDB9B9", kkonganti@17: "#B9D7ED", kkonganti@17: "#E7E9B9", kkonganti@17: "#DCB9ED", kkonganti@17: "#B9EDE0", kkonganti@17: "#8F2323", kkonganti@17: "#23628F", kkonganti@17: "#8F6A23", kkonganti@17: "#6B238F", kkonganti@17: "#4F8F23", kkonganti@17: ] kkonganti@17: kkonganti@17: # Credit: https://mokole.com/palette.html kkonganti@17: # Will use this palette if we run out ouf kkonganti@17: # 20 serotypes. More than 50 serotypes kkonganti@17: # per run is probably rare but if not, kkonganti@17: # will cycle through about 45. kkonganti@17: distinct_color_palette2 = [ kkonganti@17: "#2F4F4F", # darkslategray kkonganti@17: "#556B2F", # darkolivegreen kkonganti@17: "#A0522D", # sienna kkonganti@17: "#2E8B57", # seagreen kkonganti@17: "#006400", # darkgreen kkonganti@17: "#8B0000", # darkred kkonganti@17: "#808000", # olive kkonganti@17: "#BC8F8F", # rosybrown kkonganti@17: "#663399", # rebeccapurple kkonganti@17: "#B8860B", # darkgoldenrod kkonganti@17: "#4682B4", # steelblue kkonganti@17: "#000080", # navy kkonganti@17: "#D2691E", # chocolate kkonganti@17: "#9ACD32", # yellowgreen kkonganti@17: "#20B2AA", # lightseagreen kkonganti@17: "#CD5C5C", # indianred kkonganti@17: "#8FBC8F", # darkseagreen kkonganti@17: "#800080", # purple kkonganti@17: "#B03060", # maroon3 kkonganti@17: "#FF8C00", # darkorange kkonganti@17: "#FFD700", # gold kkonganti@17: "#FFFF00", # yellow kkonganti@17: "#DEB887", # burlywood kkonganti@17: "#00FF00", # lime kkonganti@17: "#BA55D3", # mediumorchid kkonganti@17: "#00FA9A", # mediumspringgreen kkonganti@17: "#4169E1", # royalblue kkonganti@17: "#E9967A", # darksalmon kkonganti@17: "#DC143C", # crimson kkonganti@17: "#00FFFF", # aqua kkonganti@17: "#F4A460", # sandybrown kkonganti@17: "#9370DB", # mediumpurple kkonganti@17: "#0000FF", # blue kkonganti@17: "#ADFF2F", # greenyellow kkonganti@17: "#FF6347", # tomato kkonganti@17: "#D8BFD8", # thistle kkonganti@17: "#FF00FF", # fuchsia kkonganti@17: "#DB7093", # palevioletred kkonganti@17: "#F0E68C", # khaki kkonganti@17: "#6495ED", # cornflower kkonganti@17: "#DDA0DD", # plum kkonganti@17: "#EE82EE", # violet kkonganti@17: "#7FFFD4", # aquamarine kkonganti@17: "#FAFAD2", # lightgoldenrod kkonganti@17: "#FF69B4", # hotpink kkonganti@17: "#FFB6C1", # lightpink kkonganti@17: ] kkonganti@17: kkonganti@17: no_hit_color = "#434348" kkonganti@17: col_count = 0 kkonganti@17: serotypes = set() kkonganti@17: salmon_counts = defaultdict(defaultdict) kkonganti@17: salmon_counts["id"] = "BETTERCALLSAL_SALMON_COUNTS" kkonganti@17: salmon_counts["section_name"] = "Salmon read counts" kkonganti@17: salmon_counts["description"] = ( kkonganti@17: "This section shows the read counts from running salmon " kkonganti@17: + "in --meta mode using SE, merged PE or concatenated PE reads against " kkonganti@17: + "an on-the-fly salmon index generated from the genome hits " kkonganti@17: + "of kma." kkonganti@17: ) kkonganti@17: salmon_counts["plot_type"] = "bargraph" kkonganti@17: salmon_counts["pconfig"]["id"] = "bettercallsal_salmon_counts_plot" kkonganti@17: salmon_counts["pconfig"]["title"] = "Salmon: Read counts" kkonganti@17: salmon_counts["pconfig"]["ylab"] = "Number of reads" kkonganti@17: salmon_counts["pconfig"]["xDecimals"] = "false" kkonganti@17: salmon_counts["pconfig"]["cpswitch_counts_label"] = "Number of reads (Counts)" kkonganti@17: salmon_counts["pconfig"]["cpswitch_percent_label"] = "Number of reads (Percentages)" kkonganti@17: kkonganti@17: for sample in sorted(sample_salmon_counts.keys()): kkonganti@17: serotypes.update(list(sample_salmon_counts[sample].keys())) kkonganti@17: salmon_counts["data"][sample] = sample_salmon_counts[sample] kkonganti@17: kkonganti@17: if len(serotypes) > len(distinct_color_palette): kkonganti@17: distinct_color_palette = distinct_color_palette2 kkonganti@17: kkonganti@17: for serotype in sorted(serotypes): kkonganti@17: if serotype == no_hit: kkonganti@17: continue kkonganti@17: if col_count == len(distinct_color_palette) - 1: kkonganti@17: col_count = 0 kkonganti@17: kkonganti@17: col_count += 1 kkonganti@17: salmon_counts["categories"][serotype] = {"color": distinct_color_palette[col_count]} kkonganti@17: kkonganti@17: salmon_counts["categories"][no_hit] = {"color": no_hit_color} kkonganti@17: json.dump(salmon_counts, open(file, "w")) kkonganti@17: kkonganti@17: kkonganti@17: if __name__ == "__main__": kkonganti@17: main()