cfsan_bettercallsal: 0.5.0/bin/sourmash_sim

annotate 0.5.0/bin/sourmash_sim_matrix.py @ 1:365849f031fd

"planemo upload"

author	kkonganti
date	Mon, 05 Jun 2023 18:48:51 -0400
parents
children

rev	line source
kkonganti@1	1 #!/usr/bin/env python3
kkonganti@1	2
kkonganti@1	3 # Kranti Konganti
kkonganti@1	4
kkonganti@1	5 import os
kkonganti@1	6 import argparse
kkonganti@1	7 import inspect
kkonganti@1	8 import logging
kkonganti@1	9 import re
kkonganti@1	10 import pickle
kkonganti@1	11 import pprint
kkonganti@1	12 import json
kkonganti@1	13 from collections import defaultdict
kkonganti@1	14
kkonganti@1	15 # Set logging.
kkonganti@1	16 logging.basicConfig(
kkonganti@1	17 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n",
kkonganti@1	18 level=logging.DEBUG,
kkonganti@1	19 )
kkonganti@1	20
kkonganti@1	21 # Debug print.
kkonganti@1	22 ppp = pprint.PrettyPrinter(width=50, indent=4)
kkonganti@1	23
kkonganti@1	24 # Multiple inheritence for pretty printing of help text.
kkonganti@1	25 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
kkonganti@1	26 pass
kkonganti@1	27
kkonganti@1	28
kkonganti@1	29 def main() -> None:
kkonganti@1	30 """
kkonganti@1	31 This script works only in the context of `bettercallsal` Nextflow workflow.
kkonganti@1	32 It takes:
kkonganti@1	33 1. A CSV file containing a similarity matrix or dissimilarity matrix where
kkonganti@1	34 the header row contains the names.
kkonganti@1	35 3. It takes indexed NCBI Pathogen metadata in pickle format and converts
kkonganti@1	36 accessions to serotype names in the final distance matrix output.
kkonganti@1	37 """
kkonganti@1	38
kkonganti@1	39 prog_name = os.path.basename(inspect.stack()[0].filename)
kkonganti@1	40
kkonganti@1	41 parser = argparse.ArgumentParser(
kkonganti@1	42 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@1	43 )
kkonganti@1	44
kkonganti@1	45 required = parser.add_argument_group("required arguments")
kkonganti@1	46
kkonganti@1	47 required.add_argument(
kkonganti@1	48 "-csv",
kkonganti@1	49 dest="mat",
kkonganti@1	50 default=False,
kkonganti@1	51 required=True,
kkonganti@1	52 help="Absolute UNIX path to .csv file containing similarity\n"
kkonganti@1	53 + "or dissimilarity matrix from `sourmash compare`.",
kkonganti@1	54 )
kkonganti@1	55 required.add_argument(
kkonganti@1	56 "-pickle",
kkonganti@1	57 dest="acc2sero",
kkonganti@1	58 default=False,
kkonganti@1	59 required=True,
kkonganti@1	60 help="Absolute UNIX Path to the *ACC2SERO.pickle\n"
kkonganti@1	61 + "metadata file. On raven2, these are located at\n"
kkonganti@1	62 + "/hpc/db/bettercallsal/PDGXXXXXXXXXX.XXXXX/",
kkonganti@1	63 )
kkonganti@1	64 required.add_argument(
kkonganti@1	65 "-labels",
kkonganti@1	66 dest="labels",
kkonganti@1	67 default=False,
kkonganti@1	68 required=True,
kkonganti@1	69 help="Absolute UNIX Path to the *.labels.txt\n"
kkonganti@1	70 + "file from `sourmash compare`. The accessions\n"
kkonganti@1	71 + "will be renanamed to serotype names.",
kkonganti@1	72 )
kkonganti@1	73
kkonganti@1	74 args = parser.parse_args()
kkonganti@1	75 csv = args.mat
kkonganti@1	76 labels = args.labels
kkonganti@1	77 pickled_sero = args.acc2sero
kkonganti@1	78 row_names = list()
kkonganti@1	79 distance_mat = defaultdict(defaultdict)
kkonganti@1	80 out_csv = os.path.join(os.getcwd(), "bcs_sourmash_matrix.tblsum.txt")
kkonganti@1	81 out_json = os.path.join(os.getcwd(), "bcs_sourmash_matrix_mqc.json")
kkonganti@1	82
kkonganti@1	83 # Prepare dictionary to be dumped as JSON.
kkonganti@1	84 distance_mat["id"] = "BETTERCALLSAL_CONTAINMENT_INDEX"
kkonganti@1	85 distance_mat["section_name"] = "Containment index"
kkonganti@1	86 distance_mat["description"] = (
kkonganti@1	87 "This section shows the containment index between a sample and the genomes"
kkonganti@1	88 + "by running <code>sourmash gather</code> "
kkonganti@1	89 + "using <code>--containment</code> option."
kkonganti@1	90 )
kkonganti@1	91 distance_mat["plot_type"] = "heatmap"
kkonganti@1	92 distance_mat["pconfig"]["id"] = "bettercallsal_containment_index_heatmap"
kkonganti@1	93 distance_mat["pconfig"]["title"] = "Sourmash: containment index"
kkonganti@1	94 distance_mat["pconfig"]["xTitle"] = "Samples"
kkonganti@1	95 distance_mat["pconfig"]["yTitle"] = "Isolates (Genome assemblies)"
kkonganti@1	96 distance_mat["pconfig"]["ycats_samples"] = "False"
kkonganti@1	97 distance_mat["pconfig"]["xcats_samples"] = "False"
kkonganti@1	98 distance_mat["pconfig"]["square"] = "False"
kkonganti@1	99 distance_mat["pconfig"]["min"] = "0.0"
kkonganti@1	100 distance_mat["pconfig"]["max"] = "1.0"
kkonganti@1	101 distance_mat["data"]["data"] = list()
kkonganti@1	102
kkonganti@1	103 if pickled_sero and (not os.path.exists(pickled_sero) or not os.path.getsize(pickled_sero)):
kkonganti@1	104 logging.error(
kkonganti@1	105 "The pickle file,\n" + f"{os.path.basename(pickled_sero)} does not exist or is empty!"
kkonganti@1	106 )
kkonganti@1	107 exit(1)
kkonganti@1	108 else:
kkonganti@1	109 acc2sero = pickle.load(file=open(pickled_sero, "rb"))
kkonganti@1	110
kkonganti@1	111 if csv and (not os.path.exists(csv) or not os.path.getsize(csv) > 0):
kkonganti@1	112 logging.error("File,\n" + f"{csv}\ndoes not exist " + "or is empty!")
kkonganti@1	113 exit(0)
kkonganti@1	114
kkonganti@1	115 if labels and (not os.path.exists(labels) or not os.path.getsize(labels) > 0):
kkonganti@1	116 logging.error("File,\n" + f"{labels}\ndoes not exist " + "or is empty!")
kkonganti@1	117 exit(0)
kkonganti@1	118
kkonganti@1	119 # with open(out_labels, "w") as out_labels_fh:
kkonganti@1	120 with open(labels, "r") as labels_fh:
kkonganti@1	121 for line in labels_fh:
kkonganti@1	122 line = line.strip()
kkonganti@1	123 if line not in acc2sero.keys():
kkonganti@1	124 row_names.append(line)
kkonganti@1	125
kkonganti@1	126 labels_fh.close()
kkonganti@1	127
kkonganti@1	128 with open(out_csv, "w") as csv_out_fh:
kkonganti@1	129 with open(csv, "r") as csv_in_fh:
kkonganti@1	130 header = csv_in_fh.readline().strip().split(",")
kkonganti@1	131 acc_cols = [idx for idx, col in enumerate(header) if col in acc2sero.keys()]
kkonganti@1	132 sample_cols = [idx for idx, col in enumerate(header) if col not in acc2sero.keys()]
kkonganti@1	133
kkonganti@1	134 col_names = [
kkonganti@1	135 re.sub(r"serotype=\|\,antigen_formula=.*?\\|", "", s)
kkonganti@1	136 for s in [acc2sero[col] + f"\| \| {col}" for col in header if col in acc2sero.keys()]
kkonganti@1	137 ]
kkonganti@1	138
kkonganti@1	139 distance_mat["xcats"] = col_names
kkonganti@1	140 csv_out_fh.write("\t".join(["Sample"] + col_names) + "\n")
kkonganti@1	141 line_num = 0
kkonganti@1	142
kkonganti@1	143 for line in csv_in_fh:
kkonganti@1	144 if line_num not in sample_cols:
kkonganti@1	145 continue
kkonganti@1	146 else:
kkonganti@1	147
kkonganti@1	148 heatmap_rows = [
kkonganti@1	149 str(round(float(line.strip().split(",")[col]), 5)) for col in acc_cols
kkonganti@1	150 ]
kkonganti@1	151 # distance_mat["data"]["hmdata"].append(heatmap_rows)
kkonganti@1	152 # distance_mat["data"][row_names[line_num]] = heatmap_rows
kkonganti@1	153 distance_mat["data"]["data"].append(heatmap_rows)
kkonganti@1	154 # distance_mat["data"][row_names[line_num]] = dict(
kkonganti@1	155 # [(col_names[idx], val) for idx, val in enumerate(heatmap_rows)]
kkonganti@1	156 # )
kkonganti@1	157 csv_out_fh.write("\t".join([row_names[line_num]] + heatmap_rows) + "\n")
kkonganti@1	158 line_num += 1
kkonganti@1	159 csv_in_fh.close()
kkonganti@1	160 csv_out_fh.close()
kkonganti@1	161
kkonganti@1	162 distance_mat["ycats"] = row_names
kkonganti@1	163 json.dump(distance_mat, open(out_json, "w"))
kkonganti@1	164
kkonganti@1	165
kkonganti@1	166 if __name__ == "__main__":
kkonganti@1	167 main()

Mercurial > repos > kkonganti > cfsan_bettercallsal

annotate 0.5.0/bin/sourmash_sim_matrix.py @ 1:365849f031fd