annotate 1.0.0/bin/sourmash_sim_matrix.py @ 0:801b85b03a17 draft default tip

planemo upload
author galaxytrakr
date Thu, 28 May 2026 20:31:42 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
1 #!/usr/bin/env python3
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
2
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
3 # Kranti Konganti
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
4
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
5 import os
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
6 import argparse
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
7 import inspect
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
8 import logging
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
9 import re
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
10 import pickle
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
11 import pprint
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
12 import json
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
13 from collections import defaultdict
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
14
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
15 # Set logging.
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
16 logging.basicConfig(
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
17 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
18 level=logging.DEBUG,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
19 )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
20
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
21 # Debug print.
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
22 ppp = pprint.PrettyPrinter(width=50, indent=4)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
23
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
24 # Multiple inheritence for pretty printing of help text.
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
25 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
26 pass
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
27
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
28
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
29 def main() -> None:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
30 """
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
31 This script works only in the context of `bettercallsal` Nextflow workflow.
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
32 It takes:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
33 1. A CSV file containing a similarity matrix or dissimilarity matrix where
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
34 the header row contains the names.
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
35 3. It takes indexed NCBI Pathogen metadata in pickle format and converts
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
36 accessions to serotype names in the final distance matrix output.
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
37 """
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
38
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
39 prog_name = os.path.basename(inspect.stack()[0].filename)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
40
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
41 parser = argparse.ArgumentParser(
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
42 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
43 )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
44
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
45 required = parser.add_argument_group("required arguments")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
46
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
47 required.add_argument(
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
48 "-csv",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
49 dest="mat",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
50 default=False,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
51 required=True,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
52 help="Absolute UNIX path to .csv file containing similarity\n"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
53 + "or dissimilarity matrix from `sourmash compare`.",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
54 )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
55 required.add_argument(
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
56 "-pickle",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
57 dest="acc2sero",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
58 default=False,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
59 required=True,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
60 help="Absolute UNIX Path to the *ACC2SERO.pickle\n"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
61 + "metadata file. On raven2, these are located at\n"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
62 + "/hpc/db/bettercallsal/PDGXXXXXXXXXX.XXXXX/",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
63 )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
64 required.add_argument(
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
65 "-labels",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
66 dest="labels",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
67 default=False,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
68 required=True,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
69 help="Absolute UNIX Path to the *.labels.txt\n"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
70 + "file from `sourmash compare`. The accessions\n"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
71 + "will be renanamed to serotype names.",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
72 )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
73
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
74 args = parser.parse_args()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
75 csv = args.mat
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
76 labels = args.labels
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
77 pickled_sero = args.acc2sero
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
78 row_names = list()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
79 distance_mat = defaultdict(defaultdict)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
80 out_csv = os.path.join(os.getcwd(), "bcs_sourmash_matrix.tblsum.txt")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
81 out_json = os.path.join(os.getcwd(), "bcs_sourmash_matrix_mqc.json")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
82
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
83 # Prepare dictionary to be dumped as JSON.
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
84 distance_mat["id"] = "BETTERCALLSAL_CONTAINMENT_INDEX"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
85 distance_mat["section_name"] = "Containment index"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
86 distance_mat["description"] = (
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
87 "This section shows the containment index between a sample and the genomes"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
88 + "by running <code>sourmash gather</code> "
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
89 + "using <code>--containment</code> option."
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
90 )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
91 distance_mat["plot_type"] = "heatmap"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
92 distance_mat["pconfig"]["id"] = "bettercallsal_containment_index_heatmap"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
93 distance_mat["pconfig"]["title"] = "Sourmash: containment index"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
94 distance_mat["pconfig"]["xTitle"] = "Samples"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
95 distance_mat["pconfig"]["yTitle"] = "Isolates (Genome assemblies)"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
96 distance_mat["pconfig"]["ycats_samples"] = "False"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
97 distance_mat["pconfig"]["xcats_samples"] = "False"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
98 distance_mat["pconfig"]["square"] = "False"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
99 distance_mat["pconfig"]["min"] = "0.0"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
100 distance_mat["pconfig"]["max"] = "1.0"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
101 distance_mat["data"]["data"] = list()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
102
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
103 if pickled_sero and (not os.path.exists(pickled_sero) or not os.path.getsize(pickled_sero)):
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
104 logging.error(
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
105 "The pickle file,\n" + f"{os.path.basename(pickled_sero)} does not exist or is empty!"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
106 )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
107 exit(1)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
108 else:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
109 acc2sero = pickle.load(file=open(pickled_sero, "rb"))
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
110
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
111 if csv and (not os.path.exists(csv) or not os.path.getsize(csv) > 0):
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
112 logging.error("File,\n" + f"{csv}\ndoes not exist " + "or is empty!")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
113 exit(0)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
114
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
115 if labels and (not os.path.exists(labels) or not os.path.getsize(labels) > 0):
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
116 logging.error("File,\n" + f"{labels}\ndoes not exist " + "or is empty!")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
117 exit(0)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
118
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
119 # with open(out_labels, "w") as out_labels_fh:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
120 with open(labels, "r") as labels_fh:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
121 for line in labels_fh:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
122 line = line.strip()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
123 if line not in acc2sero.keys():
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
124 row_names.append(line)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
125
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
126 labels_fh.close()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
127
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
128 with open(out_csv, "w") as csv_out_fh:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
129 with open(csv, "r") as csv_in_fh:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
130 header = csv_in_fh.readline().strip().split(",")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
131 acc_cols = [idx for idx, col in enumerate(header) if col in acc2sero.keys()]
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
132 sample_cols = [idx for idx, col in enumerate(header) if col not in acc2sero.keys()]
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
133
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
134 col_names = [
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
135 re.sub(r"serotype=|\,antigen_formula=.*?\|", "", s)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
136 for s in [acc2sero[col] + f"| | {col}" for col in header if col in acc2sero.keys()]
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
137 ]
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
138
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
139 distance_mat["xcats"] = col_names
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
140 csv_out_fh.write("\t".join(["Sample"] + col_names) + "\n")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
141 line_num = 0
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
142
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
143 for line in csv_in_fh:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
144 if line_num not in sample_cols:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
145 continue
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
146 else:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
147
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
148 heatmap_rows = [
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
149 str(round(float(line.strip().split(",")[col]), 5)) for col in acc_cols
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
150 ]
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
151 # distance_mat["data"]["hmdata"].append(heatmap_rows)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
152 # distance_mat["data"][row_names[line_num]] = heatmap_rows
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
153 distance_mat["data"]["data"].append(heatmap_rows)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
154 # distance_mat["data"][row_names[line_num]] = dict(
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
155 # [(col_names[idx], val) for idx, val in enumerate(heatmap_rows)]
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
156 # )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
157 csv_out_fh.write("\t".join([row_names[line_num]] + heatmap_rows) + "\n")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
158 line_num += 1
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
159 csv_in_fh.close()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
160 csv_out_fh.close()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
161
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
162 distance_mat["ycats"] = row_names
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
163 json.dump(distance_mat, open(out_json, "w"))
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
164
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
165
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
166 if __name__ == "__main__":
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
167 main()