kkonganti@1
|
1 #!/usr/bin/env python3
|
kkonganti@1
|
2
|
kkonganti@1
|
3 # Kranti Konganti
|
kkonganti@1
|
4
|
kkonganti@1
|
5 import os
|
kkonganti@1
|
6 import argparse
|
kkonganti@1
|
7 import inspect
|
kkonganti@1
|
8 import logging
|
kkonganti@1
|
9 import re
|
kkonganti@1
|
10 import pickle
|
kkonganti@1
|
11 import pprint
|
kkonganti@1
|
12 import json
|
kkonganti@1
|
13 from collections import defaultdict
|
kkonganti@1
|
14
|
kkonganti@1
|
15 # Set logging.
|
kkonganti@1
|
16 logging.basicConfig(
|
kkonganti@1
|
17 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n",
|
kkonganti@1
|
18 level=logging.DEBUG,
|
kkonganti@1
|
19 )
|
kkonganti@1
|
20
|
kkonganti@1
|
21 # Debug print.
|
kkonganti@1
|
22 ppp = pprint.PrettyPrinter(width=50, indent=4)
|
kkonganti@1
|
23
|
kkonganti@1
|
24 # Multiple inheritence for pretty printing of help text.
|
kkonganti@1
|
25 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
|
kkonganti@1
|
26 pass
|
kkonganti@1
|
27
|
kkonganti@1
|
28
|
kkonganti@1
|
29 def main() -> None:
|
kkonganti@1
|
30 """
|
kkonganti@1
|
31 This script works only in the context of `bettercallsal` Nextflow workflow.
|
kkonganti@1
|
32 It takes:
|
kkonganti@1
|
33 1. A CSV file containing a similarity matrix or dissimilarity matrix where
|
kkonganti@1
|
34 the header row contains the names.
|
kkonganti@1
|
35 3. It takes indexed NCBI Pathogen metadata in pickle format and converts
|
kkonganti@1
|
36 accessions to serotype names in the final distance matrix output.
|
kkonganti@1
|
37 """
|
kkonganti@1
|
38
|
kkonganti@1
|
39 prog_name = os.path.basename(inspect.stack()[0].filename)
|
kkonganti@1
|
40
|
kkonganti@1
|
41 parser = argparse.ArgumentParser(
|
kkonganti@1
|
42 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
|
kkonganti@1
|
43 )
|
kkonganti@1
|
44
|
kkonganti@1
|
45 required = parser.add_argument_group("required arguments")
|
kkonganti@1
|
46
|
kkonganti@1
|
47 required.add_argument(
|
kkonganti@1
|
48 "-csv",
|
kkonganti@1
|
49 dest="mat",
|
kkonganti@1
|
50 default=False,
|
kkonganti@1
|
51 required=True,
|
kkonganti@1
|
52 help="Absolute UNIX path to .csv file containing similarity\n"
|
kkonganti@1
|
53 + "or dissimilarity matrix from `sourmash compare`.",
|
kkonganti@1
|
54 )
|
kkonganti@1
|
55 required.add_argument(
|
kkonganti@1
|
56 "-pickle",
|
kkonganti@1
|
57 dest="acc2sero",
|
kkonganti@1
|
58 default=False,
|
kkonganti@1
|
59 required=True,
|
kkonganti@1
|
60 help="Absolute UNIX Path to the *ACC2SERO.pickle\n"
|
kkonganti@1
|
61 + "metadata file. On raven2, these are located at\n"
|
kkonganti@1
|
62 + "/hpc/db/bettercallsal/PDGXXXXXXXXXX.XXXXX/",
|
kkonganti@1
|
63 )
|
kkonganti@1
|
64 required.add_argument(
|
kkonganti@1
|
65 "-labels",
|
kkonganti@1
|
66 dest="labels",
|
kkonganti@1
|
67 default=False,
|
kkonganti@1
|
68 required=True,
|
kkonganti@1
|
69 help="Absolute UNIX Path to the *.labels.txt\n"
|
kkonganti@1
|
70 + "file from `sourmash compare`. The accessions\n"
|
kkonganti@1
|
71 + "will be renanamed to serotype names.",
|
kkonganti@1
|
72 )
|
kkonganti@1
|
73
|
kkonganti@1
|
74 args = parser.parse_args()
|
kkonganti@1
|
75 csv = args.mat
|
kkonganti@1
|
76 labels = args.labels
|
kkonganti@1
|
77 pickled_sero = args.acc2sero
|
kkonganti@1
|
78 row_names = list()
|
kkonganti@1
|
79 distance_mat = defaultdict(defaultdict)
|
kkonganti@1
|
80 out_csv = os.path.join(os.getcwd(), "bcs_sourmash_matrix.tblsum.txt")
|
kkonganti@1
|
81 out_json = os.path.join(os.getcwd(), "bcs_sourmash_matrix_mqc.json")
|
kkonganti@1
|
82
|
kkonganti@1
|
83 # Prepare dictionary to be dumped as JSON.
|
kkonganti@1
|
84 distance_mat["id"] = "BETTERCALLSAL_CONTAINMENT_INDEX"
|
kkonganti@1
|
85 distance_mat["section_name"] = "Containment index"
|
kkonganti@1
|
86 distance_mat["description"] = (
|
kkonganti@1
|
87 "This section shows the containment index between a sample and the genomes"
|
kkonganti@1
|
88 + "by running <code>sourmash gather</code> "
|
kkonganti@1
|
89 + "using <code>--containment</code> option."
|
kkonganti@1
|
90 )
|
kkonganti@1
|
91 distance_mat["plot_type"] = "heatmap"
|
kkonganti@1
|
92 distance_mat["pconfig"]["id"] = "bettercallsal_containment_index_heatmap"
|
kkonganti@1
|
93 distance_mat["pconfig"]["title"] = "Sourmash: containment index"
|
kkonganti@1
|
94 distance_mat["pconfig"]["xTitle"] = "Samples"
|
kkonganti@1
|
95 distance_mat["pconfig"]["yTitle"] = "Isolates (Genome assemblies)"
|
kkonganti@1
|
96 distance_mat["pconfig"]["ycats_samples"] = "False"
|
kkonganti@1
|
97 distance_mat["pconfig"]["xcats_samples"] = "False"
|
kkonganti@1
|
98 distance_mat["pconfig"]["square"] = "False"
|
kkonganti@1
|
99 distance_mat["pconfig"]["min"] = "0.0"
|
kkonganti@1
|
100 distance_mat["pconfig"]["max"] = "1.0"
|
kkonganti@1
|
101 distance_mat["data"]["data"] = list()
|
kkonganti@1
|
102
|
kkonganti@1
|
103 if pickled_sero and (not os.path.exists(pickled_sero) or not os.path.getsize(pickled_sero)):
|
kkonganti@1
|
104 logging.error(
|
kkonganti@1
|
105 "The pickle file,\n" + f"{os.path.basename(pickled_sero)} does not exist or is empty!"
|
kkonganti@1
|
106 )
|
kkonganti@1
|
107 exit(1)
|
kkonganti@1
|
108 else:
|
kkonganti@1
|
109 acc2sero = pickle.load(file=open(pickled_sero, "rb"))
|
kkonganti@1
|
110
|
kkonganti@1
|
111 if csv and (not os.path.exists(csv) or not os.path.getsize(csv) > 0):
|
kkonganti@1
|
112 logging.error("File,\n" + f"{csv}\ndoes not exist " + "or is empty!")
|
kkonganti@1
|
113 exit(0)
|
kkonganti@1
|
114
|
kkonganti@1
|
115 if labels and (not os.path.exists(labels) or not os.path.getsize(labels) > 0):
|
kkonganti@1
|
116 logging.error("File,\n" + f"{labels}\ndoes not exist " + "or is empty!")
|
kkonganti@1
|
117 exit(0)
|
kkonganti@1
|
118
|
kkonganti@1
|
119 # with open(out_labels, "w") as out_labels_fh:
|
kkonganti@1
|
120 with open(labels, "r") as labels_fh:
|
kkonganti@1
|
121 for line in labels_fh:
|
kkonganti@1
|
122 line = line.strip()
|
kkonganti@1
|
123 if line not in acc2sero.keys():
|
kkonganti@1
|
124 row_names.append(line)
|
kkonganti@1
|
125
|
kkonganti@1
|
126 labels_fh.close()
|
kkonganti@1
|
127
|
kkonganti@1
|
128 with open(out_csv, "w") as csv_out_fh:
|
kkonganti@1
|
129 with open(csv, "r") as csv_in_fh:
|
kkonganti@1
|
130 header = csv_in_fh.readline().strip().split(",")
|
kkonganti@1
|
131 acc_cols = [idx for idx, col in enumerate(header) if col in acc2sero.keys()]
|
kkonganti@1
|
132 sample_cols = [idx for idx, col in enumerate(header) if col not in acc2sero.keys()]
|
kkonganti@1
|
133
|
kkonganti@1
|
134 col_names = [
|
kkonganti@1
|
135 re.sub(r"serotype=|\,antigen_formula=.*?\|", "", s)
|
kkonganti@1
|
136 for s in [acc2sero[col] + f"| | {col}" for col in header if col in acc2sero.keys()]
|
kkonganti@1
|
137 ]
|
kkonganti@1
|
138
|
kkonganti@1
|
139 distance_mat["xcats"] = col_names
|
kkonganti@1
|
140 csv_out_fh.write("\t".join(["Sample"] + col_names) + "\n")
|
kkonganti@1
|
141 line_num = 0
|
kkonganti@1
|
142
|
kkonganti@1
|
143 for line in csv_in_fh:
|
kkonganti@1
|
144 if line_num not in sample_cols:
|
kkonganti@1
|
145 continue
|
kkonganti@1
|
146 else:
|
kkonganti@1
|
147
|
kkonganti@1
|
148 heatmap_rows = [
|
kkonganti@1
|
149 str(round(float(line.strip().split(",")[col]), 5)) for col in acc_cols
|
kkonganti@1
|
150 ]
|
kkonganti@1
|
151 # distance_mat["data"]["hmdata"].append(heatmap_rows)
|
kkonganti@1
|
152 # distance_mat["data"][row_names[line_num]] = heatmap_rows
|
kkonganti@1
|
153 distance_mat["data"]["data"].append(heatmap_rows)
|
kkonganti@1
|
154 # distance_mat["data"][row_names[line_num]] = dict(
|
kkonganti@1
|
155 # [(col_names[idx], val) for idx, val in enumerate(heatmap_rows)]
|
kkonganti@1
|
156 # )
|
kkonganti@1
|
157 csv_out_fh.write("\t".join([row_names[line_num]] + heatmap_rows) + "\n")
|
kkonganti@1
|
158 line_num += 1
|
kkonganti@1
|
159 csv_in_fh.close()
|
kkonganti@1
|
160 csv_out_fh.close()
|
kkonganti@1
|
161
|
kkonganti@1
|
162 distance_mat["ycats"] = row_names
|
kkonganti@1
|
163 json.dump(distance_mat, open(out_json, "w"))
|
kkonganti@1
|
164
|
kkonganti@1
|
165
|
kkonganti@1
|
166 if __name__ == "__main__":
|
kkonganti@1
|
167 main()
|