kkonganti@11: #!/usr/bin/env python3 kkonganti@11: kkonganti@11: # Kranti Konganti kkonganti@11: kkonganti@11: import os kkonganti@11: import argparse kkonganti@11: import inspect kkonganti@11: import logging kkonganti@11: import re kkonganti@11: import pickle kkonganti@11: import pprint kkonganti@11: import json kkonganti@11: from collections import defaultdict kkonganti@11: kkonganti@11: # Set logging. kkonganti@11: logging.basicConfig( kkonganti@11: format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n", kkonganti@11: level=logging.DEBUG, kkonganti@11: ) kkonganti@11: kkonganti@11: # Debug print. kkonganti@11: ppp = pprint.PrettyPrinter(width=50, indent=4) kkonganti@11: kkonganti@11: # Multiple inheritence for pretty printing of help text. kkonganti@11: class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): kkonganti@11: pass kkonganti@11: kkonganti@11: kkonganti@11: def main() -> None: kkonganti@11: """ kkonganti@11: This script works only in the context of `bettercallsal` Nextflow workflow. kkonganti@11: It takes: kkonganti@11: 1. A CSV file containing a similarity matrix or dissimilarity matrix where kkonganti@11: the header row contains the names. kkonganti@11: 3. It takes indexed NCBI Pathogen metadata in pickle format and converts kkonganti@11: accessions to serotype names in the final distance matrix output. kkonganti@11: """ kkonganti@11: kkonganti@11: prog_name = os.path.basename(inspect.stack()[0].filename) kkonganti@11: kkonganti@11: parser = argparse.ArgumentParser( kkonganti@11: prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses kkonganti@11: ) kkonganti@11: kkonganti@11: required = parser.add_argument_group("required arguments") kkonganti@11: kkonganti@11: required.add_argument( kkonganti@11: "-csv", kkonganti@11: dest="mat", kkonganti@11: default=False, kkonganti@11: required=True, kkonganti@11: help="Absolute UNIX path to .csv file containing similarity\n" kkonganti@11: + "or dissimilarity matrix from `sourmash compare`.", kkonganti@11: ) kkonganti@11: required.add_argument( kkonganti@11: "-pickle", kkonganti@11: dest="acc2sero", kkonganti@11: default=False, kkonganti@11: required=True, kkonganti@11: help="Absolute UNIX Path to the *ACC2SERO.pickle\n" kkonganti@11: + "metadata file. On raven2, these are located at\n" kkonganti@11: + "/hpc/db/bettercallsal/PDGXXXXXXXXXX.XXXXX/", kkonganti@11: ) kkonganti@11: required.add_argument( kkonganti@11: "-labels", kkonganti@11: dest="labels", kkonganti@11: default=False, kkonganti@11: required=True, kkonganti@11: help="Absolute UNIX Path to the *.labels.txt\n" kkonganti@11: + "file from `sourmash compare`. The accessions\n" kkonganti@11: + "will be renanamed to serotype names.", kkonganti@11: ) kkonganti@11: kkonganti@11: args = parser.parse_args() kkonganti@11: csv = args.mat kkonganti@11: labels = args.labels kkonganti@11: pickled_sero = args.acc2sero kkonganti@11: row_names = list() kkonganti@11: distance_mat = defaultdict(defaultdict) kkonganti@11: out_csv = os.path.join(os.getcwd(), "bcs_sourmash_matrix.tblsum.txt") kkonganti@11: out_json = os.path.join(os.getcwd(), "bcs_sourmash_matrix_mqc.json") kkonganti@11: kkonganti@11: # Prepare dictionary to be dumped as JSON. kkonganti@11: distance_mat["id"] = "BETTERCALLSAL_CONTAINMENT_INDEX" kkonganti@11: distance_mat["section_name"] = "Containment index" kkonganti@11: distance_mat["description"] = ( kkonganti@11: "This section shows the containment index between a sample and the genomes" kkonganti@11: + "by running sourmash gather " kkonganti@11: + "using --containment option." kkonganti@11: ) kkonganti@11: distance_mat["plot_type"] = "heatmap" kkonganti@11: distance_mat["pconfig"]["id"] = "bettercallsal_containment_index_heatmap" kkonganti@11: distance_mat["pconfig"]["title"] = "Sourmash: containment index" kkonganti@11: distance_mat["pconfig"]["xTitle"] = "Samples" kkonganti@11: distance_mat["pconfig"]["yTitle"] = "Isolates (Genome assemblies)" kkonganti@11: distance_mat["pconfig"]["ycats_samples"] = "False" kkonganti@11: distance_mat["pconfig"]["xcats_samples"] = "False" kkonganti@11: distance_mat["pconfig"]["square"] = "False" kkonganti@11: distance_mat["pconfig"]["min"] = "0.0" kkonganti@11: distance_mat["pconfig"]["max"] = "1.0" kkonganti@11: distance_mat["data"]["data"] = list() kkonganti@11: kkonganti@11: if pickled_sero and (not os.path.exists(pickled_sero) or not os.path.getsize(pickled_sero)): kkonganti@11: logging.error( kkonganti@11: "The pickle file,\n" + f"{os.path.basename(pickled_sero)} does not exist or is empty!" kkonganti@11: ) kkonganti@11: exit(1) kkonganti@11: else: kkonganti@11: acc2sero = pickle.load(file=open(pickled_sero, "rb")) kkonganti@11: kkonganti@11: if csv and (not os.path.exists(csv) or not os.path.getsize(csv) > 0): kkonganti@11: logging.error("File,\n" + f"{csv}\ndoes not exist " + "or is empty!") kkonganti@11: exit(0) kkonganti@11: kkonganti@11: if labels and (not os.path.exists(labels) or not os.path.getsize(labels) > 0): kkonganti@11: logging.error("File,\n" + f"{labels}\ndoes not exist " + "or is empty!") kkonganti@11: exit(0) kkonganti@11: kkonganti@11: # with open(out_labels, "w") as out_labels_fh: kkonganti@11: with open(labels, "r") as labels_fh: kkonganti@11: for line in labels_fh: kkonganti@11: line = line.strip() kkonganti@11: if line not in acc2sero.keys(): kkonganti@11: row_names.append(line) kkonganti@11: kkonganti@11: labels_fh.close() kkonganti@11: kkonganti@11: with open(out_csv, "w") as csv_out_fh: kkonganti@11: with open(csv, "r") as csv_in_fh: kkonganti@11: header = csv_in_fh.readline().strip().split(",") kkonganti@11: acc_cols = [idx for idx, col in enumerate(header) if col in acc2sero.keys()] kkonganti@11: sample_cols = [idx for idx, col in enumerate(header) if col not in acc2sero.keys()] kkonganti@11: kkonganti@11: col_names = [ kkonganti@11: re.sub(r"serotype=|\,antigen_formula=.*?\|", "", s) kkonganti@11: for s in [acc2sero[col] + f"| | {col}" for col in header if col in acc2sero.keys()] kkonganti@11: ] kkonganti@11: kkonganti@11: distance_mat["xcats"] = col_names kkonganti@11: csv_out_fh.write("\t".join(["Sample"] + col_names) + "\n") kkonganti@11: line_num = 0 kkonganti@11: kkonganti@11: for line in csv_in_fh: kkonganti@11: if line_num not in sample_cols: kkonganti@11: continue kkonganti@11: else: kkonganti@11: kkonganti@11: heatmap_rows = [ kkonganti@11: str(round(float(line.strip().split(",")[col]), 5)) for col in acc_cols kkonganti@11: ] kkonganti@11: # distance_mat["data"]["hmdata"].append(heatmap_rows) kkonganti@11: # distance_mat["data"][row_names[line_num]] = heatmap_rows kkonganti@11: distance_mat["data"]["data"].append(heatmap_rows) kkonganti@11: # distance_mat["data"][row_names[line_num]] = dict( kkonganti@11: # [(col_names[idx], val) for idx, val in enumerate(heatmap_rows)] kkonganti@11: # ) kkonganti@11: csv_out_fh.write("\t".join([row_names[line_num]] + heatmap_rows) + "\n") kkonganti@11: line_num += 1 kkonganti@11: csv_in_fh.close() kkonganti@11: csv_out_fh.close() kkonganti@11: kkonganti@11: distance_mat["ycats"] = row_names kkonganti@11: json.dump(distance_mat, open(out_json, "w")) kkonganti@11: kkonganti@11: kkonganti@11: if __name__ == "__main__": kkonganti@11: main()