kkonganti@0: #!/usr/bin/env python3
kkonganti@0: 
kkonganti@0: # Kranti Konganti
kkonganti@0: 
kkonganti@0: import argparse
kkonganti@0: import glob
kkonganti@0: import inspect
kkonganti@0: import logging
kkonganti@0: import os
kkonganti@0: import pickle
kkonganti@0: import pprint
kkonganti@0: import re
kkonganti@0: from collections import defaultdict
kkonganti@0: 
kkonganti@0: 
kkonganti@0: # Multiple inheritence for pretty printing of help text.
kkonganti@0: class MultiArgFormatClasses(
kkonganti@0:     argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
kkonganti@0: ):
kkonganti@0:     pass
kkonganti@0: 
kkonganti@0: 
kkonganti@0: # Main
kkonganti@0: def main() -> None:
kkonganti@0:     """
kkonganti@0:     This script works only in the context of a Nextflow workflow.
kkonganti@0:     It takes:
kkonganti@0:         1. A pickle file containing a dictionary object where genome accession
kkonganti@0:             is the key and the computed serotype is the value.
kkonganti@0:                         OR
kkonganti@0:         1. It takes a pickle file containing a nested dictionary, where genome accession
kkonganti@0:             is the key and the metadata is a dictionary associated with that key.
kkonganti@0:         2. A file with `mash screen` results.
kkonganti@0:         3. A directory containing genomes' FASTA in gzipped format where the
kkonganti@0:             FASTA file contains 2 lines: one FASTA header followed by
kkonganti@0:             genome Sequence.
kkonganti@0:     and then generates a concatenated FASTA file of top N unique `mash screen`
kkonganti@0:     genome hits as requested.
kkonganti@0:     """
kkonganti@0: 
kkonganti@0:     # Set logging.
kkonganti@0:     logging.basicConfig(
kkonganti@0:         format="\n"
kkonganti@0:         + "=" * 55
kkonganti@0:         + "\n%(asctime)s - %(levelname)s\n"
kkonganti@0:         + "=" * 55
kkonganti@0:         + "\n%(message)s\n\n",
kkonganti@0:         level=logging.DEBUG,
kkonganti@0:     )
kkonganti@0: 
kkonganti@0:     # Debug print.
kkonganti@0:     ppp = pprint.PrettyPrinter(width=55)
kkonganti@0:     prog_name = os.path.basename(inspect.stack()[0].filename)
kkonganti@0: 
kkonganti@0:     parser = argparse.ArgumentParser(
kkonganti@0:         prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@0:     )
kkonganti@0: 
kkonganti@0:     parser.add_argument(
kkonganti@0:         "-s",
kkonganti@0:         dest="sero_snp_metadata",
kkonganti@0:         default=False,
kkonganti@0:         required=False,
kkonganti@0:         help="Absolute UNIX path to metadata text file with the field separator, | "
kkonganti@0:         + "\nand 5 fields: serotype|asm_lvl|asm_url|snp_cluster_id"
kkonganti@0:         + "\nEx: serotype=Derby,antigen_formula=4:f,g:-|Scaffold|402440|ftp://...\n|PDS000096654.2\n"
kkonganti@0:         + "Mentioning this option will create a pickle file for the\nprovided metadata and exits.",
kkonganti@0:     )
kkonganti@0:     parser.add_argument(
kkonganti@0:         "-fs",
kkonganti@0:         dest="force_write_pick",
kkonganti@0:         action="store_true",
kkonganti@0:         required=False,
kkonganti@0:         help="By default, when -s flag is on, the pickle file named *.ACC2SERO.pickle\n"
kkonganti@0:         + "is written to CWD. If the file exists, the program will not overwrite\n"
kkonganti@0:         + "and exit. Use -fs option to overwrite.",
kkonganti@0:     )
kkonganti@0:     parser.add_argument(
kkonganti@0:         "-m",
kkonganti@0:         dest="mash_screen_res",
kkonganti@0:         default=False,
kkonganti@0:         required=False,
kkonganti@0:         help="Absolute UNIX path to `mash screen` results file.",
kkonganti@0:     )
kkonganti@0:     parser.add_argument(
kkonganti@0:         "-ms",
kkonganti@0:         dest="mash_screen_res_suffix",
kkonganti@0:         default=".screened",
kkonganti@0:         required=False,
kkonganti@0:         help="Suffix of the `mash screen` result file.",
kkonganti@0:     )
kkonganti@0:     parser.add_argument(
kkonganti@0:         "-ps",
kkonganti@0:         dest="pickled_sero",
kkonganti@0:         default=False,
kkonganti@0:         required=False,
kkonganti@0:         help="Absolute UNIX Path to serialized metadata object in a pickle file.\n"
kkonganti@0:         + "You can create the pickle file of the metadata using -s option.\n"
kkonganti@0:         + "Required if -m is on.",
kkonganti@0:     )
kkonganti@0:     parser.add_argument(
kkonganti@0:         "-gd",
kkonganti@0:         dest="genomes_dir",
kkonganti@0:         default=False,
kkonganti@0:         required=False,
kkonganti@0:         help="Absolute UNIX path to a directory containing\n"
kkonganti@0:         + "gzipped genome FASTA files.\n"
kkonganti@0:         + "Required if -m is on.",
kkonganti@0:     )
kkonganti@0:     parser.add_argument(
kkonganti@0:         "-gds",
kkonganti@0:         dest="genomes_dir_suffix",
kkonganti@0:         default="_scaffolded_genomic.fna.gz",
kkonganti@0:         required=False,
kkonganti@0:         help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.",
kkonganti@0:     )
kkonganti@0:     parser.add_argument(
kkonganti@0:         "-n",
kkonganti@0:         dest="num_uniq_hits",
kkonganti@0:         default=10,
kkonganti@0:         required=False,
kkonganti@0:         help="This many number of serotype genomes' accessions are returned.",
kkonganti@0:     )
kkonganti@0:     parser.add_argument(
kkonganti@0:         "-op",
kkonganti@0:         dest="out_prefix",
kkonganti@0:         default="MASH_SCREEN",
kkonganti@0:         required=False,
kkonganti@0:         help="Set the output file prefix for .fna.gz and .txt files.",
kkonganti@0:     )
kkonganti@0:     # required = parser.add_argument_group('required arguments')
kkonganti@0: 
kkonganti@0:     args = parser.parse_args()
kkonganti@0:     num_uniq_hits = int(args.num_uniq_hits)
kkonganti@0:     mash_screen_res = args.mash_screen_res
kkonganti@0:     mash_screen_res_suffix = args.mash_screen_res_suffix
kkonganti@0:     pickle_sero = args.sero_snp_metadata
kkonganti@0:     pickled_sero = args.pickled_sero
kkonganti@0:     f_write_pick = args.force_write_pick
kkonganti@0:     genomes_dir = args.genomes_dir
kkonganti@0:     genomes_dir_suffix = args.genomes_dir_suffix
kkonganti@0:     out_prefix = args.out_prefix
kkonganti@0:     req_metadata = {
kkonganti@0:         "mlst_sequence_type": "ST",
kkonganti@0:         "epi_type": "ET",
kkonganti@0:         "host": "HO",
kkonganti@0:         "host_disease": "HD",
kkonganti@0:         "isolation_source": "IS",
kkonganti@0:         "outbreak": "OU",
kkonganti@0:         "source_type": "SOT",
kkonganti@0:         "strain": "GS",
kkonganti@0:     }
kkonganti@0:     target_acc_key = "target_acc"
kkonganti@0:     ncbi_path_heading = "NCBI Pathogen Isolates Browser"
kkonganti@0:     ncbi_path_uri = "https://www.ncbi.nlm.nih.gov/pathogens/isolates/#"
kkonganti@0:     mash_genomes_gz = os.path.join(
kkonganti@0:         os.getcwd(), out_prefix + "_TOP_" + str(num_uniq_hits) + "_UNIQUE_HITS.fna.gz"
kkonganti@0:     )
kkonganti@0:     mash_uniq_hits_txt = os.path.join(
kkonganti@0:         os.getcwd(), re.sub(".fna.gz", ".txt", os.path.basename(mash_genomes_gz))
kkonganti@0:     )
kkonganti@0:     mash_uniq_accs_txt = os.path.join(
kkonganti@0:         os.getcwd(), re.sub(".fna.gz", "_ACCS.txt", os.path.basename(mash_genomes_gz))
kkonganti@0:     )
kkonganti@0:     mash_popup_info_txt = os.path.join(
kkonganti@0:         os.getcwd(), re.sub(".fna.gz", "_POPUP.txt", os.path.basename(mash_genomes_gz))
kkonganti@0:     )
kkonganti@0: 
kkonganti@0:     if mash_screen_res and os.path.exists(mash_genomes_gz):
kkonganti@0:         logging.error(
kkonganti@0:             "A concatenated genome FASTA file,\n"
kkonganti@0:             + f"{os.path.basename(mash_genomes_gz)} already exists in:\n"
kkonganti@0:             + f"{os.getcwd()}\n"
kkonganti@0:             + "Please remove or move it as we will not "
kkonganti@0:             + "overwrite it."
kkonganti@0:         )
kkonganti@0:         exit(1)
kkonganti@0: 
kkonganti@0:     if os.path.exists(mash_uniq_hits_txt) and os.path.getsize(mash_uniq_hits_txt) > 0:
kkonganti@0:         os.remove(mash_uniq_hits_txt)
kkonganti@0: 
kkonganti@0:     if mash_screen_res and (not genomes_dir or not pickled_sero):
kkonganti@0:         logging.error("When -m is on, -ps and -gd are also required.")
kkonganti@0:         exit(1)
kkonganti@0: 
kkonganti@0:     if genomes_dir:
kkonganti@0:         if not os.path.isdir(genomes_dir):
kkonganti@0:             logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!")
kkonganti@0:             exit(1)
kkonganti@0:         if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0:
kkonganti@0:             logging.error(
kkonganti@0:                 "Genomes directory"
kkonganti@0:                 + f"{genomes_dir}"
kkonganti@0:                 + "\ndoes not seem to have any\n"
kkonganti@0:                 + f"files ending with suffix: {genomes_dir_suffix}"
kkonganti@0:             )
kkonganti@0:             exit(1)
kkonganti@0: 
kkonganti@0:     if pickle_sero and os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0:
kkonganti@0:         acc2serotype = defaultdict()
kkonganti@0:         init_pickled_sero = os.path.join(os.getcwd(), out_prefix + ".ACC2SERO.pickle")
kkonganti@0: 
kkonganti@0:         if (
kkonganti@0:             os.path.exists(init_pickled_sero)
kkonganti@0:             and os.path.getsize(init_pickled_sero)
kkonganti@0:             and not f_write_pick
kkonganti@0:         ):
kkonganti@0:             logging.error(
kkonganti@0:                 f"File {os.path.basename(init_pickled_sero)} already exists in\n{os.getcwd()}\n"
kkonganti@0:                 + "Use -fs to force overwrite it."
kkonganti@0:             )
kkonganti@0:             exit(1)
kkonganti@0: 
kkonganti@0:         with open(pickle_sero, "r") as sero_snp_meta:
kkonganti@0:             for line in sero_snp_meta:
kkonganti@0:                 cols = line.strip().split("|")
kkonganti@0:                 url_cols = cols[3].split("/")
kkonganti@0: 
kkonganti@0:                 if not 4 <= len(cols) <= 5:
kkonganti@0:                     logging.error(
kkonganti@0:                         f"The metadata file {pickle_sero} is malformed.\n"
kkonganti@0:                         + f"Expected 4-5 columns. Got {len(cols)} columns.\n"
kkonganti@0:                     )
kkonganti@0:                     exit(1)
kkonganti@0: 
kkonganti@0:                 if not len(url_cols) > 5:
kkonganti@0:                     acc = url_cols[3]
kkonganti@0:                 else:
kkonganti@0:                     acc = url_cols[9]
kkonganti@0: 
kkonganti@0:                 if not re.match(r"^GC[AF]\_\d+\.\d+$", acc):
kkonganti@0:                     logging.error(
kkonganti@0:                         f"Did not find accession in either field number 4\n"
kkonganti@0:                         + "or field number 10 of column 4."
kkonganti@0:                     )
kkonganti@0:                     exit(1)
kkonganti@0: 
kkonganti@0:                 acc2serotype[acc] = cols[0]
kkonganti@0: 
kkonganti@0:         with open(init_pickled_sero, "wb") as write_pickled_sero:
kkonganti@0:             pickle.dump(file=write_pickled_sero, obj=acc2serotype)
kkonganti@0: 
kkonganti@0:         logging.info(
kkonganti@0:             f"Created the pickle file for\n{os.path.basename(pickle_sero)}.\n"
kkonganti@0:             + "This was the only requested function."
kkonganti@0:         )
kkonganti@0:         sero_snp_meta.close()
kkonganti@0:         write_pickled_sero.close()
kkonganti@0:         exit(0)
kkonganti@0:     elif pickle_sero and not (
kkonganti@0:         os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0
kkonganti@0:     ):
kkonganti@0:         logging.error(
kkonganti@0:             "Requested to create pickle from metadata, but\n"
kkonganti@0:             + f"the file, {os.path.basename(pickle_sero)} is empty or\ndoes not exist!"
kkonganti@0:         )
kkonganti@0:         exit(1)
kkonganti@0: 
kkonganti@0:     if mash_screen_res and os.path.exists(mash_screen_res):
kkonganti@0:         if os.path.getsize(mash_screen_res) > 0:
kkonganti@0:             seen_uniq_hits = 0
kkonganti@0:             unpickled_acc2serotype = pickle.load(file=open(pickled_sero, "rb"))
kkonganti@0: 
kkonganti@0:             with open(mash_screen_res, "r") as msh_res:
kkonganti@0:                 mash_hits = defaultdict()
kkonganti@0:                 seen_mash_sero = defaultdict()
kkonganti@0: 
kkonganti@0:                 for line in msh_res:
kkonganti@0:                     cols = line.strip().split("\t")
kkonganti@0: 
kkonganti@0:                     if len(cols) < 5:
kkonganti@0:                         logging.error(
kkonganti@0:                             f"The file {os.path.basename(mash_screen_res)} seems to\n"
kkonganti@0:                             + "be malformed. It contains less than required 5-6 columns."
kkonganti@0:                         )
kkonganti@0:                         exit(1)
kkonganti@0: 
kkonganti@0:                     mash_hit_acc = re.sub(
kkonganti@0:                         genomes_dir_suffix,
kkonganti@0:                         "",
kkonganti@0:                         str(
kkonganti@0:                             (
kkonganti@0:                                 re.search(r"GC[AF].*?" + genomes_dir_suffix, cols[4])
kkonganti@0:                             ).group()
kkonganti@0:                         ),
kkonganti@0:                     )
kkonganti@0: 
kkonganti@0:                     if mash_hit_acc:
kkonganti@0:                         mash_hits.setdefault(cols[0], []).append(mash_hit_acc)
kkonganti@0:                     else:
kkonganti@0:                         logging.error(
kkonganti@0:                             "Did not find an assembly accession in column\n"
kkonganti@0:                             + f"number 5. Found {cols[4]} instead. Cannot proceed!"
kkonganti@0:                         )
kkonganti@0:                         exit(1)
kkonganti@0:             msh_res.close()
kkonganti@0:         elif os.path.getsize(mash_screen_res) == 0:
kkonganti@0:             failed_sample_name = os.path.basename(mash_screen_res).rstrip(
kkonganti@0:                 mash_screen_res_suffix
kkonganti@0:             )
kkonganti@0:             with open(
kkonganti@0:                 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w"
kkonganti@0:             ) as failed_sample_fh:
kkonganti@0:                 failed_sample_fh.write(f"{failed_sample_name}\n")
kkonganti@0:                 failed_sample_fh.close()
kkonganti@0:             exit(0)
kkonganti@0: 
kkonganti@0:         # ppp.pprint(mash_hits)
kkonganti@0:         msh_out_txt = open(mash_uniq_hits_txt, "w")
kkonganti@0:         msh_out_accs_txt = open(mash_uniq_accs_txt, "w")
kkonganti@0:         mash_out_pop_txt = open(mash_popup_info_txt, "w")
kkonganti@0:         wrote_header_pop = False
kkonganti@0:         wrote_header_acc = False
kkonganti@0: 
kkonganti@0:         with open(mash_genomes_gz, "wb") as msh_out_gz:
kkonganti@0:             for _, (ident, acc_list) in enumerate(
kkonganti@0:                 sorted(mash_hits.items(), reverse=True)
kkonganti@0:             ):
kkonganti@0:                 for acc in acc_list:
kkonganti@0:                     if seen_uniq_hits >= num_uniq_hits:
kkonganti@0:                         break
kkonganti@0:                     if isinstance(unpickled_acc2serotype[acc], dict):
kkonganti@0:                         if target_acc_key in unpickled_acc2serotype[acc].keys():
kkonganti@0:                             if not wrote_header_pop:
kkonganti@0:                                 mash_out_pop_txt.write(
kkonganti@0:                                     "POPUP_INFO\nSEPARATOR COMMA\nDATA\n"
kkonganti@0:                                 )
kkonganti@0:                                 wrote_header_pop = True
kkonganti@0: 
kkonganti@0:                             pdt = "".join(unpickled_acc2serotype[acc][target_acc_key])
kkonganti@0: 
kkonganti@0:                             popup_line = ",".join(
kkonganti@0:                                 [
kkonganti@0:                                     acc,
kkonganti@0:                                     ncbi_path_heading,
kkonganti@0:                                     f'<a target="_blank" href="{ncbi_path_uri + pdt}">{pdt}</a>',
kkonganti@0:                                 ]
kkonganti@0:                             )
kkonganti@0:                             mash_out_pop_txt.write(popup_line + "\n")
kkonganti@0: 
kkonganti@0:                         if all(
kkonganti@0:                             k in unpickled_acc2serotype[acc].keys()
kkonganti@0:                             for k in req_metadata.keys()
kkonganti@0:                         ):
kkonganti@0:                             if not wrote_header_acc:
kkonganti@0:                                 msh_out_txt.write(
kkonganti@0:                                     "METADATA\nSEPARATOR COMMA\nFIELD_LABELS,"
kkonganti@0:                                 )
kkonganti@0:                                 msh_out_txt.write(
kkonganti@0:                                     f"{','.join([str(key).upper() for key in req_metadata.keys()])}\nDATA\n"
kkonganti@0:                                 )
kkonganti@0:                                 wrote_header_acc = True
kkonganti@0: 
kkonganti@0:                             metadata_line = ",".join(
kkonganti@0:                                 [
kkonganti@0:                                     re.sub(
kkonganti@0:                                         ",",
kkonganti@0:                                         "",
kkonganti@0:                                         "|".join(unpickled_acc2serotype[acc][m]),
kkonganti@0:                                     )
kkonganti@0:                                     for m in req_metadata.keys()
kkonganti@0:                                 ],
kkonganti@0:                             )
kkonganti@0: 
kkonganti@0:                         msh_out_txt.write(f"{acc.strip()},{metadata_line}\n")
kkonganti@0:                         msh_out_accs_txt.write(
kkonganti@0:                             f"{os.path.join(genomes_dir, acc + genomes_dir_suffix)}\n"
kkonganti@0:                         )
kkonganti@0:                         seen_mash_sero[acc] = 1
kkonganti@0:                         seen_uniq_hits += 1
kkonganti@0:                     elif not isinstance(unpickled_acc2serotype[acc], dict):
kkonganti@0:                         if unpickled_acc2serotype[acc] not in seen_mash_sero.keys():
kkonganti@0:                             seen_mash_sero[unpickled_acc2serotype[acc]] = 1
kkonganti@0:                             seen_uniq_hits += 1
kkonganti@0:                             # print(acc.strip() + '\t' + ident + '\t' + unpickled_acc2serotype[acc], file=sys.stdout)
kkonganti@0:                             msh_out_txt.write(
kkonganti@0:                                 f"{acc.strip()}\t{unpickled_acc2serotype[acc]}\t{ident}\n"
kkonganti@0:                             )
kkonganti@0:                             with open(
kkonganti@0:                                 os.path.join(genomes_dir, acc + genomes_dir_suffix),
kkonganti@0:                                 "rb",
kkonganti@0:                             ) as msh_in_gz:
kkonganti@0:                                 msh_out_gz.writelines(msh_in_gz.readlines())
kkonganti@0:                             msh_in_gz.close()
kkonganti@0:         msh_out_gz.close()
kkonganti@0:         msh_out_txt.close()
kkonganti@0:         msh_out_accs_txt.close()
kkonganti@0:         logging.info(
kkonganti@0:             f"File {os.path.basename(mash_genomes_gz)}\n"
kkonganti@0:             + f"written in:\n{os.getcwd()}\nDone! Bye!"
kkonganti@0:         )
kkonganti@0:         exit(0)
kkonganti@0: 
kkonganti@0: 
kkonganti@0: if __name__ == "__main__":
kkonganti@0:     main()