kkonganti@0: #!/usr/bin/env python3 kkonganti@0: kkonganti@0: # Kranti Konganti kkonganti@0: kkonganti@0: import argparse kkonganti@0: import glob kkonganti@0: import inspect kkonganti@0: import logging kkonganti@0: import os kkonganti@0: import pickle kkonganti@0: import pprint kkonganti@0: import re kkonganti@0: from collections import defaultdict kkonganti@0: kkonganti@0: kkonganti@0: # Multiple inheritence for pretty printing of help text. kkonganti@0: class MultiArgFormatClasses( kkonganti@0: argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter kkonganti@0: ): kkonganti@0: pass kkonganti@0: kkonganti@0: kkonganti@0: # Main kkonganti@0: def main() -> None: kkonganti@0: """ kkonganti@0: This script works only in the context of a Nextflow workflow. kkonganti@0: It takes: kkonganti@0: 1. A pickle file containing a dictionary object where genome accession kkonganti@0: is the key and the computed serotype is the value. kkonganti@0: OR kkonganti@0: 1. It takes a pickle file containing a nested dictionary, where genome accession kkonganti@0: is the key and the metadata is a dictionary associated with that key. kkonganti@0: 2. A file with `mash screen` results. kkonganti@0: 3. A directory containing genomes' FASTA in gzipped format where the kkonganti@0: FASTA file contains 2 lines: one FASTA header followed by kkonganti@0: genome Sequence. kkonganti@0: and then generates a concatenated FASTA file of top N unique `mash screen` kkonganti@0: genome hits as requested. kkonganti@0: """ kkonganti@0: kkonganti@0: # Set logging. kkonganti@0: logging.basicConfig( kkonganti@0: format="\n" kkonganti@0: + "=" * 55 kkonganti@0: + "\n%(asctime)s - %(levelname)s\n" kkonganti@0: + "=" * 55 kkonganti@0: + "\n%(message)s\n\n", kkonganti@0: level=logging.DEBUG, kkonganti@0: ) kkonganti@0: kkonganti@0: # Debug print. kkonganti@0: ppp = pprint.PrettyPrinter(width=55) kkonganti@0: prog_name = os.path.basename(inspect.stack()[0].filename) kkonganti@0: kkonganti@0: parser = argparse.ArgumentParser( kkonganti@0: prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses kkonganti@0: ) kkonganti@0: kkonganti@0: parser.add_argument( kkonganti@0: "-s", kkonganti@0: dest="sero_snp_metadata", kkonganti@0: default=False, kkonganti@0: required=False, kkonganti@0: help="Absolute UNIX path to metadata text file with the field separator, | " kkonganti@0: + "\nand 5 fields: serotype|asm_lvl|asm_url|snp_cluster_id" kkonganti@0: + "\nEx: serotype=Derby,antigen_formula=4:f,g:-|Scaffold|402440|ftp://...\n|PDS000096654.2\n" kkonganti@0: + "Mentioning this option will create a pickle file for the\nprovided metadata and exits.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-fs", kkonganti@0: dest="force_write_pick", kkonganti@0: action="store_true", kkonganti@0: required=False, kkonganti@0: help="By default, when -s flag is on, the pickle file named *.ACC2SERO.pickle\n" kkonganti@0: + "is written to CWD. If the file exists, the program will not overwrite\n" kkonganti@0: + "and exit. Use -fs option to overwrite.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-m", kkonganti@0: dest="mash_screen_res", kkonganti@0: default=False, kkonganti@0: required=False, kkonganti@0: help="Absolute UNIX path to `mash screen` results file.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-ms", kkonganti@0: dest="mash_screen_res_suffix", kkonganti@0: default=".screened", kkonganti@0: required=False, kkonganti@0: help="Suffix of the `mash screen` result file.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-ps", kkonganti@0: dest="pickled_sero", kkonganti@0: default=False, kkonganti@0: required=False, kkonganti@0: help="Absolute UNIX Path to serialized metadata object in a pickle file.\n" kkonganti@0: + "You can create the pickle file of the metadata using -s option.\n" kkonganti@0: + "Required if -m is on.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-gd", kkonganti@0: dest="genomes_dir", kkonganti@0: default=False, kkonganti@0: required=False, kkonganti@0: help="Absolute UNIX path to a directory containing\n" kkonganti@0: + "gzipped genome FASTA files.\n" kkonganti@0: + "Required if -m is on.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-gds", kkonganti@0: dest="genomes_dir_suffix", kkonganti@0: default="_scaffolded_genomic.fna.gz", kkonganti@0: required=False, kkonganti@0: help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-n", kkonganti@0: dest="num_uniq_hits", kkonganti@0: default=10, kkonganti@0: required=False, kkonganti@0: help="This many number of serotype genomes' accessions are returned.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-op", kkonganti@0: dest="out_prefix", kkonganti@0: default="MASH_SCREEN", kkonganti@0: required=False, kkonganti@0: help="Set the output file prefix for .fna.gz and .txt files.", kkonganti@0: ) kkonganti@0: # required = parser.add_argument_group('required arguments') kkonganti@0: kkonganti@0: args = parser.parse_args() kkonganti@0: num_uniq_hits = int(args.num_uniq_hits) kkonganti@0: mash_screen_res = args.mash_screen_res kkonganti@0: mash_screen_res_suffix = args.mash_screen_res_suffix kkonganti@0: pickle_sero = args.sero_snp_metadata kkonganti@0: pickled_sero = args.pickled_sero kkonganti@0: f_write_pick = args.force_write_pick kkonganti@0: genomes_dir = args.genomes_dir kkonganti@0: genomes_dir_suffix = args.genomes_dir_suffix kkonganti@0: out_prefix = args.out_prefix kkonganti@0: req_metadata = { kkonganti@0: "mlst_sequence_type": "ST", kkonganti@0: "epi_type": "ET", kkonganti@0: "host": "HO", kkonganti@0: "host_disease": "HD", kkonganti@0: "isolation_source": "IS", kkonganti@0: "outbreak": "OU", kkonganti@0: "source_type": "SOT", kkonganti@0: "strain": "GS", kkonganti@0: } kkonganti@0: target_acc_key = "target_acc" kkonganti@0: ncbi_path_heading = "NCBI Pathogen Isolates Browser" kkonganti@0: ncbi_path_uri = "https://www.ncbi.nlm.nih.gov/pathogens/isolates/#" kkonganti@0: mash_genomes_gz = os.path.join( kkonganti@0: os.getcwd(), out_prefix + "_TOP_" + str(num_uniq_hits) + "_UNIQUE_HITS.fna.gz" kkonganti@0: ) kkonganti@0: mash_uniq_hits_txt = os.path.join( kkonganti@0: os.getcwd(), re.sub(".fna.gz", ".txt", os.path.basename(mash_genomes_gz)) kkonganti@0: ) kkonganti@0: mash_uniq_accs_txt = os.path.join( kkonganti@0: os.getcwd(), re.sub(".fna.gz", "_ACCS.txt", os.path.basename(mash_genomes_gz)) kkonganti@0: ) kkonganti@0: mash_popup_info_txt = os.path.join( kkonganti@0: os.getcwd(), re.sub(".fna.gz", "_POPUP.txt", os.path.basename(mash_genomes_gz)) kkonganti@0: ) kkonganti@0: kkonganti@0: if mash_screen_res and os.path.exists(mash_genomes_gz): kkonganti@0: logging.error( kkonganti@0: "A concatenated genome FASTA file,\n" kkonganti@0: + f"{os.path.basename(mash_genomes_gz)} already exists in:\n" kkonganti@0: + f"{os.getcwd()}\n" kkonganti@0: + "Please remove or move it as we will not " kkonganti@0: + "overwrite it." kkonganti@0: ) kkonganti@0: exit(1) kkonganti@0: kkonganti@0: if os.path.exists(mash_uniq_hits_txt) and os.path.getsize(mash_uniq_hits_txt) > 0: kkonganti@0: os.remove(mash_uniq_hits_txt) kkonganti@0: kkonganti@0: if mash_screen_res and (not genomes_dir or not pickled_sero): kkonganti@0: logging.error("When -m is on, -ps and -gd are also required.") kkonganti@0: exit(1) kkonganti@0: kkonganti@0: if genomes_dir: kkonganti@0: if not os.path.isdir(genomes_dir): kkonganti@0: logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!") kkonganti@0: exit(1) kkonganti@0: if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0: kkonganti@0: logging.error( kkonganti@0: "Genomes directory" kkonganti@0: + f"{genomes_dir}" kkonganti@0: + "\ndoes not seem to have any\n" kkonganti@0: + f"files ending with suffix: {genomes_dir_suffix}" kkonganti@0: ) kkonganti@0: exit(1) kkonganti@0: kkonganti@0: if pickle_sero and os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0: kkonganti@0: acc2serotype = defaultdict() kkonganti@0: init_pickled_sero = os.path.join(os.getcwd(), out_prefix + ".ACC2SERO.pickle") kkonganti@0: kkonganti@0: if ( kkonganti@0: os.path.exists(init_pickled_sero) kkonganti@0: and os.path.getsize(init_pickled_sero) kkonganti@0: and not f_write_pick kkonganti@0: ): kkonganti@0: logging.error( kkonganti@0: f"File {os.path.basename(init_pickled_sero)} already exists in\n{os.getcwd()}\n" kkonganti@0: + "Use -fs to force overwrite it." kkonganti@0: ) kkonganti@0: exit(1) kkonganti@0: kkonganti@0: with open(pickle_sero, "r") as sero_snp_meta: kkonganti@0: for line in sero_snp_meta: kkonganti@0: cols = line.strip().split("|") kkonganti@0: url_cols = cols[3].split("/") kkonganti@0: kkonganti@0: if not 4 <= len(cols) <= 5: kkonganti@0: logging.error( kkonganti@0: f"The metadata file {pickle_sero} is malformed.\n" kkonganti@0: + f"Expected 4-5 columns. Got {len(cols)} columns.\n" kkonganti@0: ) kkonganti@0: exit(1) kkonganti@0: kkonganti@0: if not len(url_cols) > 5: kkonganti@0: acc = url_cols[3] kkonganti@0: else: kkonganti@0: acc = url_cols[9] kkonganti@0: kkonganti@0: if not re.match(r"^GC[AF]\_\d+\.\d+$", acc): kkonganti@0: logging.error( kkonganti@0: f"Did not find accession in either field number 4\n" kkonganti@0: + "or field number 10 of column 4." kkonganti@0: ) kkonganti@0: exit(1) kkonganti@0: kkonganti@0: acc2serotype[acc] = cols[0] kkonganti@0: kkonganti@0: with open(init_pickled_sero, "wb") as write_pickled_sero: kkonganti@0: pickle.dump(file=write_pickled_sero, obj=acc2serotype) kkonganti@0: kkonganti@0: logging.info( kkonganti@0: f"Created the pickle file for\n{os.path.basename(pickle_sero)}.\n" kkonganti@0: + "This was the only requested function." kkonganti@0: ) kkonganti@0: sero_snp_meta.close() kkonganti@0: write_pickled_sero.close() kkonganti@0: exit(0) kkonganti@0: elif pickle_sero and not ( kkonganti@0: os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0 kkonganti@0: ): kkonganti@0: logging.error( kkonganti@0: "Requested to create pickle from metadata, but\n" kkonganti@0: + f"the file, {os.path.basename(pickle_sero)} is empty or\ndoes not exist!" kkonganti@0: ) kkonganti@0: exit(1) kkonganti@0: kkonganti@0: if mash_screen_res and os.path.exists(mash_screen_res): kkonganti@0: if os.path.getsize(mash_screen_res) > 0: kkonganti@0: seen_uniq_hits = 0 kkonganti@0: unpickled_acc2serotype = pickle.load(file=open(pickled_sero, "rb")) kkonganti@0: kkonganti@0: with open(mash_screen_res, "r") as msh_res: kkonganti@0: mash_hits = defaultdict() kkonganti@0: seen_mash_sero = defaultdict() kkonganti@0: kkonganti@0: for line in msh_res: kkonganti@0: cols = line.strip().split("\t") kkonganti@0: kkonganti@0: if len(cols) < 5: kkonganti@0: logging.error( kkonganti@0: f"The file {os.path.basename(mash_screen_res)} seems to\n" kkonganti@0: + "be malformed. It contains less than required 5-6 columns." kkonganti@0: ) kkonganti@0: exit(1) kkonganti@0: kkonganti@0: mash_hit_acc = re.sub( kkonganti@0: genomes_dir_suffix, kkonganti@0: "", kkonganti@0: str( kkonganti@0: ( kkonganti@0: re.search(r"GC[AF].*?" + genomes_dir_suffix, cols[4]) kkonganti@0: ).group() kkonganti@0: ), kkonganti@0: ) kkonganti@0: kkonganti@0: if mash_hit_acc: kkonganti@0: mash_hits.setdefault(cols[0], []).append(mash_hit_acc) kkonganti@0: else: kkonganti@0: logging.error( kkonganti@0: "Did not find an assembly accession in column\n" kkonganti@0: + f"number 5. Found {cols[4]} instead. Cannot proceed!" kkonganti@0: ) kkonganti@0: exit(1) kkonganti@0: msh_res.close() kkonganti@0: elif os.path.getsize(mash_screen_res) == 0: kkonganti@0: failed_sample_name = os.path.basename(mash_screen_res).rstrip( kkonganti@0: mash_screen_res_suffix kkonganti@0: ) kkonganti@0: with open( kkonganti@0: os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w" kkonganti@0: ) as failed_sample_fh: kkonganti@0: failed_sample_fh.write(f"{failed_sample_name}\n") kkonganti@0: failed_sample_fh.close() kkonganti@0: exit(0) kkonganti@0: kkonganti@0: # ppp.pprint(mash_hits) kkonganti@0: msh_out_txt = open(mash_uniq_hits_txt, "w") kkonganti@0: msh_out_accs_txt = open(mash_uniq_accs_txt, "w") kkonganti@0: mash_out_pop_txt = open(mash_popup_info_txt, "w") kkonganti@0: wrote_header_pop = False kkonganti@0: wrote_header_acc = False kkonganti@0: kkonganti@0: with open(mash_genomes_gz, "wb") as msh_out_gz: kkonganti@0: for _, (ident, acc_list) in enumerate( kkonganti@0: sorted(mash_hits.items(), reverse=True) kkonganti@0: ): kkonganti@0: for acc in acc_list: kkonganti@0: if seen_uniq_hits >= num_uniq_hits: kkonganti@0: break kkonganti@0: if isinstance(unpickled_acc2serotype[acc], dict): kkonganti@0: if target_acc_key in unpickled_acc2serotype[acc].keys(): kkonganti@0: if not wrote_header_pop: kkonganti@0: mash_out_pop_txt.write( kkonganti@0: "POPUP_INFO\nSEPARATOR COMMA\nDATA\n" kkonganti@0: ) kkonganti@0: wrote_header_pop = True kkonganti@0: kkonganti@0: pdt = "".join(unpickled_acc2serotype[acc][target_acc_key]) kkonganti@0: kkonganti@0: popup_line = ",".join( kkonganti@0: [ kkonganti@0: acc, kkonganti@0: ncbi_path_heading, kkonganti@0: f'{pdt}', kkonganti@0: ] kkonganti@0: ) kkonganti@0: mash_out_pop_txt.write(popup_line + "\n") kkonganti@0: kkonganti@0: if all( kkonganti@0: k in unpickled_acc2serotype[acc].keys() kkonganti@0: for k in req_metadata.keys() kkonganti@0: ): kkonganti@0: if not wrote_header_acc: kkonganti@0: msh_out_txt.write( kkonganti@0: "METADATA\nSEPARATOR COMMA\nFIELD_LABELS," kkonganti@0: ) kkonganti@0: msh_out_txt.write( kkonganti@0: f"{','.join([str(key).upper() for key in req_metadata.keys()])}\nDATA\n" kkonganti@0: ) kkonganti@0: wrote_header_acc = True kkonganti@0: kkonganti@0: metadata_line = ",".join( kkonganti@0: [ kkonganti@0: re.sub( kkonganti@0: ",", kkonganti@0: "", kkonganti@0: "|".join(unpickled_acc2serotype[acc][m]), kkonganti@0: ) kkonganti@0: for m in req_metadata.keys() kkonganti@0: ], kkonganti@0: ) kkonganti@0: kkonganti@0: msh_out_txt.write(f"{acc.strip()},{metadata_line}\n") kkonganti@0: msh_out_accs_txt.write( kkonganti@0: f"{os.path.join(genomes_dir, acc + genomes_dir_suffix)}\n" kkonganti@0: ) kkonganti@0: seen_mash_sero[acc] = 1 kkonganti@0: seen_uniq_hits += 1 kkonganti@0: elif not isinstance(unpickled_acc2serotype[acc], dict): kkonganti@0: if unpickled_acc2serotype[acc] not in seen_mash_sero.keys(): kkonganti@0: seen_mash_sero[unpickled_acc2serotype[acc]] = 1 kkonganti@0: seen_uniq_hits += 1 kkonganti@0: # print(acc.strip() + '\t' + ident + '\t' + unpickled_acc2serotype[acc], file=sys.stdout) kkonganti@0: msh_out_txt.write( kkonganti@0: f"{acc.strip()}\t{unpickled_acc2serotype[acc]}\t{ident}\n" kkonganti@0: ) kkonganti@0: with open( kkonganti@0: os.path.join(genomes_dir, acc + genomes_dir_suffix), kkonganti@0: "rb", kkonganti@0: ) as msh_in_gz: kkonganti@0: msh_out_gz.writelines(msh_in_gz.readlines()) kkonganti@0: msh_in_gz.close() kkonganti@0: msh_out_gz.close() kkonganti@0: msh_out_txt.close() kkonganti@0: msh_out_accs_txt.close() kkonganti@0: logging.info( kkonganti@0: f"File {os.path.basename(mash_genomes_gz)}\n" kkonganti@0: + f"written in:\n{os.getcwd()}\nDone! Bye!" kkonganti@0: ) kkonganti@0: exit(0) kkonganti@0: kkonganti@0: kkonganti@0: if __name__ == "__main__": kkonganti@0: main()