annotate 0.5.0/bin/get_top_unique_mash_hit_genomes.py @ 1:365849f031fd

"planemo upload"
author kkonganti
date Mon, 05 Jun 2023 18:48:51 -0400
parents
children
rev   line source
kkonganti@1 1 #!/usr/bin/env python3
kkonganti@1 2
kkonganti@1 3 # Kranti Konganti
kkonganti@1 4
kkonganti@1 5 import os
kkonganti@1 6 import glob
kkonganti@1 7 import pickle
kkonganti@1 8 import argparse
kkonganti@1 9 import inspect
kkonganti@1 10 import logging
kkonganti@1 11 import re
kkonganti@1 12 import pprint
kkonganti@1 13 from collections import defaultdict
kkonganti@1 14
kkonganti@1 15 # Multiple inheritence for pretty printing of help text.
kkonganti@1 16 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
kkonganti@1 17 pass
kkonganti@1 18
kkonganti@1 19
kkonganti@1 20 # Main
kkonganti@1 21 def main() -> None:
kkonganti@1 22 """
kkonganti@1 23 This script works only in the context of `bettercallsal` Nextflow workflow.
kkonganti@1 24 It takes:
kkonganti@1 25 1. A pickle file containing a dictionary object where genome accession
kkonganti@1 26 is the key and the computed serotype is the value.
kkonganti@1 27 2. A file with `mash screen` results run against the Salmonella SNP
kkonganti@1 28 Cluster genomes' sketch.
kkonganti@1 29 3. A directory containing genomes' FASTA in gzipped format where the
kkonganti@1 30 FASTA file contains 2 lines: one FASTA header followed by
kkonganti@1 31 genome Sequence.
kkonganti@1 32 and then generates a concatenated FASTA file of top N unique `mash screen`
kkonganti@1 33 genome hits as requested.
kkonganti@1 34 """
kkonganti@1 35
kkonganti@1 36 # Set logging.
kkonganti@1 37 logging.basicConfig(
kkonganti@1 38 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n",
kkonganti@1 39 level=logging.DEBUG,
kkonganti@1 40 )
kkonganti@1 41
kkonganti@1 42 # Debug print.
kkonganti@1 43 ppp = pprint.PrettyPrinter(width=55)
kkonganti@1 44 prog_name = os.path.basename(inspect.stack()[0].filename)
kkonganti@1 45
kkonganti@1 46 parser = argparse.ArgumentParser(
kkonganti@1 47 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@1 48 )
kkonganti@1 49
kkonganti@1 50 parser.add_argument(
kkonganti@1 51 "-s",
kkonganti@1 52 dest="sero_snp_metadata",
kkonganti@1 53 default=False,
kkonganti@1 54 required=False,
kkonganti@1 55 help="Absolute UNIX path to metadata text file with the field separator, | "
kkonganti@1 56 + "\nand 5 fields: serotype|asm_lvl|asm_url|snp_cluster_id"
kkonganti@1 57 + "\nEx: serotype=Derby,antigen_formula=4:f,g:-|Scaffold|402440|ftp://...\n|PDS000096654.2\n"
kkonganti@1 58 + "Mentioning this option will create a pickle file for the\nprovided metadata and exits.",
kkonganti@1 59 )
kkonganti@1 60 parser.add_argument(
kkonganti@1 61 "-fs",
kkonganti@1 62 dest="force_write_pick",
kkonganti@1 63 action="store_true",
kkonganti@1 64 required=False,
kkonganti@1 65 help="By default, when -s flag is on, the pickle file named *.ACC2SERO.pickle\n"
kkonganti@1 66 + "is written to CWD. If the file exists, the program will not overwrite\n"
kkonganti@1 67 + "and exit. Use -fs option to overwrite.",
kkonganti@1 68 )
kkonganti@1 69 parser.add_argument(
kkonganti@1 70 "-m",
kkonganti@1 71 dest="mash_screen_res",
kkonganti@1 72 default=False,
kkonganti@1 73 required=False,
kkonganti@1 74 help="Absolute UNIX path to `mash screen` results file.",
kkonganti@1 75 )
kkonganti@1 76 parser.add_argument(
kkonganti@1 77 "-ms",
kkonganti@1 78 dest="mash_screen_res_suffix",
kkonganti@1 79 default=".screened",
kkonganti@1 80 required=False,
kkonganti@1 81 help="Suffix of the `mash screen` result file.",
kkonganti@1 82 )
kkonganti@1 83 parser.add_argument(
kkonganti@1 84 "-ps",
kkonganti@1 85 dest="pickled_sero",
kkonganti@1 86 default=False,
kkonganti@1 87 required=False,
kkonganti@1 88 help="Absolute UNIX Path to serialized metadata object in a pickle file.\n"
kkonganti@1 89 + "You can create the pickle file of the metadata using -s option.\n"
kkonganti@1 90 + "Required if -m is on.",
kkonganti@1 91 )
kkonganti@1 92 parser.add_argument(
kkonganti@1 93 "-gd",
kkonganti@1 94 dest="genomes_dir",
kkonganti@1 95 default=False,
kkonganti@1 96 required=False,
kkonganti@1 97 help="Absolute UNIX path to a directory containing\n"
kkonganti@1 98 + "gzipped genome FASTA files.\n"
kkonganti@1 99 + "Required if -m is on.",
kkonganti@1 100 )
kkonganti@1 101 parser.add_argument(
kkonganti@1 102 "-gds",
kkonganti@1 103 dest="genomes_dir_suffix",
kkonganti@1 104 default="_scaffolded_genomic.fna.gz",
kkonganti@1 105 required=False,
kkonganti@1 106 help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.",
kkonganti@1 107 )
kkonganti@1 108 parser.add_argument(
kkonganti@1 109 "-n",
kkonganti@1 110 dest="num_uniq_hits",
kkonganti@1 111 default=10,
kkonganti@1 112 help="This many number of serotype genomes' accessions are " + "\nreturned.",
kkonganti@1 113 )
kkonganti@1 114 parser.add_argument(
kkonganti@1 115 "-op",
kkonganti@1 116 dest="out_prefix",
kkonganti@1 117 default="MASH_SCREEN",
kkonganti@1 118 help="Set the output file prefix for .fna.gz and .txt files.",
kkonganti@1 119 )
kkonganti@1 120 # required = parser.add_argument_group('required arguments')
kkonganti@1 121
kkonganti@1 122 args = parser.parse_args()
kkonganti@1 123 num_uniq_hits = int(args.num_uniq_hits)
kkonganti@1 124 mash_screen_res = args.mash_screen_res
kkonganti@1 125 mash_screen_res_suffix = args.mash_screen_res_suffix
kkonganti@1 126 pickle_sero = args.sero_snp_metadata
kkonganti@1 127 pickled_sero = args.pickled_sero
kkonganti@1 128 f_write_pick = args.force_write_pick
kkonganti@1 129 genomes_dir = args.genomes_dir
kkonganti@1 130 genomes_dir_suffix = args.genomes_dir_suffix
kkonganti@1 131 out_prefix = args.out_prefix
kkonganti@1 132 mash_genomes_gz = os.path.join(
kkonganti@1 133 os.getcwd(), out_prefix + "_TOP_" + str(num_uniq_hits) + "_UNIQUE_HITS.fna.gz"
kkonganti@1 134 )
kkonganti@1 135 mash_uniq_hits_txt = os.path.join(
kkonganti@1 136 os.getcwd(), re.sub(".fna.gz", ".txt", os.path.basename(mash_genomes_gz))
kkonganti@1 137 )
kkonganti@1 138
kkonganti@1 139 if mash_screen_res and os.path.exists(mash_genomes_gz):
kkonganti@1 140 logging.error(
kkonganti@1 141 "A concatenated genome FASTA file,\n"
kkonganti@1 142 + f"{os.path.basename(mash_genomes_gz)} already exists in:\n"
kkonganti@1 143 + f"{os.getcwd()}\n"
kkonganti@1 144 + "Please remove or move it as we will not "
kkonganti@1 145 + "overwrite it."
kkonganti@1 146 )
kkonganti@1 147 exit(1)
kkonganti@1 148
kkonganti@1 149 if os.path.exists(mash_uniq_hits_txt) and os.path.getsize(mash_uniq_hits_txt) > 0:
kkonganti@1 150 os.remove(mash_uniq_hits_txt)
kkonganti@1 151
kkonganti@1 152 if mash_screen_res and (not genomes_dir or not pickled_sero):
kkonganti@1 153 logging.error("When -m is on, -ps and -gd are also required.")
kkonganti@1 154 exit(1)
kkonganti@1 155
kkonganti@1 156 if genomes_dir:
kkonganti@1 157 if not os.path.isdir(genomes_dir):
kkonganti@1 158 logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!")
kkonganti@1 159 exit(1)
kkonganti@1 160 if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0:
kkonganti@1 161 logging.error(
kkonganti@1 162 "Genomes directory"
kkonganti@1 163 + f"{genomes_dir}"
kkonganti@1 164 + "\ndoes not seem to have any\n"
kkonganti@1 165 + f"files ending with suffix: {genomes_dir_suffix}"
kkonganti@1 166 )
kkonganti@1 167 exit(1)
kkonganti@1 168
kkonganti@1 169 if pickle_sero and os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0:
kkonganti@1 170 acc2serotype = defaultdict()
kkonganti@1 171 init_pickled_sero = os.path.join(os.getcwd(), out_prefix + ".ACC2SERO.pickle")
kkonganti@1 172
kkonganti@1 173 if (
kkonganti@1 174 os.path.exists(init_pickled_sero)
kkonganti@1 175 and os.path.getsize(init_pickled_sero)
kkonganti@1 176 and not f_write_pick
kkonganti@1 177 ):
kkonganti@1 178 logging.error(
kkonganti@1 179 f"File {os.path.basename(init_pickled_sero)} already exists in\n{os.getcwd()}\n"
kkonganti@1 180 + "Use -fs to force overwrite it."
kkonganti@1 181 )
kkonganti@1 182 exit(1)
kkonganti@1 183
kkonganti@1 184 with open(pickle_sero, "r") as sero_snp_meta:
kkonganti@1 185 for line in sero_snp_meta:
kkonganti@1 186 cols = line.strip().split("|")
kkonganti@1 187 url_cols = cols[3].split("/")
kkonganti@1 188
kkonganti@1 189 if not 4 <= len(cols) <= 5:
kkonganti@1 190 logging.error(
kkonganti@1 191 f"The metadata file {pickle_sero} is malformed.\n"
kkonganti@1 192 + f"Expected 4-5 columns. Got {len(cols)} columns.\n"
kkonganti@1 193 )
kkonganti@1 194 exit(1)
kkonganti@1 195
kkonganti@1 196 if not len(url_cols) > 5:
kkonganti@1 197 acc = url_cols[3]
kkonganti@1 198 else:
kkonganti@1 199 acc = url_cols[9]
kkonganti@1 200
kkonganti@1 201 if not re.match(r"^GC[AF]\_\d+\.\d+$", acc):
kkonganti@1 202 logging.error(
kkonganti@1 203 f"Did not find accession in either field number 4\n"
kkonganti@1 204 + "or field number 10 of column 4."
kkonganti@1 205 )
kkonganti@1 206 exit(1)
kkonganti@1 207
kkonganti@1 208 acc2serotype[acc] = cols[0]
kkonganti@1 209
kkonganti@1 210 with open(init_pickled_sero, "wb") as write_pickled_sero:
kkonganti@1 211 pickle.dump(file=write_pickled_sero, obj=acc2serotype)
kkonganti@1 212
kkonganti@1 213 logging.info(
kkonganti@1 214 f"Created the pickle file for\n{os.path.basename(pickle_sero)}.\n"
kkonganti@1 215 + "This was the only requested function."
kkonganti@1 216 )
kkonganti@1 217 sero_snp_meta.close()
kkonganti@1 218 write_pickled_sero.close()
kkonganti@1 219 exit(0)
kkonganti@1 220 elif pickle_sero and not (os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0):
kkonganti@1 221
kkonganti@1 222 logging.error(
kkonganti@1 223 "Requested to create pickle from metadata, but\n"
kkonganti@1 224 + f"the file, {os.path.basename(pickle_sero)} is empty or\ndoes not exist!"
kkonganti@1 225 )
kkonganti@1 226 exit(1)
kkonganti@1 227
kkonganti@1 228 if mash_screen_res and os.path.exists(mash_screen_res):
kkonganti@1 229 if os.path.getsize(mash_screen_res) > 0:
kkonganti@1 230
kkonganti@1 231 seen_uniq_hits = 0
kkonganti@1 232 unpickled_acc2serotype = pickle.load(file=open(pickled_sero, "rb"))
kkonganti@1 233
kkonganti@1 234 with open(mash_screen_res, "r") as msh_res:
kkonganti@1 235 mash_hits = defaultdict()
kkonganti@1 236 seen_mash_sero = defaultdict()
kkonganti@1 237
kkonganti@1 238 for line in msh_res:
kkonganti@1 239 cols = line.strip().split("\t")
kkonganti@1 240
kkonganti@1 241 if len(cols) < 5:
kkonganti@1 242 logging.error(
kkonganti@1 243 f"The file {os.path.basename(mash_screen_res)} seems to\n"
kkonganti@1 244 + "be malformed. It contains less than required 5-6 columns."
kkonganti@1 245 )
kkonganti@1 246 exit(1)
kkonganti@1 247
kkonganti@1 248 mash_hit_acc = re.sub(
kkonganti@1 249 genomes_dir_suffix,
kkonganti@1 250 "",
kkonganti@1 251 str((re.search(r"GC[AF].*?" + genomes_dir_suffix, cols[4])).group()),
kkonganti@1 252 )
kkonganti@1 253
kkonganti@1 254 if mash_hit_acc:
kkonganti@1 255 mash_hits.setdefault(cols[0], []).append(mash_hit_acc)
kkonganti@1 256 else:
kkonganti@1 257 logging.error(
kkonganti@1 258 "Did not find an assembly accession in column\n"
kkonganti@1 259 + f"number 5. Found {cols[4]} instead. Cannot proceed!"
kkonganti@1 260 )
kkonganti@1 261 exit(1)
kkonganti@1 262 msh_res.close()
kkonganti@1 263 elif os.path.getsize(mash_screen_res) == 0:
kkonganti@1 264 failed_sample_name = os.path.basename(mash_screen_res).rstrip(mash_screen_res_suffix)
kkonganti@1 265 with open(
kkonganti@1 266 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w"
kkonganti@1 267 ) as failed_sample_fh:
kkonganti@1 268 failed_sample_fh.write(f"{failed_sample_name}\n")
kkonganti@1 269 failed_sample_fh.close()
kkonganti@1 270 exit(0)
kkonganti@1 271
kkonganti@1 272 # ppp.pprint(mash_hits)
kkonganti@1 273 msh_out_txt = open(mash_uniq_hits_txt, "w")
kkonganti@1 274 with open(mash_genomes_gz, "wb") as msh_out_gz:
kkonganti@1 275 for _, (ident, acc_list) in enumerate(sorted(mash_hits.items(), reverse=True)):
kkonganti@1 276 for acc in acc_list:
kkonganti@1 277 if seen_uniq_hits >= num_uniq_hits:
kkonganti@1 278 break
kkonganti@1 279 if unpickled_acc2serotype[acc] not in seen_mash_sero.keys():
kkonganti@1 280 seen_mash_sero[unpickled_acc2serotype[acc]] = 1
kkonganti@1 281 seen_uniq_hits += 1
kkonganti@1 282 # print(acc.strip() + '\t' + ident + '\t' + unpickled_acc2serotype[acc], file=sys.stdout)
kkonganti@1 283 msh_out_txt.write(
kkonganti@1 284 f"{acc.strip()}\t{unpickled_acc2serotype[acc]}\t{ident}\n"
kkonganti@1 285 )
kkonganti@1 286 with open(
kkonganti@1 287 os.path.join(genomes_dir, acc + genomes_dir_suffix), "rb"
kkonganti@1 288 ) as msh_in_gz:
kkonganti@1 289 msh_out_gz.writelines(msh_in_gz.readlines())
kkonganti@1 290 msh_in_gz.close()
kkonganti@1 291 msh_out_gz.close()
kkonganti@1 292 msh_out_txt.close()
kkonganti@1 293 logging.info(
kkonganti@1 294 f"File {os.path.basename(mash_genomes_gz)}\n"
kkonganti@1 295 + f"written in:\n{os.getcwd()}\nDone! Bye!"
kkonganti@1 296 )
kkonganti@1 297 exit(0)
kkonganti@1 298
kkonganti@1 299
kkonganti@1 300 if __name__ == "__main__":
kkonganti@1 301 main()