annotate 0.2.0/bin/get_top_unique_mash_hit_genomes.py @ 17:b571995ddb51

planemo upload
author kkonganti
date Mon, 15 Jul 2024 19:01:29 -0400
parents a5f31c44f8c9
children
rev   line source
kkonganti@11 1 #!/usr/bin/env python3
kkonganti@11 2
kkonganti@11 3 # Kranti Konganti
kkonganti@11 4
kkonganti@11 5 import argparse
kkonganti@11 6 import glob
kkonganti@11 7 import inspect
kkonganti@11 8 import logging
kkonganti@11 9 import os
kkonganti@11 10 import pickle
kkonganti@11 11 import pprint
kkonganti@11 12 import re
kkonganti@11 13 from collections import defaultdict
kkonganti@11 14
kkonganti@11 15
kkonganti@11 16 # Multiple inheritence for pretty printing of help text.
kkonganti@11 17 class MultiArgFormatClasses(
kkonganti@11 18 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
kkonganti@11 19 ):
kkonganti@11 20 pass
kkonganti@11 21
kkonganti@11 22
kkonganti@11 23 # Main
kkonganti@11 24 def main() -> None:
kkonganti@11 25 """
kkonganti@11 26 This script works only in the context of a Nextflow workflow.
kkonganti@11 27 It takes:
kkonganti@11 28 1. A pickle file containing a dictionary object where genome accession
kkonganti@11 29 is the key and the computed serotype is the value.
kkonganti@11 30 OR
kkonganti@11 31 1. It takes a pickle file containing a nested dictionary, where genome accession
kkonganti@11 32 is the key and the metadata is a dictionary associated with that key.
kkonganti@11 33 2. A file with `mash screen` results.
kkonganti@11 34 3. A directory containing genomes' FASTA in gzipped format where the
kkonganti@11 35 FASTA file contains 2 lines: one FASTA header followed by
kkonganti@11 36 genome Sequence.
kkonganti@11 37 and then generates a concatenated FASTA file of top N unique `mash screen`
kkonganti@11 38 genome hits as requested.
kkonganti@11 39 """
kkonganti@11 40
kkonganti@11 41 # Set logging.
kkonganti@11 42 logging.basicConfig(
kkonganti@11 43 format="\n"
kkonganti@11 44 + "=" * 55
kkonganti@11 45 + "\n%(asctime)s - %(levelname)s\n"
kkonganti@11 46 + "=" * 55
kkonganti@11 47 + "\n%(message)s\n\n",
kkonganti@11 48 level=logging.DEBUG,
kkonganti@11 49 )
kkonganti@11 50
kkonganti@11 51 # Debug print.
kkonganti@11 52 ppp = pprint.PrettyPrinter(width=55)
kkonganti@11 53 prog_name = os.path.basename(inspect.stack()[0].filename)
kkonganti@11 54
kkonganti@11 55 parser = argparse.ArgumentParser(
kkonganti@11 56 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@11 57 )
kkonganti@11 58
kkonganti@11 59 parser.add_argument(
kkonganti@11 60 "-s",
kkonganti@11 61 dest="sero_snp_metadata",
kkonganti@11 62 default=False,
kkonganti@11 63 required=False,
kkonganti@11 64 help="Absolute UNIX path to metadata text file with the field separator, | "
kkonganti@11 65 + "\nand 5 fields: serotype|asm_lvl|asm_url|snp_cluster_id"
kkonganti@11 66 + "\nEx: serotype=Derby,antigen_formula=4:f,g:-|Scaffold|402440|ftp://...\n|PDS000096654.2\n"
kkonganti@11 67 + "Mentioning this option will create a pickle file for the\nprovided metadata and exits.",
kkonganti@11 68 )
kkonganti@11 69 parser.add_argument(
kkonganti@11 70 "-fs",
kkonganti@11 71 dest="force_write_pick",
kkonganti@11 72 action="store_true",
kkonganti@11 73 required=False,
kkonganti@11 74 help="By default, when -s flag is on, the pickle file named *.ACC2SERO.pickle\n"
kkonganti@11 75 + "is written to CWD. If the file exists, the program will not overwrite\n"
kkonganti@11 76 + "and exit. Use -fs option to overwrite.",
kkonganti@11 77 )
kkonganti@11 78 parser.add_argument(
kkonganti@11 79 "-m",
kkonganti@11 80 dest="mash_screen_res",
kkonganti@11 81 default=False,
kkonganti@11 82 required=False,
kkonganti@11 83 help="Absolute UNIX path to `mash screen` results file.",
kkonganti@11 84 )
kkonganti@11 85 parser.add_argument(
kkonganti@11 86 "-ms",
kkonganti@11 87 dest="mash_screen_res_suffix",
kkonganti@11 88 default=".screened",
kkonganti@11 89 required=False,
kkonganti@11 90 help="Suffix of the `mash screen` result file.",
kkonganti@11 91 )
kkonganti@11 92 parser.add_argument(
kkonganti@11 93 "-ps",
kkonganti@11 94 dest="pickled_sero",
kkonganti@11 95 default=False,
kkonganti@11 96 required=False,
kkonganti@11 97 help="Absolute UNIX Path to serialized metadata object in a pickle file.\n"
kkonganti@11 98 + "You can create the pickle file of the metadata using -s option.\n"
kkonganti@11 99 + "Required if -m is on.",
kkonganti@11 100 )
kkonganti@11 101 parser.add_argument(
kkonganti@11 102 "-gd",
kkonganti@11 103 dest="genomes_dir",
kkonganti@11 104 default=False,
kkonganti@11 105 required=False,
kkonganti@11 106 help="Absolute UNIX path to a directory containing\n"
kkonganti@11 107 + "gzipped genome FASTA files.\n"
kkonganti@11 108 + "Required if -m is on.",
kkonganti@11 109 )
kkonganti@11 110 parser.add_argument(
kkonganti@11 111 "-gds",
kkonganti@11 112 dest="genomes_dir_suffix",
kkonganti@11 113 default="_scaffolded_genomic.fna.gz",
kkonganti@11 114 required=False,
kkonganti@11 115 help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.",
kkonganti@11 116 )
kkonganti@11 117 parser.add_argument(
kkonganti@11 118 "-n",
kkonganti@11 119 dest="num_uniq_hits",
kkonganti@11 120 default=10,
kkonganti@11 121 required=False,
kkonganti@11 122 help="This many number of serotype genomes' accessions are returned.",
kkonganti@11 123 )
kkonganti@11 124 parser.add_argument(
kkonganti@11 125 "-op",
kkonganti@11 126 dest="out_prefix",
kkonganti@11 127 default="MASH_SCREEN",
kkonganti@11 128 required=False,
kkonganti@11 129 help="Set the output file prefix for .fna.gz and .txt files.",
kkonganti@11 130 )
kkonganti@11 131 # required = parser.add_argument_group('required arguments')
kkonganti@11 132
kkonganti@11 133 args = parser.parse_args()
kkonganti@11 134 num_uniq_hits = int(args.num_uniq_hits)
kkonganti@11 135 mash_screen_res = args.mash_screen_res
kkonganti@11 136 mash_screen_res_suffix = args.mash_screen_res_suffix
kkonganti@11 137 pickle_sero = args.sero_snp_metadata
kkonganti@11 138 pickled_sero = args.pickled_sero
kkonganti@11 139 f_write_pick = args.force_write_pick
kkonganti@11 140 genomes_dir = args.genomes_dir
kkonganti@11 141 genomes_dir_suffix = args.genomes_dir_suffix
kkonganti@11 142 out_prefix = args.out_prefix
kkonganti@11 143 req_metadata = {
kkonganti@11 144 "mlst_sequence_type": "ST",
kkonganti@11 145 "epi_type": "ET",
kkonganti@11 146 "host": "HO",
kkonganti@11 147 "host_disease": "HD",
kkonganti@11 148 "isolation_source": "IS",
kkonganti@11 149 "outbreak": "OU",
kkonganti@11 150 "source_type": "SOT",
kkonganti@11 151 "strain": "GS",
kkonganti@11 152 }
kkonganti@11 153 target_acc_key = "target_acc"
kkonganti@11 154 ncbi_path_heading = "NCBI Pathogen Isolates Browser"
kkonganti@11 155 ncbi_path_uri = "https://www.ncbi.nlm.nih.gov/pathogens/isolates/#"
kkonganti@11 156 mash_genomes_gz = os.path.join(
kkonganti@11 157 os.getcwd(), out_prefix + "_TOP_" + str(num_uniq_hits) + "_UNIQUE_HITS.fna.gz"
kkonganti@11 158 )
kkonganti@11 159 mash_uniq_hits_txt = os.path.join(
kkonganti@11 160 os.getcwd(), re.sub(".fna.gz", ".txt", os.path.basename(mash_genomes_gz))
kkonganti@11 161 )
kkonganti@11 162 mash_uniq_accs_txt = os.path.join(
kkonganti@11 163 os.getcwd(), re.sub(".fna.gz", "_ACCS.txt", os.path.basename(mash_genomes_gz))
kkonganti@11 164 )
kkonganti@11 165 mash_popup_info_txt = os.path.join(
kkonganti@11 166 os.getcwd(), re.sub(".fna.gz", "_POPUP.txt", os.path.basename(mash_genomes_gz))
kkonganti@11 167 )
kkonganti@11 168
kkonganti@11 169 if mash_screen_res and os.path.exists(mash_genomes_gz):
kkonganti@11 170 logging.error(
kkonganti@11 171 "A concatenated genome FASTA file,\n"
kkonganti@11 172 + f"{os.path.basename(mash_genomes_gz)} already exists in:\n"
kkonganti@11 173 + f"{os.getcwd()}\n"
kkonganti@11 174 + "Please remove or move it as we will not "
kkonganti@11 175 + "overwrite it."
kkonganti@11 176 )
kkonganti@11 177 exit(1)
kkonganti@11 178
kkonganti@11 179 if os.path.exists(mash_uniq_hits_txt) and os.path.getsize(mash_uniq_hits_txt) > 0:
kkonganti@11 180 os.remove(mash_uniq_hits_txt)
kkonganti@11 181
kkonganti@11 182 if mash_screen_res and (not genomes_dir or not pickled_sero):
kkonganti@11 183 logging.error("When -m is on, -ps and -gd are also required.")
kkonganti@11 184 exit(1)
kkonganti@11 185
kkonganti@11 186 if genomes_dir:
kkonganti@11 187 if not os.path.isdir(genomes_dir):
kkonganti@11 188 logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!")
kkonganti@11 189 exit(1)
kkonganti@11 190 if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0:
kkonganti@11 191 logging.error(
kkonganti@11 192 "Genomes directory"
kkonganti@11 193 + f"{genomes_dir}"
kkonganti@11 194 + "\ndoes not seem to have any\n"
kkonganti@11 195 + f"files ending with suffix: {genomes_dir_suffix}"
kkonganti@11 196 )
kkonganti@11 197 exit(1)
kkonganti@11 198
kkonganti@11 199 if pickle_sero and os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0:
kkonganti@11 200 acc2serotype = defaultdict()
kkonganti@11 201 init_pickled_sero = os.path.join(os.getcwd(), out_prefix + ".ACC2SERO.pickle")
kkonganti@11 202
kkonganti@11 203 if (
kkonganti@11 204 os.path.exists(init_pickled_sero)
kkonganti@11 205 and os.path.getsize(init_pickled_sero)
kkonganti@11 206 and not f_write_pick
kkonganti@11 207 ):
kkonganti@11 208 logging.error(
kkonganti@11 209 f"File {os.path.basename(init_pickled_sero)} already exists in\n{os.getcwd()}\n"
kkonganti@11 210 + "Use -fs to force overwrite it."
kkonganti@11 211 )
kkonganti@11 212 exit(1)
kkonganti@11 213
kkonganti@11 214 with open(pickle_sero, "r") as sero_snp_meta:
kkonganti@11 215 for line in sero_snp_meta:
kkonganti@11 216 cols = line.strip().split("|")
kkonganti@11 217 url_cols = cols[3].split("/")
kkonganti@11 218
kkonganti@11 219 if not 4 <= len(cols) <= 5:
kkonganti@11 220 logging.error(
kkonganti@11 221 f"The metadata file {pickle_sero} is malformed.\n"
kkonganti@11 222 + f"Expected 4-5 columns. Got {len(cols)} columns.\n"
kkonganti@11 223 )
kkonganti@11 224 exit(1)
kkonganti@11 225
kkonganti@11 226 if not len(url_cols) > 5:
kkonganti@11 227 acc = url_cols[3]
kkonganti@11 228 else:
kkonganti@11 229 acc = url_cols[9]
kkonganti@11 230
kkonganti@11 231 if not re.match(r"^GC[AF]\_\d+\.\d+$", acc):
kkonganti@11 232 logging.error(
kkonganti@11 233 f"Did not find accession in either field number 4\n"
kkonganti@11 234 + "or field number 10 of column 4."
kkonganti@11 235 )
kkonganti@11 236 exit(1)
kkonganti@11 237
kkonganti@11 238 acc2serotype[acc] = cols[0]
kkonganti@11 239
kkonganti@11 240 with open(init_pickled_sero, "wb") as write_pickled_sero:
kkonganti@11 241 pickle.dump(file=write_pickled_sero, obj=acc2serotype)
kkonganti@11 242
kkonganti@11 243 logging.info(
kkonganti@11 244 f"Created the pickle file for\n{os.path.basename(pickle_sero)}.\n"
kkonganti@11 245 + "This was the only requested function."
kkonganti@11 246 )
kkonganti@11 247 sero_snp_meta.close()
kkonganti@11 248 write_pickled_sero.close()
kkonganti@11 249 exit(0)
kkonganti@11 250 elif pickle_sero and not (
kkonganti@11 251 os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0
kkonganti@11 252 ):
kkonganti@11 253 logging.error(
kkonganti@11 254 "Requested to create pickle from metadata, but\n"
kkonganti@11 255 + f"the file, {os.path.basename(pickle_sero)} is empty or\ndoes not exist!"
kkonganti@11 256 )
kkonganti@11 257 exit(1)
kkonganti@11 258
kkonganti@11 259 if mash_screen_res and os.path.exists(mash_screen_res):
kkonganti@11 260 if os.path.getsize(mash_screen_res) > 0:
kkonganti@11 261 seen_uniq_hits = 0
kkonganti@11 262 unpickled_acc2serotype = pickle.load(file=open(pickled_sero, "rb"))
kkonganti@11 263
kkonganti@11 264 with open(mash_screen_res, "r") as msh_res:
kkonganti@11 265 mash_hits = defaultdict()
kkonganti@11 266 seen_mash_sero = defaultdict()
kkonganti@11 267
kkonganti@11 268 for line in msh_res:
kkonganti@11 269 cols = line.strip().split("\t")
kkonganti@11 270
kkonganti@11 271 if len(cols) < 5:
kkonganti@11 272 logging.error(
kkonganti@11 273 f"The file {os.path.basename(mash_screen_res)} seems to\n"
kkonganti@11 274 + "be malformed. It contains less than required 5-6 columns."
kkonganti@11 275 )
kkonganti@11 276 exit(1)
kkonganti@11 277
kkonganti@11 278 mash_hit_acc = re.sub(
kkonganti@11 279 genomes_dir_suffix,
kkonganti@11 280 "",
kkonganti@11 281 str(
kkonganti@11 282 (
kkonganti@11 283 re.search(r"GC[AF].*?" + genomes_dir_suffix, cols[4])
kkonganti@11 284 ).group()
kkonganti@11 285 ),
kkonganti@11 286 )
kkonganti@11 287
kkonganti@11 288 if mash_hit_acc:
kkonganti@11 289 mash_hits.setdefault(cols[0], []).append(mash_hit_acc)
kkonganti@11 290 else:
kkonganti@11 291 logging.error(
kkonganti@11 292 "Did not find an assembly accession in column\n"
kkonganti@11 293 + f"number 5. Found {cols[4]} instead. Cannot proceed!"
kkonganti@11 294 )
kkonganti@11 295 exit(1)
kkonganti@11 296 msh_res.close()
kkonganti@11 297 elif os.path.getsize(mash_screen_res) == 0:
kkonganti@11 298 failed_sample_name = os.path.basename(mash_screen_res).rstrip(
kkonganti@11 299 mash_screen_res_suffix
kkonganti@11 300 )
kkonganti@11 301 with open(
kkonganti@11 302 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w"
kkonganti@11 303 ) as failed_sample_fh:
kkonganti@11 304 failed_sample_fh.write(f"{failed_sample_name}\n")
kkonganti@11 305 failed_sample_fh.close()
kkonganti@11 306 exit(0)
kkonganti@11 307
kkonganti@11 308 # ppp.pprint(mash_hits)
kkonganti@11 309 msh_out_txt = open(mash_uniq_hits_txt, "w")
kkonganti@11 310 msh_out_accs_txt = open(mash_uniq_accs_txt, "w")
kkonganti@11 311 mash_out_pop_txt = open(mash_popup_info_txt, "w")
kkonganti@11 312 wrote_header_pop = False
kkonganti@11 313 wrote_header_acc = False
kkonganti@11 314
kkonganti@11 315 with open(mash_genomes_gz, "wb") as msh_out_gz:
kkonganti@11 316 for _, (ident, acc_list) in enumerate(
kkonganti@11 317 sorted(mash_hits.items(), reverse=True)
kkonganti@11 318 ):
kkonganti@11 319 for acc in acc_list:
kkonganti@11 320 if seen_uniq_hits >= num_uniq_hits:
kkonganti@11 321 break
kkonganti@11 322 if isinstance(unpickled_acc2serotype[acc], dict):
kkonganti@11 323 if target_acc_key in unpickled_acc2serotype[acc].keys():
kkonganti@11 324 if not wrote_header_pop:
kkonganti@11 325 mash_out_pop_txt.write(
kkonganti@11 326 "POPUP_INFO\nSEPARATOR COMMA\nDATA\n"
kkonganti@11 327 )
kkonganti@11 328 wrote_header_pop = True
kkonganti@11 329
kkonganti@11 330 pdt = "".join(unpickled_acc2serotype[acc][target_acc_key])
kkonganti@11 331
kkonganti@11 332 popup_line = ",".join(
kkonganti@11 333 [
kkonganti@11 334 acc,
kkonganti@11 335 ncbi_path_heading,
kkonganti@11 336 f'<a target="_blank" href="{ncbi_path_uri + pdt}">{pdt}</a>',
kkonganti@11 337 ]
kkonganti@11 338 )
kkonganti@11 339 mash_out_pop_txt.write(popup_line + "\n")
kkonganti@11 340
kkonganti@11 341 if all(
kkonganti@11 342 k in unpickled_acc2serotype[acc].keys()
kkonganti@11 343 for k in req_metadata.keys()
kkonganti@11 344 ):
kkonganti@11 345 if not wrote_header_acc:
kkonganti@11 346 msh_out_txt.write(
kkonganti@11 347 "METADATA\nSEPARATOR COMMA\nFIELD_LABELS,"
kkonganti@11 348 )
kkonganti@11 349 msh_out_txt.write(
kkonganti@11 350 f"{','.join([str(key).upper() for key in req_metadata.keys()])}\nDATA\n"
kkonganti@11 351 )
kkonganti@11 352 wrote_header_acc = True
kkonganti@11 353
kkonganti@11 354 metadata_line = ",".join(
kkonganti@11 355 [
kkonganti@11 356 re.sub(
kkonganti@11 357 ",",
kkonganti@11 358 "",
kkonganti@11 359 "|".join(unpickled_acc2serotype[acc][m]),
kkonganti@11 360 )
kkonganti@11 361 for m in req_metadata.keys()
kkonganti@11 362 ],
kkonganti@11 363 )
kkonganti@11 364
kkonganti@11 365 msh_out_txt.write(f"{acc.strip()},{metadata_line}\n")
kkonganti@11 366 msh_out_accs_txt.write(
kkonganti@11 367 f"{os.path.join(genomes_dir, acc + genomes_dir_suffix)}\n"
kkonganti@11 368 )
kkonganti@11 369 seen_mash_sero[acc] = 1
kkonganti@11 370 seen_uniq_hits += 1
kkonganti@11 371 elif not isinstance(unpickled_acc2serotype[acc], dict):
kkonganti@11 372 if unpickled_acc2serotype[acc] not in seen_mash_sero.keys():
kkonganti@11 373 seen_mash_sero[unpickled_acc2serotype[acc]] = 1
kkonganti@11 374 seen_uniq_hits += 1
kkonganti@11 375 # print(acc.strip() + '\t' + ident + '\t' + unpickled_acc2serotype[acc], file=sys.stdout)
kkonganti@11 376 msh_out_txt.write(
kkonganti@11 377 f"{acc.strip()}\t{unpickled_acc2serotype[acc]}\t{ident}\n"
kkonganti@11 378 )
kkonganti@11 379 with open(
kkonganti@11 380 os.path.join(genomes_dir, acc + genomes_dir_suffix),
kkonganti@11 381 "rb",
kkonganti@11 382 ) as msh_in_gz:
kkonganti@11 383 msh_out_gz.writelines(msh_in_gz.readlines())
kkonganti@11 384 msh_in_gz.close()
kkonganti@11 385 msh_out_gz.close()
kkonganti@11 386 msh_out_txt.close()
kkonganti@11 387 msh_out_accs_txt.close()
kkonganti@11 388 logging.info(
kkonganti@11 389 f"File {os.path.basename(mash_genomes_gz)}\n"
kkonganti@11 390 + f"written in:\n{os.getcwd()}\nDone! Bye!"
kkonganti@11 391 )
kkonganti@11 392 exit(0)
kkonganti@11 393
kkonganti@11 394
kkonganti@11 395 if __name__ == "__main__":
kkonganti@11 396 main()