annotate 0.2.0/bin/index_pdg_metadata.py @ 17:b571995ddb51

planemo upload
author kkonganti
date Mon, 15 Jul 2024 19:01:29 -0400
parents a5f31c44f8c9
children
rev   line source
kkonganti@11 1 #!/usr/bin/env python3
kkonganti@11 2
kkonganti@11 3 # Kranti Konganti
kkonganti@11 4
kkonganti@11 5 import argparse
kkonganti@11 6 import inspect
kkonganti@11 7 import logging
kkonganti@11 8 import os
kkonganti@11 9 import pickle
kkonganti@11 10 import pprint
kkonganti@11 11 import re
kkonganti@11 12 from collections import defaultdict
kkonganti@11 13
kkonganti@11 14
kkonganti@11 15 # Multiple inheritence for pretty printing of help text.
kkonganti@11 16 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
kkonganti@11 17 pass
kkonganti@11 18
kkonganti@11 19
kkonganti@11 20 # Main
kkonganti@11 21 def main() -> None:
kkonganti@11 22 """
kkonganti@11 23 This script works only in the context of `cronology_db` Nextflow workflow.
kkonganti@11 24 It takes an UNIX path to directory containing the following files:
kkonganti@11 25 1. PDG metadata file (Ex: `PDG000000043.204.metadata.tsv`)
kkonganti@11 26 2. PDG SNP Cluster metadata file (Ex: `PDG000000043.204.reference_target.cluster_list.tsv`)
kkonganti@11 27 3. A list of possibly downloadable assembly accessions (one per line) from the metadata file.
kkonganti@11 28 and then generates a pickled file with relevant metadata columns mentioned with the -cols option.
kkonganti@11 29 """
kkonganti@11 30
kkonganti@11 31 # Set logging.
kkonganti@11 32 logging.basicConfig(
kkonganti@11 33 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n",
kkonganti@11 34 level=logging.DEBUG,
kkonganti@11 35 )
kkonganti@11 36
kkonganti@11 37 # Debug print.
kkonganti@11 38 ppp = pprint.PrettyPrinter(width=55)
kkonganti@11 39 prog_name = os.path.basename(inspect.stack()[0].filename)
kkonganti@11 40
kkonganti@11 41 parser = argparse.ArgumentParser(
kkonganti@11 42 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@11 43 )
kkonganti@11 44
kkonganti@11 45 required = parser.add_argument_group("required arguments")
kkonganti@11 46
kkonganti@11 47 required.add_argument(
kkonganti@11 48 "-pdg_dir",
kkonganti@11 49 dest="pdg_dir",
kkonganti@11 50 default=False,
kkonganti@11 51 required=True,
kkonganti@11 52 help="Absolute UNIX path to directory containing the following files.\nEx:"
kkonganti@11 53 + "\n1. PDG000000043.204.metadata.tsv"
kkonganti@11 54 + "\n2. PDG000000043.204.reference_target.cluster_list.tsv"
kkonganti@11 55 + "\n3. A file of assembly accessions, one per line parsed out from"
kkonganti@11 56 + "\n the metadata file.",
kkonganti@11 57 )
kkonganti@11 58 parser.add_argument(
kkonganti@11 59 "-mlst",
kkonganti@11 60 dest="mlst_results",
kkonganti@11 61 required=False,
kkonganti@11 62 help="Absolute UNIX path to MLST results file\nIf MLST results exists for a accession, they"
kkonganti@11 63 + "\nwill be included in the index.",
kkonganti@11 64 )
kkonganti@11 65 parser.add_argument(
kkonganti@11 66 "-pdg_meta_pat",
kkonganti@11 67 dest="pdg_meta_pat",
kkonganti@11 68 default="PDG\d+\.\d+\.metadata\.tsv",
kkonganti@11 69 required=False,
kkonganti@11 70 help="The pattern to be used to search for PDG metadata\nfile.",
kkonganti@11 71 )
kkonganti@11 72 parser.add_argument(
kkonganti@11 73 "-pdg_snp_meta_pat",
kkonganti@11 74 dest="pdg_snp_meta_pat",
kkonganti@11 75 default="PDG\d+\.\d+\.reference\_target\.cluster\_list\.tsv",
kkonganti@11 76 required=False,
kkonganti@11 77 help="The pattern to be used to search for PDG SNP Cluster metadata\nfile.",
kkonganti@11 78 )
kkonganti@11 79 parser.add_argument(
kkonganti@11 80 "-pdg_accs_filename_pat",
kkonganti@11 81 dest="pdg_accs_fn_pat",
kkonganti@11 82 default="accs_all.txt",
kkonganti@11 83 required=False,
kkonganti@11 84 help="The filename to look for where all the parsed GC[AF] accessions are stored,\n"
kkonganti@11 85 + "one per line.",
kkonganti@11 86 )
kkonganti@11 87 parser.add_argument(
kkonganti@11 88 "-cols",
kkonganti@11 89 dest="metadata_cols",
kkonganti@11 90 default="epi_type,collected_by,collection_date,host,"
kkonganti@11 91 + "\nhost_disease,isolation_source,outbreak,sample_name,scientific_name,serovar,"
kkonganti@11 92 + "\nsource_type,strain,computed_types,target_acc",
kkonganti@11 93 required=False,
kkonganti@11 94 help="The data in these metadata columns will be indexed for each\nisolate.",
kkonganti@11 95 )
kkonganti@11 96 parser.add_argument(
kkonganti@11 97 "-fs",
kkonganti@11 98 dest="force_write_pick",
kkonganti@11 99 action="store_true",
kkonganti@11 100 required=False,
kkonganti@11 101 help="By default, when -s flag is on, the pickle file named *.IDXD_PDG_METAD.pickle"
kkonganti@11 102 + "\nis written to CWD. If the file exists, the program will not overwrite"
kkonganti@11 103 + "\nand exit. Use -fs option to overwrite.",
kkonganti@11 104 )
kkonganti@11 105 parser.add_argument(
kkonganti@11 106 "-op",
kkonganti@11 107 dest="out_prefix",
kkonganti@11 108 default="IDXD_PDG_METAD",
kkonganti@11 109 help="Set the output file prefix for indexed PDG metadata.",
kkonganti@11 110 )
kkonganti@11 111 parser.add_argument(
kkonganti@11 112 "-pfs",
kkonganti@11 113 dest="pdg_meta_fs",
kkonganti@11 114 default="\t",
kkonganti@11 115 help="Change the field separator of the PDG metadata file.",
kkonganti@11 116 )
kkonganti@11 117
kkonganti@11 118 args = parser.parse_args()
kkonganti@11 119 pdg_dir = os.path.abspath(args.pdg_dir)
kkonganti@11 120 mcols = args.metadata_cols
kkonganti@11 121 f_write_pick = args.force_write_pick
kkonganti@11 122 out_prefix = args.out_prefix
kkonganti@11 123 pdg_meta_fs = args.pdg_meta_fs
kkonganti@11 124 mlst_res = args.mlst_results
kkonganti@11 125 acc_pat = re.compile(r"^GC[AF]\_\d+\.?\d*")
kkonganti@11 126 mcols_pat = re.compile(r"[\w+\,]")
kkonganti@11 127 pdg_meta_pat = re.compile(f"{args.pdg_meta_pat}")
kkonganti@11 128 pdg_snp_meta_pat = re.compile(f"{args.pdg_snp_meta_pat}")
kkonganti@11 129 pdg_accs_fn_pat = re.compile(f"{args.pdg_accs_fn_pat}")
kkonganti@11 130 target_acc_col = 41
kkonganti@11 131 acc_col = 9
kkonganti@11 132 num_accs_check = list()
kkonganti@11 133 mlst_sts = dict()
kkonganti@11 134 acceptable_num_mlst_cols = 10
kkonganti@11 135 mlst_st_col = 2
kkonganti@11 136 mlst_acc_col = 0
kkonganti@11 137
kkonganti@11 138 # Basic checks
kkonganti@11 139
kkonganti@11 140 if os.path.exists(pdg_dir) and os.path.isdir(pdg_dir):
kkonganti@11 141 pdg_meta_file = [f for f in os.listdir(pdg_dir) if pdg_meta_pat.match(f)]
kkonganti@11 142 pdg_snp_meta_file = [f for f in os.listdir(pdg_dir) if pdg_snp_meta_pat.match(f)]
kkonganti@11 143 pdg_acc_all = [f for f in os.listdir(pdg_dir) if pdg_accs_fn_pat.match(f)]
kkonganti@11 144 req_files = [len(pdg_meta_file), len(pdg_snp_meta_file), len(pdg_acc_all)]
kkonganti@11 145 if any(x > 1 for x in req_files):
kkonganti@11 146 logging.error(
kkonganti@11 147 f"Directory {os.path.basename(pdg_dir)} contains"
kkonganti@11 148 + "\ncontains mulitple files matching the search pattern."
kkonganti@11 149 )
kkonganti@11 150 exit(1)
kkonganti@11 151 elif any(x == 0 for x in req_files):
kkonganti@11 152 logging.error(
kkonganti@11 153 f"Directory {os.path.basename(pdg_dir)} does not contain"
kkonganti@11 154 + "\nany files matching the following file patterns:"
kkonganti@11 155 + f"\n{pdg_meta_pat.pattern}"
kkonganti@11 156 + f"\n{pdg_snp_meta_pat.pattern}"
kkonganti@11 157 + f"\n{pdg_accs_fn_pat.pattern}"
kkonganti@11 158 )
kkonganti@11 159 exit(1)
kkonganti@11 160 pdg_meta_file = os.path.join(pdg_dir, "".join(pdg_meta_file))
kkonganti@11 161 pdg_snp_meta_file = os.path.join(pdg_dir, "".join(pdg_snp_meta_file))
kkonganti@11 162 pdg_acc_all = os.path.join(pdg_dir, "".join(pdg_acc_all))
kkonganti@11 163 else:
kkonganti@11 164 logging.error(f"Directory path {pdg_dir} does not exist.")
kkonganti@11 165 exit(1)
kkonganti@11 166
kkonganti@11 167 if mlst_res and not (os.path.exists(mlst_res) or os.path.getsize(mlst_res) > 0):
kkonganti@11 168 logging.error(
kkonganti@11 169 f"Requested to index MLST results. but the file {os.path.basename(mlst_res)}"
kkonganti@11 170 + "does not exist or the file is empty."
kkonganti@11 171 )
kkonganti@11 172 exit(1)
kkonganti@11 173 elif mlst_res:
kkonganti@11 174 with open(mlst_res, "r") as mlst_res_fh:
kkonganti@11 175 header = mlst_res_fh.readline()
kkonganti@11 176 mlst_res_has_10_cols = False
kkonganti@11 177
kkonganti@11 178 for line in mlst_res_fh:
kkonganti@11 179 cols = line.strip().split("\t")
kkonganti@11 180 acc = acc_pat.findall(cols[mlst_acc_col])
kkonganti@11 181 if len(acc) > 1:
kkonganti@11 182 logging.error(f"Found more than 1 accession in column:\ncols[mlst_acc_col]\n")
kkonganti@11 183 exit(1)
kkonganti@11 184 else:
kkonganti@11 185 acc = "".join(acc)
kkonganti@11 186 if len(cols) == acceptable_num_mlst_cols and re.match(r"\d+|\-", cols[mlst_st_col]):
kkonganti@11 187 mlst_res_has_10_cols = True
kkonganti@11 188 if re.match(r"\-", cols[mlst_st_col]):
kkonganti@11 189 mlst_sts[acc] = "NULL"
kkonganti@11 190 else:
kkonganti@11 191 mlst_sts[acc] = cols[mlst_st_col]
kkonganti@11 192
kkonganti@11 193 if not mlst_res_has_10_cols:
kkonganti@11 194 logging.error(
kkonganti@11 195 "Requested to incorporate MLST ST's but file"
kkonganti@11 196 + f"\n{os.path.basename(mlst_res)}"
kkonganti@11 197 + "\ndoes not have 10 columns in all rows."
kkonganti@11 198 )
kkonganti@11 199 exit(1)
kkonganti@11 200
kkonganti@11 201 mlst_res_fh.close()
kkonganti@11 202
kkonganti@11 203 with open(pdg_acc_all, "r") as pdg_acc_all_fh:
kkonganti@11 204 for a in pdg_acc_all_fh.readlines():
kkonganti@11 205 num_accs_check.append(a.strip())
kkonganti@11 206
kkonganti@11 207 if not mcols_pat.match(mcols):
kkonganti@11 208 logging.error(
kkonganti@11 209 f"Supplied columns' names should only be"
kkonganti@11 210 + "\nalphanumeric (including _) separated by a comma."
kkonganti@11 211 )
kkonganti@11 212 exit(1)
kkonganti@11 213 else:
kkonganti@11 214 mcols = re.sub("\n", "", mcols).split(",")
kkonganti@11 215
kkonganti@11 216 if (
kkonganti@11 217 pdg_snp_meta_file
kkonganti@11 218 and os.path.exists(pdg_snp_meta_file)
kkonganti@11 219 and os.path.getsize(pdg_snp_meta_file) > 0
kkonganti@11 220 ):
kkonganti@11 221 acc2snp = defaultdict()
kkonganti@11 222 acc2meta = defaultdict(defaultdict)
kkonganti@11 223 init_pickled_sero = os.path.join(os.getcwd(), out_prefix + ".pickle")
kkonganti@11 224
kkonganti@11 225 if (
kkonganti@11 226 os.path.exists(init_pickled_sero)
kkonganti@11 227 and os.path.getsize(init_pickled_sero)
kkonganti@11 228 and not f_write_pick
kkonganti@11 229 ):
kkonganti@11 230 logging.error(
kkonganti@11 231 f"File {os.path.basename(init_pickled_sero)} already exists in\n{os.getcwd()}\n"
kkonganti@11 232 + "Use -fs to force overwrite it."
kkonganti@11 233 )
kkonganti@11 234 exit(1)
kkonganti@11 235
kkonganti@11 236 with open(pdg_snp_meta_file, "r") as snp_meta:
kkonganti@11 237 header = snp_meta.readline()
kkonganti@11 238 skipped_acc2snp = set()
kkonganti@11 239 for line in snp_meta:
kkonganti@11 240 cols = line.strip().split(pdg_meta_fs)
kkonganti@11 241 if not 4 <= len(cols) < 5:
kkonganti@11 242 logging.error(
kkonganti@11 243 f"The metadata file {pdg_snp_meta_file} is malformed.\n"
kkonganti@11 244 + f"Expected 4 columns. Got {len(cols)} columns.\n"
kkonganti@11 245 )
kkonganti@11 246 exit(1)
kkonganti@11 247
kkonganti@11 248 if re.match("NULL", cols[3]):
kkonganti@11 249 skipped_acc2snp.add(f"Isolate {cols[1]} has no genome accession: {cols[3]}")
kkonganti@11 250 elif not acc_pat.match(cols[3]):
kkonganti@11 251 logging.error(
kkonganti@11 252 f"Did not find accession in either field number 4\n"
kkonganti@11 253 + "or field number 10 of column 4."
kkonganti@11 254 + f"\nLine: {line}"
kkonganti@11 255 )
kkonganti@11 256 exit(1)
kkonganti@11 257 elif not re.match("NULL", cols[3]):
kkonganti@11 258 acc2snp[cols[3]] = cols[0]
kkonganti@11 259
kkonganti@11 260 if len(skipped_acc2snp) > 0:
kkonganti@11 261 logging.info(
kkonganti@11 262 f"While indexing\n{os.path.basename(pdg_snp_meta_file)},"
kkonganti@11 263 + "\nthese isolates were skipped:\n\n"
kkonganti@11 264 + "\n".join(skipped_acc2snp)
kkonganti@11 265 )
kkonganti@11 266
kkonganti@11 267 with open(pdg_meta_file, "r") as pdg_meta:
kkonganti@11 268 header = pdg_meta.readline().strip().split(pdg_meta_fs)
kkonganti@11 269 user_req_cols = [mcol_i for mcol_i, mcol in enumerate(header) if mcol in mcols]
kkonganti@11 270 cols_not_found = [mcol for mcol in mcols if mcol not in header]
kkonganti@11 271 null_wgs_accs = set()
kkonganti@11 272 if len(cols_not_found) > 0:
kkonganti@11 273 logging.error(
kkonganti@11 274 f"The following columns do not exist in the"
kkonganti@11 275 + f"\nmetadata file [ {os.path.basename(pdg_meta_file)} ]:\n"
kkonganti@11 276 + "".join(cols_not_found)
kkonganti@11 277 )
kkonganti@11 278 exit(1)
kkonganti@11 279
kkonganti@11 280 for line in pdg_meta.readlines():
kkonganti@11 281 cols = line.strip().split(pdg_meta_fs)
kkonganti@11 282 pdg_assm_acc = cols[acc_col]
kkonganti@11 283 if not acc_pat.match(pdg_assm_acc):
kkonganti@11 284 null_wgs_accs.add(
kkonganti@11 285 f"Isolate {cols[target_acc_col]} has no genome accession: {pdg_assm_acc}"
kkonganti@11 286 )
kkonganti@11 287 continue
kkonganti@11 288
kkonganti@11 289 if pdg_assm_acc in mlst_sts.keys():
kkonganti@11 290 acc2meta[pdg_assm_acc].setdefault("mlst_sequence_type", []).append(
kkonganti@11 291 str(mlst_sts[pdg_assm_acc])
kkonganti@11 292 )
kkonganti@11 293
kkonganti@11 294 for col in user_req_cols:
kkonganti@11 295 acc2meta[pdg_assm_acc].setdefault(header[col], []).append(str(cols[col]))
kkonganti@11 296
kkonganti@11 297 if len(null_wgs_accs) > 0:
kkonganti@11 298 logging.info(
kkonganti@11 299 f"While indexing\n{os.path.basename(pdg_meta_file)},"
kkonganti@11 300 + "\nthese isolates were skipped:\n\n"
kkonganti@11 301 + "\n".join(null_wgs_accs)
kkonganti@11 302 )
kkonganti@11 303
kkonganti@11 304 with open(init_pickled_sero, "wb") as write_pickled_sero:
kkonganti@11 305 pickle.dump(file=write_pickled_sero, obj=acc2meta)
kkonganti@11 306
kkonganti@11 307 if len(num_accs_check) != len(acc2meta.keys()):
kkonganti@11 308 logging.error(
kkonganti@11 309 "Failed the accession count check."
kkonganti@11 310 + f"\nExpected {len(num_accs_check)} accessions but got {len(acc2meta.keys())}."
kkonganti@11 311 )
kkonganti@11 312 exit(1)
kkonganti@11 313 else:
kkonganti@11 314 logging.info(
kkonganti@11 315 f"Number of valid accessions: {len(num_accs_check)}"
kkonganti@11 316 + f"\nNumber of accessions indexed: {len(acc2meta.keys())}"
kkonganti@11 317 + f"\nNumber of accessions participating in any of the SNP Clusters: {len(acc2snp.keys())}"
kkonganti@11 318 + f"\n\nCreated the pickle file for\n{os.path.basename(pdg_meta_file)}."
kkonganti@11 319 + "\nThis was the only requested function."
kkonganti@11 320 )
kkonganti@11 321
kkonganti@11 322 snp_meta.close()
kkonganti@11 323 write_pickled_sero.close()
kkonganti@11 324 exit(0)
kkonganti@11 325 elif pdg_meta_file and not (
kkonganti@11 326 os.path.exists(pdg_meta_file) and os.path.getsize(pdg_meta_file) > 0
kkonganti@11 327 ):
kkonganti@11 328 logging.error(
kkonganti@11 329 "Requested to create pickle from metadata, but\n"
kkonganti@11 330 + f"the file, {os.path.basename(pdg_meta_file)} is empty or\ndoes not exist!"
kkonganti@11 331 )
kkonganti@11 332 exit(1)
kkonganti@11 333
kkonganti@11 334 pdg_acc_all_fh.close()
kkonganti@11 335 snp_meta.close()
kkonganti@11 336 pdg_meta.close()
kkonganti@11 337 write_pickled_sero.close()
kkonganti@11 338
kkonganti@11 339
kkonganti@11 340 if __name__ == "__main__":
kkonganti@11 341 main()