Mercurial > repos > galaxytrakr > hfp_bettercallsal_awsbatch
comparison 1.0.0/bin/index_pdg_metadata.py @ 0:801b85b03a17 draft default tip
planemo upload
| author | galaxytrakr |
|---|---|
| date | Thu, 28 May 2026 20:31:42 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:801b85b03a17 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 # Kranti Konganti | |
| 4 | |
| 5 import argparse | |
| 6 import inspect | |
| 7 import logging | |
| 8 import os | |
| 9 import pickle | |
| 10 import pprint | |
| 11 import re | |
| 12 from collections import defaultdict | |
| 13 | |
| 14 | |
| 15 # Multiple inheritence for pretty printing of help text. | |
| 16 class MultiArgFormatClasses( | |
| 17 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter | |
| 18 ): | |
| 19 pass | |
| 20 | |
| 21 | |
| 22 # Main | |
| 23 def main() -> None: | |
| 24 """ | |
| 25 This script works only in the context of `bettercallsal_db` Nextflow workflow. | |
| 26 It takes an UNIX path to directory containing the following files: | |
| 27 1. PDG metadata file (Ex: `PDG000000043.204.metadata.tsv`) | |
| 28 2. PDG SNP Cluster metadata file (Ex: `PDG000000043.204.reference_target.cluster_list.tsv`) | |
| 29 3. A list of possibly downloadable assembly accessions (one per line) from the metadata file. | |
| 30 and then generates a pickled file with relevant metadata columns mentioned with the -cols option. | |
| 31 """ | |
| 32 | |
| 33 # Set logging. | |
| 34 logging.basicConfig( | |
| 35 format="\n" | |
| 36 + "=" * 55 | |
| 37 + "\n%(asctime)s - %(levelname)s\n" | |
| 38 + "=" * 55 | |
| 39 + "\n%(message)s\n\n", | |
| 40 level=logging.DEBUG, | |
| 41 ) | |
| 42 | |
| 43 # Debug print. | |
| 44 ppp = pprint.PrettyPrinter(width=55) | |
| 45 prog_name = os.path.basename(inspect.stack()[0].filename) | |
| 46 | |
| 47 parser = argparse.ArgumentParser( | |
| 48 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses | |
| 49 ) | |
| 50 | |
| 51 required = parser.add_argument_group("required arguments") | |
| 52 | |
| 53 required.add_argument( | |
| 54 "-pdg_dir", | |
| 55 dest="pdg_dir", | |
| 56 default=False, | |
| 57 required=True, | |
| 58 help="Absolute UNIX path to directory containing the following files.\nEx:" | |
| 59 + "\n1. PDG000000043.204.metadata.tsv" | |
| 60 + "\n2. PDG000000043.204.reference_target.cluster_list.tsv" | |
| 61 + "\n3. A file of assembly accessions, one per line parsed out from" | |
| 62 + "\n the metadata file.", | |
| 63 ) | |
| 64 parser.add_argument( | |
| 65 "-mlst", | |
| 66 dest="mlst_results", | |
| 67 required=False, | |
| 68 help="Absolute UNIX path to MLST results file\nIf MLST results exists for a accession, they" | |
| 69 + "\nwill be included in the index.", | |
| 70 ) | |
| 71 parser.add_argument( | |
| 72 "-pdg_meta_pat", | |
| 73 dest="pdg_meta_pat", | |
| 74 default="PDG\d+\.\d+\.metadata\.tsv", | |
| 75 required=False, | |
| 76 help="The pattern to be used to search for PDG metadata\nfile.", | |
| 77 ) | |
| 78 parser.add_argument( | |
| 79 "-pdg_snp_meta_pat", | |
| 80 dest="pdg_snp_meta_pat", | |
| 81 default="PDG\d+\.\d+\.reference\_target\.cluster\_list\.tsv", | |
| 82 required=False, | |
| 83 help="The pattern to be used to search for PDG SNP Cluster metadata\nfile.", | |
| 84 ) | |
| 85 parser.add_argument( | |
| 86 "-pdg_accs_filename_pat", | |
| 87 dest="pdg_accs_fn_pat", | |
| 88 default="accs_all.txt", | |
| 89 required=False, | |
| 90 help="The filename to look for where all the parsed GC[AF] accessions are stored,\n" | |
| 91 + "one per line.", | |
| 92 ) | |
| 93 parser.add_argument( | |
| 94 "-cols", | |
| 95 dest="metadata_cols", | |
| 96 default="epi_type,collected_by,collection_date,host," | |
| 97 + "\nhost_disease,isolation_source,outbreak,sample_name,scientific_name,serovar," | |
| 98 + "\nsource_type,strain,computed_types,target_acc,geo_loc_name", | |
| 99 required=False, | |
| 100 help="The data in these metadata columns will be indexed for each\nisolate.", | |
| 101 ) | |
| 102 parser.add_argument( | |
| 103 "-fs", | |
| 104 dest="force_write_pick", | |
| 105 action="store_true", | |
| 106 required=False, | |
| 107 help="By default, when -s flag is on, the pickle file named *.IDXD_PDG_METAD.pickle" | |
| 108 + "\nis written to CWD. If the file exists, the program will not overwrite" | |
| 109 + "\nand exit. Use -fs option to overwrite.", | |
| 110 ) | |
| 111 parser.add_argument( | |
| 112 "-op", | |
| 113 dest="out_prefix", | |
| 114 default="IDXD_PDG_METAD", | |
| 115 help="Set the output file prefix for indexed PDG metadata.", | |
| 116 ) | |
| 117 parser.add_argument( | |
| 118 "-pfs", | |
| 119 dest="pdg_meta_fs", | |
| 120 default="\t", | |
| 121 help="Change the field separator of the PDG metadata file.", | |
| 122 ) | |
| 123 | |
| 124 args = parser.parse_args() | |
| 125 pdg_dir = os.path.abspath(args.pdg_dir) | |
| 126 mcols = args.metadata_cols | |
| 127 f_write_pick = args.force_write_pick | |
| 128 out_prefix = args.out_prefix | |
| 129 pdg_meta_fs = args.pdg_meta_fs | |
| 130 mlst_res = args.mlst_results | |
| 131 acc_pat = re.compile(r"^GC[AF]\_\d+\.?\d*") | |
| 132 mcols_pat = re.compile(r"[\w+\,]") | |
| 133 pdg_meta_pat = re.compile(f"{args.pdg_meta_pat}") | |
| 134 pdg_snp_meta_pat = re.compile(f"{args.pdg_snp_meta_pat}") | |
| 135 pdg_accs_fn_pat = re.compile(f"{args.pdg_accs_fn_pat}") | |
| 136 target_acc_col = 41 | |
| 137 acc_col = 9 | |
| 138 num_accs_check = list() | |
| 139 mlst_sts = dict() | |
| 140 acceptable_num_mlst_cols = 10 | |
| 141 mlst_st_col = 2 | |
| 142 mlst_acc_col = 0 | |
| 143 | |
| 144 # Basic checks | |
| 145 | |
| 146 if os.path.exists(pdg_dir) and os.path.isdir(pdg_dir): | |
| 147 pdg_meta_file = [f for f in os.listdir(pdg_dir) if pdg_meta_pat.match(f)] | |
| 148 pdg_snp_meta_file = [ | |
| 149 f for f in os.listdir(pdg_dir) if pdg_snp_meta_pat.match(f) | |
| 150 ] | |
| 151 pdg_acc_all = [f for f in os.listdir(pdg_dir) if pdg_accs_fn_pat.match(f)] | |
| 152 req_files = [len(pdg_meta_file), len(pdg_snp_meta_file), len(pdg_acc_all)] | |
| 153 if any(x > 1 for x in req_files): | |
| 154 logging.error( | |
| 155 f"Directory {os.path.basename(pdg_dir)} contains" | |
| 156 + "\ncontains mulitple files matching the search pattern." | |
| 157 ) | |
| 158 exit(1) | |
| 159 elif any(x == 0 for x in req_files): | |
| 160 logging.error( | |
| 161 f"Directory {os.path.basename(pdg_dir)} does not contain" | |
| 162 + "\nany files matching the following file patterns:" | |
| 163 + f"\n{pdg_meta_pat.pattern}" | |
| 164 + f"\n{pdg_snp_meta_pat.pattern}" | |
| 165 + f"\n{pdg_accs_fn_pat.pattern}" | |
| 166 ) | |
| 167 exit(1) | |
| 168 pdg_meta_file = os.path.join(pdg_dir, "".join(pdg_meta_file)) | |
| 169 pdg_snp_meta_file = os.path.join(pdg_dir, "".join(pdg_snp_meta_file)) | |
| 170 pdg_acc_all = os.path.join(pdg_dir, "".join(pdg_acc_all)) | |
| 171 else: | |
| 172 logging.error(f"Directory path {pdg_dir} does not exist.") | |
| 173 exit(1) | |
| 174 | |
| 175 if mlst_res and not (os.path.exists(mlst_res) or os.path.getsize(mlst_res) > 0): | |
| 176 logging.error( | |
| 177 f"Requested to index MLST results. but the file {os.path.basename(mlst_res)}" | |
| 178 + "does not exist or the file is empty." | |
| 179 ) | |
| 180 exit(1) | |
| 181 elif mlst_res: | |
| 182 with open(mlst_res, "r") as mlst_res_fh: | |
| 183 header = mlst_res_fh.readline() | |
| 184 mlst_res_has_10_cols = False | |
| 185 | |
| 186 for line in mlst_res_fh: | |
| 187 cols = line.strip().split("\t") | |
| 188 acc = acc_pat.findall(cols[mlst_acc_col]) | |
| 189 if len(acc) > 1: | |
| 190 logging.error( | |
| 191 f"Found more than 1 accession in column:\ncols[mlst_acc_col]\n" | |
| 192 ) | |
| 193 exit(1) | |
| 194 else: | |
| 195 acc = "".join(acc) | |
| 196 if len(cols) == acceptable_num_mlst_cols and re.match( | |
| 197 r"\d+|\-", cols[mlst_st_col] | |
| 198 ): | |
| 199 mlst_res_has_10_cols = True | |
| 200 if re.match(r"\-", cols[mlst_st_col]): | |
| 201 mlst_sts[acc] = "NULL" | |
| 202 else: | |
| 203 mlst_sts[acc] = cols[mlst_st_col] | |
| 204 | |
| 205 if not mlst_res_has_10_cols: | |
| 206 logging.error( | |
| 207 "Requested to incorporate MLST ST's but file" | |
| 208 + f"\n{os.path.basename(mlst_res)}" | |
| 209 + "\ndoes not have 10 columns in all rows." | |
| 210 ) | |
| 211 exit(1) | |
| 212 | |
| 213 mlst_res_fh.close() | |
| 214 | |
| 215 with open(pdg_acc_all, "r") as pdg_acc_all_fh: | |
| 216 for a in pdg_acc_all_fh.readlines(): | |
| 217 num_accs_check.append(a.strip()) | |
| 218 | |
| 219 if not mcols_pat.match(mcols): | |
| 220 logging.error( | |
| 221 f"Supplied columns' names should only be" | |
| 222 + "\nalphanumeric (including _) separated by a comma." | |
| 223 ) | |
| 224 exit(1) | |
| 225 else: | |
| 226 mcols = re.sub("\n", "", mcols).split(",") | |
| 227 | |
| 228 if ( | |
| 229 pdg_snp_meta_file | |
| 230 and os.path.exists(pdg_snp_meta_file) | |
| 231 and os.path.getsize(pdg_snp_meta_file) > 0 | |
| 232 ): | |
| 233 acc2snp = defaultdict() | |
| 234 acc2meta = defaultdict(defaultdict) | |
| 235 init_pickled_sero = os.path.join(os.getcwd(), out_prefix + ".pickle") | |
| 236 | |
| 237 if ( | |
| 238 os.path.exists(init_pickled_sero) | |
| 239 and os.path.getsize(init_pickled_sero) | |
| 240 and not f_write_pick | |
| 241 ): | |
| 242 logging.error( | |
| 243 f"File {os.path.basename(init_pickled_sero)} already exists in\n{os.getcwd()}\n" | |
| 244 + "Use -fs to force overwrite it." | |
| 245 ) | |
| 246 exit(1) | |
| 247 | |
| 248 with open(pdg_snp_meta_file, "r") as snp_meta: | |
| 249 header = snp_meta.readline() | |
| 250 skipped_acc2snp = set() | |
| 251 for line in snp_meta: | |
| 252 cols = line.strip().split(pdg_meta_fs) | |
| 253 if not 4 <= len(cols) < 5: | |
| 254 logging.error( | |
| 255 f"The metadata file {pdg_snp_meta_file} is malformed.\n" | |
| 256 + f"Expected 4 columns. Got {len(cols)} columns.\n" | |
| 257 ) | |
| 258 exit(1) | |
| 259 | |
| 260 if re.match("NULL", cols[3]): | |
| 261 skipped_acc2snp.add( | |
| 262 f"Isolate {cols[1]} has no genome accession: {cols[3]}" | |
| 263 ) | |
| 264 elif not acc_pat.match(cols[3]): | |
| 265 logging.error( | |
| 266 f"Did not find accession in either field number 4\n" | |
| 267 + "or field number 10 of column 4." | |
| 268 + f"\nLine: {line}" | |
| 269 ) | |
| 270 exit(1) | |
| 271 elif not re.match("NULL", cols[3]): | |
| 272 acc2snp[cols[3]] = cols[0] | |
| 273 | |
| 274 if len(skipped_acc2snp) > 0: | |
| 275 logging.info( | |
| 276 f"While indexing\n{os.path.basename(pdg_snp_meta_file)}," | |
| 277 + "\nthese isolates were skipped:\n\n" | |
| 278 + "\n".join(skipped_acc2snp) | |
| 279 ) | |
| 280 | |
| 281 with open(pdg_meta_file, "r") as pdg_meta: | |
| 282 header = pdg_meta.readline().strip().split(pdg_meta_fs) | |
| 283 user_req_cols = [ | |
| 284 mcol_i for mcol_i, mcol in enumerate(header) if mcol in mcols | |
| 285 ] | |
| 286 cols_not_found = [mcol for mcol in mcols if mcol not in header] | |
| 287 null_wgs_accs = set() | |
| 288 if len(cols_not_found) > 0: | |
| 289 logging.error( | |
| 290 f"The following columns do not exist in the" | |
| 291 + f"\nmetadata file [ {os.path.basename(pdg_meta_file)} ]:\n" | |
| 292 + "".join(cols_not_found) | |
| 293 ) | |
| 294 exit(1) | |
| 295 | |
| 296 for line in pdg_meta.readlines(): | |
| 297 cols = line.strip().split(pdg_meta_fs) | |
| 298 pdg_assm_acc = cols[acc_col] | |
| 299 if not acc_pat.match(pdg_assm_acc): | |
| 300 null_wgs_accs.add( | |
| 301 f"Isolate {cols[target_acc_col]} has no genome accession: {pdg_assm_acc}" | |
| 302 ) | |
| 303 continue | |
| 304 | |
| 305 if pdg_assm_acc in mlst_sts.keys(): | |
| 306 acc2meta[pdg_assm_acc].setdefault("mlst_sequence_type", []).append( | |
| 307 str(mlst_sts[pdg_assm_acc]) | |
| 308 ) | |
| 309 | |
| 310 for col in user_req_cols: | |
| 311 acc2meta[pdg_assm_acc].setdefault(header[col], []).append( | |
| 312 str(cols[col]) | |
| 313 ) | |
| 314 | |
| 315 if len(null_wgs_accs) > 0: | |
| 316 logging.info( | |
| 317 f"While indexing\n{os.path.basename(pdg_meta_file)}," | |
| 318 + "\nthese isolates were skipped:\n\n" | |
| 319 + "\n".join(null_wgs_accs) | |
| 320 ) | |
| 321 | |
| 322 with open(init_pickled_sero, "wb") as write_pickled_sero: | |
| 323 pickle.dump(file=write_pickled_sero, obj=acc2meta) | |
| 324 | |
| 325 if len(num_accs_check) != len(acc2meta.keys()): | |
| 326 logging.error( | |
| 327 "Failed the accession count check." | |
| 328 + f"\nExpected {len(num_accs_check)} accessions but got {len(acc2meta.keys())}." | |
| 329 ) | |
| 330 exit(1) | |
| 331 else: | |
| 332 logging.info( | |
| 333 f"Number of valid accessions: {len(num_accs_check)}" | |
| 334 + f"\nNumber of accessions indexed: {len(acc2meta.keys())}" | |
| 335 + f"\nNumber of accessions participating in any of the SNP Clusters: {len(acc2snp.keys())}" | |
| 336 + f"\n\nCreated the pickle file for\n{os.path.basename(pdg_meta_file)}." | |
| 337 + "\nThis was the only requested function." | |
| 338 ) | |
| 339 | |
| 340 snp_meta.close() | |
| 341 write_pickled_sero.close() | |
| 342 exit(0) | |
| 343 elif pdg_meta_file and not ( | |
| 344 os.path.exists(pdg_meta_file) and os.path.getsize(pdg_meta_file) > 0 | |
| 345 ): | |
| 346 logging.error( | |
| 347 "Requested to create pickle from metadata, but\n" | |
| 348 + f"the file, {os.path.basename(pdg_meta_file)} is empty or\ndoes not exist!" | |
| 349 ) | |
| 350 exit(1) | |
| 351 | |
| 352 pdg_acc_all_fh.close() | |
| 353 snp_meta.close() | |
| 354 pdg_meta.close() | |
| 355 write_pickled_sero.close() | |
| 356 | |
| 357 | |
| 358 if __name__ == "__main__": | |
| 359 main() |
