|
0
|
1 #!/usr/bin/env python3
|
|
|
2
|
|
|
3 # Kranti Konganti
|
|
|
4
|
|
|
5 import argparse
|
|
|
6 import inspect
|
|
|
7 import logging
|
|
|
8 import os
|
|
|
9 import pickle
|
|
|
10 import pprint
|
|
|
11 import re
|
|
|
12 from collections import defaultdict
|
|
|
13
|
|
|
14
|
|
|
15 # Multiple inheritence for pretty printing of help text.
|
|
|
16 class MultiArgFormatClasses(
|
|
|
17 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
|
|
|
18 ):
|
|
|
19 pass
|
|
|
20
|
|
|
21
|
|
|
22 # Main
|
|
|
23 def main() -> None:
|
|
|
24 """
|
|
|
25 This script works only in the context of `bettercallsal_db` Nextflow workflow.
|
|
|
26 It takes an UNIX path to directory containing the following files:
|
|
|
27 1. PDG metadata file (Ex: `PDG000000043.204.metadata.tsv`)
|
|
|
28 2. PDG SNP Cluster metadata file (Ex: `PDG000000043.204.reference_target.cluster_list.tsv`)
|
|
|
29 3. A list of possibly downloadable assembly accessions (one per line) from the metadata file.
|
|
|
30 and then generates a pickled file with relevant metadata columns mentioned with the -cols option.
|
|
|
31 """
|
|
|
32
|
|
|
33 # Set logging.
|
|
|
34 logging.basicConfig(
|
|
|
35 format="\n"
|
|
|
36 + "=" * 55
|
|
|
37 + "\n%(asctime)s - %(levelname)s\n"
|
|
|
38 + "=" * 55
|
|
|
39 + "\n%(message)s\n\n",
|
|
|
40 level=logging.DEBUG,
|
|
|
41 )
|
|
|
42
|
|
|
43 # Debug print.
|
|
|
44 ppp = pprint.PrettyPrinter(width=55)
|
|
|
45 prog_name = os.path.basename(inspect.stack()[0].filename)
|
|
|
46
|
|
|
47 parser = argparse.ArgumentParser(
|
|
|
48 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
|
|
|
49 )
|
|
|
50
|
|
|
51 required = parser.add_argument_group("required arguments")
|
|
|
52
|
|
|
53 required.add_argument(
|
|
|
54 "-pdg_dir",
|
|
|
55 dest="pdg_dir",
|
|
|
56 default=False,
|
|
|
57 required=True,
|
|
|
58 help="Absolute UNIX path to directory containing the following files.\nEx:"
|
|
|
59 + "\n1. PDG000000043.204.metadata.tsv"
|
|
|
60 + "\n2. PDG000000043.204.reference_target.cluster_list.tsv"
|
|
|
61 + "\n3. A file of assembly accessions, one per line parsed out from"
|
|
|
62 + "\n the metadata file.",
|
|
|
63 )
|
|
|
64 parser.add_argument(
|
|
|
65 "-mlst",
|
|
|
66 dest="mlst_results",
|
|
|
67 required=False,
|
|
|
68 help="Absolute UNIX path to MLST results file\nIf MLST results exists for a accession, they"
|
|
|
69 + "\nwill be included in the index.",
|
|
|
70 )
|
|
|
71 parser.add_argument(
|
|
|
72 "-pdg_meta_pat",
|
|
|
73 dest="pdg_meta_pat",
|
|
|
74 default="PDG\d+\.\d+\.metadata\.tsv",
|
|
|
75 required=False,
|
|
|
76 help="The pattern to be used to search for PDG metadata\nfile.",
|
|
|
77 )
|
|
|
78 parser.add_argument(
|
|
|
79 "-pdg_snp_meta_pat",
|
|
|
80 dest="pdg_snp_meta_pat",
|
|
|
81 default="PDG\d+\.\d+\.reference\_target\.cluster\_list\.tsv",
|
|
|
82 required=False,
|
|
|
83 help="The pattern to be used to search for PDG SNP Cluster metadata\nfile.",
|
|
|
84 )
|
|
|
85 parser.add_argument(
|
|
|
86 "-pdg_accs_filename_pat",
|
|
|
87 dest="pdg_accs_fn_pat",
|
|
|
88 default="accs_all.txt",
|
|
|
89 required=False,
|
|
|
90 help="The filename to look for where all the parsed GC[AF] accessions are stored,\n"
|
|
|
91 + "one per line.",
|
|
|
92 )
|
|
|
93 parser.add_argument(
|
|
|
94 "-cols",
|
|
|
95 dest="metadata_cols",
|
|
|
96 default="epi_type,collected_by,collection_date,host,"
|
|
|
97 + "\nhost_disease,isolation_source,outbreak,sample_name,scientific_name,serovar,"
|
|
|
98 + "\nsource_type,strain,computed_types,target_acc,geo_loc_name",
|
|
|
99 required=False,
|
|
|
100 help="The data in these metadata columns will be indexed for each\nisolate.",
|
|
|
101 )
|
|
|
102 parser.add_argument(
|
|
|
103 "-fs",
|
|
|
104 dest="force_write_pick",
|
|
|
105 action="store_true",
|
|
|
106 required=False,
|
|
|
107 help="By default, when -s flag is on, the pickle file named *.IDXD_PDG_METAD.pickle"
|
|
|
108 + "\nis written to CWD. If the file exists, the program will not overwrite"
|
|
|
109 + "\nand exit. Use -fs option to overwrite.",
|
|
|
110 )
|
|
|
111 parser.add_argument(
|
|
|
112 "-op",
|
|
|
113 dest="out_prefix",
|
|
|
114 default="IDXD_PDG_METAD",
|
|
|
115 help="Set the output file prefix for indexed PDG metadata.",
|
|
|
116 )
|
|
|
117 parser.add_argument(
|
|
|
118 "-pfs",
|
|
|
119 dest="pdg_meta_fs",
|
|
|
120 default="\t",
|
|
|
121 help="Change the field separator of the PDG metadata file.",
|
|
|
122 )
|
|
|
123
|
|
|
124 args = parser.parse_args()
|
|
|
125 pdg_dir = os.path.abspath(args.pdg_dir)
|
|
|
126 mcols = args.metadata_cols
|
|
|
127 f_write_pick = args.force_write_pick
|
|
|
128 out_prefix = args.out_prefix
|
|
|
129 pdg_meta_fs = args.pdg_meta_fs
|
|
|
130 mlst_res = args.mlst_results
|
|
|
131 acc_pat = re.compile(r"^GC[AF]\_\d+\.?\d*")
|
|
|
132 mcols_pat = re.compile(r"[\w+\,]")
|
|
|
133 pdg_meta_pat = re.compile(f"{args.pdg_meta_pat}")
|
|
|
134 pdg_snp_meta_pat = re.compile(f"{args.pdg_snp_meta_pat}")
|
|
|
135 pdg_accs_fn_pat = re.compile(f"{args.pdg_accs_fn_pat}")
|
|
|
136 target_acc_col = 41
|
|
|
137 acc_col = 9
|
|
|
138 num_accs_check = list()
|
|
|
139 mlst_sts = dict()
|
|
|
140 acceptable_num_mlst_cols = 10
|
|
|
141 mlst_st_col = 2
|
|
|
142 mlst_acc_col = 0
|
|
|
143
|
|
|
144 # Basic checks
|
|
|
145
|
|
|
146 if os.path.exists(pdg_dir) and os.path.isdir(pdg_dir):
|
|
|
147 pdg_meta_file = [f for f in os.listdir(pdg_dir) if pdg_meta_pat.match(f)]
|
|
|
148 pdg_snp_meta_file = [
|
|
|
149 f for f in os.listdir(pdg_dir) if pdg_snp_meta_pat.match(f)
|
|
|
150 ]
|
|
|
151 pdg_acc_all = [f for f in os.listdir(pdg_dir) if pdg_accs_fn_pat.match(f)]
|
|
|
152 req_files = [len(pdg_meta_file), len(pdg_snp_meta_file), len(pdg_acc_all)]
|
|
|
153 if any(x > 1 for x in req_files):
|
|
|
154 logging.error(
|
|
|
155 f"Directory {os.path.basename(pdg_dir)} contains"
|
|
|
156 + "\ncontains mulitple files matching the search pattern."
|
|
|
157 )
|
|
|
158 exit(1)
|
|
|
159 elif any(x == 0 for x in req_files):
|
|
|
160 logging.error(
|
|
|
161 f"Directory {os.path.basename(pdg_dir)} does not contain"
|
|
|
162 + "\nany files matching the following file patterns:"
|
|
|
163 + f"\n{pdg_meta_pat.pattern}"
|
|
|
164 + f"\n{pdg_snp_meta_pat.pattern}"
|
|
|
165 + f"\n{pdg_accs_fn_pat.pattern}"
|
|
|
166 )
|
|
|
167 exit(1)
|
|
|
168 pdg_meta_file = os.path.join(pdg_dir, "".join(pdg_meta_file))
|
|
|
169 pdg_snp_meta_file = os.path.join(pdg_dir, "".join(pdg_snp_meta_file))
|
|
|
170 pdg_acc_all = os.path.join(pdg_dir, "".join(pdg_acc_all))
|
|
|
171 else:
|
|
|
172 logging.error(f"Directory path {pdg_dir} does not exist.")
|
|
|
173 exit(1)
|
|
|
174
|
|
|
175 if mlst_res and not (os.path.exists(mlst_res) or os.path.getsize(mlst_res) > 0):
|
|
|
176 logging.error(
|
|
|
177 f"Requested to index MLST results. but the file {os.path.basename(mlst_res)}"
|
|
|
178 + "does not exist or the file is empty."
|
|
|
179 )
|
|
|
180 exit(1)
|
|
|
181 elif mlst_res:
|
|
|
182 with open(mlst_res, "r") as mlst_res_fh:
|
|
|
183 header = mlst_res_fh.readline()
|
|
|
184 mlst_res_has_10_cols = False
|
|
|
185
|
|
|
186 for line in mlst_res_fh:
|
|
|
187 cols = line.strip().split("\t")
|
|
|
188 acc = acc_pat.findall(cols[mlst_acc_col])
|
|
|
189 if len(acc) > 1:
|
|
|
190 logging.error(
|
|
|
191 f"Found more than 1 accession in column:\ncols[mlst_acc_col]\n"
|
|
|
192 )
|
|
|
193 exit(1)
|
|
|
194 else:
|
|
|
195 acc = "".join(acc)
|
|
|
196 if len(cols) == acceptable_num_mlst_cols and re.match(
|
|
|
197 r"\d+|\-", cols[mlst_st_col]
|
|
|
198 ):
|
|
|
199 mlst_res_has_10_cols = True
|
|
|
200 if re.match(r"\-", cols[mlst_st_col]):
|
|
|
201 mlst_sts[acc] = "NULL"
|
|
|
202 else:
|
|
|
203 mlst_sts[acc] = cols[mlst_st_col]
|
|
|
204
|
|
|
205 if not mlst_res_has_10_cols:
|
|
|
206 logging.error(
|
|
|
207 "Requested to incorporate MLST ST's but file"
|
|
|
208 + f"\n{os.path.basename(mlst_res)}"
|
|
|
209 + "\ndoes not have 10 columns in all rows."
|
|
|
210 )
|
|
|
211 exit(1)
|
|
|
212
|
|
|
213 mlst_res_fh.close()
|
|
|
214
|
|
|
215 with open(pdg_acc_all, "r") as pdg_acc_all_fh:
|
|
|
216 for a in pdg_acc_all_fh.readlines():
|
|
|
217 num_accs_check.append(a.strip())
|
|
|
218
|
|
|
219 if not mcols_pat.match(mcols):
|
|
|
220 logging.error(
|
|
|
221 f"Supplied columns' names should only be"
|
|
|
222 + "\nalphanumeric (including _) separated by a comma."
|
|
|
223 )
|
|
|
224 exit(1)
|
|
|
225 else:
|
|
|
226 mcols = re.sub("\n", "", mcols).split(",")
|
|
|
227
|
|
|
228 if (
|
|
|
229 pdg_snp_meta_file
|
|
|
230 and os.path.exists(pdg_snp_meta_file)
|
|
|
231 and os.path.getsize(pdg_snp_meta_file) > 0
|
|
|
232 ):
|
|
|
233 acc2snp = defaultdict()
|
|
|
234 acc2meta = defaultdict(defaultdict)
|
|
|
235 init_pickled_sero = os.path.join(os.getcwd(), out_prefix + ".pickle")
|
|
|
236
|
|
|
237 if (
|
|
|
238 os.path.exists(init_pickled_sero)
|
|
|
239 and os.path.getsize(init_pickled_sero)
|
|
|
240 and not f_write_pick
|
|
|
241 ):
|
|
|
242 logging.error(
|
|
|
243 f"File {os.path.basename(init_pickled_sero)} already exists in\n{os.getcwd()}\n"
|
|
|
244 + "Use -fs to force overwrite it."
|
|
|
245 )
|
|
|
246 exit(1)
|
|
|
247
|
|
|
248 with open(pdg_snp_meta_file, "r") as snp_meta:
|
|
|
249 header = snp_meta.readline()
|
|
|
250 skipped_acc2snp = set()
|
|
|
251 for line in snp_meta:
|
|
|
252 cols = line.strip().split(pdg_meta_fs)
|
|
|
253 if not 4 <= len(cols) < 5:
|
|
|
254 logging.error(
|
|
|
255 f"The metadata file {pdg_snp_meta_file} is malformed.\n"
|
|
|
256 + f"Expected 4 columns. Got {len(cols)} columns.\n"
|
|
|
257 )
|
|
|
258 exit(1)
|
|
|
259
|
|
|
260 if re.match("NULL", cols[3]):
|
|
|
261 skipped_acc2snp.add(
|
|
|
262 f"Isolate {cols[1]} has no genome accession: {cols[3]}"
|
|
|
263 )
|
|
|
264 elif not acc_pat.match(cols[3]):
|
|
|
265 logging.error(
|
|
|
266 f"Did not find accession in either field number 4\n"
|
|
|
267 + "or field number 10 of column 4."
|
|
|
268 + f"\nLine: {line}"
|
|
|
269 )
|
|
|
270 exit(1)
|
|
|
271 elif not re.match("NULL", cols[3]):
|
|
|
272 acc2snp[cols[3]] = cols[0]
|
|
|
273
|
|
|
274 if len(skipped_acc2snp) > 0:
|
|
|
275 logging.info(
|
|
|
276 f"While indexing\n{os.path.basename(pdg_snp_meta_file)},"
|
|
|
277 + "\nthese isolates were skipped:\n\n"
|
|
|
278 + "\n".join(skipped_acc2snp)
|
|
|
279 )
|
|
|
280
|
|
|
281 with open(pdg_meta_file, "r") as pdg_meta:
|
|
|
282 header = pdg_meta.readline().strip().split(pdg_meta_fs)
|
|
|
283 user_req_cols = [
|
|
|
284 mcol_i for mcol_i, mcol in enumerate(header) if mcol in mcols
|
|
|
285 ]
|
|
|
286 cols_not_found = [mcol for mcol in mcols if mcol not in header]
|
|
|
287 null_wgs_accs = set()
|
|
|
288 if len(cols_not_found) > 0:
|
|
|
289 logging.error(
|
|
|
290 f"The following columns do not exist in the"
|
|
|
291 + f"\nmetadata file [ {os.path.basename(pdg_meta_file)} ]:\n"
|
|
|
292 + "".join(cols_not_found)
|
|
|
293 )
|
|
|
294 exit(1)
|
|
|
295
|
|
|
296 for line in pdg_meta.readlines():
|
|
|
297 cols = line.strip().split(pdg_meta_fs)
|
|
|
298 pdg_assm_acc = cols[acc_col]
|
|
|
299 if not acc_pat.match(pdg_assm_acc):
|
|
|
300 null_wgs_accs.add(
|
|
|
301 f"Isolate {cols[target_acc_col]} has no genome accession: {pdg_assm_acc}"
|
|
|
302 )
|
|
|
303 continue
|
|
|
304
|
|
|
305 if pdg_assm_acc in mlst_sts.keys():
|
|
|
306 acc2meta[pdg_assm_acc].setdefault("mlst_sequence_type", []).append(
|
|
|
307 str(mlst_sts[pdg_assm_acc])
|
|
|
308 )
|
|
|
309
|
|
|
310 for col in user_req_cols:
|
|
|
311 acc2meta[pdg_assm_acc].setdefault(header[col], []).append(
|
|
|
312 str(cols[col])
|
|
|
313 )
|
|
|
314
|
|
|
315 if len(null_wgs_accs) > 0:
|
|
|
316 logging.info(
|
|
|
317 f"While indexing\n{os.path.basename(pdg_meta_file)},"
|
|
|
318 + "\nthese isolates were skipped:\n\n"
|
|
|
319 + "\n".join(null_wgs_accs)
|
|
|
320 )
|
|
|
321
|
|
|
322 with open(init_pickled_sero, "wb") as write_pickled_sero:
|
|
|
323 pickle.dump(file=write_pickled_sero, obj=acc2meta)
|
|
|
324
|
|
|
325 if len(num_accs_check) != len(acc2meta.keys()):
|
|
|
326 logging.error(
|
|
|
327 "Failed the accession count check."
|
|
|
328 + f"\nExpected {len(num_accs_check)} accessions but got {len(acc2meta.keys())}."
|
|
|
329 )
|
|
|
330 exit(1)
|
|
|
331 else:
|
|
|
332 logging.info(
|
|
|
333 f"Number of valid accessions: {len(num_accs_check)}"
|
|
|
334 + f"\nNumber of accessions indexed: {len(acc2meta.keys())}"
|
|
|
335 + f"\nNumber of accessions participating in any of the SNP Clusters: {len(acc2snp.keys())}"
|
|
|
336 + f"\n\nCreated the pickle file for\n{os.path.basename(pdg_meta_file)}."
|
|
|
337 + "\nThis was the only requested function."
|
|
|
338 )
|
|
|
339
|
|
|
340 snp_meta.close()
|
|
|
341 write_pickled_sero.close()
|
|
|
342 exit(0)
|
|
|
343 elif pdg_meta_file and not (
|
|
|
344 os.path.exists(pdg_meta_file) and os.path.getsize(pdg_meta_file) > 0
|
|
|
345 ):
|
|
|
346 logging.error(
|
|
|
347 "Requested to create pickle from metadata, but\n"
|
|
|
348 + f"the file, {os.path.basename(pdg_meta_file)} is empty or\ndoes not exist!"
|
|
|
349 )
|
|
|
350 exit(1)
|
|
|
351
|
|
|
352 pdg_acc_all_fh.close()
|
|
|
353 snp_meta.close()
|
|
|
354 pdg_meta.close()
|
|
|
355 write_pickled_sero.close()
|
|
|
356
|
|
|
357
|
|
|
358 if __name__ == "__main__":
|
|
|
359 main()
|