Mercurial > repos > kkonganti > cfsan_cronology
comparison 0.2.0/bin/index_pdg_metadata.py @ 11:a5f31c44f8c9
planemo upload
author | kkonganti |
---|---|
date | Mon, 15 Jul 2024 16:11:44 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
10:ddf7a172bf30 | 11:a5f31c44f8c9 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 # Kranti Konganti | |
4 | |
5 import argparse | |
6 import inspect | |
7 import logging | |
8 import os | |
9 import pickle | |
10 import pprint | |
11 import re | |
12 from collections import defaultdict | |
13 | |
14 | |
15 # Multiple inheritence for pretty printing of help text. | |
16 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): | |
17 pass | |
18 | |
19 | |
20 # Main | |
21 def main() -> None: | |
22 """ | |
23 This script works only in the context of `cronology_db` Nextflow workflow. | |
24 It takes an UNIX path to directory containing the following files: | |
25 1. PDG metadata file (Ex: `PDG000000043.204.metadata.tsv`) | |
26 2. PDG SNP Cluster metadata file (Ex: `PDG000000043.204.reference_target.cluster_list.tsv`) | |
27 3. A list of possibly downloadable assembly accessions (one per line) from the metadata file. | |
28 and then generates a pickled file with relevant metadata columns mentioned with the -cols option. | |
29 """ | |
30 | |
31 # Set logging. | |
32 logging.basicConfig( | |
33 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n", | |
34 level=logging.DEBUG, | |
35 ) | |
36 | |
37 # Debug print. | |
38 ppp = pprint.PrettyPrinter(width=55) | |
39 prog_name = os.path.basename(inspect.stack()[0].filename) | |
40 | |
41 parser = argparse.ArgumentParser( | |
42 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses | |
43 ) | |
44 | |
45 required = parser.add_argument_group("required arguments") | |
46 | |
47 required.add_argument( | |
48 "-pdg_dir", | |
49 dest="pdg_dir", | |
50 default=False, | |
51 required=True, | |
52 help="Absolute UNIX path to directory containing the following files.\nEx:" | |
53 + "\n1. PDG000000043.204.metadata.tsv" | |
54 + "\n2. PDG000000043.204.reference_target.cluster_list.tsv" | |
55 + "\n3. A file of assembly accessions, one per line parsed out from" | |
56 + "\n the metadata file.", | |
57 ) | |
58 parser.add_argument( | |
59 "-mlst", | |
60 dest="mlst_results", | |
61 required=False, | |
62 help="Absolute UNIX path to MLST results file\nIf MLST results exists for a accession, they" | |
63 + "\nwill be included in the index.", | |
64 ) | |
65 parser.add_argument( | |
66 "-pdg_meta_pat", | |
67 dest="pdg_meta_pat", | |
68 default="PDG\d+\.\d+\.metadata\.tsv", | |
69 required=False, | |
70 help="The pattern to be used to search for PDG metadata\nfile.", | |
71 ) | |
72 parser.add_argument( | |
73 "-pdg_snp_meta_pat", | |
74 dest="pdg_snp_meta_pat", | |
75 default="PDG\d+\.\d+\.reference\_target\.cluster\_list\.tsv", | |
76 required=False, | |
77 help="The pattern to be used to search for PDG SNP Cluster metadata\nfile.", | |
78 ) | |
79 parser.add_argument( | |
80 "-pdg_accs_filename_pat", | |
81 dest="pdg_accs_fn_pat", | |
82 default="accs_all.txt", | |
83 required=False, | |
84 help="The filename to look for where all the parsed GC[AF] accessions are stored,\n" | |
85 + "one per line.", | |
86 ) | |
87 parser.add_argument( | |
88 "-cols", | |
89 dest="metadata_cols", | |
90 default="epi_type,collected_by,collection_date,host," | |
91 + "\nhost_disease,isolation_source,outbreak,sample_name,scientific_name,serovar," | |
92 + "\nsource_type,strain,computed_types,target_acc", | |
93 required=False, | |
94 help="The data in these metadata columns will be indexed for each\nisolate.", | |
95 ) | |
96 parser.add_argument( | |
97 "-fs", | |
98 dest="force_write_pick", | |
99 action="store_true", | |
100 required=False, | |
101 help="By default, when -s flag is on, the pickle file named *.IDXD_PDG_METAD.pickle" | |
102 + "\nis written to CWD. If the file exists, the program will not overwrite" | |
103 + "\nand exit. Use -fs option to overwrite.", | |
104 ) | |
105 parser.add_argument( | |
106 "-op", | |
107 dest="out_prefix", | |
108 default="IDXD_PDG_METAD", | |
109 help="Set the output file prefix for indexed PDG metadata.", | |
110 ) | |
111 parser.add_argument( | |
112 "-pfs", | |
113 dest="pdg_meta_fs", | |
114 default="\t", | |
115 help="Change the field separator of the PDG metadata file.", | |
116 ) | |
117 | |
118 args = parser.parse_args() | |
119 pdg_dir = os.path.abspath(args.pdg_dir) | |
120 mcols = args.metadata_cols | |
121 f_write_pick = args.force_write_pick | |
122 out_prefix = args.out_prefix | |
123 pdg_meta_fs = args.pdg_meta_fs | |
124 mlst_res = args.mlst_results | |
125 acc_pat = re.compile(r"^GC[AF]\_\d+\.?\d*") | |
126 mcols_pat = re.compile(r"[\w+\,]") | |
127 pdg_meta_pat = re.compile(f"{args.pdg_meta_pat}") | |
128 pdg_snp_meta_pat = re.compile(f"{args.pdg_snp_meta_pat}") | |
129 pdg_accs_fn_pat = re.compile(f"{args.pdg_accs_fn_pat}") | |
130 target_acc_col = 41 | |
131 acc_col = 9 | |
132 num_accs_check = list() | |
133 mlst_sts = dict() | |
134 acceptable_num_mlst_cols = 10 | |
135 mlst_st_col = 2 | |
136 mlst_acc_col = 0 | |
137 | |
138 # Basic checks | |
139 | |
140 if os.path.exists(pdg_dir) and os.path.isdir(pdg_dir): | |
141 pdg_meta_file = [f for f in os.listdir(pdg_dir) if pdg_meta_pat.match(f)] | |
142 pdg_snp_meta_file = [f for f in os.listdir(pdg_dir) if pdg_snp_meta_pat.match(f)] | |
143 pdg_acc_all = [f for f in os.listdir(pdg_dir) if pdg_accs_fn_pat.match(f)] | |
144 req_files = [len(pdg_meta_file), len(pdg_snp_meta_file), len(pdg_acc_all)] | |
145 if any(x > 1 for x in req_files): | |
146 logging.error( | |
147 f"Directory {os.path.basename(pdg_dir)} contains" | |
148 + "\ncontains mulitple files matching the search pattern." | |
149 ) | |
150 exit(1) | |
151 elif any(x == 0 for x in req_files): | |
152 logging.error( | |
153 f"Directory {os.path.basename(pdg_dir)} does not contain" | |
154 + "\nany files matching the following file patterns:" | |
155 + f"\n{pdg_meta_pat.pattern}" | |
156 + f"\n{pdg_snp_meta_pat.pattern}" | |
157 + f"\n{pdg_accs_fn_pat.pattern}" | |
158 ) | |
159 exit(1) | |
160 pdg_meta_file = os.path.join(pdg_dir, "".join(pdg_meta_file)) | |
161 pdg_snp_meta_file = os.path.join(pdg_dir, "".join(pdg_snp_meta_file)) | |
162 pdg_acc_all = os.path.join(pdg_dir, "".join(pdg_acc_all)) | |
163 else: | |
164 logging.error(f"Directory path {pdg_dir} does not exist.") | |
165 exit(1) | |
166 | |
167 if mlst_res and not (os.path.exists(mlst_res) or os.path.getsize(mlst_res) > 0): | |
168 logging.error( | |
169 f"Requested to index MLST results. but the file {os.path.basename(mlst_res)}" | |
170 + "does not exist or the file is empty." | |
171 ) | |
172 exit(1) | |
173 elif mlst_res: | |
174 with open(mlst_res, "r") as mlst_res_fh: | |
175 header = mlst_res_fh.readline() | |
176 mlst_res_has_10_cols = False | |
177 | |
178 for line in mlst_res_fh: | |
179 cols = line.strip().split("\t") | |
180 acc = acc_pat.findall(cols[mlst_acc_col]) | |
181 if len(acc) > 1: | |
182 logging.error(f"Found more than 1 accession in column:\ncols[mlst_acc_col]\n") | |
183 exit(1) | |
184 else: | |
185 acc = "".join(acc) | |
186 if len(cols) == acceptable_num_mlst_cols and re.match(r"\d+|\-", cols[mlst_st_col]): | |
187 mlst_res_has_10_cols = True | |
188 if re.match(r"\-", cols[mlst_st_col]): | |
189 mlst_sts[acc] = "NULL" | |
190 else: | |
191 mlst_sts[acc] = cols[mlst_st_col] | |
192 | |
193 if not mlst_res_has_10_cols: | |
194 logging.error( | |
195 "Requested to incorporate MLST ST's but file" | |
196 + f"\n{os.path.basename(mlst_res)}" | |
197 + "\ndoes not have 10 columns in all rows." | |
198 ) | |
199 exit(1) | |
200 | |
201 mlst_res_fh.close() | |
202 | |
203 with open(pdg_acc_all, "r") as pdg_acc_all_fh: | |
204 for a in pdg_acc_all_fh.readlines(): | |
205 num_accs_check.append(a.strip()) | |
206 | |
207 if not mcols_pat.match(mcols): | |
208 logging.error( | |
209 f"Supplied columns' names should only be" | |
210 + "\nalphanumeric (including _) separated by a comma." | |
211 ) | |
212 exit(1) | |
213 else: | |
214 mcols = re.sub("\n", "", mcols).split(",") | |
215 | |
216 if ( | |
217 pdg_snp_meta_file | |
218 and os.path.exists(pdg_snp_meta_file) | |
219 and os.path.getsize(pdg_snp_meta_file) > 0 | |
220 ): | |
221 acc2snp = defaultdict() | |
222 acc2meta = defaultdict(defaultdict) | |
223 init_pickled_sero = os.path.join(os.getcwd(), out_prefix + ".pickle") | |
224 | |
225 if ( | |
226 os.path.exists(init_pickled_sero) | |
227 and os.path.getsize(init_pickled_sero) | |
228 and not f_write_pick | |
229 ): | |
230 logging.error( | |
231 f"File {os.path.basename(init_pickled_sero)} already exists in\n{os.getcwd()}\n" | |
232 + "Use -fs to force overwrite it." | |
233 ) | |
234 exit(1) | |
235 | |
236 with open(pdg_snp_meta_file, "r") as snp_meta: | |
237 header = snp_meta.readline() | |
238 skipped_acc2snp = set() | |
239 for line in snp_meta: | |
240 cols = line.strip().split(pdg_meta_fs) | |
241 if not 4 <= len(cols) < 5: | |
242 logging.error( | |
243 f"The metadata file {pdg_snp_meta_file} is malformed.\n" | |
244 + f"Expected 4 columns. Got {len(cols)} columns.\n" | |
245 ) | |
246 exit(1) | |
247 | |
248 if re.match("NULL", cols[3]): | |
249 skipped_acc2snp.add(f"Isolate {cols[1]} has no genome accession: {cols[3]}") | |
250 elif not acc_pat.match(cols[3]): | |
251 logging.error( | |
252 f"Did not find accession in either field number 4\n" | |
253 + "or field number 10 of column 4." | |
254 + f"\nLine: {line}" | |
255 ) | |
256 exit(1) | |
257 elif not re.match("NULL", cols[3]): | |
258 acc2snp[cols[3]] = cols[0] | |
259 | |
260 if len(skipped_acc2snp) > 0: | |
261 logging.info( | |
262 f"While indexing\n{os.path.basename(pdg_snp_meta_file)}," | |
263 + "\nthese isolates were skipped:\n\n" | |
264 + "\n".join(skipped_acc2snp) | |
265 ) | |
266 | |
267 with open(pdg_meta_file, "r") as pdg_meta: | |
268 header = pdg_meta.readline().strip().split(pdg_meta_fs) | |
269 user_req_cols = [mcol_i for mcol_i, mcol in enumerate(header) if mcol in mcols] | |
270 cols_not_found = [mcol for mcol in mcols if mcol not in header] | |
271 null_wgs_accs = set() | |
272 if len(cols_not_found) > 0: | |
273 logging.error( | |
274 f"The following columns do not exist in the" | |
275 + f"\nmetadata file [ {os.path.basename(pdg_meta_file)} ]:\n" | |
276 + "".join(cols_not_found) | |
277 ) | |
278 exit(1) | |
279 | |
280 for line in pdg_meta.readlines(): | |
281 cols = line.strip().split(pdg_meta_fs) | |
282 pdg_assm_acc = cols[acc_col] | |
283 if not acc_pat.match(pdg_assm_acc): | |
284 null_wgs_accs.add( | |
285 f"Isolate {cols[target_acc_col]} has no genome accession: {pdg_assm_acc}" | |
286 ) | |
287 continue | |
288 | |
289 if pdg_assm_acc in mlst_sts.keys(): | |
290 acc2meta[pdg_assm_acc].setdefault("mlst_sequence_type", []).append( | |
291 str(mlst_sts[pdg_assm_acc]) | |
292 ) | |
293 | |
294 for col in user_req_cols: | |
295 acc2meta[pdg_assm_acc].setdefault(header[col], []).append(str(cols[col])) | |
296 | |
297 if len(null_wgs_accs) > 0: | |
298 logging.info( | |
299 f"While indexing\n{os.path.basename(pdg_meta_file)}," | |
300 + "\nthese isolates were skipped:\n\n" | |
301 + "\n".join(null_wgs_accs) | |
302 ) | |
303 | |
304 with open(init_pickled_sero, "wb") as write_pickled_sero: | |
305 pickle.dump(file=write_pickled_sero, obj=acc2meta) | |
306 | |
307 if len(num_accs_check) != len(acc2meta.keys()): | |
308 logging.error( | |
309 "Failed the accession count check." | |
310 + f"\nExpected {len(num_accs_check)} accessions but got {len(acc2meta.keys())}." | |
311 ) | |
312 exit(1) | |
313 else: | |
314 logging.info( | |
315 f"Number of valid accessions: {len(num_accs_check)}" | |
316 + f"\nNumber of accessions indexed: {len(acc2meta.keys())}" | |
317 + f"\nNumber of accessions participating in any of the SNP Clusters: {len(acc2snp.keys())}" | |
318 + f"\n\nCreated the pickle file for\n{os.path.basename(pdg_meta_file)}." | |
319 + "\nThis was the only requested function." | |
320 ) | |
321 | |
322 snp_meta.close() | |
323 write_pickled_sero.close() | |
324 exit(0) | |
325 elif pdg_meta_file and not ( | |
326 os.path.exists(pdg_meta_file) and os.path.getsize(pdg_meta_file) > 0 | |
327 ): | |
328 logging.error( | |
329 "Requested to create pickle from metadata, but\n" | |
330 + f"the file, {os.path.basename(pdg_meta_file)} is empty or\ndoes not exist!" | |
331 ) | |
332 exit(1) | |
333 | |
334 pdg_acc_all_fh.close() | |
335 snp_meta.close() | |
336 pdg_meta.close() | |
337 write_pickled_sero.close() | |
338 | |
339 | |
340 if __name__ == "__main__": | |
341 main() |