Mercurial > repos > kkonganti > cfsan_bettercallsal
comparison 0.7.0/bin/get_top_unique_mash_hit_genomes.py @ 17:0e7a0053e4a6
planemo upload
author | kkonganti |
---|---|
date | Mon, 15 Jul 2024 10:42:02 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
16:b90e5a7a3d4f | 17:0e7a0053e4a6 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 # Kranti Konganti | |
4 | |
5 import argparse | |
6 import glob | |
7 import inspect | |
8 import logging | |
9 import os | |
10 import pickle | |
11 import pprint | |
12 import re | |
13 import subprocess | |
14 from collections import defaultdict | |
15 | |
16 | |
17 # Multiple inheritence for pretty printing of help text. | |
18 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): | |
19 pass | |
20 | |
21 | |
22 # Main | |
23 def main() -> None: | |
24 """ | |
25 This script works only in the context of a Nextflow workflow. | |
26 It takes: | |
27 1. A pickle file containing a dictionary object where genome accession | |
28 is the key and the computed serotype is the value. | |
29 OR | |
30 1. It takes a pickle file containing a nested dictionary, where genome accession | |
31 is the key and the metadata is a dictionary associated with that key. | |
32 2. A file with `mash screen` results. | |
33 3. A directory containing genomes' FASTA in gzipped format where the | |
34 FASTA file contains 2 lines: one FASTA header followed by | |
35 genome Sequence. | |
36 and then generates a concatenated FASTA file of top N unique `mash screen` | |
37 genome hits as requested. | |
38 | |
39 In addition: | |
40 1. User can skip `mash screen` hits that originate from the supplied | |
41 bio project accessions. | |
42 For -skip option to work, ncbi-datasets should be available in $PATH. | |
43 """ | |
44 | |
45 # Set logging. | |
46 logging.basicConfig( | |
47 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n", | |
48 level=logging.DEBUG, | |
49 ) | |
50 | |
51 # Debug print. | |
52 ppp = pprint.PrettyPrinter(width=55) | |
53 prog_name = os.path.basename(inspect.stack()[0].filename) | |
54 | |
55 parser = argparse.ArgumentParser( | |
56 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses | |
57 ) | |
58 | |
59 parser.add_argument( | |
60 "-s", | |
61 dest="sero_snp_metadata", | |
62 default=False, | |
63 required=False, | |
64 help="Absolute UNIX path to metadata text file with the field separator, | " | |
65 + "\nand 5 fields: serotype|asm_lvl|asm_url|snp_cluster_id" | |
66 + "\nEx: serotype=Derby,antigen_formula=4:f,g:-|Scaffold|402440|ftp://...\n|PDS000096654.2\n" | |
67 + "Mentioning this option will create a pickle file for the\nprovided metadata and exits.", | |
68 ) | |
69 parser.add_argument( | |
70 "-fs", | |
71 dest="force_write_pick", | |
72 action="store_true", | |
73 required=False, | |
74 help="By default, when -s flag is on, the pickle file named *.ACC2SERO.pickle\n" | |
75 + "is written to CWD. If the file exists, the program will not overwrite\n" | |
76 + "and exit. Use -fs option to overwrite.", | |
77 ) | |
78 parser.add_argument( | |
79 "-m", | |
80 dest="mash_screen_res", | |
81 default=False, | |
82 required=False, | |
83 help="Absolute UNIX path to `mash screen` results file.", | |
84 ) | |
85 parser.add_argument( | |
86 "-ms", | |
87 dest="mash_screen_res_suffix", | |
88 default=".screened", | |
89 required=False, | |
90 help="Suffix of the `mash screen` result file.", | |
91 ) | |
92 parser.add_argument( | |
93 "-ps", | |
94 dest="pickled_sero", | |
95 default=False, | |
96 required=False, | |
97 help="Absolute UNIX Path to serialized metadata object in a pickle file.\n" | |
98 + "You can create the pickle file of the metadata using -s option.\n" | |
99 + "Required if -m is on.", | |
100 ) | |
101 parser.add_argument( | |
102 "-gd", | |
103 dest="genomes_dir", | |
104 default=False, | |
105 required=False, | |
106 help="Absolute UNIX path to a directory containing\n" | |
107 + "gzipped genome FASTA files.\n" | |
108 + "Required if -m is on.", | |
109 ) | |
110 parser.add_argument( | |
111 "-gds", | |
112 dest="genomes_dir_suffix", | |
113 default="_scaffolded_genomic.fna.gz", | |
114 required=False, | |
115 help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.", | |
116 ) | |
117 parser.add_argument( | |
118 "-n", | |
119 dest="num_uniq_hits", | |
120 default=10, | |
121 required=False, | |
122 help="This many number of serotype genomes' accessions are returned.", | |
123 ) | |
124 parser.add_argument( | |
125 "-skip", | |
126 dest="skip_accs", | |
127 default=str(""), | |
128 required=False, | |
129 help="Skip all hits which belong to the following bioproject accession(s).\n" | |
130 + "A comma separated list of more than one bioproject.", | |
131 ) | |
132 parser.add_argument( | |
133 "-op", | |
134 dest="out_prefix", | |
135 default="MASH_SCREEN", | |
136 required=False, | |
137 help="Set the output file prefix for .fna.gz and .txt files.", | |
138 ) | |
139 # required = parser.add_argument_group('required arguments') | |
140 | |
141 args = parser.parse_args() | |
142 num_uniq_hits = int(args.num_uniq_hits) | |
143 mash_screen_res = args.mash_screen_res | |
144 mash_screen_res_suffix = args.mash_screen_res_suffix | |
145 pickle_sero = args.sero_snp_metadata | |
146 pickled_sero = args.pickled_sero | |
147 f_write_pick = args.force_write_pick | |
148 genomes_dir = args.genomes_dir | |
149 genomes_dir_suffix = args.genomes_dir_suffix | |
150 out_prefix = args.out_prefix | |
151 skip_accs = args.skip_accs | |
152 skip_accs_list = list() | |
153 skip_check = re.compile(r"PRJNA\d+(?:\,PRJNA\d+){0,1}") | |
154 req_metadata = { | |
155 "mlst_sequence_type": "ST", | |
156 "epi_type": "ET", | |
157 "host": "HO", | |
158 "host_disease": "HD", | |
159 "isolation_source": "IS", | |
160 "outbreak": "OU", | |
161 "source_type": "SOT", | |
162 "strain": "GS", | |
163 } | |
164 target_acc_key = "target_acc" | |
165 ncbi_path_heading = "NCBI Pathogen Isolates Browser" | |
166 ncbi_path_uri = "https://www.ncbi.nlm.nih.gov/pathogens/isolates/#" | |
167 mash_genomes_gz = os.path.join( | |
168 os.getcwd(), out_prefix + "_TOP_" + str(num_uniq_hits) + "_UNIQUE_HITS.fna.gz" | |
169 ) | |
170 mash_uniq_hits_txt = os.path.join( | |
171 os.getcwd(), re.sub(".fna.gz", ".txt", os.path.basename(mash_genomes_gz)) | |
172 ) | |
173 mash_uniq_accs_txt = os.path.join( | |
174 os.getcwd(), re.sub(".fna.gz", "_ACCS.txt", os.path.basename(mash_genomes_gz)) | |
175 ) | |
176 mash_popup_info_txt = os.path.join( | |
177 os.getcwd(), re.sub(".fna.gz", "_POPUP.txt", os.path.basename(mash_genomes_gz)) | |
178 ) | |
179 | |
180 if mash_screen_res and os.path.exists(mash_genomes_gz): | |
181 logging.error( | |
182 "A concatenated genome FASTA file,\n" | |
183 + f"{os.path.basename(mash_genomes_gz)} already exists in:\n" | |
184 + f"{os.getcwd()}\n" | |
185 + "Please remove or move it as we will not " | |
186 + "overwrite it." | |
187 ) | |
188 exit(1) | |
189 | |
190 if os.path.exists(mash_uniq_hits_txt) and os.path.getsize(mash_uniq_hits_txt) > 0: | |
191 os.remove(mash_uniq_hits_txt) | |
192 | |
193 if mash_screen_res and (not genomes_dir or not pickled_sero): | |
194 logging.error("When -m is on, -ps and -gd are also required.") | |
195 exit(1) | |
196 | |
197 if skip_accs and not skip_check.match(skip_accs): | |
198 logging.error( | |
199 "Supplied bio project accessions are not valid!\n" | |
200 + "Valid options:\n\t-skip PRJNA766315\n\t-skip PRJNA766315,PRJNA675435" | |
201 ) | |
202 exit(1) | |
203 elif skip_check.match(skip_accs): | |
204 datasets_cmd = "datasets summary genome accession --as-json-lines --report ids_only".split() | |
205 datasets_cmd.append(skip_accs) | |
206 dataformat_cmd = "dataformat tsv genome --fields accession --elide-header".split() | |
207 try: | |
208 accs_query = subprocess.run(datasets_cmd, capture_output=True, check=True) | |
209 try: | |
210 skip_accs_list = ( | |
211 subprocess.check_output(dataformat_cmd, input=accs_query.stdout) | |
212 .decode("utf-8") | |
213 .split("\n") | |
214 ) | |
215 except subprocess.CalledProcessError as e: | |
216 logging.error(f"Query failed\n\t{dataformat_cmd.join(' ')}\nError:\n\t{e}") | |
217 exit(1) | |
218 except subprocess.CalledProcessError as e: | |
219 logging.error(f"Query failed\n\t{datasets_cmd.join(' ')}\nError:\n\t{e}") | |
220 exit(1) | |
221 | |
222 if len(skip_accs_list) > 0: | |
223 filter_these_hits = list(filter(bool, skip_accs_list)) | |
224 else: | |
225 filter_these_hits = list() | |
226 | |
227 if genomes_dir: | |
228 if not os.path.isdir(genomes_dir): | |
229 logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!") | |
230 exit(1) | |
231 if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0: | |
232 logging.error( | |
233 "Genomes directory" | |
234 + f"{genomes_dir}" | |
235 + "\ndoes not seem to have any\n" | |
236 + f"files ending with suffix: {genomes_dir_suffix}" | |
237 ) | |
238 exit(1) | |
239 | |
240 if pickle_sero and os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0: | |
241 acc2serotype = defaultdict() | |
242 init_pickled_sero = os.path.join(os.getcwd(), out_prefix + ".ACC2SERO.pickle") | |
243 | |
244 if ( | |
245 os.path.exists(init_pickled_sero) | |
246 and os.path.getsize(init_pickled_sero) | |
247 and not f_write_pick | |
248 ): | |
249 logging.error( | |
250 f"File {os.path.basename(init_pickled_sero)} already exists in\n{os.getcwd()}\n" | |
251 + "Use -fs to force overwrite it." | |
252 ) | |
253 exit(1) | |
254 | |
255 with open(pickle_sero, "r") as sero_snp_meta: | |
256 for line in sero_snp_meta: | |
257 cols = line.strip().split("|") | |
258 url_cols = cols[3].split("/") | |
259 | |
260 if not 4 <= len(cols) <= 5: | |
261 logging.error( | |
262 f"The metadata file {pickle_sero} is malformed.\n" | |
263 + f"Expected 4-5 columns. Got {len(cols)} columns.\n" | |
264 ) | |
265 exit(1) | |
266 | |
267 if not len(url_cols) > 5: | |
268 acc = url_cols[3] | |
269 else: | |
270 acc = url_cols[9] | |
271 | |
272 if not re.match(r"^GC[AF]\_\d+\.\d+$", acc): | |
273 logging.error( | |
274 f"Did not find accession in either field number 4\n" | |
275 + "or field number 10 of column 4." | |
276 ) | |
277 exit(1) | |
278 | |
279 acc2serotype[acc] = cols[0] | |
280 | |
281 with open(init_pickled_sero, "wb") as write_pickled_sero: | |
282 pickle.dump(file=write_pickled_sero, obj=acc2serotype) | |
283 | |
284 logging.info( | |
285 f"Created the pickle file for\n{os.path.basename(pickle_sero)}.\n" | |
286 + "This was the only requested function." | |
287 ) | |
288 sero_snp_meta.close() | |
289 write_pickled_sero.close() | |
290 exit(0) | |
291 elif pickle_sero and not (os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0): | |
292 logging.error( | |
293 "Requested to create pickle from metadata, but\n" | |
294 + f"the file, {os.path.basename(pickle_sero)} is empty or\ndoes not exist!" | |
295 ) | |
296 exit(1) | |
297 | |
298 if mash_screen_res and os.path.exists(mash_screen_res): | |
299 if os.path.getsize(mash_screen_res) > 0: | |
300 seen_uniq_hits = 0 | |
301 unpickled_acc2serotype = pickle.load(file=open(pickled_sero, "rb")) | |
302 | |
303 with open(mash_screen_res, "r") as msh_res: | |
304 mash_hits = defaultdict() | |
305 seen_mash_sero = defaultdict() | |
306 | |
307 for line in msh_res: | |
308 cols = line.strip().split("\t") | |
309 | |
310 if len(cols) < 5: | |
311 logging.error( | |
312 f"The file {os.path.basename(mash_screen_res)} seems to\n" | |
313 + "be malformed. It contains less than required 5-6 columns." | |
314 ) | |
315 exit(1) | |
316 | |
317 mash_hit_acc = re.sub( | |
318 genomes_dir_suffix, | |
319 "", | |
320 str((re.search(r"GC[AF].*?" + genomes_dir_suffix, cols[4])).group()), | |
321 ) | |
322 | |
323 if mash_hit_acc: | |
324 mash_hits.setdefault(cols[0], []).append(mash_hit_acc) | |
325 else: | |
326 logging.error( | |
327 "Did not find an assembly accession in column\n" | |
328 + f"number 5. Found {cols[4]} instead. Cannot proceed!" | |
329 ) | |
330 exit(1) | |
331 msh_res.close() | |
332 elif os.path.getsize(mash_screen_res) == 0: | |
333 failed_sample_name = os.path.basename(mash_screen_res).rstrip(mash_screen_res_suffix) | |
334 with open( | |
335 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w" | |
336 ) as failed_sample_fh: | |
337 failed_sample_fh.write(f"{failed_sample_name}\n") | |
338 failed_sample_fh.close() | |
339 exit(0) | |
340 | |
341 # ppp.pprint(mash_hits) | |
342 msh_out_txt = open(mash_uniq_hits_txt, "w") | |
343 wrote_header_pop = False | |
344 wrote_header_acc = False | |
345 | |
346 with open(mash_genomes_gz, "wb") as msh_out_gz: | |
347 for _, (ident, acc_list) in enumerate(sorted(mash_hits.items(), reverse=True)): | |
348 for acc in acc_list: | |
349 if len(filter_these_hits) > 0 and acc in filter_these_hits: | |
350 continue | |
351 if seen_uniq_hits >= num_uniq_hits: | |
352 break | |
353 if isinstance(unpickled_acc2serotype[acc], dict): | |
354 if target_acc_key in unpickled_acc2serotype[acc].keys(): | |
355 if not wrote_header_pop: | |
356 mash_out_pop_txt = open(mash_popup_info_txt, "w") | |
357 mash_out_pop_txt.write("POPUP_INFO\nSEPARATOR COMMA\nDATA\n") | |
358 wrote_header_pop = True | |
359 | |
360 pdt = "".join(unpickled_acc2serotype[acc][target_acc_key]) | |
361 | |
362 popup_line = ",".join( | |
363 [ | |
364 acc, | |
365 ncbi_path_heading, | |
366 f'<a target="_blank" href="{ncbi_path_uri + pdt}">{pdt}</a>', | |
367 ] | |
368 ) | |
369 mash_out_pop_txt.write(popup_line + "\n") | |
370 | |
371 if all( | |
372 k in unpickled_acc2serotype[acc].keys() for k in req_metadata.keys() | |
373 ): | |
374 if not wrote_header_acc: | |
375 msh_out_accs_txt = open(mash_uniq_accs_txt, "w") | |
376 msh_out_txt.write("METADATA\nSEPARATOR COMMA\nFIELD_LABELS,") | |
377 msh_out_txt.write( | |
378 f"{','.join([str(key).upper() for key in req_metadata.keys()])}\nDATA\n" | |
379 ) | |
380 wrote_header_acc = True | |
381 | |
382 metadata_line = ",".join( | |
383 [ | |
384 re.sub( | |
385 ",", | |
386 "", | |
387 "|".join(unpickled_acc2serotype[acc][m]), | |
388 ) | |
389 for m in req_metadata.keys() | |
390 ], | |
391 ) | |
392 | |
393 msh_out_txt.write(f"{acc.strip()},{metadata_line}\n") | |
394 msh_out_accs_txt.write( | |
395 f"{os.path.join(genomes_dir, acc + genomes_dir_suffix)}\n" | |
396 ) | |
397 seen_mash_sero[acc] = 1 | |
398 seen_uniq_hits += 1 | |
399 elif not isinstance(unpickled_acc2serotype[acc], dict): | |
400 if unpickled_acc2serotype[acc] not in seen_mash_sero.keys(): | |
401 seen_mash_sero[unpickled_acc2serotype[acc]] = 1 | |
402 seen_uniq_hits += 1 | |
403 # print(acc.strip() + '\t' + ident + '\t' + unpickled_acc2serotype[acc], file=sys.stdout) | |
404 msh_out_txt.write( | |
405 f"{acc.strip()}\t{unpickled_acc2serotype[acc]}\t{ident}\n" | |
406 ) | |
407 with open( | |
408 os.path.join(genomes_dir, acc + genomes_dir_suffix), | |
409 "rb", | |
410 ) as msh_in_gz: | |
411 msh_out_gz.writelines(msh_in_gz.readlines()) | |
412 msh_in_gz.close() | |
413 msh_out_gz.close() | |
414 msh_out_txt.close() | |
415 | |
416 if "msh_out_accs_txt" in locals().keys() and not msh_out_accs_txt.closed: | |
417 msh_out_accs_txt.close() | |
418 if "mash_out_pop_txt" in locals().keys() and not mash_out_pop_txt.closed: | |
419 mash_out_pop_txt.close() | |
420 | |
421 logging.info( | |
422 f"File {os.path.basename(mash_genomes_gz)}\n" | |
423 + f"written in:\n{os.getcwd()}\nDone! Bye!" | |
424 ) | |
425 exit(0) | |
426 | |
427 | |
428 if __name__ == "__main__": | |
429 main() |