Mercurial > repos > kkonganti > cfsan_cronology
comparison 0.1.0/bin/get_top_unique_mash_hit_genomes.py @ 0:c8597e9e1a97
"planemo upload"
author | kkonganti |
---|---|
date | Mon, 27 Nov 2023 12:37:44 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c8597e9e1a97 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 # Kranti Konganti | |
4 | |
5 import argparse | |
6 import glob | |
7 import inspect | |
8 import logging | |
9 import os | |
10 import pickle | |
11 import pprint | |
12 import re | |
13 from collections import defaultdict | |
14 | |
15 | |
16 # Multiple inheritence for pretty printing of help text. | |
17 class MultiArgFormatClasses( | |
18 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter | |
19 ): | |
20 pass | |
21 | |
22 | |
23 # Main | |
24 def main() -> None: | |
25 """ | |
26 This script works only in the context of a Nextflow workflow. | |
27 It takes: | |
28 1. A pickle file containing a dictionary object where genome accession | |
29 is the key and the computed serotype is the value. | |
30 OR | |
31 1. It takes a pickle file containing a nested dictionary, where genome accession | |
32 is the key and the metadata is a dictionary associated with that key. | |
33 2. A file with `mash screen` results. | |
34 3. A directory containing genomes' FASTA in gzipped format where the | |
35 FASTA file contains 2 lines: one FASTA header followed by | |
36 genome Sequence. | |
37 and then generates a concatenated FASTA file of top N unique `mash screen` | |
38 genome hits as requested. | |
39 """ | |
40 | |
41 # Set logging. | |
42 logging.basicConfig( | |
43 format="\n" | |
44 + "=" * 55 | |
45 + "\n%(asctime)s - %(levelname)s\n" | |
46 + "=" * 55 | |
47 + "\n%(message)s\n\n", | |
48 level=logging.DEBUG, | |
49 ) | |
50 | |
51 # Debug print. | |
52 ppp = pprint.PrettyPrinter(width=55) | |
53 prog_name = os.path.basename(inspect.stack()[0].filename) | |
54 | |
55 parser = argparse.ArgumentParser( | |
56 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses | |
57 ) | |
58 | |
59 parser.add_argument( | |
60 "-s", | |
61 dest="sero_snp_metadata", | |
62 default=False, | |
63 required=False, | |
64 help="Absolute UNIX path to metadata text file with the field separator, | " | |
65 + "\nand 5 fields: serotype|asm_lvl|asm_url|snp_cluster_id" | |
66 + "\nEx: serotype=Derby,antigen_formula=4:f,g:-|Scaffold|402440|ftp://...\n|PDS000096654.2\n" | |
67 + "Mentioning this option will create a pickle file for the\nprovided metadata and exits.", | |
68 ) | |
69 parser.add_argument( | |
70 "-fs", | |
71 dest="force_write_pick", | |
72 action="store_true", | |
73 required=False, | |
74 help="By default, when -s flag is on, the pickle file named *.ACC2SERO.pickle\n" | |
75 + "is written to CWD. If the file exists, the program will not overwrite\n" | |
76 + "and exit. Use -fs option to overwrite.", | |
77 ) | |
78 parser.add_argument( | |
79 "-m", | |
80 dest="mash_screen_res", | |
81 default=False, | |
82 required=False, | |
83 help="Absolute UNIX path to `mash screen` results file.", | |
84 ) | |
85 parser.add_argument( | |
86 "-ms", | |
87 dest="mash_screen_res_suffix", | |
88 default=".screened", | |
89 required=False, | |
90 help="Suffix of the `mash screen` result file.", | |
91 ) | |
92 parser.add_argument( | |
93 "-ps", | |
94 dest="pickled_sero", | |
95 default=False, | |
96 required=False, | |
97 help="Absolute UNIX Path to serialized metadata object in a pickle file.\n" | |
98 + "You can create the pickle file of the metadata using -s option.\n" | |
99 + "Required if -m is on.", | |
100 ) | |
101 parser.add_argument( | |
102 "-gd", | |
103 dest="genomes_dir", | |
104 default=False, | |
105 required=False, | |
106 help="Absolute UNIX path to a directory containing\n" | |
107 + "gzipped genome FASTA files.\n" | |
108 + "Required if -m is on.", | |
109 ) | |
110 parser.add_argument( | |
111 "-gds", | |
112 dest="genomes_dir_suffix", | |
113 default="_scaffolded_genomic.fna.gz", | |
114 required=False, | |
115 help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.", | |
116 ) | |
117 parser.add_argument( | |
118 "-n", | |
119 dest="num_uniq_hits", | |
120 default=10, | |
121 required=False, | |
122 help="This many number of serotype genomes' accessions are returned.", | |
123 ) | |
124 parser.add_argument( | |
125 "-op", | |
126 dest="out_prefix", | |
127 default="MASH_SCREEN", | |
128 required=False, | |
129 help="Set the output file prefix for .fna.gz and .txt files.", | |
130 ) | |
131 # required = parser.add_argument_group('required arguments') | |
132 | |
133 args = parser.parse_args() | |
134 num_uniq_hits = int(args.num_uniq_hits) | |
135 mash_screen_res = args.mash_screen_res | |
136 mash_screen_res_suffix = args.mash_screen_res_suffix | |
137 pickle_sero = args.sero_snp_metadata | |
138 pickled_sero = args.pickled_sero | |
139 f_write_pick = args.force_write_pick | |
140 genomes_dir = args.genomes_dir | |
141 genomes_dir_suffix = args.genomes_dir_suffix | |
142 out_prefix = args.out_prefix | |
143 req_metadata = { | |
144 "mlst_sequence_type": "ST", | |
145 "epi_type": "ET", | |
146 "host": "HO", | |
147 "host_disease": "HD", | |
148 "isolation_source": "IS", | |
149 "outbreak": "OU", | |
150 "source_type": "SOT", | |
151 "strain": "GS", | |
152 } | |
153 target_acc_key = "target_acc" | |
154 ncbi_path_heading = "NCBI Pathogen Isolates Browser" | |
155 ncbi_path_uri = "https://www.ncbi.nlm.nih.gov/pathogens/isolates/#" | |
156 mash_genomes_gz = os.path.join( | |
157 os.getcwd(), out_prefix + "_TOP_" + str(num_uniq_hits) + "_UNIQUE_HITS.fna.gz" | |
158 ) | |
159 mash_uniq_hits_txt = os.path.join( | |
160 os.getcwd(), re.sub(".fna.gz", ".txt", os.path.basename(mash_genomes_gz)) | |
161 ) | |
162 mash_uniq_accs_txt = os.path.join( | |
163 os.getcwd(), re.sub(".fna.gz", "_ACCS.txt", os.path.basename(mash_genomes_gz)) | |
164 ) | |
165 mash_popup_info_txt = os.path.join( | |
166 os.getcwd(), re.sub(".fna.gz", "_POPUP.txt", os.path.basename(mash_genomes_gz)) | |
167 ) | |
168 | |
169 if mash_screen_res and os.path.exists(mash_genomes_gz): | |
170 logging.error( | |
171 "A concatenated genome FASTA file,\n" | |
172 + f"{os.path.basename(mash_genomes_gz)} already exists in:\n" | |
173 + f"{os.getcwd()}\n" | |
174 + "Please remove or move it as we will not " | |
175 + "overwrite it." | |
176 ) | |
177 exit(1) | |
178 | |
179 if os.path.exists(mash_uniq_hits_txt) and os.path.getsize(mash_uniq_hits_txt) > 0: | |
180 os.remove(mash_uniq_hits_txt) | |
181 | |
182 if mash_screen_res and (not genomes_dir or not pickled_sero): | |
183 logging.error("When -m is on, -ps and -gd are also required.") | |
184 exit(1) | |
185 | |
186 if genomes_dir: | |
187 if not os.path.isdir(genomes_dir): | |
188 logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!") | |
189 exit(1) | |
190 if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0: | |
191 logging.error( | |
192 "Genomes directory" | |
193 + f"{genomes_dir}" | |
194 + "\ndoes not seem to have any\n" | |
195 + f"files ending with suffix: {genomes_dir_suffix}" | |
196 ) | |
197 exit(1) | |
198 | |
199 if pickle_sero and os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0: | |
200 acc2serotype = defaultdict() | |
201 init_pickled_sero = os.path.join(os.getcwd(), out_prefix + ".ACC2SERO.pickle") | |
202 | |
203 if ( | |
204 os.path.exists(init_pickled_sero) | |
205 and os.path.getsize(init_pickled_sero) | |
206 and not f_write_pick | |
207 ): | |
208 logging.error( | |
209 f"File {os.path.basename(init_pickled_sero)} already exists in\n{os.getcwd()}\n" | |
210 + "Use -fs to force overwrite it." | |
211 ) | |
212 exit(1) | |
213 | |
214 with open(pickle_sero, "r") as sero_snp_meta: | |
215 for line in sero_snp_meta: | |
216 cols = line.strip().split("|") | |
217 url_cols = cols[3].split("/") | |
218 | |
219 if not 4 <= len(cols) <= 5: | |
220 logging.error( | |
221 f"The metadata file {pickle_sero} is malformed.\n" | |
222 + f"Expected 4-5 columns. Got {len(cols)} columns.\n" | |
223 ) | |
224 exit(1) | |
225 | |
226 if not len(url_cols) > 5: | |
227 acc = url_cols[3] | |
228 else: | |
229 acc = url_cols[9] | |
230 | |
231 if not re.match(r"^GC[AF]\_\d+\.\d+$", acc): | |
232 logging.error( | |
233 f"Did not find accession in either field number 4\n" | |
234 + "or field number 10 of column 4." | |
235 ) | |
236 exit(1) | |
237 | |
238 acc2serotype[acc] = cols[0] | |
239 | |
240 with open(init_pickled_sero, "wb") as write_pickled_sero: | |
241 pickle.dump(file=write_pickled_sero, obj=acc2serotype) | |
242 | |
243 logging.info( | |
244 f"Created the pickle file for\n{os.path.basename(pickle_sero)}.\n" | |
245 + "This was the only requested function." | |
246 ) | |
247 sero_snp_meta.close() | |
248 write_pickled_sero.close() | |
249 exit(0) | |
250 elif pickle_sero and not ( | |
251 os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0 | |
252 ): | |
253 logging.error( | |
254 "Requested to create pickle from metadata, but\n" | |
255 + f"the file, {os.path.basename(pickle_sero)} is empty or\ndoes not exist!" | |
256 ) | |
257 exit(1) | |
258 | |
259 if mash_screen_res and os.path.exists(mash_screen_res): | |
260 if os.path.getsize(mash_screen_res) > 0: | |
261 seen_uniq_hits = 0 | |
262 unpickled_acc2serotype = pickle.load(file=open(pickled_sero, "rb")) | |
263 | |
264 with open(mash_screen_res, "r") as msh_res: | |
265 mash_hits = defaultdict() | |
266 seen_mash_sero = defaultdict() | |
267 | |
268 for line in msh_res: | |
269 cols = line.strip().split("\t") | |
270 | |
271 if len(cols) < 5: | |
272 logging.error( | |
273 f"The file {os.path.basename(mash_screen_res)} seems to\n" | |
274 + "be malformed. It contains less than required 5-6 columns." | |
275 ) | |
276 exit(1) | |
277 | |
278 mash_hit_acc = re.sub( | |
279 genomes_dir_suffix, | |
280 "", | |
281 str( | |
282 ( | |
283 re.search(r"GC[AF].*?" + genomes_dir_suffix, cols[4]) | |
284 ).group() | |
285 ), | |
286 ) | |
287 | |
288 if mash_hit_acc: | |
289 mash_hits.setdefault(cols[0], []).append(mash_hit_acc) | |
290 else: | |
291 logging.error( | |
292 "Did not find an assembly accession in column\n" | |
293 + f"number 5. Found {cols[4]} instead. Cannot proceed!" | |
294 ) | |
295 exit(1) | |
296 msh_res.close() | |
297 elif os.path.getsize(mash_screen_res) == 0: | |
298 failed_sample_name = os.path.basename(mash_screen_res).rstrip( | |
299 mash_screen_res_suffix | |
300 ) | |
301 with open( | |
302 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w" | |
303 ) as failed_sample_fh: | |
304 failed_sample_fh.write(f"{failed_sample_name}\n") | |
305 failed_sample_fh.close() | |
306 exit(0) | |
307 | |
308 # ppp.pprint(mash_hits) | |
309 msh_out_txt = open(mash_uniq_hits_txt, "w") | |
310 msh_out_accs_txt = open(mash_uniq_accs_txt, "w") | |
311 mash_out_pop_txt = open(mash_popup_info_txt, "w") | |
312 wrote_header_pop = False | |
313 wrote_header_acc = False | |
314 | |
315 with open(mash_genomes_gz, "wb") as msh_out_gz: | |
316 for _, (ident, acc_list) in enumerate( | |
317 sorted(mash_hits.items(), reverse=True) | |
318 ): | |
319 for acc in acc_list: | |
320 if seen_uniq_hits >= num_uniq_hits: | |
321 break | |
322 if isinstance(unpickled_acc2serotype[acc], dict): | |
323 if target_acc_key in unpickled_acc2serotype[acc].keys(): | |
324 if not wrote_header_pop: | |
325 mash_out_pop_txt.write( | |
326 "POPUP_INFO\nSEPARATOR COMMA\nDATA\n" | |
327 ) | |
328 wrote_header_pop = True | |
329 | |
330 pdt = "".join(unpickled_acc2serotype[acc][target_acc_key]) | |
331 | |
332 popup_line = ",".join( | |
333 [ | |
334 acc, | |
335 ncbi_path_heading, | |
336 f'<a target="_blank" href="{ncbi_path_uri + pdt}">{pdt}</a>', | |
337 ] | |
338 ) | |
339 mash_out_pop_txt.write(popup_line + "\n") | |
340 | |
341 if all( | |
342 k in unpickled_acc2serotype[acc].keys() | |
343 for k in req_metadata.keys() | |
344 ): | |
345 if not wrote_header_acc: | |
346 msh_out_txt.write( | |
347 "METADATA\nSEPARATOR COMMA\nFIELD_LABELS," | |
348 ) | |
349 msh_out_txt.write( | |
350 f"{','.join([str(key).upper() for key in req_metadata.keys()])}\nDATA\n" | |
351 ) | |
352 wrote_header_acc = True | |
353 | |
354 metadata_line = ",".join( | |
355 [ | |
356 re.sub( | |
357 ",", | |
358 "", | |
359 "|".join(unpickled_acc2serotype[acc][m]), | |
360 ) | |
361 for m in req_metadata.keys() | |
362 ], | |
363 ) | |
364 | |
365 msh_out_txt.write(f"{acc.strip()},{metadata_line}\n") | |
366 msh_out_accs_txt.write( | |
367 f"{os.path.join(genomes_dir, acc + genomes_dir_suffix)}\n" | |
368 ) | |
369 seen_mash_sero[acc] = 1 | |
370 seen_uniq_hits += 1 | |
371 elif not isinstance(unpickled_acc2serotype[acc], dict): | |
372 if unpickled_acc2serotype[acc] not in seen_mash_sero.keys(): | |
373 seen_mash_sero[unpickled_acc2serotype[acc]] = 1 | |
374 seen_uniq_hits += 1 | |
375 # print(acc.strip() + '\t' + ident + '\t' + unpickled_acc2serotype[acc], file=sys.stdout) | |
376 msh_out_txt.write( | |
377 f"{acc.strip()}\t{unpickled_acc2serotype[acc]}\t{ident}\n" | |
378 ) | |
379 with open( | |
380 os.path.join(genomes_dir, acc + genomes_dir_suffix), | |
381 "rb", | |
382 ) as msh_in_gz: | |
383 msh_out_gz.writelines(msh_in_gz.readlines()) | |
384 msh_in_gz.close() | |
385 msh_out_gz.close() | |
386 msh_out_txt.close() | |
387 msh_out_accs_txt.close() | |
388 logging.info( | |
389 f"File {os.path.basename(mash_genomes_gz)}\n" | |
390 + f"written in:\n{os.getcwd()}\nDone! Bye!" | |
391 ) | |
392 exit(0) | |
393 | |
394 | |
395 if __name__ == "__main__": | |
396 main() |