comparison 0.1.0/bin/get_top_unique_mash_hit_genomes.py @ 0:c8597e9e1a97

"planemo upload"
author kkonganti
date Mon, 27 Nov 2023 12:37:44 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c8597e9e1a97
1 #!/usr/bin/env python3
2
3 # Kranti Konganti
4
5 import argparse
6 import glob
7 import inspect
8 import logging
9 import os
10 import pickle
11 import pprint
12 import re
13 from collections import defaultdict
14
15
16 # Multiple inheritence for pretty printing of help text.
17 class MultiArgFormatClasses(
18 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
19 ):
20 pass
21
22
23 # Main
24 def main() -> None:
25 """
26 This script works only in the context of a Nextflow workflow.
27 It takes:
28 1. A pickle file containing a dictionary object where genome accession
29 is the key and the computed serotype is the value.
30 OR
31 1. It takes a pickle file containing a nested dictionary, where genome accession
32 is the key and the metadata is a dictionary associated with that key.
33 2. A file with `mash screen` results.
34 3. A directory containing genomes' FASTA in gzipped format where the
35 FASTA file contains 2 lines: one FASTA header followed by
36 genome Sequence.
37 and then generates a concatenated FASTA file of top N unique `mash screen`
38 genome hits as requested.
39 """
40
41 # Set logging.
42 logging.basicConfig(
43 format="\n"
44 + "=" * 55
45 + "\n%(asctime)s - %(levelname)s\n"
46 + "=" * 55
47 + "\n%(message)s\n\n",
48 level=logging.DEBUG,
49 )
50
51 # Debug print.
52 ppp = pprint.PrettyPrinter(width=55)
53 prog_name = os.path.basename(inspect.stack()[0].filename)
54
55 parser = argparse.ArgumentParser(
56 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
57 )
58
59 parser.add_argument(
60 "-s",
61 dest="sero_snp_metadata",
62 default=False,
63 required=False,
64 help="Absolute UNIX path to metadata text file with the field separator, | "
65 + "\nand 5 fields: serotype|asm_lvl|asm_url|snp_cluster_id"
66 + "\nEx: serotype=Derby,antigen_formula=4:f,g:-|Scaffold|402440|ftp://...\n|PDS000096654.2\n"
67 + "Mentioning this option will create a pickle file for the\nprovided metadata and exits.",
68 )
69 parser.add_argument(
70 "-fs",
71 dest="force_write_pick",
72 action="store_true",
73 required=False,
74 help="By default, when -s flag is on, the pickle file named *.ACC2SERO.pickle\n"
75 + "is written to CWD. If the file exists, the program will not overwrite\n"
76 + "and exit. Use -fs option to overwrite.",
77 )
78 parser.add_argument(
79 "-m",
80 dest="mash_screen_res",
81 default=False,
82 required=False,
83 help="Absolute UNIX path to `mash screen` results file.",
84 )
85 parser.add_argument(
86 "-ms",
87 dest="mash_screen_res_suffix",
88 default=".screened",
89 required=False,
90 help="Suffix of the `mash screen` result file.",
91 )
92 parser.add_argument(
93 "-ps",
94 dest="pickled_sero",
95 default=False,
96 required=False,
97 help="Absolute UNIX Path to serialized metadata object in a pickle file.\n"
98 + "You can create the pickle file of the metadata using -s option.\n"
99 + "Required if -m is on.",
100 )
101 parser.add_argument(
102 "-gd",
103 dest="genomes_dir",
104 default=False,
105 required=False,
106 help="Absolute UNIX path to a directory containing\n"
107 + "gzipped genome FASTA files.\n"
108 + "Required if -m is on.",
109 )
110 parser.add_argument(
111 "-gds",
112 dest="genomes_dir_suffix",
113 default="_scaffolded_genomic.fna.gz",
114 required=False,
115 help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.",
116 )
117 parser.add_argument(
118 "-n",
119 dest="num_uniq_hits",
120 default=10,
121 required=False,
122 help="This many number of serotype genomes' accessions are returned.",
123 )
124 parser.add_argument(
125 "-op",
126 dest="out_prefix",
127 default="MASH_SCREEN",
128 required=False,
129 help="Set the output file prefix for .fna.gz and .txt files.",
130 )
131 # required = parser.add_argument_group('required arguments')
132
133 args = parser.parse_args()
134 num_uniq_hits = int(args.num_uniq_hits)
135 mash_screen_res = args.mash_screen_res
136 mash_screen_res_suffix = args.mash_screen_res_suffix
137 pickle_sero = args.sero_snp_metadata
138 pickled_sero = args.pickled_sero
139 f_write_pick = args.force_write_pick
140 genomes_dir = args.genomes_dir
141 genomes_dir_suffix = args.genomes_dir_suffix
142 out_prefix = args.out_prefix
143 req_metadata = {
144 "mlst_sequence_type": "ST",
145 "epi_type": "ET",
146 "host": "HO",
147 "host_disease": "HD",
148 "isolation_source": "IS",
149 "outbreak": "OU",
150 "source_type": "SOT",
151 "strain": "GS",
152 }
153 target_acc_key = "target_acc"
154 ncbi_path_heading = "NCBI Pathogen Isolates Browser"
155 ncbi_path_uri = "https://www.ncbi.nlm.nih.gov/pathogens/isolates/#"
156 mash_genomes_gz = os.path.join(
157 os.getcwd(), out_prefix + "_TOP_" + str(num_uniq_hits) + "_UNIQUE_HITS.fna.gz"
158 )
159 mash_uniq_hits_txt = os.path.join(
160 os.getcwd(), re.sub(".fna.gz", ".txt", os.path.basename(mash_genomes_gz))
161 )
162 mash_uniq_accs_txt = os.path.join(
163 os.getcwd(), re.sub(".fna.gz", "_ACCS.txt", os.path.basename(mash_genomes_gz))
164 )
165 mash_popup_info_txt = os.path.join(
166 os.getcwd(), re.sub(".fna.gz", "_POPUP.txt", os.path.basename(mash_genomes_gz))
167 )
168
169 if mash_screen_res and os.path.exists(mash_genomes_gz):
170 logging.error(
171 "A concatenated genome FASTA file,\n"
172 + f"{os.path.basename(mash_genomes_gz)} already exists in:\n"
173 + f"{os.getcwd()}\n"
174 + "Please remove or move it as we will not "
175 + "overwrite it."
176 )
177 exit(1)
178
179 if os.path.exists(mash_uniq_hits_txt) and os.path.getsize(mash_uniq_hits_txt) > 0:
180 os.remove(mash_uniq_hits_txt)
181
182 if mash_screen_res and (not genomes_dir or not pickled_sero):
183 logging.error("When -m is on, -ps and -gd are also required.")
184 exit(1)
185
186 if genomes_dir:
187 if not os.path.isdir(genomes_dir):
188 logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!")
189 exit(1)
190 if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0:
191 logging.error(
192 "Genomes directory"
193 + f"{genomes_dir}"
194 + "\ndoes not seem to have any\n"
195 + f"files ending with suffix: {genomes_dir_suffix}"
196 )
197 exit(1)
198
199 if pickle_sero and os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0:
200 acc2serotype = defaultdict()
201 init_pickled_sero = os.path.join(os.getcwd(), out_prefix + ".ACC2SERO.pickle")
202
203 if (
204 os.path.exists(init_pickled_sero)
205 and os.path.getsize(init_pickled_sero)
206 and not f_write_pick
207 ):
208 logging.error(
209 f"File {os.path.basename(init_pickled_sero)} already exists in\n{os.getcwd()}\n"
210 + "Use -fs to force overwrite it."
211 )
212 exit(1)
213
214 with open(pickle_sero, "r") as sero_snp_meta:
215 for line in sero_snp_meta:
216 cols = line.strip().split("|")
217 url_cols = cols[3].split("/")
218
219 if not 4 <= len(cols) <= 5:
220 logging.error(
221 f"The metadata file {pickle_sero} is malformed.\n"
222 + f"Expected 4-5 columns. Got {len(cols)} columns.\n"
223 )
224 exit(1)
225
226 if not len(url_cols) > 5:
227 acc = url_cols[3]
228 else:
229 acc = url_cols[9]
230
231 if not re.match(r"^GC[AF]\_\d+\.\d+$", acc):
232 logging.error(
233 f"Did not find accession in either field number 4\n"
234 + "or field number 10 of column 4."
235 )
236 exit(1)
237
238 acc2serotype[acc] = cols[0]
239
240 with open(init_pickled_sero, "wb") as write_pickled_sero:
241 pickle.dump(file=write_pickled_sero, obj=acc2serotype)
242
243 logging.info(
244 f"Created the pickle file for\n{os.path.basename(pickle_sero)}.\n"
245 + "This was the only requested function."
246 )
247 sero_snp_meta.close()
248 write_pickled_sero.close()
249 exit(0)
250 elif pickle_sero and not (
251 os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0
252 ):
253 logging.error(
254 "Requested to create pickle from metadata, but\n"
255 + f"the file, {os.path.basename(pickle_sero)} is empty or\ndoes not exist!"
256 )
257 exit(1)
258
259 if mash_screen_res and os.path.exists(mash_screen_res):
260 if os.path.getsize(mash_screen_res) > 0:
261 seen_uniq_hits = 0
262 unpickled_acc2serotype = pickle.load(file=open(pickled_sero, "rb"))
263
264 with open(mash_screen_res, "r") as msh_res:
265 mash_hits = defaultdict()
266 seen_mash_sero = defaultdict()
267
268 for line in msh_res:
269 cols = line.strip().split("\t")
270
271 if len(cols) < 5:
272 logging.error(
273 f"The file {os.path.basename(mash_screen_res)} seems to\n"
274 + "be malformed. It contains less than required 5-6 columns."
275 )
276 exit(1)
277
278 mash_hit_acc = re.sub(
279 genomes_dir_suffix,
280 "",
281 str(
282 (
283 re.search(r"GC[AF].*?" + genomes_dir_suffix, cols[4])
284 ).group()
285 ),
286 )
287
288 if mash_hit_acc:
289 mash_hits.setdefault(cols[0], []).append(mash_hit_acc)
290 else:
291 logging.error(
292 "Did not find an assembly accession in column\n"
293 + f"number 5. Found {cols[4]} instead. Cannot proceed!"
294 )
295 exit(1)
296 msh_res.close()
297 elif os.path.getsize(mash_screen_res) == 0:
298 failed_sample_name = os.path.basename(mash_screen_res).rstrip(
299 mash_screen_res_suffix
300 )
301 with open(
302 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w"
303 ) as failed_sample_fh:
304 failed_sample_fh.write(f"{failed_sample_name}\n")
305 failed_sample_fh.close()
306 exit(0)
307
308 # ppp.pprint(mash_hits)
309 msh_out_txt = open(mash_uniq_hits_txt, "w")
310 msh_out_accs_txt = open(mash_uniq_accs_txt, "w")
311 mash_out_pop_txt = open(mash_popup_info_txt, "w")
312 wrote_header_pop = False
313 wrote_header_acc = False
314
315 with open(mash_genomes_gz, "wb") as msh_out_gz:
316 for _, (ident, acc_list) in enumerate(
317 sorted(mash_hits.items(), reverse=True)
318 ):
319 for acc in acc_list:
320 if seen_uniq_hits >= num_uniq_hits:
321 break
322 if isinstance(unpickled_acc2serotype[acc], dict):
323 if target_acc_key in unpickled_acc2serotype[acc].keys():
324 if not wrote_header_pop:
325 mash_out_pop_txt.write(
326 "POPUP_INFO\nSEPARATOR COMMA\nDATA\n"
327 )
328 wrote_header_pop = True
329
330 pdt = "".join(unpickled_acc2serotype[acc][target_acc_key])
331
332 popup_line = ",".join(
333 [
334 acc,
335 ncbi_path_heading,
336 f'<a target="_blank" href="{ncbi_path_uri + pdt}">{pdt}</a>',
337 ]
338 )
339 mash_out_pop_txt.write(popup_line + "\n")
340
341 if all(
342 k in unpickled_acc2serotype[acc].keys()
343 for k in req_metadata.keys()
344 ):
345 if not wrote_header_acc:
346 msh_out_txt.write(
347 "METADATA\nSEPARATOR COMMA\nFIELD_LABELS,"
348 )
349 msh_out_txt.write(
350 f"{','.join([str(key).upper() for key in req_metadata.keys()])}\nDATA\n"
351 )
352 wrote_header_acc = True
353
354 metadata_line = ",".join(
355 [
356 re.sub(
357 ",",
358 "",
359 "|".join(unpickled_acc2serotype[acc][m]),
360 )
361 for m in req_metadata.keys()
362 ],
363 )
364
365 msh_out_txt.write(f"{acc.strip()},{metadata_line}\n")
366 msh_out_accs_txt.write(
367 f"{os.path.join(genomes_dir, acc + genomes_dir_suffix)}\n"
368 )
369 seen_mash_sero[acc] = 1
370 seen_uniq_hits += 1
371 elif not isinstance(unpickled_acc2serotype[acc], dict):
372 if unpickled_acc2serotype[acc] not in seen_mash_sero.keys():
373 seen_mash_sero[unpickled_acc2serotype[acc]] = 1
374 seen_uniq_hits += 1
375 # print(acc.strip() + '\t' + ident + '\t' + unpickled_acc2serotype[acc], file=sys.stdout)
376 msh_out_txt.write(
377 f"{acc.strip()}\t{unpickled_acc2serotype[acc]}\t{ident}\n"
378 )
379 with open(
380 os.path.join(genomes_dir, acc + genomes_dir_suffix),
381 "rb",
382 ) as msh_in_gz:
383 msh_out_gz.writelines(msh_in_gz.readlines())
384 msh_in_gz.close()
385 msh_out_gz.close()
386 msh_out_txt.close()
387 msh_out_accs_txt.close()
388 logging.info(
389 f"File {os.path.basename(mash_genomes_gz)}\n"
390 + f"written in:\n{os.getcwd()}\nDone! Bye!"
391 )
392 exit(0)
393
394
395 if __name__ == "__main__":
396 main()