kkonganti@17
|
1 #!/usr/bin/env python3
|
kkonganti@17
|
2
|
kkonganti@17
|
3 # Kranti Konganti
|
kkonganti@17
|
4
|
kkonganti@17
|
5 import argparse
|
kkonganti@17
|
6 import glob
|
kkonganti@17
|
7 import inspect
|
kkonganti@17
|
8 import logging
|
kkonganti@17
|
9 import os
|
kkonganti@17
|
10 import pickle
|
kkonganti@17
|
11 import pprint
|
kkonganti@17
|
12 import re
|
kkonganti@17
|
13 import subprocess
|
kkonganti@17
|
14 from collections import defaultdict
|
kkonganti@17
|
15
|
kkonganti@17
|
16
|
kkonganti@17
|
17 # Multiple inheritence for pretty printing of help text.
|
kkonganti@17
|
18 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
|
kkonganti@17
|
19 pass
|
kkonganti@17
|
20
|
kkonganti@17
|
21
|
kkonganti@17
|
22 # Main
|
kkonganti@17
|
23 def main() -> None:
|
kkonganti@17
|
24 """
|
kkonganti@17
|
25 This script works only in the context of a Nextflow workflow.
|
kkonganti@17
|
26 It takes:
|
kkonganti@17
|
27 1. A pickle file containing a dictionary object where genome accession
|
kkonganti@17
|
28 is the key and the computed serotype is the value.
|
kkonganti@17
|
29 OR
|
kkonganti@17
|
30 1. It takes a pickle file containing a nested dictionary, where genome accession
|
kkonganti@17
|
31 is the key and the metadata is a dictionary associated with that key.
|
kkonganti@17
|
32 2. A file with `mash screen` results.
|
kkonganti@17
|
33 3. A directory containing genomes' FASTA in gzipped format where the
|
kkonganti@17
|
34 FASTA file contains 2 lines: one FASTA header followed by
|
kkonganti@17
|
35 genome Sequence.
|
kkonganti@17
|
36 and then generates a concatenated FASTA file of top N unique `mash screen`
|
kkonganti@17
|
37 genome hits as requested.
|
kkonganti@17
|
38
|
kkonganti@17
|
39 In addition:
|
kkonganti@17
|
40 1. User can skip `mash screen` hits that originate from the supplied
|
kkonganti@17
|
41 bio project accessions.
|
kkonganti@17
|
42 For -skip option to work, ncbi-datasets should be available in $PATH.
|
kkonganti@17
|
43 """
|
kkonganti@17
|
44
|
kkonganti@17
|
45 # Set logging.
|
kkonganti@17
|
46 logging.basicConfig(
|
kkonganti@17
|
47 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n",
|
kkonganti@17
|
48 level=logging.DEBUG,
|
kkonganti@17
|
49 )
|
kkonganti@17
|
50
|
kkonganti@17
|
51 # Debug print.
|
kkonganti@17
|
52 ppp = pprint.PrettyPrinter(width=55)
|
kkonganti@17
|
53 prog_name = os.path.basename(inspect.stack()[0].filename)
|
kkonganti@17
|
54
|
kkonganti@17
|
55 parser = argparse.ArgumentParser(
|
kkonganti@17
|
56 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
|
kkonganti@17
|
57 )
|
kkonganti@17
|
58
|
kkonganti@17
|
59 parser.add_argument(
|
kkonganti@17
|
60 "-s",
|
kkonganti@17
|
61 dest="sero_snp_metadata",
|
kkonganti@17
|
62 default=False,
|
kkonganti@17
|
63 required=False,
|
kkonganti@17
|
64 help="Absolute UNIX path to metadata text file with the field separator, | "
|
kkonganti@17
|
65 + "\nand 5 fields: serotype|asm_lvl|asm_url|snp_cluster_id"
|
kkonganti@17
|
66 + "\nEx: serotype=Derby,antigen_formula=4:f,g:-|Scaffold|402440|ftp://...\n|PDS000096654.2\n"
|
kkonganti@17
|
67 + "Mentioning this option will create a pickle file for the\nprovided metadata and exits.",
|
kkonganti@17
|
68 )
|
kkonganti@17
|
69 parser.add_argument(
|
kkonganti@17
|
70 "-fs",
|
kkonganti@17
|
71 dest="force_write_pick",
|
kkonganti@17
|
72 action="store_true",
|
kkonganti@17
|
73 required=False,
|
kkonganti@17
|
74 help="By default, when -s flag is on, the pickle file named *.ACC2SERO.pickle\n"
|
kkonganti@17
|
75 + "is written to CWD. If the file exists, the program will not overwrite\n"
|
kkonganti@17
|
76 + "and exit. Use -fs option to overwrite.",
|
kkonganti@17
|
77 )
|
kkonganti@17
|
78 parser.add_argument(
|
kkonganti@17
|
79 "-m",
|
kkonganti@17
|
80 dest="mash_screen_res",
|
kkonganti@17
|
81 default=False,
|
kkonganti@17
|
82 required=False,
|
kkonganti@17
|
83 help="Absolute UNIX path to `mash screen` results file.",
|
kkonganti@17
|
84 )
|
kkonganti@17
|
85 parser.add_argument(
|
kkonganti@17
|
86 "-ms",
|
kkonganti@17
|
87 dest="mash_screen_res_suffix",
|
kkonganti@17
|
88 default=".screened",
|
kkonganti@17
|
89 required=False,
|
kkonganti@17
|
90 help="Suffix of the `mash screen` result file.",
|
kkonganti@17
|
91 )
|
kkonganti@17
|
92 parser.add_argument(
|
kkonganti@17
|
93 "-ps",
|
kkonganti@17
|
94 dest="pickled_sero",
|
kkonganti@17
|
95 default=False,
|
kkonganti@17
|
96 required=False,
|
kkonganti@17
|
97 help="Absolute UNIX Path to serialized metadata object in a pickle file.\n"
|
kkonganti@17
|
98 + "You can create the pickle file of the metadata using -s option.\n"
|
kkonganti@17
|
99 + "Required if -m is on.",
|
kkonganti@17
|
100 )
|
kkonganti@17
|
101 parser.add_argument(
|
kkonganti@17
|
102 "-gd",
|
kkonganti@17
|
103 dest="genomes_dir",
|
kkonganti@17
|
104 default=False,
|
kkonganti@17
|
105 required=False,
|
kkonganti@17
|
106 help="Absolute UNIX path to a directory containing\n"
|
kkonganti@17
|
107 + "gzipped genome FASTA files.\n"
|
kkonganti@17
|
108 + "Required if -m is on.",
|
kkonganti@17
|
109 )
|
kkonganti@17
|
110 parser.add_argument(
|
kkonganti@17
|
111 "-gds",
|
kkonganti@17
|
112 dest="genomes_dir_suffix",
|
kkonganti@17
|
113 default="_scaffolded_genomic.fna.gz",
|
kkonganti@17
|
114 required=False,
|
kkonganti@17
|
115 help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.",
|
kkonganti@17
|
116 )
|
kkonganti@17
|
117 parser.add_argument(
|
kkonganti@17
|
118 "-n",
|
kkonganti@17
|
119 dest="num_uniq_hits",
|
kkonganti@17
|
120 default=10,
|
kkonganti@17
|
121 required=False,
|
kkonganti@17
|
122 help="This many number of serotype genomes' accessions are returned.",
|
kkonganti@17
|
123 )
|
kkonganti@17
|
124 parser.add_argument(
|
kkonganti@17
|
125 "-skip",
|
kkonganti@17
|
126 dest="skip_accs",
|
kkonganti@17
|
127 default=str(""),
|
kkonganti@17
|
128 required=False,
|
kkonganti@17
|
129 help="Skip all hits which belong to the following bioproject accession(s).\n"
|
kkonganti@17
|
130 + "A comma separated list of more than one bioproject.",
|
kkonganti@17
|
131 )
|
kkonganti@17
|
132 parser.add_argument(
|
kkonganti@17
|
133 "-op",
|
kkonganti@17
|
134 dest="out_prefix",
|
kkonganti@17
|
135 default="MASH_SCREEN",
|
kkonganti@17
|
136 required=False,
|
kkonganti@17
|
137 help="Set the output file prefix for .fna.gz and .txt files.",
|
kkonganti@17
|
138 )
|
kkonganti@17
|
139 # required = parser.add_argument_group('required arguments')
|
kkonganti@17
|
140
|
kkonganti@17
|
141 args = parser.parse_args()
|
kkonganti@17
|
142 num_uniq_hits = int(args.num_uniq_hits)
|
kkonganti@17
|
143 mash_screen_res = args.mash_screen_res
|
kkonganti@17
|
144 mash_screen_res_suffix = args.mash_screen_res_suffix
|
kkonganti@17
|
145 pickle_sero = args.sero_snp_metadata
|
kkonganti@17
|
146 pickled_sero = args.pickled_sero
|
kkonganti@17
|
147 f_write_pick = args.force_write_pick
|
kkonganti@17
|
148 genomes_dir = args.genomes_dir
|
kkonganti@17
|
149 genomes_dir_suffix = args.genomes_dir_suffix
|
kkonganti@17
|
150 out_prefix = args.out_prefix
|
kkonganti@17
|
151 skip_accs = args.skip_accs
|
kkonganti@17
|
152 skip_accs_list = list()
|
kkonganti@17
|
153 skip_check = re.compile(r"PRJNA\d+(?:\,PRJNA\d+){0,1}")
|
kkonganti@17
|
154 req_metadata = {
|
kkonganti@17
|
155 "mlst_sequence_type": "ST",
|
kkonganti@17
|
156 "epi_type": "ET",
|
kkonganti@17
|
157 "host": "HO",
|
kkonganti@17
|
158 "host_disease": "HD",
|
kkonganti@17
|
159 "isolation_source": "IS",
|
kkonganti@17
|
160 "outbreak": "OU",
|
kkonganti@17
|
161 "source_type": "SOT",
|
kkonganti@17
|
162 "strain": "GS",
|
kkonganti@17
|
163 }
|
kkonganti@17
|
164 target_acc_key = "target_acc"
|
kkonganti@17
|
165 ncbi_path_heading = "NCBI Pathogen Isolates Browser"
|
kkonganti@17
|
166 ncbi_path_uri = "https://www.ncbi.nlm.nih.gov/pathogens/isolates/#"
|
kkonganti@17
|
167 mash_genomes_gz = os.path.join(
|
kkonganti@17
|
168 os.getcwd(), out_prefix + "_TOP_" + str(num_uniq_hits) + "_UNIQUE_HITS.fna.gz"
|
kkonganti@17
|
169 )
|
kkonganti@17
|
170 mash_uniq_hits_txt = os.path.join(
|
kkonganti@17
|
171 os.getcwd(), re.sub(".fna.gz", ".txt", os.path.basename(mash_genomes_gz))
|
kkonganti@17
|
172 )
|
kkonganti@17
|
173 mash_uniq_accs_txt = os.path.join(
|
kkonganti@17
|
174 os.getcwd(), re.sub(".fna.gz", "_ACCS.txt", os.path.basename(mash_genomes_gz))
|
kkonganti@17
|
175 )
|
kkonganti@17
|
176 mash_popup_info_txt = os.path.join(
|
kkonganti@17
|
177 os.getcwd(), re.sub(".fna.gz", "_POPUP.txt", os.path.basename(mash_genomes_gz))
|
kkonganti@17
|
178 )
|
kkonganti@17
|
179
|
kkonganti@17
|
180 if mash_screen_res and os.path.exists(mash_genomes_gz):
|
kkonganti@17
|
181 logging.error(
|
kkonganti@17
|
182 "A concatenated genome FASTA file,\n"
|
kkonganti@17
|
183 + f"{os.path.basename(mash_genomes_gz)} already exists in:\n"
|
kkonganti@17
|
184 + f"{os.getcwd()}\n"
|
kkonganti@17
|
185 + "Please remove or move it as we will not "
|
kkonganti@17
|
186 + "overwrite it."
|
kkonganti@17
|
187 )
|
kkonganti@17
|
188 exit(1)
|
kkonganti@17
|
189
|
kkonganti@17
|
190 if os.path.exists(mash_uniq_hits_txt) and os.path.getsize(mash_uniq_hits_txt) > 0:
|
kkonganti@17
|
191 os.remove(mash_uniq_hits_txt)
|
kkonganti@17
|
192
|
kkonganti@17
|
193 if mash_screen_res and (not genomes_dir or not pickled_sero):
|
kkonganti@17
|
194 logging.error("When -m is on, -ps and -gd are also required.")
|
kkonganti@17
|
195 exit(1)
|
kkonganti@17
|
196
|
kkonganti@17
|
197 if skip_accs and not skip_check.match(skip_accs):
|
kkonganti@17
|
198 logging.error(
|
kkonganti@17
|
199 "Supplied bio project accessions are not valid!\n"
|
kkonganti@17
|
200 + "Valid options:\n\t-skip PRJNA766315\n\t-skip PRJNA766315,PRJNA675435"
|
kkonganti@17
|
201 )
|
kkonganti@17
|
202 exit(1)
|
kkonganti@17
|
203 elif skip_check.match(skip_accs):
|
kkonganti@17
|
204 datasets_cmd = "datasets summary genome accession --as-json-lines --report ids_only".split()
|
kkonganti@17
|
205 datasets_cmd.append(skip_accs)
|
kkonganti@17
|
206 dataformat_cmd = "dataformat tsv genome --fields accession --elide-header".split()
|
kkonganti@17
|
207 try:
|
kkonganti@17
|
208 accs_query = subprocess.run(datasets_cmd, capture_output=True, check=True)
|
kkonganti@17
|
209 try:
|
kkonganti@17
|
210 skip_accs_list = (
|
kkonganti@17
|
211 subprocess.check_output(dataformat_cmd, input=accs_query.stdout)
|
kkonganti@17
|
212 .decode("utf-8")
|
kkonganti@17
|
213 .split("\n")
|
kkonganti@17
|
214 )
|
kkonganti@17
|
215 except subprocess.CalledProcessError as e:
|
kkonganti@17
|
216 logging.error(f"Query failed\n\t{dataformat_cmd.join(' ')}\nError:\n\t{e}")
|
kkonganti@17
|
217 exit(1)
|
kkonganti@17
|
218 except subprocess.CalledProcessError as e:
|
kkonganti@17
|
219 logging.error(f"Query failed\n\t{datasets_cmd.join(' ')}\nError:\n\t{e}")
|
kkonganti@17
|
220 exit(1)
|
kkonganti@17
|
221
|
kkonganti@17
|
222 if len(skip_accs_list) > 0:
|
kkonganti@17
|
223 filter_these_hits = list(filter(bool, skip_accs_list))
|
kkonganti@17
|
224 else:
|
kkonganti@17
|
225 filter_these_hits = list()
|
kkonganti@17
|
226
|
kkonganti@17
|
227 if genomes_dir:
|
kkonganti@17
|
228 if not os.path.isdir(genomes_dir):
|
kkonganti@17
|
229 logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!")
|
kkonganti@17
|
230 exit(1)
|
kkonganti@17
|
231 if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0:
|
kkonganti@17
|
232 logging.error(
|
kkonganti@17
|
233 "Genomes directory"
|
kkonganti@17
|
234 + f"{genomes_dir}"
|
kkonganti@17
|
235 + "\ndoes not seem to have any\n"
|
kkonganti@17
|
236 + f"files ending with suffix: {genomes_dir_suffix}"
|
kkonganti@17
|
237 )
|
kkonganti@17
|
238 exit(1)
|
kkonganti@17
|
239
|
kkonganti@17
|
240 if pickle_sero and os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0:
|
kkonganti@17
|
241 acc2serotype = defaultdict()
|
kkonganti@17
|
242 init_pickled_sero = os.path.join(os.getcwd(), out_prefix + ".ACC2SERO.pickle")
|
kkonganti@17
|
243
|
kkonganti@17
|
244 if (
|
kkonganti@17
|
245 os.path.exists(init_pickled_sero)
|
kkonganti@17
|
246 and os.path.getsize(init_pickled_sero)
|
kkonganti@17
|
247 and not f_write_pick
|
kkonganti@17
|
248 ):
|
kkonganti@17
|
249 logging.error(
|
kkonganti@17
|
250 f"File {os.path.basename(init_pickled_sero)} already exists in\n{os.getcwd()}\n"
|
kkonganti@17
|
251 + "Use -fs to force overwrite it."
|
kkonganti@17
|
252 )
|
kkonganti@17
|
253 exit(1)
|
kkonganti@17
|
254
|
kkonganti@17
|
255 with open(pickle_sero, "r") as sero_snp_meta:
|
kkonganti@17
|
256 for line in sero_snp_meta:
|
kkonganti@17
|
257 cols = line.strip().split("|")
|
kkonganti@17
|
258 url_cols = cols[3].split("/")
|
kkonganti@17
|
259
|
kkonganti@17
|
260 if not 4 <= len(cols) <= 5:
|
kkonganti@17
|
261 logging.error(
|
kkonganti@17
|
262 f"The metadata file {pickle_sero} is malformed.\n"
|
kkonganti@17
|
263 + f"Expected 4-5 columns. Got {len(cols)} columns.\n"
|
kkonganti@17
|
264 )
|
kkonganti@17
|
265 exit(1)
|
kkonganti@17
|
266
|
kkonganti@17
|
267 if not len(url_cols) > 5:
|
kkonganti@17
|
268 acc = url_cols[3]
|
kkonganti@17
|
269 else:
|
kkonganti@17
|
270 acc = url_cols[9]
|
kkonganti@17
|
271
|
kkonganti@17
|
272 if not re.match(r"^GC[AF]\_\d+\.\d+$", acc):
|
kkonganti@17
|
273 logging.error(
|
kkonganti@17
|
274 f"Did not find accession in either field number 4\n"
|
kkonganti@17
|
275 + "or field number 10 of column 4."
|
kkonganti@17
|
276 )
|
kkonganti@17
|
277 exit(1)
|
kkonganti@17
|
278
|
kkonganti@17
|
279 acc2serotype[acc] = cols[0]
|
kkonganti@17
|
280
|
kkonganti@17
|
281 with open(init_pickled_sero, "wb") as write_pickled_sero:
|
kkonganti@17
|
282 pickle.dump(file=write_pickled_sero, obj=acc2serotype)
|
kkonganti@17
|
283
|
kkonganti@17
|
284 logging.info(
|
kkonganti@17
|
285 f"Created the pickle file for\n{os.path.basename(pickle_sero)}.\n"
|
kkonganti@17
|
286 + "This was the only requested function."
|
kkonganti@17
|
287 )
|
kkonganti@17
|
288 sero_snp_meta.close()
|
kkonganti@17
|
289 write_pickled_sero.close()
|
kkonganti@17
|
290 exit(0)
|
kkonganti@17
|
291 elif pickle_sero and not (os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0):
|
kkonganti@17
|
292 logging.error(
|
kkonganti@17
|
293 "Requested to create pickle from metadata, but\n"
|
kkonganti@17
|
294 + f"the file, {os.path.basename(pickle_sero)} is empty or\ndoes not exist!"
|
kkonganti@17
|
295 )
|
kkonganti@17
|
296 exit(1)
|
kkonganti@17
|
297
|
kkonganti@17
|
298 if mash_screen_res and os.path.exists(mash_screen_res):
|
kkonganti@17
|
299 if os.path.getsize(mash_screen_res) > 0:
|
kkonganti@17
|
300 seen_uniq_hits = 0
|
kkonganti@17
|
301 unpickled_acc2serotype = pickle.load(file=open(pickled_sero, "rb"))
|
kkonganti@17
|
302
|
kkonganti@17
|
303 with open(mash_screen_res, "r") as msh_res:
|
kkonganti@17
|
304 mash_hits = defaultdict()
|
kkonganti@17
|
305 seen_mash_sero = defaultdict()
|
kkonganti@17
|
306
|
kkonganti@17
|
307 for line in msh_res:
|
kkonganti@17
|
308 cols = line.strip().split("\t")
|
kkonganti@17
|
309
|
kkonganti@17
|
310 if len(cols) < 5:
|
kkonganti@17
|
311 logging.error(
|
kkonganti@17
|
312 f"The file {os.path.basename(mash_screen_res)} seems to\n"
|
kkonganti@17
|
313 + "be malformed. It contains less than required 5-6 columns."
|
kkonganti@17
|
314 )
|
kkonganti@17
|
315 exit(1)
|
kkonganti@17
|
316
|
kkonganti@17
|
317 mash_hit_acc = re.sub(
|
kkonganti@17
|
318 genomes_dir_suffix,
|
kkonganti@17
|
319 "",
|
kkonganti@17
|
320 str((re.search(r"GC[AF].*?" + genomes_dir_suffix, cols[4])).group()),
|
kkonganti@17
|
321 )
|
kkonganti@17
|
322
|
kkonganti@17
|
323 if mash_hit_acc:
|
kkonganti@17
|
324 mash_hits.setdefault(cols[0], []).append(mash_hit_acc)
|
kkonganti@17
|
325 else:
|
kkonganti@17
|
326 logging.error(
|
kkonganti@17
|
327 "Did not find an assembly accession in column\n"
|
kkonganti@17
|
328 + f"number 5. Found {cols[4]} instead. Cannot proceed!"
|
kkonganti@17
|
329 )
|
kkonganti@17
|
330 exit(1)
|
kkonganti@17
|
331 msh_res.close()
|
kkonganti@17
|
332 elif os.path.getsize(mash_screen_res) == 0:
|
kkonganti@17
|
333 failed_sample_name = os.path.basename(mash_screen_res).rstrip(mash_screen_res_suffix)
|
kkonganti@17
|
334 with open(
|
kkonganti@17
|
335 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w"
|
kkonganti@17
|
336 ) as failed_sample_fh:
|
kkonganti@17
|
337 failed_sample_fh.write(f"{failed_sample_name}\n")
|
kkonganti@17
|
338 failed_sample_fh.close()
|
kkonganti@17
|
339 exit(0)
|
kkonganti@17
|
340
|
kkonganti@17
|
341 # ppp.pprint(mash_hits)
|
kkonganti@17
|
342 msh_out_txt = open(mash_uniq_hits_txt, "w")
|
kkonganti@17
|
343 wrote_header_pop = False
|
kkonganti@17
|
344 wrote_header_acc = False
|
kkonganti@17
|
345
|
kkonganti@17
|
346 with open(mash_genomes_gz, "wb") as msh_out_gz:
|
kkonganti@17
|
347 for _, (ident, acc_list) in enumerate(sorted(mash_hits.items(), reverse=True)):
|
kkonganti@17
|
348 for acc in acc_list:
|
kkonganti@17
|
349 if len(filter_these_hits) > 0 and acc in filter_these_hits:
|
kkonganti@17
|
350 continue
|
kkonganti@17
|
351 if seen_uniq_hits >= num_uniq_hits:
|
kkonganti@17
|
352 break
|
kkonganti@17
|
353 if isinstance(unpickled_acc2serotype[acc], dict):
|
kkonganti@17
|
354 if target_acc_key in unpickled_acc2serotype[acc].keys():
|
kkonganti@17
|
355 if not wrote_header_pop:
|
kkonganti@17
|
356 mash_out_pop_txt = open(mash_popup_info_txt, "w")
|
kkonganti@17
|
357 mash_out_pop_txt.write("POPUP_INFO\nSEPARATOR COMMA\nDATA\n")
|
kkonganti@17
|
358 wrote_header_pop = True
|
kkonganti@17
|
359
|
kkonganti@17
|
360 pdt = "".join(unpickled_acc2serotype[acc][target_acc_key])
|
kkonganti@17
|
361
|
kkonganti@17
|
362 popup_line = ",".join(
|
kkonganti@17
|
363 [
|
kkonganti@17
|
364 acc,
|
kkonganti@17
|
365 ncbi_path_heading,
|
kkonganti@17
|
366 f'<a target="_blank" href="{ncbi_path_uri + pdt}">{pdt}</a>',
|
kkonganti@17
|
367 ]
|
kkonganti@17
|
368 )
|
kkonganti@17
|
369 mash_out_pop_txt.write(popup_line + "\n")
|
kkonganti@17
|
370
|
kkonganti@17
|
371 if all(
|
kkonganti@17
|
372 k in unpickled_acc2serotype[acc].keys() for k in req_metadata.keys()
|
kkonganti@17
|
373 ):
|
kkonganti@17
|
374 if not wrote_header_acc:
|
kkonganti@17
|
375 msh_out_accs_txt = open(mash_uniq_accs_txt, "w")
|
kkonganti@17
|
376 msh_out_txt.write("METADATA\nSEPARATOR COMMA\nFIELD_LABELS,")
|
kkonganti@17
|
377 msh_out_txt.write(
|
kkonganti@17
|
378 f"{','.join([str(key).upper() for key in req_metadata.keys()])}\nDATA\n"
|
kkonganti@17
|
379 )
|
kkonganti@17
|
380 wrote_header_acc = True
|
kkonganti@17
|
381
|
kkonganti@17
|
382 metadata_line = ",".join(
|
kkonganti@17
|
383 [
|
kkonganti@17
|
384 re.sub(
|
kkonganti@17
|
385 ",",
|
kkonganti@17
|
386 "",
|
kkonganti@17
|
387 "|".join(unpickled_acc2serotype[acc][m]),
|
kkonganti@17
|
388 )
|
kkonganti@17
|
389 for m in req_metadata.keys()
|
kkonganti@17
|
390 ],
|
kkonganti@17
|
391 )
|
kkonganti@17
|
392
|
kkonganti@17
|
393 msh_out_txt.write(f"{acc.strip()},{metadata_line}\n")
|
kkonganti@17
|
394 msh_out_accs_txt.write(
|
kkonganti@17
|
395 f"{os.path.join(genomes_dir, acc + genomes_dir_suffix)}\n"
|
kkonganti@17
|
396 )
|
kkonganti@17
|
397 seen_mash_sero[acc] = 1
|
kkonganti@17
|
398 seen_uniq_hits += 1
|
kkonganti@17
|
399 elif not isinstance(unpickled_acc2serotype[acc], dict):
|
kkonganti@17
|
400 if unpickled_acc2serotype[acc] not in seen_mash_sero.keys():
|
kkonganti@17
|
401 seen_mash_sero[unpickled_acc2serotype[acc]] = 1
|
kkonganti@17
|
402 seen_uniq_hits += 1
|
kkonganti@17
|
403 # print(acc.strip() + '\t' + ident + '\t' + unpickled_acc2serotype[acc], file=sys.stdout)
|
kkonganti@17
|
404 msh_out_txt.write(
|
kkonganti@17
|
405 f"{acc.strip()}\t{unpickled_acc2serotype[acc]}\t{ident}\n"
|
kkonganti@17
|
406 )
|
kkonganti@17
|
407 with open(
|
kkonganti@17
|
408 os.path.join(genomes_dir, acc + genomes_dir_suffix),
|
kkonganti@17
|
409 "rb",
|
kkonganti@17
|
410 ) as msh_in_gz:
|
kkonganti@17
|
411 msh_out_gz.writelines(msh_in_gz.readlines())
|
kkonganti@17
|
412 msh_in_gz.close()
|
kkonganti@17
|
413 msh_out_gz.close()
|
kkonganti@17
|
414 msh_out_txt.close()
|
kkonganti@17
|
415
|
kkonganti@17
|
416 if "msh_out_accs_txt" in locals().keys() and not msh_out_accs_txt.closed:
|
kkonganti@17
|
417 msh_out_accs_txt.close()
|
kkonganti@17
|
418 if "mash_out_pop_txt" in locals().keys() and not mash_out_pop_txt.closed:
|
kkonganti@17
|
419 mash_out_pop_txt.close()
|
kkonganti@17
|
420
|
kkonganti@17
|
421 logging.info(
|
kkonganti@17
|
422 f"File {os.path.basename(mash_genomes_gz)}\n"
|
kkonganti@17
|
423 + f"written in:\n{os.getcwd()}\nDone! Bye!"
|
kkonganti@17
|
424 )
|
kkonganti@17
|
425 exit(0)
|
kkonganti@17
|
426
|
kkonganti@17
|
427
|
kkonganti@17
|
428 if __name__ == "__main__":
|
kkonganti@17
|
429 main()
|