kkonganti@1
|
1 #!/usr/bin/env python3
|
kkonganti@1
|
2
|
kkonganti@1
|
3 # Kranti Konganti
|
kkonganti@1
|
4
|
kkonganti@1
|
5 import os
|
kkonganti@1
|
6 import glob
|
kkonganti@1
|
7 import pickle
|
kkonganti@1
|
8 import argparse
|
kkonganti@1
|
9 import inspect
|
kkonganti@1
|
10 import logging
|
kkonganti@1
|
11 import re
|
kkonganti@1
|
12 import pprint
|
kkonganti@1
|
13 from collections import defaultdict
|
kkonganti@1
|
14
|
kkonganti@1
|
15 # Multiple inheritence for pretty printing of help text.
|
kkonganti@1
|
16 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
|
kkonganti@1
|
17 pass
|
kkonganti@1
|
18
|
kkonganti@1
|
19
|
kkonganti@1
|
20 # Main
|
kkonganti@1
|
21 def main() -> None:
|
kkonganti@1
|
22 """
|
kkonganti@1
|
23 This script works only in the context of `bettercallsal` Nextflow workflow.
|
kkonganti@1
|
24 It takes:
|
kkonganti@1
|
25 1. A pickle file containing a dictionary object where genome accession
|
kkonganti@1
|
26 is the key and the computed serotype is the value.
|
kkonganti@1
|
27 2. A file with `mash screen` results run against the Salmonella SNP
|
kkonganti@1
|
28 Cluster genomes' sketch.
|
kkonganti@1
|
29 3. A directory containing genomes' FASTA in gzipped format where the
|
kkonganti@1
|
30 FASTA file contains 2 lines: one FASTA header followed by
|
kkonganti@1
|
31 genome Sequence.
|
kkonganti@1
|
32 and then generates a concatenated FASTA file of top N unique `mash screen`
|
kkonganti@1
|
33 genome hits as requested.
|
kkonganti@1
|
34 """
|
kkonganti@1
|
35
|
kkonganti@1
|
36 # Set logging.
|
kkonganti@1
|
37 logging.basicConfig(
|
kkonganti@1
|
38 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n",
|
kkonganti@1
|
39 level=logging.DEBUG,
|
kkonganti@1
|
40 )
|
kkonganti@1
|
41
|
kkonganti@1
|
42 # Debug print.
|
kkonganti@1
|
43 ppp = pprint.PrettyPrinter(width=55)
|
kkonganti@1
|
44 prog_name = os.path.basename(inspect.stack()[0].filename)
|
kkonganti@1
|
45
|
kkonganti@1
|
46 parser = argparse.ArgumentParser(
|
kkonganti@1
|
47 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
|
kkonganti@1
|
48 )
|
kkonganti@1
|
49
|
kkonganti@1
|
50 parser.add_argument(
|
kkonganti@1
|
51 "-s",
|
kkonganti@1
|
52 dest="sero_snp_metadata",
|
kkonganti@1
|
53 default=False,
|
kkonganti@1
|
54 required=False,
|
kkonganti@1
|
55 help="Absolute UNIX path to metadata text file with the field separator, | "
|
kkonganti@1
|
56 + "\nand 5 fields: serotype|asm_lvl|asm_url|snp_cluster_id"
|
kkonganti@1
|
57 + "\nEx: serotype=Derby,antigen_formula=4:f,g:-|Scaffold|402440|ftp://...\n|PDS000096654.2\n"
|
kkonganti@1
|
58 + "Mentioning this option will create a pickle file for the\nprovided metadata and exits.",
|
kkonganti@1
|
59 )
|
kkonganti@1
|
60 parser.add_argument(
|
kkonganti@1
|
61 "-fs",
|
kkonganti@1
|
62 dest="force_write_pick",
|
kkonganti@1
|
63 action="store_true",
|
kkonganti@1
|
64 required=False,
|
kkonganti@1
|
65 help="By default, when -s flag is on, the pickle file named *.ACC2SERO.pickle\n"
|
kkonganti@1
|
66 + "is written to CWD. If the file exists, the program will not overwrite\n"
|
kkonganti@1
|
67 + "and exit. Use -fs option to overwrite.",
|
kkonganti@1
|
68 )
|
kkonganti@1
|
69 parser.add_argument(
|
kkonganti@1
|
70 "-m",
|
kkonganti@1
|
71 dest="mash_screen_res",
|
kkonganti@1
|
72 default=False,
|
kkonganti@1
|
73 required=False,
|
kkonganti@1
|
74 help="Absolute UNIX path to `mash screen` results file.",
|
kkonganti@1
|
75 )
|
kkonganti@1
|
76 parser.add_argument(
|
kkonganti@1
|
77 "-ms",
|
kkonganti@1
|
78 dest="mash_screen_res_suffix",
|
kkonganti@1
|
79 default=".screened",
|
kkonganti@1
|
80 required=False,
|
kkonganti@1
|
81 help="Suffix of the `mash screen` result file.",
|
kkonganti@1
|
82 )
|
kkonganti@1
|
83 parser.add_argument(
|
kkonganti@1
|
84 "-ps",
|
kkonganti@1
|
85 dest="pickled_sero",
|
kkonganti@1
|
86 default=False,
|
kkonganti@1
|
87 required=False,
|
kkonganti@1
|
88 help="Absolute UNIX Path to serialized metadata object in a pickle file.\n"
|
kkonganti@1
|
89 + "You can create the pickle file of the metadata using -s option.\n"
|
kkonganti@1
|
90 + "Required if -m is on.",
|
kkonganti@1
|
91 )
|
kkonganti@1
|
92 parser.add_argument(
|
kkonganti@1
|
93 "-gd",
|
kkonganti@1
|
94 dest="genomes_dir",
|
kkonganti@1
|
95 default=False,
|
kkonganti@1
|
96 required=False,
|
kkonganti@1
|
97 help="Absolute UNIX path to a directory containing\n"
|
kkonganti@1
|
98 + "gzipped genome FASTA files.\n"
|
kkonganti@1
|
99 + "Required if -m is on.",
|
kkonganti@1
|
100 )
|
kkonganti@1
|
101 parser.add_argument(
|
kkonganti@1
|
102 "-gds",
|
kkonganti@1
|
103 dest="genomes_dir_suffix",
|
kkonganti@1
|
104 default="_scaffolded_genomic.fna.gz",
|
kkonganti@1
|
105 required=False,
|
kkonganti@1
|
106 help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.",
|
kkonganti@1
|
107 )
|
kkonganti@1
|
108 parser.add_argument(
|
kkonganti@1
|
109 "-n",
|
kkonganti@1
|
110 dest="num_uniq_hits",
|
kkonganti@1
|
111 default=10,
|
kkonganti@1
|
112 help="This many number of serotype genomes' accessions are " + "\nreturned.",
|
kkonganti@1
|
113 )
|
kkonganti@1
|
114 parser.add_argument(
|
kkonganti@1
|
115 "-op",
|
kkonganti@1
|
116 dest="out_prefix",
|
kkonganti@1
|
117 default="MASH_SCREEN",
|
kkonganti@1
|
118 help="Set the output file prefix for .fna.gz and .txt files.",
|
kkonganti@1
|
119 )
|
kkonganti@1
|
120 # required = parser.add_argument_group('required arguments')
|
kkonganti@1
|
121
|
kkonganti@1
|
122 args = parser.parse_args()
|
kkonganti@1
|
123 num_uniq_hits = int(args.num_uniq_hits)
|
kkonganti@1
|
124 mash_screen_res = args.mash_screen_res
|
kkonganti@1
|
125 mash_screen_res_suffix = args.mash_screen_res_suffix
|
kkonganti@1
|
126 pickle_sero = args.sero_snp_metadata
|
kkonganti@1
|
127 pickled_sero = args.pickled_sero
|
kkonganti@1
|
128 f_write_pick = args.force_write_pick
|
kkonganti@1
|
129 genomes_dir = args.genomes_dir
|
kkonganti@1
|
130 genomes_dir_suffix = args.genomes_dir_suffix
|
kkonganti@1
|
131 out_prefix = args.out_prefix
|
kkonganti@1
|
132 mash_genomes_gz = os.path.join(
|
kkonganti@1
|
133 os.getcwd(), out_prefix + "_TOP_" + str(num_uniq_hits) + "_UNIQUE_HITS.fna.gz"
|
kkonganti@1
|
134 )
|
kkonganti@1
|
135 mash_uniq_hits_txt = os.path.join(
|
kkonganti@1
|
136 os.getcwd(), re.sub(".fna.gz", ".txt", os.path.basename(mash_genomes_gz))
|
kkonganti@1
|
137 )
|
kkonganti@1
|
138
|
kkonganti@1
|
139 if mash_screen_res and os.path.exists(mash_genomes_gz):
|
kkonganti@1
|
140 logging.error(
|
kkonganti@1
|
141 "A concatenated genome FASTA file,\n"
|
kkonganti@1
|
142 + f"{os.path.basename(mash_genomes_gz)} already exists in:\n"
|
kkonganti@1
|
143 + f"{os.getcwd()}\n"
|
kkonganti@1
|
144 + "Please remove or move it as we will not "
|
kkonganti@1
|
145 + "overwrite it."
|
kkonganti@1
|
146 )
|
kkonganti@1
|
147 exit(1)
|
kkonganti@1
|
148
|
kkonganti@1
|
149 if os.path.exists(mash_uniq_hits_txt) and os.path.getsize(mash_uniq_hits_txt) > 0:
|
kkonganti@1
|
150 os.remove(mash_uniq_hits_txt)
|
kkonganti@1
|
151
|
kkonganti@1
|
152 if mash_screen_res and (not genomes_dir or not pickled_sero):
|
kkonganti@1
|
153 logging.error("When -m is on, -ps and -gd are also required.")
|
kkonganti@1
|
154 exit(1)
|
kkonganti@1
|
155
|
kkonganti@1
|
156 if genomes_dir:
|
kkonganti@1
|
157 if not os.path.isdir(genomes_dir):
|
kkonganti@1
|
158 logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!")
|
kkonganti@1
|
159 exit(1)
|
kkonganti@1
|
160 if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0:
|
kkonganti@1
|
161 logging.error(
|
kkonganti@1
|
162 "Genomes directory"
|
kkonganti@1
|
163 + f"{genomes_dir}"
|
kkonganti@1
|
164 + "\ndoes not seem to have any\n"
|
kkonganti@1
|
165 + f"files ending with suffix: {genomes_dir_suffix}"
|
kkonganti@1
|
166 )
|
kkonganti@1
|
167 exit(1)
|
kkonganti@1
|
168
|
kkonganti@1
|
169 if pickle_sero and os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0:
|
kkonganti@1
|
170 acc2serotype = defaultdict()
|
kkonganti@1
|
171 init_pickled_sero = os.path.join(os.getcwd(), out_prefix + ".ACC2SERO.pickle")
|
kkonganti@1
|
172
|
kkonganti@1
|
173 if (
|
kkonganti@1
|
174 os.path.exists(init_pickled_sero)
|
kkonganti@1
|
175 and os.path.getsize(init_pickled_sero)
|
kkonganti@1
|
176 and not f_write_pick
|
kkonganti@1
|
177 ):
|
kkonganti@1
|
178 logging.error(
|
kkonganti@1
|
179 f"File {os.path.basename(init_pickled_sero)} already exists in\n{os.getcwd()}\n"
|
kkonganti@1
|
180 + "Use -fs to force overwrite it."
|
kkonganti@1
|
181 )
|
kkonganti@1
|
182 exit(1)
|
kkonganti@1
|
183
|
kkonganti@1
|
184 with open(pickle_sero, "r") as sero_snp_meta:
|
kkonganti@1
|
185 for line in sero_snp_meta:
|
kkonganti@1
|
186 cols = line.strip().split("|")
|
kkonganti@1
|
187 url_cols = cols[3].split("/")
|
kkonganti@1
|
188
|
kkonganti@1
|
189 if not 4 <= len(cols) <= 5:
|
kkonganti@1
|
190 logging.error(
|
kkonganti@1
|
191 f"The metadata file {pickle_sero} is malformed.\n"
|
kkonganti@1
|
192 + f"Expected 4-5 columns. Got {len(cols)} columns.\n"
|
kkonganti@1
|
193 )
|
kkonganti@1
|
194 exit(1)
|
kkonganti@1
|
195
|
kkonganti@1
|
196 if not len(url_cols) > 5:
|
kkonganti@1
|
197 acc = url_cols[3]
|
kkonganti@1
|
198 else:
|
kkonganti@1
|
199 acc = url_cols[9]
|
kkonganti@1
|
200
|
kkonganti@1
|
201 if not re.match(r"^GC[AF]\_\d+\.\d+$", acc):
|
kkonganti@1
|
202 logging.error(
|
kkonganti@1
|
203 f"Did not find accession in either field number 4\n"
|
kkonganti@1
|
204 + "or field number 10 of column 4."
|
kkonganti@1
|
205 )
|
kkonganti@1
|
206 exit(1)
|
kkonganti@1
|
207
|
kkonganti@1
|
208 acc2serotype[acc] = cols[0]
|
kkonganti@1
|
209
|
kkonganti@1
|
210 with open(init_pickled_sero, "wb") as write_pickled_sero:
|
kkonganti@1
|
211 pickle.dump(file=write_pickled_sero, obj=acc2serotype)
|
kkonganti@1
|
212
|
kkonganti@1
|
213 logging.info(
|
kkonganti@1
|
214 f"Created the pickle file for\n{os.path.basename(pickle_sero)}.\n"
|
kkonganti@1
|
215 + "This was the only requested function."
|
kkonganti@1
|
216 )
|
kkonganti@1
|
217 sero_snp_meta.close()
|
kkonganti@1
|
218 write_pickled_sero.close()
|
kkonganti@1
|
219 exit(0)
|
kkonganti@1
|
220 elif pickle_sero and not (os.path.exists(pickle_sero) and os.path.getsize(pickle_sero) > 0):
|
kkonganti@1
|
221
|
kkonganti@1
|
222 logging.error(
|
kkonganti@1
|
223 "Requested to create pickle from metadata, but\n"
|
kkonganti@1
|
224 + f"the file, {os.path.basename(pickle_sero)} is empty or\ndoes not exist!"
|
kkonganti@1
|
225 )
|
kkonganti@1
|
226 exit(1)
|
kkonganti@1
|
227
|
kkonganti@1
|
228 if mash_screen_res and os.path.exists(mash_screen_res):
|
kkonganti@1
|
229 if os.path.getsize(mash_screen_res) > 0:
|
kkonganti@1
|
230
|
kkonganti@1
|
231 seen_uniq_hits = 0
|
kkonganti@1
|
232 unpickled_acc2serotype = pickle.load(file=open(pickled_sero, "rb"))
|
kkonganti@1
|
233
|
kkonganti@1
|
234 with open(mash_screen_res, "r") as msh_res:
|
kkonganti@1
|
235 mash_hits = defaultdict()
|
kkonganti@1
|
236 seen_mash_sero = defaultdict()
|
kkonganti@1
|
237
|
kkonganti@1
|
238 for line in msh_res:
|
kkonganti@1
|
239 cols = line.strip().split("\t")
|
kkonganti@1
|
240
|
kkonganti@1
|
241 if len(cols) < 5:
|
kkonganti@1
|
242 logging.error(
|
kkonganti@1
|
243 f"The file {os.path.basename(mash_screen_res)} seems to\n"
|
kkonganti@1
|
244 + "be malformed. It contains less than required 5-6 columns."
|
kkonganti@1
|
245 )
|
kkonganti@1
|
246 exit(1)
|
kkonganti@1
|
247
|
kkonganti@1
|
248 mash_hit_acc = re.sub(
|
kkonganti@1
|
249 genomes_dir_suffix,
|
kkonganti@1
|
250 "",
|
kkonganti@1
|
251 str((re.search(r"GC[AF].*?" + genomes_dir_suffix, cols[4])).group()),
|
kkonganti@1
|
252 )
|
kkonganti@1
|
253
|
kkonganti@1
|
254 if mash_hit_acc:
|
kkonganti@1
|
255 mash_hits.setdefault(cols[0], []).append(mash_hit_acc)
|
kkonganti@1
|
256 else:
|
kkonganti@1
|
257 logging.error(
|
kkonganti@1
|
258 "Did not find an assembly accession in column\n"
|
kkonganti@1
|
259 + f"number 5. Found {cols[4]} instead. Cannot proceed!"
|
kkonganti@1
|
260 )
|
kkonganti@1
|
261 exit(1)
|
kkonganti@1
|
262 msh_res.close()
|
kkonganti@1
|
263 elif os.path.getsize(mash_screen_res) == 0:
|
kkonganti@1
|
264 failed_sample_name = os.path.basename(mash_screen_res).rstrip(mash_screen_res_suffix)
|
kkonganti@1
|
265 with open(
|
kkonganti@1
|
266 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w"
|
kkonganti@1
|
267 ) as failed_sample_fh:
|
kkonganti@1
|
268 failed_sample_fh.write(f"{failed_sample_name}\n")
|
kkonganti@1
|
269 failed_sample_fh.close()
|
kkonganti@1
|
270 exit(0)
|
kkonganti@1
|
271
|
kkonganti@1
|
272 # ppp.pprint(mash_hits)
|
kkonganti@1
|
273 msh_out_txt = open(mash_uniq_hits_txt, "w")
|
kkonganti@1
|
274 with open(mash_genomes_gz, "wb") as msh_out_gz:
|
kkonganti@1
|
275 for _, (ident, acc_list) in enumerate(sorted(mash_hits.items(), reverse=True)):
|
kkonganti@1
|
276 for acc in acc_list:
|
kkonganti@1
|
277 if seen_uniq_hits >= num_uniq_hits:
|
kkonganti@1
|
278 break
|
kkonganti@1
|
279 if unpickled_acc2serotype[acc] not in seen_mash_sero.keys():
|
kkonganti@1
|
280 seen_mash_sero[unpickled_acc2serotype[acc]] = 1
|
kkonganti@1
|
281 seen_uniq_hits += 1
|
kkonganti@1
|
282 # print(acc.strip() + '\t' + ident + '\t' + unpickled_acc2serotype[acc], file=sys.stdout)
|
kkonganti@1
|
283 msh_out_txt.write(
|
kkonganti@1
|
284 f"{acc.strip()}\t{unpickled_acc2serotype[acc]}\t{ident}\n"
|
kkonganti@1
|
285 )
|
kkonganti@1
|
286 with open(
|
kkonganti@1
|
287 os.path.join(genomes_dir, acc + genomes_dir_suffix), "rb"
|
kkonganti@1
|
288 ) as msh_in_gz:
|
kkonganti@1
|
289 msh_out_gz.writelines(msh_in_gz.readlines())
|
kkonganti@1
|
290 msh_in_gz.close()
|
kkonganti@1
|
291 msh_out_gz.close()
|
kkonganti@1
|
292 msh_out_txt.close()
|
kkonganti@1
|
293 logging.info(
|
kkonganti@1
|
294 f"File {os.path.basename(mash_genomes_gz)}\n"
|
kkonganti@1
|
295 + f"written in:\n{os.getcwd()}\nDone! Bye!"
|
kkonganti@1
|
296 )
|
kkonganti@1
|
297 exit(0)
|
kkonganti@1
|
298
|
kkonganti@1
|
299
|
kkonganti@1
|
300 if __name__ == "__main__":
|
kkonganti@1
|
301 main()
|