|
0
|
1 #!/usr/bin/env python3
|
|
|
2
|
|
|
3 # Kranti Konganti
|
|
|
4
|
|
|
5 import argparse
|
|
|
6 import inspect
|
|
|
7 import json
|
|
|
8 import logging
|
|
|
9 import os
|
|
|
10 import shutil
|
|
|
11 import tempfile
|
|
|
12 from urllib.parse import urlparse
|
|
|
13 from urllib.request import urlopen
|
|
|
14
|
|
|
15 # Set logging format.
|
|
|
16 logging.basicConfig(
|
|
|
17 format="\n"
|
|
|
18 + "=" * 55
|
|
|
19 + "\n%(asctime)s - %(levelname)s\n"
|
|
|
20 + "=" * 55
|
|
|
21 + "\n%(message)s\n",
|
|
|
22 level=logging.DEBUG,
|
|
|
23 )
|
|
|
24
|
|
|
25
|
|
|
26 # Multiple inheritence for pretty printing of help text.
|
|
|
27 class MultiArgFormatClasses(
|
|
|
28 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
|
|
|
29 ):
|
|
|
30 pass
|
|
|
31
|
|
|
32
|
|
|
33 def dl_pubmlst(**kwargs) -> None:
|
|
|
34 """
|
|
|
35 Method to save the Raw Data from a URL.
|
|
|
36 """
|
|
|
37 outdir, url, suffix, parent, filename, expectjson = [
|
|
|
38 kwargs[k] for k in kwargs.keys()
|
|
|
39 ]
|
|
|
40
|
|
|
41 if (outdir or url) == None:
|
|
|
42 logging.error(
|
|
|
43 "Please provide absolute UNIX path\n" + "to store the result DB flat files."
|
|
|
44 )
|
|
|
45 exit(1)
|
|
|
46
|
|
|
47 logging.info(f"Downloading... Please wait...\n{url}")
|
|
|
48
|
|
|
49 with urlopen(url) as response:
|
|
|
50 with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file:
|
|
|
51 shutil.copyfileobj(response, tmp_html_file)
|
|
|
52
|
|
|
53 if expectjson:
|
|
|
54 try:
|
|
|
55 jsonresponse = json.load(open(tmp_html_file.name, "r"))
|
|
|
56 except json.JSONDecodeError:
|
|
|
57 logging.error(f"The response from\n{url}\nwas not valid JSON!")
|
|
|
58 exit(1)
|
|
|
59
|
|
|
60 logging.info(f"Got a valid JSON response from:\n{url}")
|
|
|
61 return jsonresponse
|
|
|
62
|
|
|
63 if not parent:
|
|
|
64 if not filename:
|
|
|
65 save_to = os.path.join(
|
|
|
66 outdir, os.path.basename(urlparse(url).path) + suffix
|
|
|
67 )
|
|
|
68 else:
|
|
|
69 save_to = os.path.join(outdir, filename + suffix)
|
|
|
70
|
|
|
71 logging.info(f"Saving to:\n{os.path.basename(save_to)}")
|
|
|
72
|
|
|
73 with urlopen(url) as url_response:
|
|
|
74 with open(save_to, "w") as fout:
|
|
|
75 fout.writelines(url_response.read().decode("utf-8"))
|
|
|
76
|
|
|
77 fout.close()
|
|
|
78 url_response.close()
|
|
|
79
|
|
|
80
|
|
|
81 def main() -> None:
|
|
|
82 """
|
|
|
83 This script is part of the `cronology_db` Nextflow workflow and is only
|
|
|
84 tested on POSIX sytems.
|
|
|
85 It:
|
|
|
86 1. Downloads the MLST scheme in JSON format from PubMLST.
|
|
|
87 and then,
|
|
|
88 2. Downloads the alleles' FASTA and profile table
|
|
|
89 suitable to run MLST analysis.
|
|
|
90 """
|
|
|
91
|
|
|
92 prog_name = os.path.basename(inspect.stack()[0].filename)
|
|
|
93
|
|
|
94 parser = argparse.ArgumentParser(
|
|
|
95 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
|
|
|
96 )
|
|
|
97
|
|
|
98 required = parser.add_argument_group("required arguments")
|
|
|
99
|
|
|
100 required.add_argument(
|
|
|
101 "-org",
|
|
|
102 dest="organism",
|
|
|
103 required=True,
|
|
|
104 help="The organism name to download the MLST alleles'\nFASTA and profile CSV for."
|
|
|
105 + "\nEx: -org salmonella",
|
|
|
106 )
|
|
|
107 parser.add_argument(
|
|
|
108 "-f",
|
|
|
109 dest="overwrite",
|
|
|
110 default=False,
|
|
|
111 required=False,
|
|
|
112 action="store_true",
|
|
|
113 help="Force overwrite the results directory\nmentioned with -out.",
|
|
|
114 )
|
|
|
115 parser.add_argument(
|
|
|
116 "-out",
|
|
|
117 dest="outdir",
|
|
|
118 default=os.getcwd(),
|
|
|
119 required=False,
|
|
|
120 help="The absolute UNIX path to store the MLST alleles'\nFASTA and profile CSV.\n",
|
|
|
121 )
|
|
|
122 parser.add_argument(
|
|
|
123 "-mlsts",
|
|
|
124 dest="schemes",
|
|
|
125 default="schemes/2",
|
|
|
126 required=False,
|
|
|
127 help="The MLST scheme ID to download.",
|
|
|
128 )
|
|
|
129 parser.add_argument(
|
|
|
130 "-profile",
|
|
|
131 dest="profile",
|
|
|
132 default="profiles_csv",
|
|
|
133 required=False,
|
|
|
134 help="The MLST profile name in the scheme.",
|
|
|
135 )
|
|
|
136 parser.add_argument(
|
|
|
137 "-loci",
|
|
|
138 dest="loci",
|
|
|
139 default="loci",
|
|
|
140 required=False,
|
|
|
141 help="The key name in the JSON response which lists the\nallele URLs to download.",
|
|
|
142 )
|
|
|
143 parser.add_argument(
|
|
|
144 "-suffix",
|
|
|
145 dest="asuffix",
|
|
|
146 default=".tfa",
|
|
|
147 required=False,
|
|
|
148 help="What should be the suffix of the downloaded allele\nFASTA.",
|
|
|
149 )
|
|
|
150 parser.add_argument(
|
|
|
151 "-akey",
|
|
|
152 dest="allele_fa_key",
|
|
|
153 default="alleles_fasta",
|
|
|
154 required=False,
|
|
|
155 help="What is the key in the JSON response that contains\nthe URL for allele FASTA.",
|
|
|
156 )
|
|
|
157 parser.add_argument(
|
|
|
158 "-id",
|
|
|
159 dest="id_key",
|
|
|
160 default="id",
|
|
|
161 required=False,
|
|
|
162 help="What is the key in the JSON response that contains\nthe name of the allele FASTA.",
|
|
|
163 )
|
|
|
164
|
|
|
165 args = parser.parse_args()
|
|
|
166 org = args.organism
|
|
|
167 outdir = os.path.join(args.outdir, org)
|
|
|
168 overwrite = args.overwrite
|
|
|
169 pubmlst_loc = "_".join(["https://rest.pubmlst.org/db/pubmlst", org, "seqdef"])
|
|
|
170 schemes = args.schemes
|
|
|
171 profile = args.profile
|
|
|
172 loci = args.loci
|
|
|
173 suffix = args.asuffix
|
|
|
174 allele_fa_key = args.allele_fa_key
|
|
|
175 id_key = args.id_key
|
|
|
176
|
|
|
177 if not overwrite and os.path.exists(outdir):
|
|
|
178 logging.error(
|
|
|
179 f"Output directory\n{os.path.basename(outdir)}\nexists. Please use -f to overwrite."
|
|
|
180 )
|
|
|
181 exit(1)
|
|
|
182 elif overwrite and os.path.exists(outdir):
|
|
|
183 shutil.rmtree(outdir, ignore_errors=True)
|
|
|
184
|
|
|
185 # Create required output directory.
|
|
|
186 os.makedirs(outdir)
|
|
|
187
|
|
|
188 # Query MLST scheme for an organism.
|
|
|
189 pubmlst_json = dl_pubmlst(
|
|
|
190 path=outdir,
|
|
|
191 url="/".join([pubmlst_loc, schemes]),
|
|
|
192 suffix=suffix,
|
|
|
193 parent=True,
|
|
|
194 filename=False,
|
|
|
195 expectjson=True,
|
|
|
196 )
|
|
|
197
|
|
|
198 # Save profile_csv as organism.txt.
|
|
|
199 if profile in pubmlst_json.keys():
|
|
|
200 dl_pubmlst(
|
|
|
201 path=outdir,
|
|
|
202 url=pubmlst_json[profile],
|
|
|
203 suffix=".txt",
|
|
|
204 parent=False,
|
|
|
205 filename=org,
|
|
|
206 expectjson=False,
|
|
|
207 )
|
|
|
208
|
|
|
209 # Save MLST alleles' FASTA
|
|
|
210 if loci in pubmlst_json.keys():
|
|
|
211 for allele in pubmlst_json[loci]:
|
|
|
212 allele_fa_json = dl_pubmlst(
|
|
|
213 path=outdir,
|
|
|
214 url=allele,
|
|
|
215 suffix=suffix,
|
|
|
216 parent=True,
|
|
|
217 filename=False,
|
|
|
218 expectJson=True,
|
|
|
219 )
|
|
|
220
|
|
|
221 dl_pubmlst(
|
|
|
222 path=outdir,
|
|
|
223 url=allele_fa_json[allele_fa_key],
|
|
|
224 suffix=suffix,
|
|
|
225 parent=False,
|
|
|
226 filename=allele_fa_json[id_key],
|
|
|
227 expectJson=False,
|
|
|
228 )
|
|
|
229
|
|
|
230 logging.info(f"Finished downloading MLST scheme and profile for {org}.")
|
|
|
231
|
|
|
232
|
|
|
233 if __name__ == "__main__":
|
|
|
234 main()
|