kkonganti@11
|
1 #!/usr/bin/env python3
|
kkonganti@11
|
2
|
kkonganti@11
|
3 # Kranti Konganti
|
kkonganti@11
|
4
|
kkonganti@11
|
5 import argparse
|
kkonganti@11
|
6 import inspect
|
kkonganti@11
|
7 import json
|
kkonganti@11
|
8 import logging
|
kkonganti@11
|
9 import os
|
kkonganti@11
|
10 import shutil
|
kkonganti@11
|
11 import tempfile
|
kkonganti@11
|
12 from urllib.parse import urlparse
|
kkonganti@11
|
13 from urllib.request import urlopen
|
kkonganti@11
|
14
|
kkonganti@11
|
15 # Set logging format.
|
kkonganti@11
|
16 logging.basicConfig(
|
kkonganti@11
|
17 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n",
|
kkonganti@11
|
18 level=logging.DEBUG,
|
kkonganti@11
|
19 )
|
kkonganti@11
|
20
|
kkonganti@11
|
21
|
kkonganti@11
|
22 # Multiple inheritence for pretty printing of help text.
|
kkonganti@11
|
23 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
|
kkonganti@11
|
24 pass
|
kkonganti@11
|
25
|
kkonganti@11
|
26
|
kkonganti@11
|
27 def dl_pubmlst(**kwargs) -> None:
|
kkonganti@11
|
28 """
|
kkonganti@11
|
29 Method to save the Raw Data from a URL.
|
kkonganti@11
|
30 """
|
kkonganti@11
|
31 outdir, url, suffix, parent, filename, expectjson = [kwargs[k] for k in kwargs.keys()]
|
kkonganti@11
|
32
|
kkonganti@11
|
33 if (outdir or url) == None:
|
kkonganti@11
|
34 logging.error("Please provide absolute UNIX path\n" + "to store the result DB flat files.")
|
kkonganti@11
|
35 exit(1)
|
kkonganti@11
|
36
|
kkonganti@11
|
37 logging.info(f"Downloading... Please wait...\n{url}")
|
kkonganti@11
|
38
|
kkonganti@11
|
39 with urlopen(url) as response:
|
kkonganti@11
|
40 with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file:
|
kkonganti@11
|
41 shutil.copyfileobj(response, tmp_html_file)
|
kkonganti@11
|
42
|
kkonganti@11
|
43 if expectjson:
|
kkonganti@11
|
44 try:
|
kkonganti@11
|
45 jsonresponse = json.load(open(tmp_html_file.name, "r"))
|
kkonganti@11
|
46 except json.JSONDecodeError:
|
kkonganti@11
|
47 logging.error(f"The response from\n{url}\nwas not valid JSON!")
|
kkonganti@11
|
48 exit(1)
|
kkonganti@11
|
49
|
kkonganti@11
|
50 logging.info(f"Got a valid JSON response from:\n{url}")
|
kkonganti@11
|
51 return jsonresponse
|
kkonganti@11
|
52
|
kkonganti@11
|
53 if not parent:
|
kkonganti@11
|
54 if not filename:
|
kkonganti@11
|
55 save_to = os.path.join(outdir, os.path.basename(urlparse(url).path) + suffix)
|
kkonganti@11
|
56 else:
|
kkonganti@11
|
57 save_to = os.path.join(outdir, filename + suffix)
|
kkonganti@11
|
58
|
kkonganti@11
|
59 logging.info(f"Saving to:\n{os.path.basename(save_to)}")
|
kkonganti@11
|
60
|
kkonganti@11
|
61 with urlopen(url) as url_response:
|
kkonganti@11
|
62 with open(save_to, "w") as fout:
|
kkonganti@11
|
63 fout.writelines(url_response.read().decode("utf-8"))
|
kkonganti@11
|
64
|
kkonganti@11
|
65 fout.close()
|
kkonganti@11
|
66 url_response.close()
|
kkonganti@11
|
67
|
kkonganti@11
|
68
|
kkonganti@11
|
69 def main() -> None:
|
kkonganti@11
|
70 """
|
kkonganti@11
|
71 This script is part of the `cronology_db` Nextflow workflow and is only
|
kkonganti@11
|
72 tested on POSIX sytems.
|
kkonganti@11
|
73 It:
|
kkonganti@11
|
74 1. Downloads the MLST scheme in JSON format from PubMLST.
|
kkonganti@11
|
75 and then,
|
kkonganti@11
|
76 2. Downloads the alleles' FASTA and profile table
|
kkonganti@11
|
77 suitable to run MLST analysis.
|
kkonganti@11
|
78 """
|
kkonganti@11
|
79
|
kkonganti@11
|
80 prog_name = os.path.basename(inspect.stack()[0].filename)
|
kkonganti@11
|
81
|
kkonganti@11
|
82 parser = argparse.ArgumentParser(
|
kkonganti@11
|
83 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
|
kkonganti@11
|
84 )
|
kkonganti@11
|
85
|
kkonganti@11
|
86 required = parser.add_argument_group("required arguments")
|
kkonganti@11
|
87
|
kkonganti@11
|
88 required.add_argument(
|
kkonganti@11
|
89 "-org",
|
kkonganti@11
|
90 dest="organism",
|
kkonganti@11
|
91 required=True,
|
kkonganti@11
|
92 help="The organism name to download the MLST alleles'\nFASTA and profile CSV for."
|
kkonganti@11
|
93 + "\nEx: -org cronobacter",
|
kkonganti@11
|
94 )
|
kkonganti@11
|
95 parser.add_argument(
|
kkonganti@11
|
96 "-f",
|
kkonganti@11
|
97 dest="overwrite",
|
kkonganti@11
|
98 default=False,
|
kkonganti@11
|
99 required=False,
|
kkonganti@11
|
100 action="store_true",
|
kkonganti@11
|
101 help="Force overwrite the results directory\nmentioned with -out.",
|
kkonganti@11
|
102 )
|
kkonganti@11
|
103 parser.add_argument(
|
kkonganti@11
|
104 "-out",
|
kkonganti@11
|
105 dest="outdir",
|
kkonganti@11
|
106 default=os.getcwd(),
|
kkonganti@11
|
107 required=False,
|
kkonganti@11
|
108 help="The absolute UNIX path to store the MLST alleles'\nFASTA and profile CSV.\n",
|
kkonganti@11
|
109 )
|
kkonganti@11
|
110 parser.add_argument(
|
kkonganti@11
|
111 "-mlsts",
|
kkonganti@11
|
112 dest="schemes",
|
kkonganti@11
|
113 default="schemes/1",
|
kkonganti@11
|
114 required=False,
|
kkonganti@11
|
115 help="The MLST scheme ID to download.",
|
kkonganti@11
|
116 )
|
kkonganti@11
|
117 parser.add_argument(
|
kkonganti@11
|
118 "-profile",
|
kkonganti@11
|
119 dest="profile",
|
kkonganti@11
|
120 default="profiles_csv",
|
kkonganti@11
|
121 required=False,
|
kkonganti@11
|
122 help="The MLST profile name in the scheme.",
|
kkonganti@11
|
123 )
|
kkonganti@11
|
124 parser.add_argument(
|
kkonganti@11
|
125 "-loci",
|
kkonganti@11
|
126 dest="loci",
|
kkonganti@11
|
127 default="loci",
|
kkonganti@11
|
128 required=False,
|
kkonganti@11
|
129 help="The key name in the JSON response which lists the\nallele URLs to download.",
|
kkonganti@11
|
130 )
|
kkonganti@11
|
131 parser.add_argument(
|
kkonganti@11
|
132 "-suffix",
|
kkonganti@11
|
133 dest="asuffix",
|
kkonganti@11
|
134 default=".tfa",
|
kkonganti@11
|
135 required=False,
|
kkonganti@11
|
136 help="What should be the suffix of the downloaded allele\nFASTA.",
|
kkonganti@11
|
137 )
|
kkonganti@11
|
138 parser.add_argument(
|
kkonganti@11
|
139 "-akey",
|
kkonganti@11
|
140 dest="allele_fa_key",
|
kkonganti@11
|
141 default="alleles_fasta",
|
kkonganti@11
|
142 required=False,
|
kkonganti@11
|
143 help="What is the key in the JSON response that contains\nthe URL for allele FASTA.",
|
kkonganti@11
|
144 )
|
kkonganti@11
|
145 parser.add_argument(
|
kkonganti@11
|
146 "-id",
|
kkonganti@11
|
147 dest="id_key",
|
kkonganti@11
|
148 default="id",
|
kkonganti@11
|
149 required=False,
|
kkonganti@11
|
150 help="What is the key in the JSON response that contains\nthe name of the allele FASTA.",
|
kkonganti@11
|
151 )
|
kkonganti@11
|
152
|
kkonganti@11
|
153 args = parser.parse_args()
|
kkonganti@11
|
154 org = args.organism
|
kkonganti@11
|
155 outdir = os.path.join(args.outdir, org)
|
kkonganti@11
|
156 overwrite = args.overwrite
|
kkonganti@11
|
157 pubmlst_loc = "_".join(["https://rest.pubmlst.org/db/pubmlst", org, "seqdef"])
|
kkonganti@11
|
158 schemes = args.schemes
|
kkonganti@11
|
159 profile = args.profile
|
kkonganti@11
|
160 loci = args.loci
|
kkonganti@11
|
161 suffix = args.asuffix
|
kkonganti@11
|
162 allele_fa_key = args.allele_fa_key
|
kkonganti@11
|
163 id_key = args.id_key
|
kkonganti@11
|
164
|
kkonganti@11
|
165 if not overwrite and os.path.exists(outdir):
|
kkonganti@11
|
166 logging.error(
|
kkonganti@11
|
167 f"Output directory\n{os.path.basename(outdir)}\nexists. Please use -f to overwrite."
|
kkonganti@11
|
168 )
|
kkonganti@11
|
169 exit(1)
|
kkonganti@11
|
170 elif overwrite and os.path.exists(outdir):
|
kkonganti@11
|
171 shutil.rmtree(outdir, ignore_errors=True)
|
kkonganti@11
|
172
|
kkonganti@11
|
173 # Create required output directory.
|
kkonganti@11
|
174 os.makedirs(outdir)
|
kkonganti@11
|
175
|
kkonganti@11
|
176 # Query MLST scheme for an organism.
|
kkonganti@11
|
177 pubmlst_json = dl_pubmlst(
|
kkonganti@11
|
178 path=outdir,
|
kkonganti@11
|
179 url="/".join([pubmlst_loc, schemes]),
|
kkonganti@11
|
180 suffix=suffix,
|
kkonganti@11
|
181 parent=True,
|
kkonganti@11
|
182 filename=False,
|
kkonganti@11
|
183 expectjson=True,
|
kkonganti@11
|
184 )
|
kkonganti@11
|
185
|
kkonganti@11
|
186 # Save profile_csv as organism.txt.
|
kkonganti@11
|
187 if profile in pubmlst_json.keys():
|
kkonganti@11
|
188 dl_pubmlst(
|
kkonganti@11
|
189 path=outdir,
|
kkonganti@11
|
190 url=pubmlst_json[profile],
|
kkonganti@11
|
191 suffix=".txt",
|
kkonganti@11
|
192 parent=False,
|
kkonganti@11
|
193 filename=org,
|
kkonganti@11
|
194 expectjson=False,
|
kkonganti@11
|
195 )
|
kkonganti@11
|
196
|
kkonganti@11
|
197 # Save MLST alleles' FASTA
|
kkonganti@11
|
198 if loci in pubmlst_json.keys():
|
kkonganti@11
|
199 for allele in pubmlst_json[loci]:
|
kkonganti@11
|
200 allele_fa_json = dl_pubmlst(
|
kkonganti@11
|
201 path=outdir,
|
kkonganti@11
|
202 url=allele,
|
kkonganti@11
|
203 suffix=suffix,
|
kkonganti@11
|
204 parent=True,
|
kkonganti@11
|
205 filename=False,
|
kkonganti@11
|
206 expectJson=True,
|
kkonganti@11
|
207 )
|
kkonganti@11
|
208
|
kkonganti@11
|
209 dl_pubmlst(
|
kkonganti@11
|
210 path=outdir,
|
kkonganti@11
|
211 url=allele_fa_json[allele_fa_key],
|
kkonganti@11
|
212 suffix=suffix,
|
kkonganti@11
|
213 parent=False,
|
kkonganti@11
|
214 filename=allele_fa_json[id_key],
|
kkonganti@11
|
215 expectJson=False,
|
kkonganti@11
|
216 )
|
kkonganti@11
|
217
|
kkonganti@11
|
218 logging.info(f"Finished downloading MLST scheme and profile for {org}.")
|
kkonganti@11
|
219
|
kkonganti@11
|
220
|
kkonganti@11
|
221 if __name__ == "__main__":
|
kkonganti@11
|
222 main()
|