comparison 1.0.0/bin/dl_pubmlst_profiles_and_schemes.py @ 0:0a8dda29956e draft default tip

planemo upload
author galaxytrakr
date Thu, 28 May 2026 20:41:10 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0a8dda29956e
1 #!/usr/bin/env python3
2
3 # Kranti Konganti
4
5 import argparse
6 import inspect
7 import json
8 import logging
9 import os
10 import shutil
11 import tempfile
12 from urllib.parse import urlparse
13 from urllib.request import urlopen
14
15 # Set logging format.
16 logging.basicConfig(
17 format="\n"
18 + "=" * 55
19 + "\n%(asctime)s - %(levelname)s\n"
20 + "=" * 55
21 + "\n%(message)s\n",
22 level=logging.DEBUG,
23 )
24
25
26 # Multiple inheritence for pretty printing of help text.
27 class MultiArgFormatClasses(
28 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
29 ):
30 pass
31
32
33 def dl_pubmlst(**kwargs) -> None:
34 """
35 Method to save the Raw Data from a URL.
36 """
37 outdir, url, suffix, parent, filename, expectjson = [
38 kwargs[k] for k in kwargs.keys()
39 ]
40
41 if (outdir or url) == None:
42 logging.error(
43 "Please provide absolute UNIX path\n" + "to store the result DB flat files."
44 )
45 exit(1)
46
47 logging.info(f"Downloading... Please wait...\n{url}")
48
49 with urlopen(url) as response:
50 with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file:
51 shutil.copyfileobj(response, tmp_html_file)
52
53 if expectjson:
54 try:
55 jsonresponse = json.load(open(tmp_html_file.name, "r"))
56 except json.JSONDecodeError:
57 logging.error(f"The response from\n{url}\nwas not valid JSON!")
58 exit(1)
59
60 logging.info(f"Got a valid JSON response from:\n{url}")
61 return jsonresponse
62
63 if not parent:
64 if not filename:
65 save_to = os.path.join(
66 outdir, os.path.basename(urlparse(url).path) + suffix
67 )
68 else:
69 save_to = os.path.join(outdir, filename + suffix)
70
71 logging.info(f"Saving to:\n{os.path.basename(save_to)}")
72
73 with urlopen(url) as url_response:
74 with open(save_to, "w") as fout:
75 fout.writelines(url_response.read().decode("utf-8"))
76
77 fout.close()
78 url_response.close()
79
80
81 def main() -> None:
82 """
83 This script is part of the `cronology_db` Nextflow workflow and is only
84 tested on POSIX sytems.
85 It:
86 1. Downloads the MLST scheme in JSON format from PubMLST.
87 and then,
88 2. Downloads the alleles' FASTA and profile table
89 suitable to run MLST analysis.
90 """
91
92 prog_name = os.path.basename(inspect.stack()[0].filename)
93
94 parser = argparse.ArgumentParser(
95 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
96 )
97
98 required = parser.add_argument_group("required arguments")
99
100 required.add_argument(
101 "-org",
102 dest="organism",
103 required=True,
104 help="The organism name to download the MLST alleles'\nFASTA and profile CSV for."
105 + "\nEx: -org salmonella",
106 )
107 parser.add_argument(
108 "-f",
109 dest="overwrite",
110 default=False,
111 required=False,
112 action="store_true",
113 help="Force overwrite the results directory\nmentioned with -out.",
114 )
115 parser.add_argument(
116 "-out",
117 dest="outdir",
118 default=os.getcwd(),
119 required=False,
120 help="The absolute UNIX path to store the MLST alleles'\nFASTA and profile CSV.\n",
121 )
122 parser.add_argument(
123 "-mlsts",
124 dest="schemes",
125 default="schemes/2",
126 required=False,
127 help="The MLST scheme ID to download.",
128 )
129 parser.add_argument(
130 "-profile",
131 dest="profile",
132 default="profiles_csv",
133 required=False,
134 help="The MLST profile name in the scheme.",
135 )
136 parser.add_argument(
137 "-loci",
138 dest="loci",
139 default="loci",
140 required=False,
141 help="The key name in the JSON response which lists the\nallele URLs to download.",
142 )
143 parser.add_argument(
144 "-suffix",
145 dest="asuffix",
146 default=".tfa",
147 required=False,
148 help="What should be the suffix of the downloaded allele\nFASTA.",
149 )
150 parser.add_argument(
151 "-akey",
152 dest="allele_fa_key",
153 default="alleles_fasta",
154 required=False,
155 help="What is the key in the JSON response that contains\nthe URL for allele FASTA.",
156 )
157 parser.add_argument(
158 "-id",
159 dest="id_key",
160 default="id",
161 required=False,
162 help="What is the key in the JSON response that contains\nthe name of the allele FASTA.",
163 )
164
165 args = parser.parse_args()
166 org = args.organism
167 outdir = os.path.join(args.outdir, org)
168 overwrite = args.overwrite
169 pubmlst_loc = "_".join(["https://rest.pubmlst.org/db/pubmlst", org, "seqdef"])
170 schemes = args.schemes
171 profile = args.profile
172 loci = args.loci
173 suffix = args.asuffix
174 allele_fa_key = args.allele_fa_key
175 id_key = args.id_key
176
177 if not overwrite and os.path.exists(outdir):
178 logging.error(
179 f"Output directory\n{os.path.basename(outdir)}\nexists. Please use -f to overwrite."
180 )
181 exit(1)
182 elif overwrite and os.path.exists(outdir):
183 shutil.rmtree(outdir, ignore_errors=True)
184
185 # Create required output directory.
186 os.makedirs(outdir)
187
188 # Query MLST scheme for an organism.
189 pubmlst_json = dl_pubmlst(
190 path=outdir,
191 url="/".join([pubmlst_loc, schemes]),
192 suffix=suffix,
193 parent=True,
194 filename=False,
195 expectjson=True,
196 )
197
198 # Save profile_csv as organism.txt.
199 if profile in pubmlst_json.keys():
200 dl_pubmlst(
201 path=outdir,
202 url=pubmlst_json[profile],
203 suffix=".txt",
204 parent=False,
205 filename=org,
206 expectjson=False,
207 )
208
209 # Save MLST alleles' FASTA
210 if loci in pubmlst_json.keys():
211 for allele in pubmlst_json[loci]:
212 allele_fa_json = dl_pubmlst(
213 path=outdir,
214 url=allele,
215 suffix=suffix,
216 parent=True,
217 filename=False,
218 expectJson=True,
219 )
220
221 dl_pubmlst(
222 path=outdir,
223 url=allele_fa_json[allele_fa_key],
224 suffix=suffix,
225 parent=False,
226 filename=allele_fa_json[id_key],
227 expectJson=False,
228 )
229
230 logging.info(f"Finished downloading MLST scheme and profile for {org}.")
231
232
233 if __name__ == "__main__":
234 main()