comparison 0.1.0/bin/dl_pubmlst_profiles_and_schemes.py @ 0:c8597e9e1a97

"planemo upload"
author kkonganti
date Mon, 27 Nov 2023 12:37:44 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c8597e9e1a97
1 #!/usr/bin/env python3
2
3 # Kranti Konganti
4
5 import argparse
6 import inspect
7 import json
8 import logging
9 import os
10 import shutil
11 import tempfile
12 from urllib.parse import urlparse
13 from urllib.request import urlopen
14
15 # Set logging format.
16 logging.basicConfig(
17 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n",
18 level=logging.DEBUG,
19 )
20
21
22 # Multiple inheritence for pretty printing of help text.
23 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
24 pass
25
26
27 def dl_pubmlst(**kwargs) -> None:
28 """
29 Method to save the Raw Data from a URL.
30 """
31 outdir, url, suffix, parent, filename, expectjson = [kwargs[k] for k in kwargs.keys()]
32
33 if (outdir or url) == None:
34 logging.error("Please provide absolute UNIX path\n" + "to store the result DB flat files.")
35 exit(1)
36
37 logging.info(f"Downloading... Please wait...\n{url}")
38
39 with urlopen(url) as response:
40 with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file:
41 shutil.copyfileobj(response, tmp_html_file)
42
43 if expectjson:
44 try:
45 jsonresponse = json.load(open(tmp_html_file.name, "r"))
46 except json.JSONDecodeError:
47 logging.error(f"The response from\n{url}\nwas not valid JSON!")
48 exit(1)
49
50 logging.info(f"Got a valid JSON response from:\n{url}")
51 return jsonresponse
52
53 if not parent:
54 if not filename:
55 save_to = os.path.join(outdir, os.path.basename(urlparse(url).path) + suffix)
56 else:
57 save_to = os.path.join(outdir, filename + suffix)
58
59 logging.info(f"Saving to:\n{os.path.basename(save_to)}")
60
61 with urlopen(url) as url_response:
62 with open(save_to, "w") as fout:
63 fout.writelines(url_response.read().decode("utf-8"))
64
65 fout.close()
66 url_response.close()
67
68
69 def main() -> None:
70 """
71 This script is part of the `cronology_db` Nextflow workflow and is only
72 tested on POSIX sytems.
73 It:
74 1. Downloads the MLST scheme in JSON format from PubMLST.
75 and then,
76 2. Downloads the alleles' FASTA and profile table
77 suitable to run MLST analysis.
78 """
79
80 prog_name = os.path.basename(inspect.stack()[0].filename)
81
82 parser = argparse.ArgumentParser(
83 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
84 )
85
86 required = parser.add_argument_group("required arguments")
87
88 required.add_argument(
89 "-org",
90 dest="organism",
91 required=True,
92 help="The organism name to download the MLST alleles'\nFASTA and profile CSV for."
93 + "\nEx: -org cronobacter",
94 )
95 parser.add_argument(
96 "-f",
97 dest="overwrite",
98 default=False,
99 required=False,
100 action="store_true",
101 help="Force overwrite the results directory\nmentioned with -out.",
102 )
103 parser.add_argument(
104 "-out",
105 dest="outdir",
106 default=os.getcwd(),
107 required=False,
108 help="The absolute UNIX path to store the MLST alleles'\nFASTA and profile CSV.\n",
109 )
110 parser.add_argument(
111 "-mlsts",
112 dest="schemes",
113 default="schemes/1",
114 required=False,
115 help="The MLST scheme ID to download.",
116 )
117 parser.add_argument(
118 "-profile",
119 dest="profile",
120 default="profiles_csv",
121 required=False,
122 help="The MLST profile name in the scheme.",
123 )
124 parser.add_argument(
125 "-loci",
126 dest="loci",
127 default="loci",
128 required=False,
129 help="The key name in the JSON response which lists the\nallele URLs to download.",
130 )
131 parser.add_argument(
132 "-suffix",
133 dest="asuffix",
134 default=".tfa",
135 required=False,
136 help="What should be the suffix of the downloaded allele\nFASTA.",
137 )
138 parser.add_argument(
139 "-akey",
140 dest="allele_fa_key",
141 default="alleles_fasta",
142 required=False,
143 help="What is the key in the JSON response that contains\nthe URL for allele FASTA.",
144 )
145 parser.add_argument(
146 "-id",
147 dest="id_key",
148 default="id",
149 required=False,
150 help="What is the key in the JSON response that contains\nthe name of the allele FASTA.",
151 )
152
153 args = parser.parse_args()
154 org = args.organism
155 outdir = os.path.join(args.outdir, org)
156 overwrite = args.overwrite
157 pubmlst_loc = "_".join(["https://rest.pubmlst.org/db/pubmlst", org, "seqdef"])
158 schemes = args.schemes
159 profile = args.profile
160 loci = args.loci
161 suffix = args.asuffix
162 allele_fa_key = args.allele_fa_key
163 id_key = args.id_key
164
165 if not overwrite and os.path.exists(outdir):
166 logging.error(
167 f"Output directory\n{os.path.basename(outdir)}\nexists. Please use -f to overwrite."
168 )
169 exit(1)
170 elif overwrite and os.path.exists(outdir):
171 shutil.rmtree(outdir, ignore_errors=True)
172
173 # Create required output directory.
174 os.makedirs(outdir)
175
176 # Query MLST scheme for an organism.
177 pubmlst_json = dl_pubmlst(
178 path=outdir,
179 url="/".join([pubmlst_loc, schemes]),
180 suffix=suffix,
181 parent=True,
182 filename=False,
183 expectjson=True,
184 )
185
186 # Save profile_csv as organism.txt.
187 if profile in pubmlst_json.keys():
188 dl_pubmlst(
189 path=outdir,
190 url=pubmlst_json[profile],
191 suffix=".txt",
192 parent=False,
193 filename=org,
194 expectjson=False,
195 )
196
197 # Save MLST alleles' FASTA
198 if loci in pubmlst_json.keys():
199 for allele in pubmlst_json[loci]:
200 allele_fa_json = dl_pubmlst(
201 path=outdir,
202 url=allele,
203 suffix=suffix,
204 parent=True,
205 filename=False,
206 expectJson=True,
207 )
208
209 dl_pubmlst(
210 path=outdir,
211 url=allele_fa_json[allele_fa_key],
212 suffix=suffix,
213 parent=False,
214 filename=allele_fa_json[id_key],
215 expectJson=False,
216 )
217
218 logging.info(f"Finished downloading MLST scheme and profile for {org}.")
219
220
221 if __name__ == "__main__":
222 main()