Mercurial > repos > kkonganti > cfsan_cronology
comparison 0.1.0/bin/dl_pubmlst_profiles_and_schemes.py @ 0:c8597e9e1a97
"planemo upload"
author | kkonganti |
---|---|
date | Mon, 27 Nov 2023 12:37:44 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c8597e9e1a97 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 # Kranti Konganti | |
4 | |
5 import argparse | |
6 import inspect | |
7 import json | |
8 import logging | |
9 import os | |
10 import shutil | |
11 import tempfile | |
12 from urllib.parse import urlparse | |
13 from urllib.request import urlopen | |
14 | |
15 # Set logging format. | |
16 logging.basicConfig( | |
17 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n", | |
18 level=logging.DEBUG, | |
19 ) | |
20 | |
21 | |
22 # Multiple inheritence for pretty printing of help text. | |
23 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): | |
24 pass | |
25 | |
26 | |
27 def dl_pubmlst(**kwargs) -> None: | |
28 """ | |
29 Method to save the Raw Data from a URL. | |
30 """ | |
31 outdir, url, suffix, parent, filename, expectjson = [kwargs[k] for k in kwargs.keys()] | |
32 | |
33 if (outdir or url) == None: | |
34 logging.error("Please provide absolute UNIX path\n" + "to store the result DB flat files.") | |
35 exit(1) | |
36 | |
37 logging.info(f"Downloading... Please wait...\n{url}") | |
38 | |
39 with urlopen(url) as response: | |
40 with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file: | |
41 shutil.copyfileobj(response, tmp_html_file) | |
42 | |
43 if expectjson: | |
44 try: | |
45 jsonresponse = json.load(open(tmp_html_file.name, "r")) | |
46 except json.JSONDecodeError: | |
47 logging.error(f"The response from\n{url}\nwas not valid JSON!") | |
48 exit(1) | |
49 | |
50 logging.info(f"Got a valid JSON response from:\n{url}") | |
51 return jsonresponse | |
52 | |
53 if not parent: | |
54 if not filename: | |
55 save_to = os.path.join(outdir, os.path.basename(urlparse(url).path) + suffix) | |
56 else: | |
57 save_to = os.path.join(outdir, filename + suffix) | |
58 | |
59 logging.info(f"Saving to:\n{os.path.basename(save_to)}") | |
60 | |
61 with urlopen(url) as url_response: | |
62 with open(save_to, "w") as fout: | |
63 fout.writelines(url_response.read().decode("utf-8")) | |
64 | |
65 fout.close() | |
66 url_response.close() | |
67 | |
68 | |
69 def main() -> None: | |
70 """ | |
71 This script is part of the `cronology_db` Nextflow workflow and is only | |
72 tested on POSIX sytems. | |
73 It: | |
74 1. Downloads the MLST scheme in JSON format from PubMLST. | |
75 and then, | |
76 2. Downloads the alleles' FASTA and profile table | |
77 suitable to run MLST analysis. | |
78 """ | |
79 | |
80 prog_name = os.path.basename(inspect.stack()[0].filename) | |
81 | |
82 parser = argparse.ArgumentParser( | |
83 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses | |
84 ) | |
85 | |
86 required = parser.add_argument_group("required arguments") | |
87 | |
88 required.add_argument( | |
89 "-org", | |
90 dest="organism", | |
91 required=True, | |
92 help="The organism name to download the MLST alleles'\nFASTA and profile CSV for." | |
93 + "\nEx: -org cronobacter", | |
94 ) | |
95 parser.add_argument( | |
96 "-f", | |
97 dest="overwrite", | |
98 default=False, | |
99 required=False, | |
100 action="store_true", | |
101 help="Force overwrite the results directory\nmentioned with -out.", | |
102 ) | |
103 parser.add_argument( | |
104 "-out", | |
105 dest="outdir", | |
106 default=os.getcwd(), | |
107 required=False, | |
108 help="The absolute UNIX path to store the MLST alleles'\nFASTA and profile CSV.\n", | |
109 ) | |
110 parser.add_argument( | |
111 "-mlsts", | |
112 dest="schemes", | |
113 default="schemes/1", | |
114 required=False, | |
115 help="The MLST scheme ID to download.", | |
116 ) | |
117 parser.add_argument( | |
118 "-profile", | |
119 dest="profile", | |
120 default="profiles_csv", | |
121 required=False, | |
122 help="The MLST profile name in the scheme.", | |
123 ) | |
124 parser.add_argument( | |
125 "-loci", | |
126 dest="loci", | |
127 default="loci", | |
128 required=False, | |
129 help="The key name in the JSON response which lists the\nallele URLs to download.", | |
130 ) | |
131 parser.add_argument( | |
132 "-suffix", | |
133 dest="asuffix", | |
134 default=".tfa", | |
135 required=False, | |
136 help="What should be the suffix of the downloaded allele\nFASTA.", | |
137 ) | |
138 parser.add_argument( | |
139 "-akey", | |
140 dest="allele_fa_key", | |
141 default="alleles_fasta", | |
142 required=False, | |
143 help="What is the key in the JSON response that contains\nthe URL for allele FASTA.", | |
144 ) | |
145 parser.add_argument( | |
146 "-id", | |
147 dest="id_key", | |
148 default="id", | |
149 required=False, | |
150 help="What is the key in the JSON response that contains\nthe name of the allele FASTA.", | |
151 ) | |
152 | |
153 args = parser.parse_args() | |
154 org = args.organism | |
155 outdir = os.path.join(args.outdir, org) | |
156 overwrite = args.overwrite | |
157 pubmlst_loc = "_".join(["https://rest.pubmlst.org/db/pubmlst", org, "seqdef"]) | |
158 schemes = args.schemes | |
159 profile = args.profile | |
160 loci = args.loci | |
161 suffix = args.asuffix | |
162 allele_fa_key = args.allele_fa_key | |
163 id_key = args.id_key | |
164 | |
165 if not overwrite and os.path.exists(outdir): | |
166 logging.error( | |
167 f"Output directory\n{os.path.basename(outdir)}\nexists. Please use -f to overwrite." | |
168 ) | |
169 exit(1) | |
170 elif overwrite and os.path.exists(outdir): | |
171 shutil.rmtree(outdir, ignore_errors=True) | |
172 | |
173 # Create required output directory. | |
174 os.makedirs(outdir) | |
175 | |
176 # Query MLST scheme for an organism. | |
177 pubmlst_json = dl_pubmlst( | |
178 path=outdir, | |
179 url="/".join([pubmlst_loc, schemes]), | |
180 suffix=suffix, | |
181 parent=True, | |
182 filename=False, | |
183 expectjson=True, | |
184 ) | |
185 | |
186 # Save profile_csv as organism.txt. | |
187 if profile in pubmlst_json.keys(): | |
188 dl_pubmlst( | |
189 path=outdir, | |
190 url=pubmlst_json[profile], | |
191 suffix=".txt", | |
192 parent=False, | |
193 filename=org, | |
194 expectjson=False, | |
195 ) | |
196 | |
197 # Save MLST alleles' FASTA | |
198 if loci in pubmlst_json.keys(): | |
199 for allele in pubmlst_json[loci]: | |
200 allele_fa_json = dl_pubmlst( | |
201 path=outdir, | |
202 url=allele, | |
203 suffix=suffix, | |
204 parent=True, | |
205 filename=False, | |
206 expectJson=True, | |
207 ) | |
208 | |
209 dl_pubmlst( | |
210 path=outdir, | |
211 url=allele_fa_json[allele_fa_key], | |
212 suffix=suffix, | |
213 parent=False, | |
214 filename=allele_fa_json[id_key], | |
215 expectJson=False, | |
216 ) | |
217 | |
218 logging.info(f"Finished downloading MLST scheme and profile for {org}.") | |
219 | |
220 | |
221 if __name__ == "__main__": | |
222 main() |