Mercurial > repos > galaxytrakr > hfp_bettercallsal_konda
comparison 1.0.0/bin/dl_pubmlst_profiles_and_schemes.py @ 0:0a8dda29956e draft default tip
planemo upload
| author | galaxytrakr |
|---|---|
| date | Thu, 28 May 2026 20:41:10 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0a8dda29956e |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 # Kranti Konganti | |
| 4 | |
| 5 import argparse | |
| 6 import inspect | |
| 7 import json | |
| 8 import logging | |
| 9 import os | |
| 10 import shutil | |
| 11 import tempfile | |
| 12 from urllib.parse import urlparse | |
| 13 from urllib.request import urlopen | |
| 14 | |
| 15 # Set logging format. | |
| 16 logging.basicConfig( | |
| 17 format="\n" | |
| 18 + "=" * 55 | |
| 19 + "\n%(asctime)s - %(levelname)s\n" | |
| 20 + "=" * 55 | |
| 21 + "\n%(message)s\n", | |
| 22 level=logging.DEBUG, | |
| 23 ) | |
| 24 | |
| 25 | |
| 26 # Multiple inheritence for pretty printing of help text. | |
| 27 class MultiArgFormatClasses( | |
| 28 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter | |
| 29 ): | |
| 30 pass | |
| 31 | |
| 32 | |
| 33 def dl_pubmlst(**kwargs) -> None: | |
| 34 """ | |
| 35 Method to save the Raw Data from a URL. | |
| 36 """ | |
| 37 outdir, url, suffix, parent, filename, expectjson = [ | |
| 38 kwargs[k] for k in kwargs.keys() | |
| 39 ] | |
| 40 | |
| 41 if (outdir or url) == None: | |
| 42 logging.error( | |
| 43 "Please provide absolute UNIX path\n" + "to store the result DB flat files." | |
| 44 ) | |
| 45 exit(1) | |
| 46 | |
| 47 logging.info(f"Downloading... Please wait...\n{url}") | |
| 48 | |
| 49 with urlopen(url) as response: | |
| 50 with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file: | |
| 51 shutil.copyfileobj(response, tmp_html_file) | |
| 52 | |
| 53 if expectjson: | |
| 54 try: | |
| 55 jsonresponse = json.load(open(tmp_html_file.name, "r")) | |
| 56 except json.JSONDecodeError: | |
| 57 logging.error(f"The response from\n{url}\nwas not valid JSON!") | |
| 58 exit(1) | |
| 59 | |
| 60 logging.info(f"Got a valid JSON response from:\n{url}") | |
| 61 return jsonresponse | |
| 62 | |
| 63 if not parent: | |
| 64 if not filename: | |
| 65 save_to = os.path.join( | |
| 66 outdir, os.path.basename(urlparse(url).path) + suffix | |
| 67 ) | |
| 68 else: | |
| 69 save_to = os.path.join(outdir, filename + suffix) | |
| 70 | |
| 71 logging.info(f"Saving to:\n{os.path.basename(save_to)}") | |
| 72 | |
| 73 with urlopen(url) as url_response: | |
| 74 with open(save_to, "w") as fout: | |
| 75 fout.writelines(url_response.read().decode("utf-8")) | |
| 76 | |
| 77 fout.close() | |
| 78 url_response.close() | |
| 79 | |
| 80 | |
| 81 def main() -> None: | |
| 82 """ | |
| 83 This script is part of the `cronology_db` Nextflow workflow and is only | |
| 84 tested on POSIX sytems. | |
| 85 It: | |
| 86 1. Downloads the MLST scheme in JSON format from PubMLST. | |
| 87 and then, | |
| 88 2. Downloads the alleles' FASTA and profile table | |
| 89 suitable to run MLST analysis. | |
| 90 """ | |
| 91 | |
| 92 prog_name = os.path.basename(inspect.stack()[0].filename) | |
| 93 | |
| 94 parser = argparse.ArgumentParser( | |
| 95 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses | |
| 96 ) | |
| 97 | |
| 98 required = parser.add_argument_group("required arguments") | |
| 99 | |
| 100 required.add_argument( | |
| 101 "-org", | |
| 102 dest="organism", | |
| 103 required=True, | |
| 104 help="The organism name to download the MLST alleles'\nFASTA and profile CSV for." | |
| 105 + "\nEx: -org salmonella", | |
| 106 ) | |
| 107 parser.add_argument( | |
| 108 "-f", | |
| 109 dest="overwrite", | |
| 110 default=False, | |
| 111 required=False, | |
| 112 action="store_true", | |
| 113 help="Force overwrite the results directory\nmentioned with -out.", | |
| 114 ) | |
| 115 parser.add_argument( | |
| 116 "-out", | |
| 117 dest="outdir", | |
| 118 default=os.getcwd(), | |
| 119 required=False, | |
| 120 help="The absolute UNIX path to store the MLST alleles'\nFASTA and profile CSV.\n", | |
| 121 ) | |
| 122 parser.add_argument( | |
| 123 "-mlsts", | |
| 124 dest="schemes", | |
| 125 default="schemes/2", | |
| 126 required=False, | |
| 127 help="The MLST scheme ID to download.", | |
| 128 ) | |
| 129 parser.add_argument( | |
| 130 "-profile", | |
| 131 dest="profile", | |
| 132 default="profiles_csv", | |
| 133 required=False, | |
| 134 help="The MLST profile name in the scheme.", | |
| 135 ) | |
| 136 parser.add_argument( | |
| 137 "-loci", | |
| 138 dest="loci", | |
| 139 default="loci", | |
| 140 required=False, | |
| 141 help="The key name in the JSON response which lists the\nallele URLs to download.", | |
| 142 ) | |
| 143 parser.add_argument( | |
| 144 "-suffix", | |
| 145 dest="asuffix", | |
| 146 default=".tfa", | |
| 147 required=False, | |
| 148 help="What should be the suffix of the downloaded allele\nFASTA.", | |
| 149 ) | |
| 150 parser.add_argument( | |
| 151 "-akey", | |
| 152 dest="allele_fa_key", | |
| 153 default="alleles_fasta", | |
| 154 required=False, | |
| 155 help="What is the key in the JSON response that contains\nthe URL for allele FASTA.", | |
| 156 ) | |
| 157 parser.add_argument( | |
| 158 "-id", | |
| 159 dest="id_key", | |
| 160 default="id", | |
| 161 required=False, | |
| 162 help="What is the key in the JSON response that contains\nthe name of the allele FASTA.", | |
| 163 ) | |
| 164 | |
| 165 args = parser.parse_args() | |
| 166 org = args.organism | |
| 167 outdir = os.path.join(args.outdir, org) | |
| 168 overwrite = args.overwrite | |
| 169 pubmlst_loc = "_".join(["https://rest.pubmlst.org/db/pubmlst", org, "seqdef"]) | |
| 170 schemes = args.schemes | |
| 171 profile = args.profile | |
| 172 loci = args.loci | |
| 173 suffix = args.asuffix | |
| 174 allele_fa_key = args.allele_fa_key | |
| 175 id_key = args.id_key | |
| 176 | |
| 177 if not overwrite and os.path.exists(outdir): | |
| 178 logging.error( | |
| 179 f"Output directory\n{os.path.basename(outdir)}\nexists. Please use -f to overwrite." | |
| 180 ) | |
| 181 exit(1) | |
| 182 elif overwrite and os.path.exists(outdir): | |
| 183 shutil.rmtree(outdir, ignore_errors=True) | |
| 184 | |
| 185 # Create required output directory. | |
| 186 os.makedirs(outdir) | |
| 187 | |
| 188 # Query MLST scheme for an organism. | |
| 189 pubmlst_json = dl_pubmlst( | |
| 190 path=outdir, | |
| 191 url="/".join([pubmlst_loc, schemes]), | |
| 192 suffix=suffix, | |
| 193 parent=True, | |
| 194 filename=False, | |
| 195 expectjson=True, | |
| 196 ) | |
| 197 | |
| 198 # Save profile_csv as organism.txt. | |
| 199 if profile in pubmlst_json.keys(): | |
| 200 dl_pubmlst( | |
| 201 path=outdir, | |
| 202 url=pubmlst_json[profile], | |
| 203 suffix=".txt", | |
| 204 parent=False, | |
| 205 filename=org, | |
| 206 expectjson=False, | |
| 207 ) | |
| 208 | |
| 209 # Save MLST alleles' FASTA | |
| 210 if loci in pubmlst_json.keys(): | |
| 211 for allele in pubmlst_json[loci]: | |
| 212 allele_fa_json = dl_pubmlst( | |
| 213 path=outdir, | |
| 214 url=allele, | |
| 215 suffix=suffix, | |
| 216 parent=True, | |
| 217 filename=False, | |
| 218 expectJson=True, | |
| 219 ) | |
| 220 | |
| 221 dl_pubmlst( | |
| 222 path=outdir, | |
| 223 url=allele_fa_json[allele_fa_key], | |
| 224 suffix=suffix, | |
| 225 parent=False, | |
| 226 filename=allele_fa_json[id_key], | |
| 227 expectJson=False, | |
| 228 ) | |
| 229 | |
| 230 logging.info(f"Finished downloading MLST scheme and profile for {org}.") | |
| 231 | |
| 232 | |
| 233 if __name__ == "__main__": | |
| 234 main() |
