annotate 0.1.0/bin/dl_pubmlst_profiles_and_schemes.py @ 5:6e5ceea33843

"planemo upload"
author kkonganti
date Mon, 27 Nov 2023 14:50:43 -0500
parents c8597e9e1a97
children
rev   line source
kkonganti@0 1 #!/usr/bin/env python3
kkonganti@0 2
kkonganti@0 3 # Kranti Konganti
kkonganti@0 4
kkonganti@0 5 import argparse
kkonganti@0 6 import inspect
kkonganti@0 7 import json
kkonganti@0 8 import logging
kkonganti@0 9 import os
kkonganti@0 10 import shutil
kkonganti@0 11 import tempfile
kkonganti@0 12 from urllib.parse import urlparse
kkonganti@0 13 from urllib.request import urlopen
kkonganti@0 14
kkonganti@0 15 # Set logging format.
kkonganti@0 16 logging.basicConfig(
kkonganti@0 17 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n",
kkonganti@0 18 level=logging.DEBUG,
kkonganti@0 19 )
kkonganti@0 20
kkonganti@0 21
kkonganti@0 22 # Multiple inheritence for pretty printing of help text.
kkonganti@0 23 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
kkonganti@0 24 pass
kkonganti@0 25
kkonganti@0 26
kkonganti@0 27 def dl_pubmlst(**kwargs) -> None:
kkonganti@0 28 """
kkonganti@0 29 Method to save the Raw Data from a URL.
kkonganti@0 30 """
kkonganti@0 31 outdir, url, suffix, parent, filename, expectjson = [kwargs[k] for k in kwargs.keys()]
kkonganti@0 32
kkonganti@0 33 if (outdir or url) == None:
kkonganti@0 34 logging.error("Please provide absolute UNIX path\n" + "to store the result DB flat files.")
kkonganti@0 35 exit(1)
kkonganti@0 36
kkonganti@0 37 logging.info(f"Downloading... Please wait...\n{url}")
kkonganti@0 38
kkonganti@0 39 with urlopen(url) as response:
kkonganti@0 40 with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file:
kkonganti@0 41 shutil.copyfileobj(response, tmp_html_file)
kkonganti@0 42
kkonganti@0 43 if expectjson:
kkonganti@0 44 try:
kkonganti@0 45 jsonresponse = json.load(open(tmp_html_file.name, "r"))
kkonganti@0 46 except json.JSONDecodeError:
kkonganti@0 47 logging.error(f"The response from\n{url}\nwas not valid JSON!")
kkonganti@0 48 exit(1)
kkonganti@0 49
kkonganti@0 50 logging.info(f"Got a valid JSON response from:\n{url}")
kkonganti@0 51 return jsonresponse
kkonganti@0 52
kkonganti@0 53 if not parent:
kkonganti@0 54 if not filename:
kkonganti@0 55 save_to = os.path.join(outdir, os.path.basename(urlparse(url).path) + suffix)
kkonganti@0 56 else:
kkonganti@0 57 save_to = os.path.join(outdir, filename + suffix)
kkonganti@0 58
kkonganti@0 59 logging.info(f"Saving to:\n{os.path.basename(save_to)}")
kkonganti@0 60
kkonganti@0 61 with urlopen(url) as url_response:
kkonganti@0 62 with open(save_to, "w") as fout:
kkonganti@0 63 fout.writelines(url_response.read().decode("utf-8"))
kkonganti@0 64
kkonganti@0 65 fout.close()
kkonganti@0 66 url_response.close()
kkonganti@0 67
kkonganti@0 68
kkonganti@0 69 def main() -> None:
kkonganti@0 70 """
kkonganti@0 71 This script is part of the `cronology_db` Nextflow workflow and is only
kkonganti@0 72 tested on POSIX sytems.
kkonganti@0 73 It:
kkonganti@0 74 1. Downloads the MLST scheme in JSON format from PubMLST.
kkonganti@0 75 and then,
kkonganti@0 76 2. Downloads the alleles' FASTA and profile table
kkonganti@0 77 suitable to run MLST analysis.
kkonganti@0 78 """
kkonganti@0 79
kkonganti@0 80 prog_name = os.path.basename(inspect.stack()[0].filename)
kkonganti@0 81
kkonganti@0 82 parser = argparse.ArgumentParser(
kkonganti@0 83 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@0 84 )
kkonganti@0 85
kkonganti@0 86 required = parser.add_argument_group("required arguments")
kkonganti@0 87
kkonganti@0 88 required.add_argument(
kkonganti@0 89 "-org",
kkonganti@0 90 dest="organism",
kkonganti@0 91 required=True,
kkonganti@0 92 help="The organism name to download the MLST alleles'\nFASTA and profile CSV for."
kkonganti@0 93 + "\nEx: -org cronobacter",
kkonganti@0 94 )
kkonganti@0 95 parser.add_argument(
kkonganti@0 96 "-f",
kkonganti@0 97 dest="overwrite",
kkonganti@0 98 default=False,
kkonganti@0 99 required=False,
kkonganti@0 100 action="store_true",
kkonganti@0 101 help="Force overwrite the results directory\nmentioned with -out.",
kkonganti@0 102 )
kkonganti@0 103 parser.add_argument(
kkonganti@0 104 "-out",
kkonganti@0 105 dest="outdir",
kkonganti@0 106 default=os.getcwd(),
kkonganti@0 107 required=False,
kkonganti@0 108 help="The absolute UNIX path to store the MLST alleles'\nFASTA and profile CSV.\n",
kkonganti@0 109 )
kkonganti@0 110 parser.add_argument(
kkonganti@0 111 "-mlsts",
kkonganti@0 112 dest="schemes",
kkonganti@0 113 default="schemes/1",
kkonganti@0 114 required=False,
kkonganti@0 115 help="The MLST scheme ID to download.",
kkonganti@0 116 )
kkonganti@0 117 parser.add_argument(
kkonganti@0 118 "-profile",
kkonganti@0 119 dest="profile",
kkonganti@0 120 default="profiles_csv",
kkonganti@0 121 required=False,
kkonganti@0 122 help="The MLST profile name in the scheme.",
kkonganti@0 123 )
kkonganti@0 124 parser.add_argument(
kkonganti@0 125 "-loci",
kkonganti@0 126 dest="loci",
kkonganti@0 127 default="loci",
kkonganti@0 128 required=False,
kkonganti@0 129 help="The key name in the JSON response which lists the\nallele URLs to download.",
kkonganti@0 130 )
kkonganti@0 131 parser.add_argument(
kkonganti@0 132 "-suffix",
kkonganti@0 133 dest="asuffix",
kkonganti@0 134 default=".tfa",
kkonganti@0 135 required=False,
kkonganti@0 136 help="What should be the suffix of the downloaded allele\nFASTA.",
kkonganti@0 137 )
kkonganti@0 138 parser.add_argument(
kkonganti@0 139 "-akey",
kkonganti@0 140 dest="allele_fa_key",
kkonganti@0 141 default="alleles_fasta",
kkonganti@0 142 required=False,
kkonganti@0 143 help="What is the key in the JSON response that contains\nthe URL for allele FASTA.",
kkonganti@0 144 )
kkonganti@0 145 parser.add_argument(
kkonganti@0 146 "-id",
kkonganti@0 147 dest="id_key",
kkonganti@0 148 default="id",
kkonganti@0 149 required=False,
kkonganti@0 150 help="What is the key in the JSON response that contains\nthe name of the allele FASTA.",
kkonganti@0 151 )
kkonganti@0 152
kkonganti@0 153 args = parser.parse_args()
kkonganti@0 154 org = args.organism
kkonganti@0 155 outdir = os.path.join(args.outdir, org)
kkonganti@0 156 overwrite = args.overwrite
kkonganti@0 157 pubmlst_loc = "_".join(["https://rest.pubmlst.org/db/pubmlst", org, "seqdef"])
kkonganti@0 158 schemes = args.schemes
kkonganti@0 159 profile = args.profile
kkonganti@0 160 loci = args.loci
kkonganti@0 161 suffix = args.asuffix
kkonganti@0 162 allele_fa_key = args.allele_fa_key
kkonganti@0 163 id_key = args.id_key
kkonganti@0 164
kkonganti@0 165 if not overwrite and os.path.exists(outdir):
kkonganti@0 166 logging.error(
kkonganti@0 167 f"Output directory\n{os.path.basename(outdir)}\nexists. Please use -f to overwrite."
kkonganti@0 168 )
kkonganti@0 169 exit(1)
kkonganti@0 170 elif overwrite and os.path.exists(outdir):
kkonganti@0 171 shutil.rmtree(outdir, ignore_errors=True)
kkonganti@0 172
kkonganti@0 173 # Create required output directory.
kkonganti@0 174 os.makedirs(outdir)
kkonganti@0 175
kkonganti@0 176 # Query MLST scheme for an organism.
kkonganti@0 177 pubmlst_json = dl_pubmlst(
kkonganti@0 178 path=outdir,
kkonganti@0 179 url="/".join([pubmlst_loc, schemes]),
kkonganti@0 180 suffix=suffix,
kkonganti@0 181 parent=True,
kkonganti@0 182 filename=False,
kkonganti@0 183 expectjson=True,
kkonganti@0 184 )
kkonganti@0 185
kkonganti@0 186 # Save profile_csv as organism.txt.
kkonganti@0 187 if profile in pubmlst_json.keys():
kkonganti@0 188 dl_pubmlst(
kkonganti@0 189 path=outdir,
kkonganti@0 190 url=pubmlst_json[profile],
kkonganti@0 191 suffix=".txt",
kkonganti@0 192 parent=False,
kkonganti@0 193 filename=org,
kkonganti@0 194 expectjson=False,
kkonganti@0 195 )
kkonganti@0 196
kkonganti@0 197 # Save MLST alleles' FASTA
kkonganti@0 198 if loci in pubmlst_json.keys():
kkonganti@0 199 for allele in pubmlst_json[loci]:
kkonganti@0 200 allele_fa_json = dl_pubmlst(
kkonganti@0 201 path=outdir,
kkonganti@0 202 url=allele,
kkonganti@0 203 suffix=suffix,
kkonganti@0 204 parent=True,
kkonganti@0 205 filename=False,
kkonganti@0 206 expectJson=True,
kkonganti@0 207 )
kkonganti@0 208
kkonganti@0 209 dl_pubmlst(
kkonganti@0 210 path=outdir,
kkonganti@0 211 url=allele_fa_json[allele_fa_key],
kkonganti@0 212 suffix=suffix,
kkonganti@0 213 parent=False,
kkonganti@0 214 filename=allele_fa_json[id_key],
kkonganti@0 215 expectJson=False,
kkonganti@0 216 )
kkonganti@0 217
kkonganti@0 218 logging.info(f"Finished downloading MLST scheme and profile for {org}.")
kkonganti@0 219
kkonganti@0 220
kkonganti@0 221 if __name__ == "__main__":
kkonganti@0 222 main()