Mercurial > repos > kkonganti > cfsan_cronology
diff 0.1.0/bin/dl_pubmlst_profiles_and_schemes.py @ 0:c8597e9e1a97
"planemo upload"
author | kkonganti |
---|---|
date | Mon, 27 Nov 2023 12:37:44 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/0.1.0/bin/dl_pubmlst_profiles_and_schemes.py Mon Nov 27 12:37:44 2023 -0500 @@ -0,0 +1,222 @@ +#!/usr/bin/env python3 + +# Kranti Konganti + +import argparse +import inspect +import json +import logging +import os +import shutil +import tempfile +from urllib.parse import urlparse +from urllib.request import urlopen + +# Set logging format. +logging.basicConfig( + format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n", + level=logging.DEBUG, +) + + +# Multiple inheritence for pretty printing of help text. +class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): + pass + + +def dl_pubmlst(**kwargs) -> None: + """ + Method to save the Raw Data from a URL. + """ + outdir, url, suffix, parent, filename, expectjson = [kwargs[k] for k in kwargs.keys()] + + if (outdir or url) == None: + logging.error("Please provide absolute UNIX path\n" + "to store the result DB flat files.") + exit(1) + + logging.info(f"Downloading... Please wait...\n{url}") + + with urlopen(url) as response: + with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file: + shutil.copyfileobj(response, tmp_html_file) + + if expectjson: + try: + jsonresponse = json.load(open(tmp_html_file.name, "r")) + except json.JSONDecodeError: + logging.error(f"The response from\n{url}\nwas not valid JSON!") + exit(1) + + logging.info(f"Got a valid JSON response from:\n{url}") + return jsonresponse + + if not parent: + if not filename: + save_to = os.path.join(outdir, os.path.basename(urlparse(url).path) + suffix) + else: + save_to = os.path.join(outdir, filename + suffix) + + logging.info(f"Saving to:\n{os.path.basename(save_to)}") + + with urlopen(url) as url_response: + with open(save_to, "w") as fout: + fout.writelines(url_response.read().decode("utf-8")) + + fout.close() + url_response.close() + + +def main() -> None: + """ + This script is part of the `cronology_db` Nextflow workflow and is only + tested on POSIX sytems. + It: + 1. Downloads the MLST scheme in JSON format from PubMLST. + and then, + 2. Downloads the alleles' FASTA and profile table + suitable to run MLST analysis. + """ + + prog_name = os.path.basename(inspect.stack()[0].filename) + + parser = argparse.ArgumentParser( + prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses + ) + + required = parser.add_argument_group("required arguments") + + required.add_argument( + "-org", + dest="organism", + required=True, + help="The organism name to download the MLST alleles'\nFASTA and profile CSV for." + + "\nEx: -org cronobacter", + ) + parser.add_argument( + "-f", + dest="overwrite", + default=False, + required=False, + action="store_true", + help="Force overwrite the results directory\nmentioned with -out.", + ) + parser.add_argument( + "-out", + dest="outdir", + default=os.getcwd(), + required=False, + help="The absolute UNIX path to store the MLST alleles'\nFASTA and profile CSV.\n", + ) + parser.add_argument( + "-mlsts", + dest="schemes", + default="schemes/1", + required=False, + help="The MLST scheme ID to download.", + ) + parser.add_argument( + "-profile", + dest="profile", + default="profiles_csv", + required=False, + help="The MLST profile name in the scheme.", + ) + parser.add_argument( + "-loci", + dest="loci", + default="loci", + required=False, + help="The key name in the JSON response which lists the\nallele URLs to download.", + ) + parser.add_argument( + "-suffix", + dest="asuffix", + default=".tfa", + required=False, + help="What should be the suffix of the downloaded allele\nFASTA.", + ) + parser.add_argument( + "-akey", + dest="allele_fa_key", + default="alleles_fasta", + required=False, + help="What is the key in the JSON response that contains\nthe URL for allele FASTA.", + ) + parser.add_argument( + "-id", + dest="id_key", + default="id", + required=False, + help="What is the key in the JSON response that contains\nthe name of the allele FASTA.", + ) + + args = parser.parse_args() + org = args.organism + outdir = os.path.join(args.outdir, org) + overwrite = args.overwrite + pubmlst_loc = "_".join(["https://rest.pubmlst.org/db/pubmlst", org, "seqdef"]) + schemes = args.schemes + profile = args.profile + loci = args.loci + suffix = args.asuffix + allele_fa_key = args.allele_fa_key + id_key = args.id_key + + if not overwrite and os.path.exists(outdir): + logging.error( + f"Output directory\n{os.path.basename(outdir)}\nexists. Please use -f to overwrite." + ) + exit(1) + elif overwrite and os.path.exists(outdir): + shutil.rmtree(outdir, ignore_errors=True) + + # Create required output directory. + os.makedirs(outdir) + + # Query MLST scheme for an organism. + pubmlst_json = dl_pubmlst( + path=outdir, + url="/".join([pubmlst_loc, schemes]), + suffix=suffix, + parent=True, + filename=False, + expectjson=True, + ) + + # Save profile_csv as organism.txt. + if profile in pubmlst_json.keys(): + dl_pubmlst( + path=outdir, + url=pubmlst_json[profile], + suffix=".txt", + parent=False, + filename=org, + expectjson=False, + ) + + # Save MLST alleles' FASTA + if loci in pubmlst_json.keys(): + for allele in pubmlst_json[loci]: + allele_fa_json = dl_pubmlst( + path=outdir, + url=allele, + suffix=suffix, + parent=True, + filename=False, + expectJson=True, + ) + + dl_pubmlst( + path=outdir, + url=allele_fa_json[allele_fa_key], + suffix=suffix, + parent=False, + filename=allele_fa_json[id_key], + expectJson=False, + ) + + logging.info(f"Finished downloading MLST scheme and profile for {org}.") + + +if __name__ == "__main__": + main()