diff 0.1.0/bin/dl_pubmlst_profiles_and_schemes.py @ 0:c8597e9e1a97

"planemo upload"
author kkonganti
date Mon, 27 Nov 2023 12:37:44 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/0.1.0/bin/dl_pubmlst_profiles_and_schemes.py	Mon Nov 27 12:37:44 2023 -0500
@@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+
+# Kranti Konganti
+
+import argparse
+import inspect
+import json
+import logging
+import os
+import shutil
+import tempfile
+from urllib.parse import urlparse
+from urllib.request import urlopen
+
+# Set logging format.
+logging.basicConfig(
+    format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n",
+    level=logging.DEBUG,
+)
+
+
+# Multiple inheritence for pretty printing of help text.
+class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def dl_pubmlst(**kwargs) -> None:
+    """
+    Method to save the Raw Data from a URL.
+    """
+    outdir, url, suffix, parent, filename, expectjson = [kwargs[k] for k in kwargs.keys()]
+
+    if (outdir or url) == None:
+        logging.error("Please provide absolute UNIX path\n" + "to store the result DB flat files.")
+        exit(1)
+
+    logging.info(f"Downloading... Please wait...\n{url}")
+
+    with urlopen(url) as response:
+        with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file:
+            shutil.copyfileobj(response, tmp_html_file)
+
+    if expectjson:
+        try:
+            jsonresponse = json.load(open(tmp_html_file.name, "r"))
+        except json.JSONDecodeError:
+            logging.error(f"The response from\n{url}\nwas not valid JSON!")
+            exit(1)
+
+        logging.info(f"Got a valid JSON response from:\n{url}")
+        return jsonresponse
+
+    if not parent:
+        if not filename:
+            save_to = os.path.join(outdir, os.path.basename(urlparse(url).path) + suffix)
+        else:
+            save_to = os.path.join(outdir, filename + suffix)
+
+        logging.info(f"Saving to:\n{os.path.basename(save_to)}")
+
+        with urlopen(url) as url_response:
+            with open(save_to, "w") as fout:
+                fout.writelines(url_response.read().decode("utf-8"))
+
+        fout.close()
+        url_response.close()
+
+
+def main() -> None:
+    """
+    This script is part of the `cronology_db` Nextflow workflow and is only
+    tested on POSIX sytems.
+    It:
+        1. Downloads the MLST scheme in JSON format from PubMLST.
+            and then,
+        2. Downloads the alleles' FASTA and profile table
+            suitable to run MLST analysis.
+    """
+
+    prog_name = os.path.basename(inspect.stack()[0].filename)
+
+    parser = argparse.ArgumentParser(
+        prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
+    )
+
+    required = parser.add_argument_group("required arguments")
+
+    required.add_argument(
+        "-org",
+        dest="organism",
+        required=True,
+        help="The organism name to download the MLST alleles'\nFASTA and profile CSV for."
+        + "\nEx: -org cronobacter",
+    )
+    parser.add_argument(
+        "-f",
+        dest="overwrite",
+        default=False,
+        required=False,
+        action="store_true",
+        help="Force overwrite the results directory\nmentioned with -out.",
+    )
+    parser.add_argument(
+        "-out",
+        dest="outdir",
+        default=os.getcwd(),
+        required=False,
+        help="The absolute UNIX path to store the MLST alleles'\nFASTA and profile CSV.\n",
+    )
+    parser.add_argument(
+        "-mlsts",
+        dest="schemes",
+        default="schemes/1",
+        required=False,
+        help="The MLST scheme ID to download.",
+    )
+    parser.add_argument(
+        "-profile",
+        dest="profile",
+        default="profiles_csv",
+        required=False,
+        help="The MLST profile name in the scheme.",
+    )
+    parser.add_argument(
+        "-loci",
+        dest="loci",
+        default="loci",
+        required=False,
+        help="The key name in the JSON response which lists the\nallele URLs to download.",
+    )
+    parser.add_argument(
+        "-suffix",
+        dest="asuffix",
+        default=".tfa",
+        required=False,
+        help="What should be the suffix of the downloaded allele\nFASTA.",
+    )
+    parser.add_argument(
+        "-akey",
+        dest="allele_fa_key",
+        default="alleles_fasta",
+        required=False,
+        help="What is the key in the JSON response that contains\nthe URL for allele FASTA.",
+    )
+    parser.add_argument(
+        "-id",
+        dest="id_key",
+        default="id",
+        required=False,
+        help="What is the key in the JSON response that contains\nthe name of the allele FASTA.",
+    )
+
+    args = parser.parse_args()
+    org = args.organism
+    outdir = os.path.join(args.outdir, org)
+    overwrite = args.overwrite
+    pubmlst_loc = "_".join(["https://rest.pubmlst.org/db/pubmlst", org, "seqdef"])
+    schemes = args.schemes
+    profile = args.profile
+    loci = args.loci
+    suffix = args.asuffix
+    allele_fa_key = args.allele_fa_key
+    id_key = args.id_key
+
+    if not overwrite and os.path.exists(outdir):
+        logging.error(
+            f"Output directory\n{os.path.basename(outdir)}\nexists. Please use -f to overwrite."
+        )
+        exit(1)
+    elif overwrite and os.path.exists(outdir):
+        shutil.rmtree(outdir, ignore_errors=True)
+
+    # Create required output directory.
+    os.makedirs(outdir)
+
+    # Query MLST scheme for an organism.
+    pubmlst_json = dl_pubmlst(
+        path=outdir,
+        url="/".join([pubmlst_loc, schemes]),
+        suffix=suffix,
+        parent=True,
+        filename=False,
+        expectjson=True,
+    )
+
+    # Save profile_csv as organism.txt.
+    if profile in pubmlst_json.keys():
+        dl_pubmlst(
+            path=outdir,
+            url=pubmlst_json[profile],
+            suffix=".txt",
+            parent=False,
+            filename=org,
+            expectjson=False,
+        )
+
+    # Save MLST alleles' FASTA
+    if loci in pubmlst_json.keys():
+        for allele in pubmlst_json[loci]:
+            allele_fa_json = dl_pubmlst(
+                path=outdir,
+                url=allele,
+                suffix=suffix,
+                parent=True,
+                filename=False,
+                expectJson=True,
+            )
+
+            dl_pubmlst(
+                path=outdir,
+                url=allele_fa_json[allele_fa_key],
+                suffix=suffix,
+                parent=False,
+                filename=allele_fa_json[id_key],
+                expectJson=False,
+            )
+
+    logging.info(f"Finished downloading MLST scheme and profile for {org}.")
+
+
+if __name__ == "__main__":
+    main()