annotate 0.7.0/bin/dl_pdg_metadata.py @ 20:133571bf891f

planemo upload
author kkonganti
date Mon, 15 Jul 2024 11:42:00 -0400
parents 0e7a0053e4a6
children
rev   line source
kkonganti@17 1 #!/usr/bin/env python3
kkonganti@17 2
kkonganti@17 3 # Kranti Konganti
kkonganti@17 4
kkonganti@17 5 import argparse
kkonganti@17 6 import inspect
kkonganti@17 7 import logging
kkonganti@17 8 import os
kkonganti@17 9 import re
kkonganti@17 10 import shutil
kkonganti@17 11 import ssl
kkonganti@17 12 import tempfile
kkonganti@17 13 from html.parser import HTMLParser
kkonganti@17 14 from urllib.request import urlopen
kkonganti@17 15
kkonganti@17 16 # Set logging.f
kkonganti@17 17 logging.basicConfig(
kkonganti@17 18 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n",
kkonganti@17 19 level=logging.DEBUG,
kkonganti@17 20 )
kkonganti@17 21
kkonganti@17 22 # Multiple inheritence for pretty printing of help text.
kkonganti@17 23 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
kkonganti@17 24 pass
kkonganti@17 25
kkonganti@17 26
kkonganti@17 27 # HTMLParser override class to get PDG release and latest Cluster .tsv file
kkonganti@17 28 class NCBIPathogensHTMLParser(HTMLParser):
kkonganti@17 29 def __init__(self, *, convert_charrefs: bool = ...) -> None:
kkonganti@17 30 super().__init__(convert_charrefs=convert_charrefs)
kkonganti@17 31 self.reset()
kkonganti@17 32 self.href_data = list()
kkonganti@17 33
kkonganti@17 34 def handle_data(self, data):
kkonganti@17 35 self.href_data.append(data)
kkonganti@17 36
kkonganti@17 37
kkonganti@17 38 def dl_pdg(**kwargs) -> None:
kkonganti@17 39 """
kkonganti@17 40 Method to save the PDG metadata file and
kkonganti@17 41 return the latest PDG release.
kkonganti@17 42 """
kkonganti@17 43 db_path, url, regex, suffix, overwrite, release = [kwargs[k] for k in kwargs.keys()]
kkonganti@17 44
kkonganti@17 45 contxt = ssl.create_default_context()
kkonganti@17 46 contxt.check_hostname = False
kkonganti@17 47 contxt.verify_mode = ssl.CERT_NONE
kkonganti@17 48
kkonganti@17 49 if (db_path or url) == None:
kkonganti@17 50 logging.error("Please provide absolute UNIX path\n" + "to store the result DB flat files.")
kkonganti@17 51 exit(1)
kkonganti@17 52
kkonganti@17 53 if re.match(r"^PDG\d+\.\d+$", release):
kkonganti@17 54 url = re.sub("latest_snps", release.strip(), url)
kkonganti@17 55
kkonganti@17 56 html_parser = NCBIPathogensHTMLParser()
kkonganti@17 57 logging.info(f"Finding latest NCBI PDG release at:\n{url}")
kkonganti@17 58
kkonganti@17 59 with urlopen(url, context=contxt) as response:
kkonganti@17 60 with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file:
kkonganti@17 61 shutil.copyfileobj(response, tmp_html_file)
kkonganti@17 62
kkonganti@17 63 with open(tmp_html_file.name, "r") as html:
kkonganti@17 64 html_parser.feed("".join(html.readlines()))
kkonganti@17 65
kkonganti@17 66 pdg_filename = re.search(regex, "".join(html_parser.href_data)).group(0)
kkonganti@17 67 pdg_release = pdg_filename.rstrip(suffix)
kkonganti@17 68 pdg_metadata_url = "/".join([url, pdg_filename])
kkonganti@17 69 pdg_release = pdg_filename.rstrip(suffix)
kkonganti@17 70 dest_dir = os.path.join(db_path, pdg_release)
kkonganti@17 71
kkonganti@17 72 logging.info(f"Found NCBI PDG file:\n{pdg_metadata_url}")
kkonganti@17 73
kkonganti@17 74 if (
kkonganti@17 75 not overwrite
kkonganti@17 76 and re.match(r".+?\.metadata\.tsv$", pdg_filename)
kkonganti@17 77 and os.path.exists(dest_dir)
kkonganti@17 78 ):
kkonganti@17 79 logging.error(f"DB path\n{dest_dir}\nalready exists. Please use -f to overwrite.")
kkonganti@17 80 exit(1)
kkonganti@17 81 elif overwrite and not re.match(r".+?\.reference_target\.cluster_list\.tsv$", pdg_filename):
kkonganti@17 82 shutil.rmtree(dest_dir, ignore_errors=True) if os.path.exists(dest_dir) else None
kkonganti@17 83 os.makedirs(dest_dir)
kkonganti@17 84 elif (
kkonganti@17 85 not overwrite
kkonganti@17 86 and re.match(r".+?\.metadata\.tsv$", pdg_filename)
kkonganti@17 87 and not os.path.exists(dest_dir)
kkonganti@17 88 ):
kkonganti@17 89 os.makedirs(dest_dir)
kkonganti@17 90
kkonganti@17 91 tsv_at = os.path.join(dest_dir, pdg_filename)
kkonganti@17 92 logging.info(f"Saving to:\n{tsv_at}")
kkonganti@17 93
kkonganti@17 94 with urlopen(pdg_metadata_url, context=contxt) as response:
kkonganti@17 95 with open(tsv_at, "w") as tsv:
kkonganti@17 96 tsv.writelines(response.read().decode("utf-8"))
kkonganti@17 97
kkonganti@17 98 html.close()
kkonganti@17 99 tmp_html_file.close()
kkonganti@17 100 os.unlink(tmp_html_file.name)
kkonganti@17 101 tsv.close()
kkonganti@17 102 response.close()
kkonganti@17 103
kkonganti@17 104 return tsv_at, dest_dir
kkonganti@17 105
kkonganti@17 106
kkonganti@17 107 def main() -> None:
kkonganti@17 108 """
kkonganti@17 109 This script is part of the `bettercallsal_db` Nextflow workflow and is only
kkonganti@17 110 tested on POSIX sytems.
kkonganti@17 111 It:
kkonganti@17 112 1. Downloads the latest NCBI Pathogens Release metadata file, which
kkonganti@17 113 looks like PDGXXXXXXXXXX.2504.metadata.csv and also the SNP cluster
kkonganti@17 114 information file which looks like PDGXXXXXXXXXX.2504.reference_target.cluster_list.tsv
kkonganti@17 115 2. Generates a new metadata file with only required information such as
kkonganti@17 116 computed_serotype, isolates GenBank or RefSeq downloadable genome FASTA
kkonganti@17 117 URL.
kkonganti@17 118 """
kkonganti@17 119
kkonganti@17 120 prog_name = os.path.basename(inspect.stack()[0].filename)
kkonganti@17 121
kkonganti@17 122 parser = argparse.ArgumentParser(
kkonganti@17 123 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@17 124 )
kkonganti@17 125
kkonganti@17 126 # required = parser.add_argument_group("required arguments")
kkonganti@17 127
kkonganti@17 128 parser.add_argument(
kkonganti@17 129 "-db",
kkonganti@17 130 dest="db_path",
kkonganti@17 131 default=os.getcwd(),
kkonganti@17 132 required=False,
kkonganti@17 133 help="Absolute UNIX path to a path where all results files are\nstored.",
kkonganti@17 134 )
kkonganti@17 135 parser.add_argument(
kkonganti@17 136 "-f",
kkonganti@17 137 dest="overwrite_db",
kkonganti@17 138 default=False,
kkonganti@17 139 required=False,
kkonganti@17 140 action="store_true",
kkonganti@17 141 help="Force overwrite a PDG release directory at DB path\nmentioned with -db.",
kkonganti@17 142 )
kkonganti@17 143 parser.add_argument(
kkonganti@17 144 "-org",
kkonganti@17 145 dest="organism",
kkonganti@17 146 default="Salmonella",
kkonganti@17 147 required=False,
kkonganti@17 148 help="The organism to create the DB flat files\nfor.",
kkonganti@17 149 )
kkonganti@17 150 parser.add_argument(
kkonganti@17 151 "-rel",
kkonganti@17 152 dest="release",
kkonganti@17 153 default=False,
kkonganti@17 154 required=False,
kkonganti@17 155 help="If you get a 404 error, try mentioning the actual release identifier.\n"
kkonganti@17 156 + "Ex: For Salmonella, you can get the release identifier by going to:\n"
kkonganti@17 157 + " https://ftp.ncbi.nlm.nih.gov/pathogen/Results/Salmonella\n"
kkonganti@17 158 + "Ex: If you want metadata beloginging to release PDG000000002.2507, then you\n"
kkonganti@17 159 + " would use this command-line option as:\n -rel PDG000000002.2507",
kkonganti@17 160 )
kkonganti@17 161
kkonganti@17 162 args = parser.parse_args()
kkonganti@17 163 db_path = args.db_path
kkonganti@17 164 org = args.organism
kkonganti@17 165 overwrite = args.overwrite_db
kkonganti@17 166 release = args.release
kkonganti@17 167 ncbi_pathogens_loc = "/".join(
kkonganti@17 168 ["https://ftp.ncbi.nlm.nih.gov/pathogen/Results", org, "latest_snps"]
kkonganti@17 169 )
kkonganti@17 170
kkonganti@17 171 if not db_path:
kkonganti@17 172 db_path = os.getcwd()
kkonganti@17 173
kkonganti@17 174 # Save metadata
kkonganti@17 175 file, dest_dir = dl_pdg(
kkonganti@17 176 db_path=db_path,
kkonganti@17 177 url="/".join([ncbi_pathogens_loc, "Metadata"]),
kkonganti@17 178 regex=re.compile(r"PDG\d+\.\d+\.metadata\.tsv"),
kkonganti@17 179 suffix=".metadata.tsv",
kkonganti@17 180 overwrite=overwrite,
kkonganti@17 181 release=release,
kkonganti@17 182 )
kkonganti@17 183
kkonganti@17 184 # Save cluster to target mapping
kkonganti@17 185 dl_pdg(
kkonganti@17 186 db_path=db_path,
kkonganti@17 187 url="/".join([ncbi_pathogens_loc, "Clusters"]),
kkonganti@17 188 regex=re.compile(r"PDG\d+\.\d+\.reference_target\.cluster_list\.tsv"),
kkonganti@17 189 suffix="reference_target\.cluster_list\.tsv",
kkonganti@17 190 overwrite=overwrite,
kkonganti@17 191 release=release,
kkonganti@17 192 )
kkonganti@17 193
kkonganti@17 194 # Create accs.txt for dataformat to fetch required ACC fields
kkonganti@17 195 accs_file = os.path.join(dest_dir, "accs_all.txt")
kkonganti@17 196 with open(file, "r") as pdg_metadata_fh:
kkonganti@17 197 with open(accs_file, "w") as accs_fh:
kkonganti@17 198 for line in pdg_metadata_fh.readlines():
kkonganti@17 199 if re.match(r"^#", line) or line in ["\n", "\n\r", "\r"]:
kkonganti@17 200 continue
kkonganti@17 201 cols = line.strip().split("\t")
kkonganti@17 202 asm_acc = cols[9]
kkonganti@17 203 accs_fh.write(f"{asm_acc}\n") if (asm_acc != "NULL") else None
kkonganti@17 204 accs_fh.close()
kkonganti@17 205 pdg_metadata_fh.close()
kkonganti@17 206
kkonganti@17 207 logging.info("Finished writing accessions for dataformat tool.")
kkonganti@17 208
kkonganti@17 209
kkonganti@17 210 if __name__ == "__main__":
kkonganti@17 211 main()