Mercurial > repos > galaxytrakr > hfp_bettercallsal_awsbatch
comparison 1.0.0/bin/dl_pdg_metadata.py @ 0:801b85b03a17 draft default tip
planemo upload
| author | galaxytrakr |
|---|---|
| date | Thu, 28 May 2026 20:31:42 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:801b85b03a17 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 # Kranti Konganti | |
| 4 | |
| 5 import argparse | |
| 6 import inspect | |
| 7 import logging | |
| 8 import os | |
| 9 import re | |
| 10 import shutil | |
| 11 import ssl | |
| 12 import tempfile | |
| 13 from html.parser import HTMLParser | |
| 14 from urllib.request import urlopen | |
| 15 | |
| 16 # Set logging.f | |
| 17 logging.basicConfig( | |
| 18 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n", | |
| 19 level=logging.DEBUG, | |
| 20 ) | |
| 21 | |
| 22 # Multiple inheritence for pretty printing of help text. | |
| 23 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): | |
| 24 pass | |
| 25 | |
| 26 | |
| 27 # HTMLParser override class to get PDG release and latest Cluster .tsv file | |
| 28 class NCBIPathogensHTMLParser(HTMLParser): | |
| 29 def __init__(self, *, convert_charrefs: bool = ...) -> None: | |
| 30 super().__init__(convert_charrefs=convert_charrefs) | |
| 31 self.reset() | |
| 32 self.href_data = list() | |
| 33 | |
| 34 def handle_data(self, data): | |
| 35 self.href_data.append(data) | |
| 36 | |
| 37 | |
| 38 def dl_pdg(**kwargs) -> None: | |
| 39 """ | |
| 40 Method to save the PDG metadata file and | |
| 41 return the latest PDG release. | |
| 42 """ | |
| 43 db_path, url, regex, suffix, overwrite, release = [kwargs[k] for k in kwargs.keys()] | |
| 44 | |
| 45 contxt = ssl.create_default_context() | |
| 46 contxt.check_hostname = False | |
| 47 contxt.verify_mode = ssl.CERT_NONE | |
| 48 | |
| 49 if (db_path or url) == None: | |
| 50 logging.error("Please provide absolute UNIX path\n" + "to store the result DB flat files.") | |
| 51 exit(1) | |
| 52 | |
| 53 if re.match(r"^PDG\d+\.\d+$", release): | |
| 54 url = re.sub("latest_snps", release.strip(), url) | |
| 55 | |
| 56 html_parser = NCBIPathogensHTMLParser() | |
| 57 logging.info(f"Finding latest NCBI PDG release at:\n{url}") | |
| 58 | |
| 59 with urlopen(url, context=contxt) as response: | |
| 60 with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file: | |
| 61 shutil.copyfileobj(response, tmp_html_file) | |
| 62 | |
| 63 with open(tmp_html_file.name, "r") as html: | |
| 64 html_parser.feed("".join(html.readlines())) | |
| 65 | |
| 66 pdg_filename = re.search(regex, "".join(html_parser.href_data)).group(0) | |
| 67 pdg_release = pdg_filename.rstrip(suffix) | |
| 68 pdg_metadata_url = "/".join([url, pdg_filename]) | |
| 69 pdg_release = pdg_filename.rstrip(suffix) | |
| 70 dest_dir = os.path.join(db_path, pdg_release) | |
| 71 | |
| 72 logging.info(f"Found NCBI PDG file:\n{pdg_metadata_url}") | |
| 73 | |
| 74 if ( | |
| 75 not overwrite | |
| 76 and re.match(r".+?\.metadata\.tsv$", pdg_filename) | |
| 77 and os.path.exists(dest_dir) | |
| 78 ): | |
| 79 logging.error(f"DB path\n{dest_dir}\nalready exists. Please use -f to overwrite.") | |
| 80 exit(1) | |
| 81 elif overwrite and not re.match(r".+?\.reference_target\.cluster_list\.tsv$", pdg_filename): | |
| 82 shutil.rmtree(dest_dir, ignore_errors=True) if os.path.exists(dest_dir) else None | |
| 83 os.makedirs(dest_dir) | |
| 84 elif ( | |
| 85 not overwrite | |
| 86 and re.match(r".+?\.metadata\.tsv$", pdg_filename) | |
| 87 and not os.path.exists(dest_dir) | |
| 88 ): | |
| 89 os.makedirs(dest_dir) | |
| 90 | |
| 91 tsv_at = os.path.join(dest_dir, pdg_filename) | |
| 92 logging.info(f"Saving to:\n{tsv_at}") | |
| 93 | |
| 94 with urlopen(pdg_metadata_url, context=contxt) as response: | |
| 95 with open(tsv_at, "w") as tsv: | |
| 96 tsv.writelines(response.read().decode("utf-8")) | |
| 97 | |
| 98 html.close() | |
| 99 tmp_html_file.close() | |
| 100 os.unlink(tmp_html_file.name) | |
| 101 tsv.close() | |
| 102 response.close() | |
| 103 | |
| 104 return tsv_at, dest_dir | |
| 105 | |
| 106 | |
| 107 def main() -> None: | |
| 108 """ | |
| 109 This script is part of the `bettercallsal_db` Nextflow workflow and is only | |
| 110 tested on POSIX sytems. | |
| 111 It: | |
| 112 1. Downloads the latest NCBI Pathogens Release metadata file, which | |
| 113 looks like PDGXXXXXXXXXX.2504.metadata.csv and also the SNP cluster | |
| 114 information file which looks like PDGXXXXXXXXXX.2504.reference_target.cluster_list.tsv | |
| 115 2. Generates a new metadata file with only required information such as | |
| 116 computed_serotype, isolates GenBank or RefSeq downloadable genome FASTA | |
| 117 URL. | |
| 118 """ | |
| 119 | |
| 120 prog_name = os.path.basename(inspect.stack()[0].filename) | |
| 121 | |
| 122 parser = argparse.ArgumentParser( | |
| 123 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses | |
| 124 ) | |
| 125 | |
| 126 # required = parser.add_argument_group("required arguments") | |
| 127 | |
| 128 parser.add_argument( | |
| 129 "-db", | |
| 130 dest="db_path", | |
| 131 default=os.getcwd(), | |
| 132 required=False, | |
| 133 help="Absolute UNIX path to a path where all results files are\nstored.", | |
| 134 ) | |
| 135 parser.add_argument( | |
| 136 "-f", | |
| 137 dest="overwrite_db", | |
| 138 default=False, | |
| 139 required=False, | |
| 140 action="store_true", | |
| 141 help="Force overwrite a PDG release directory at DB path\nmentioned with -db.", | |
| 142 ) | |
| 143 parser.add_argument( | |
| 144 "-org", | |
| 145 dest="organism", | |
| 146 default="Salmonella", | |
| 147 required=False, | |
| 148 help="The organism to create the DB flat files\nfor.", | |
| 149 ) | |
| 150 parser.add_argument( | |
| 151 "-rel", | |
| 152 dest="release", | |
| 153 default=False, | |
| 154 required=False, | |
| 155 help="If you get a 404 error, try mentioning the actual release identifier.\n" | |
| 156 + "Ex: For Salmonella, you can get the release identifier by going to:\n" | |
| 157 + " https://ftp.ncbi.nlm.nih.gov/pathogen/Results/Salmonella\n" | |
| 158 + "Ex: If you want metadata beloginging to release PDG000000002.2507, then you\n" | |
| 159 + " would use this command-line option as:\n -rel PDG000000002.2507", | |
| 160 ) | |
| 161 | |
| 162 args = parser.parse_args() | |
| 163 db_path = args.db_path | |
| 164 org = args.organism | |
| 165 overwrite = args.overwrite_db | |
| 166 release = args.release | |
| 167 ncbi_pathogens_loc = "/".join( | |
| 168 ["https://ftp.ncbi.nlm.nih.gov/pathogen/Results", org, "latest_snps"] | |
| 169 ) | |
| 170 | |
| 171 if not db_path: | |
| 172 db_path = os.getcwd() | |
| 173 | |
| 174 # Save metadata | |
| 175 file, dest_dir = dl_pdg( | |
| 176 db_path=db_path, | |
| 177 url="/".join([ncbi_pathogens_loc, "Metadata"]), | |
| 178 regex=re.compile(r"PDG\d+\.\d+\.metadata\.tsv"), | |
| 179 suffix=".metadata.tsv", | |
| 180 overwrite=overwrite, | |
| 181 release=release, | |
| 182 ) | |
| 183 | |
| 184 # Save cluster to target mapping | |
| 185 dl_pdg( | |
| 186 db_path=db_path, | |
| 187 url="/".join([ncbi_pathogens_loc, "Clusters"]), | |
| 188 regex=re.compile(r"PDG\d+\.\d+\.reference_target\.cluster_list\.tsv"), | |
| 189 suffix="reference_target\.cluster_list\.tsv", | |
| 190 overwrite=overwrite, | |
| 191 release=release, | |
| 192 ) | |
| 193 | |
| 194 # Create accs.txt for dataformat to fetch required ACC fields | |
| 195 accs_file = os.path.join(dest_dir, "accs_all.txt") | |
| 196 with open(file, "r") as pdg_metadata_fh: | |
| 197 with open(accs_file, "w") as accs_fh: | |
| 198 for line in pdg_metadata_fh.readlines(): | |
| 199 if re.match(r"^#", line) or line in ["\n", "\n\r", "\r"]: | |
| 200 continue | |
| 201 cols = line.strip().split("\t") | |
| 202 asm_acc = cols[9] | |
| 203 accs_fh.write(f"{asm_acc}\n") if (asm_acc != "NULL") else None | |
| 204 accs_fh.close() | |
| 205 pdg_metadata_fh.close() | |
| 206 | |
| 207 logging.info("Finished writing accessions for dataformat tool.") | |
| 208 | |
| 209 | |
| 210 if __name__ == "__main__": | |
| 211 main() |
