annotate 1.0.0/bin/dl_pdg_metadata.py @ 0:801b85b03a17 draft default tip

planemo upload
author galaxytrakr
date Thu, 28 May 2026 20:31:42 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
1 #!/usr/bin/env python3
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
2
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
3 # Kranti Konganti
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
4
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
5 import argparse
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
6 import inspect
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
7 import logging
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
8 import os
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
9 import re
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
10 import shutil
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
11 import ssl
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
12 import tempfile
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
13 from html.parser import HTMLParser
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
14 from urllib.request import urlopen
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
15
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
16 # Set logging.f
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
17 logging.basicConfig(
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
18 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
19 level=logging.DEBUG,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
20 )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
21
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
22 # Multiple inheritence for pretty printing of help text.
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
23 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
24 pass
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
25
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
26
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
27 # HTMLParser override class to get PDG release and latest Cluster .tsv file
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
28 class NCBIPathogensHTMLParser(HTMLParser):
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
29 def __init__(self, *, convert_charrefs: bool = ...) -> None:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
30 super().__init__(convert_charrefs=convert_charrefs)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
31 self.reset()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
32 self.href_data = list()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
33
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
34 def handle_data(self, data):
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
35 self.href_data.append(data)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
36
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
37
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
38 def dl_pdg(**kwargs) -> None:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
39 """
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
40 Method to save the PDG metadata file and
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
41 return the latest PDG release.
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
42 """
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
43 db_path, url, regex, suffix, overwrite, release = [kwargs[k] for k in kwargs.keys()]
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
44
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
45 contxt = ssl.create_default_context()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
46 contxt.check_hostname = False
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
47 contxt.verify_mode = ssl.CERT_NONE
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
48
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
49 if (db_path or url) == None:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
50 logging.error("Please provide absolute UNIX path\n" + "to store the result DB flat files.")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
51 exit(1)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
52
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
53 if re.match(r"^PDG\d+\.\d+$", release):
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
54 url = re.sub("latest_snps", release.strip(), url)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
55
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
56 html_parser = NCBIPathogensHTMLParser()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
57 logging.info(f"Finding latest NCBI PDG release at:\n{url}")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
58
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
59 with urlopen(url, context=contxt) as response:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
60 with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
61 shutil.copyfileobj(response, tmp_html_file)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
62
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
63 with open(tmp_html_file.name, "r") as html:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
64 html_parser.feed("".join(html.readlines()))
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
65
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
66 pdg_filename = re.search(regex, "".join(html_parser.href_data)).group(0)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
67 pdg_release = pdg_filename.rstrip(suffix)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
68 pdg_metadata_url = "/".join([url, pdg_filename])
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
69 pdg_release = pdg_filename.rstrip(suffix)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
70 dest_dir = os.path.join(db_path, pdg_release)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
71
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
72 logging.info(f"Found NCBI PDG file:\n{pdg_metadata_url}")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
73
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
74 if (
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
75 not overwrite
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
76 and re.match(r".+?\.metadata\.tsv$", pdg_filename)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
77 and os.path.exists(dest_dir)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
78 ):
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
79 logging.error(f"DB path\n{dest_dir}\nalready exists. Please use -f to overwrite.")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
80 exit(1)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
81 elif overwrite and not re.match(r".+?\.reference_target\.cluster_list\.tsv$", pdg_filename):
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
82 shutil.rmtree(dest_dir, ignore_errors=True) if os.path.exists(dest_dir) else None
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
83 os.makedirs(dest_dir)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
84 elif (
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
85 not overwrite
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
86 and re.match(r".+?\.metadata\.tsv$", pdg_filename)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
87 and not os.path.exists(dest_dir)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
88 ):
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
89 os.makedirs(dest_dir)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
90
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
91 tsv_at = os.path.join(dest_dir, pdg_filename)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
92 logging.info(f"Saving to:\n{tsv_at}")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
93
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
94 with urlopen(pdg_metadata_url, context=contxt) as response:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
95 with open(tsv_at, "w") as tsv:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
96 tsv.writelines(response.read().decode("utf-8"))
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
97
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
98 html.close()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
99 tmp_html_file.close()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
100 os.unlink(tmp_html_file.name)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
101 tsv.close()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
102 response.close()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
103
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
104 return tsv_at, dest_dir
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
105
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
106
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
107 def main() -> None:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
108 """
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
109 This script is part of the `bettercallsal_db` Nextflow workflow and is only
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
110 tested on POSIX sytems.
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
111 It:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
112 1. Downloads the latest NCBI Pathogens Release metadata file, which
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
113 looks like PDGXXXXXXXXXX.2504.metadata.csv and also the SNP cluster
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
114 information file which looks like PDGXXXXXXXXXX.2504.reference_target.cluster_list.tsv
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
115 2. Generates a new metadata file with only required information such as
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
116 computed_serotype, isolates GenBank or RefSeq downloadable genome FASTA
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
117 URL.
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
118 """
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
119
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
120 prog_name = os.path.basename(inspect.stack()[0].filename)
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
121
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
122 parser = argparse.ArgumentParser(
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
123 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
124 )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
125
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
126 # required = parser.add_argument_group("required arguments")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
127
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
128 parser.add_argument(
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
129 "-db",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
130 dest="db_path",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
131 default=os.getcwd(),
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
132 required=False,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
133 help="Absolute UNIX path to a path where all results files are\nstored.",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
134 )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
135 parser.add_argument(
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
136 "-f",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
137 dest="overwrite_db",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
138 default=False,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
139 required=False,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
140 action="store_true",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
141 help="Force overwrite a PDG release directory at DB path\nmentioned with -db.",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
142 )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
143 parser.add_argument(
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
144 "-org",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
145 dest="organism",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
146 default="Salmonella",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
147 required=False,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
148 help="The organism to create the DB flat files\nfor.",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
149 )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
150 parser.add_argument(
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
151 "-rel",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
152 dest="release",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
153 default=False,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
154 required=False,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
155 help="If you get a 404 error, try mentioning the actual release identifier.\n"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
156 + "Ex: For Salmonella, you can get the release identifier by going to:\n"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
157 + " https://ftp.ncbi.nlm.nih.gov/pathogen/Results/Salmonella\n"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
158 + "Ex: If you want metadata beloginging to release PDG000000002.2507, then you\n"
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
159 + " would use this command-line option as:\n -rel PDG000000002.2507",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
160 )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
161
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
162 args = parser.parse_args()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
163 db_path = args.db_path
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
164 org = args.organism
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
165 overwrite = args.overwrite_db
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
166 release = args.release
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
167 ncbi_pathogens_loc = "/".join(
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
168 ["https://ftp.ncbi.nlm.nih.gov/pathogen/Results", org, "latest_snps"]
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
169 )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
170
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
171 if not db_path:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
172 db_path = os.getcwd()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
173
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
174 # Save metadata
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
175 file, dest_dir = dl_pdg(
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
176 db_path=db_path,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
177 url="/".join([ncbi_pathogens_loc, "Metadata"]),
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
178 regex=re.compile(r"PDG\d+\.\d+\.metadata\.tsv"),
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
179 suffix=".metadata.tsv",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
180 overwrite=overwrite,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
181 release=release,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
182 )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
183
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
184 # Save cluster to target mapping
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
185 dl_pdg(
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
186 db_path=db_path,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
187 url="/".join([ncbi_pathogens_loc, "Clusters"]),
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
188 regex=re.compile(r"PDG\d+\.\d+\.reference_target\.cluster_list\.tsv"),
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
189 suffix="reference_target\.cluster_list\.tsv",
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
190 overwrite=overwrite,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
191 release=release,
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
192 )
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
193
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
194 # Create accs.txt for dataformat to fetch required ACC fields
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
195 accs_file = os.path.join(dest_dir, "accs_all.txt")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
196 with open(file, "r") as pdg_metadata_fh:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
197 with open(accs_file, "w") as accs_fh:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
198 for line in pdg_metadata_fh.readlines():
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
199 if re.match(r"^#", line) or line in ["\n", "\n\r", "\r"]:
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
200 continue
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
201 cols = line.strip().split("\t")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
202 asm_acc = cols[9]
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
203 accs_fh.write(f"{asm_acc}\n") if (asm_acc != "NULL") else None
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
204 accs_fh.close()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
205 pdg_metadata_fh.close()
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
206
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
207 logging.info("Finished writing accessions for dataformat tool.")
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
208
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
209
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
210 if __name__ == "__main__":
801b85b03a17 planemo upload
galaxytrakr
parents:
diff changeset
211 main()