kkonganti@17
|
1 #!/usr/bin/env python3
|
kkonganti@17
|
2
|
kkonganti@17
|
3 # Kranti Konganti
|
kkonganti@17
|
4
|
kkonganti@17
|
5 import argparse
|
kkonganti@17
|
6 import inspect
|
kkonganti@17
|
7 import logging
|
kkonganti@17
|
8 import os
|
kkonganti@17
|
9 import re
|
kkonganti@17
|
10 import shutil
|
kkonganti@17
|
11 import ssl
|
kkonganti@17
|
12 import tempfile
|
kkonganti@17
|
13 from html.parser import HTMLParser
|
kkonganti@17
|
14 from urllib.request import urlopen
|
kkonganti@17
|
15
|
kkonganti@17
|
16 # Set logging.f
|
kkonganti@17
|
17 logging.basicConfig(
|
kkonganti@17
|
18 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n",
|
kkonganti@17
|
19 level=logging.DEBUG,
|
kkonganti@17
|
20 )
|
kkonganti@17
|
21
|
kkonganti@17
|
22 # Multiple inheritence for pretty printing of help text.
|
kkonganti@17
|
23 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
|
kkonganti@17
|
24 pass
|
kkonganti@17
|
25
|
kkonganti@17
|
26
|
kkonganti@17
|
27 # HTMLParser override class to get PDG release and latest Cluster .tsv file
|
kkonganti@17
|
28 class NCBIPathogensHTMLParser(HTMLParser):
|
kkonganti@17
|
29 def __init__(self, *, convert_charrefs: bool = ...) -> None:
|
kkonganti@17
|
30 super().__init__(convert_charrefs=convert_charrefs)
|
kkonganti@17
|
31 self.reset()
|
kkonganti@17
|
32 self.href_data = list()
|
kkonganti@17
|
33
|
kkonganti@17
|
34 def handle_data(self, data):
|
kkonganti@17
|
35 self.href_data.append(data)
|
kkonganti@17
|
36
|
kkonganti@17
|
37
|
kkonganti@17
|
38 def dl_pdg(**kwargs) -> None:
|
kkonganti@17
|
39 """
|
kkonganti@17
|
40 Method to save the PDG metadata file and
|
kkonganti@17
|
41 return the latest PDG release.
|
kkonganti@17
|
42 """
|
kkonganti@17
|
43 db_path, url, regex, suffix, overwrite, release = [kwargs[k] for k in kwargs.keys()]
|
kkonganti@17
|
44
|
kkonganti@17
|
45 contxt = ssl.create_default_context()
|
kkonganti@17
|
46 contxt.check_hostname = False
|
kkonganti@17
|
47 contxt.verify_mode = ssl.CERT_NONE
|
kkonganti@17
|
48
|
kkonganti@17
|
49 if (db_path or url) == None:
|
kkonganti@17
|
50 logging.error("Please provide absolute UNIX path\n" + "to store the result DB flat files.")
|
kkonganti@17
|
51 exit(1)
|
kkonganti@17
|
52
|
kkonganti@17
|
53 if re.match(r"^PDG\d+\.\d+$", release):
|
kkonganti@17
|
54 url = re.sub("latest_snps", release.strip(), url)
|
kkonganti@17
|
55
|
kkonganti@17
|
56 html_parser = NCBIPathogensHTMLParser()
|
kkonganti@17
|
57 logging.info(f"Finding latest NCBI PDG release at:\n{url}")
|
kkonganti@17
|
58
|
kkonganti@17
|
59 with urlopen(url, context=contxt) as response:
|
kkonganti@17
|
60 with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file:
|
kkonganti@17
|
61 shutil.copyfileobj(response, tmp_html_file)
|
kkonganti@17
|
62
|
kkonganti@17
|
63 with open(tmp_html_file.name, "r") as html:
|
kkonganti@17
|
64 html_parser.feed("".join(html.readlines()))
|
kkonganti@17
|
65
|
kkonganti@17
|
66 pdg_filename = re.search(regex, "".join(html_parser.href_data)).group(0)
|
kkonganti@17
|
67 pdg_release = pdg_filename.rstrip(suffix)
|
kkonganti@17
|
68 pdg_metadata_url = "/".join([url, pdg_filename])
|
kkonganti@17
|
69 pdg_release = pdg_filename.rstrip(suffix)
|
kkonganti@17
|
70 dest_dir = os.path.join(db_path, pdg_release)
|
kkonganti@17
|
71
|
kkonganti@17
|
72 logging.info(f"Found NCBI PDG file:\n{pdg_metadata_url}")
|
kkonganti@17
|
73
|
kkonganti@17
|
74 if (
|
kkonganti@17
|
75 not overwrite
|
kkonganti@17
|
76 and re.match(r".+?\.metadata\.tsv$", pdg_filename)
|
kkonganti@17
|
77 and os.path.exists(dest_dir)
|
kkonganti@17
|
78 ):
|
kkonganti@17
|
79 logging.error(f"DB path\n{dest_dir}\nalready exists. Please use -f to overwrite.")
|
kkonganti@17
|
80 exit(1)
|
kkonganti@17
|
81 elif overwrite and not re.match(r".+?\.reference_target\.cluster_list\.tsv$", pdg_filename):
|
kkonganti@17
|
82 shutil.rmtree(dest_dir, ignore_errors=True) if os.path.exists(dest_dir) else None
|
kkonganti@17
|
83 os.makedirs(dest_dir)
|
kkonganti@17
|
84 elif (
|
kkonganti@17
|
85 not overwrite
|
kkonganti@17
|
86 and re.match(r".+?\.metadata\.tsv$", pdg_filename)
|
kkonganti@17
|
87 and not os.path.exists(dest_dir)
|
kkonganti@17
|
88 ):
|
kkonganti@17
|
89 os.makedirs(dest_dir)
|
kkonganti@17
|
90
|
kkonganti@17
|
91 tsv_at = os.path.join(dest_dir, pdg_filename)
|
kkonganti@17
|
92 logging.info(f"Saving to:\n{tsv_at}")
|
kkonganti@17
|
93
|
kkonganti@17
|
94 with urlopen(pdg_metadata_url, context=contxt) as response:
|
kkonganti@17
|
95 with open(tsv_at, "w") as tsv:
|
kkonganti@17
|
96 tsv.writelines(response.read().decode("utf-8"))
|
kkonganti@17
|
97
|
kkonganti@17
|
98 html.close()
|
kkonganti@17
|
99 tmp_html_file.close()
|
kkonganti@17
|
100 os.unlink(tmp_html_file.name)
|
kkonganti@17
|
101 tsv.close()
|
kkonganti@17
|
102 response.close()
|
kkonganti@17
|
103
|
kkonganti@17
|
104 return tsv_at, dest_dir
|
kkonganti@17
|
105
|
kkonganti@17
|
106
|
kkonganti@17
|
107 def main() -> None:
|
kkonganti@17
|
108 """
|
kkonganti@17
|
109 This script is part of the `bettercallsal_db` Nextflow workflow and is only
|
kkonganti@17
|
110 tested on POSIX sytems.
|
kkonganti@17
|
111 It:
|
kkonganti@17
|
112 1. Downloads the latest NCBI Pathogens Release metadata file, which
|
kkonganti@17
|
113 looks like PDGXXXXXXXXXX.2504.metadata.csv and also the SNP cluster
|
kkonganti@17
|
114 information file which looks like PDGXXXXXXXXXX.2504.reference_target.cluster_list.tsv
|
kkonganti@17
|
115 2. Generates a new metadata file with only required information such as
|
kkonganti@17
|
116 computed_serotype, isolates GenBank or RefSeq downloadable genome FASTA
|
kkonganti@17
|
117 URL.
|
kkonganti@17
|
118 """
|
kkonganti@17
|
119
|
kkonganti@17
|
120 prog_name = os.path.basename(inspect.stack()[0].filename)
|
kkonganti@17
|
121
|
kkonganti@17
|
122 parser = argparse.ArgumentParser(
|
kkonganti@17
|
123 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
|
kkonganti@17
|
124 )
|
kkonganti@17
|
125
|
kkonganti@17
|
126 # required = parser.add_argument_group("required arguments")
|
kkonganti@17
|
127
|
kkonganti@17
|
128 parser.add_argument(
|
kkonganti@17
|
129 "-db",
|
kkonganti@17
|
130 dest="db_path",
|
kkonganti@17
|
131 default=os.getcwd(),
|
kkonganti@17
|
132 required=False,
|
kkonganti@17
|
133 help="Absolute UNIX path to a path where all results files are\nstored.",
|
kkonganti@17
|
134 )
|
kkonganti@17
|
135 parser.add_argument(
|
kkonganti@17
|
136 "-f",
|
kkonganti@17
|
137 dest="overwrite_db",
|
kkonganti@17
|
138 default=False,
|
kkonganti@17
|
139 required=False,
|
kkonganti@17
|
140 action="store_true",
|
kkonganti@17
|
141 help="Force overwrite a PDG release directory at DB path\nmentioned with -db.",
|
kkonganti@17
|
142 )
|
kkonganti@17
|
143 parser.add_argument(
|
kkonganti@17
|
144 "-org",
|
kkonganti@17
|
145 dest="organism",
|
kkonganti@17
|
146 default="Salmonella",
|
kkonganti@17
|
147 required=False,
|
kkonganti@17
|
148 help="The organism to create the DB flat files\nfor.",
|
kkonganti@17
|
149 )
|
kkonganti@17
|
150 parser.add_argument(
|
kkonganti@17
|
151 "-rel",
|
kkonganti@17
|
152 dest="release",
|
kkonganti@17
|
153 default=False,
|
kkonganti@17
|
154 required=False,
|
kkonganti@17
|
155 help="If you get a 404 error, try mentioning the actual release identifier.\n"
|
kkonganti@17
|
156 + "Ex: For Salmonella, you can get the release identifier by going to:\n"
|
kkonganti@17
|
157 + " https://ftp.ncbi.nlm.nih.gov/pathogen/Results/Salmonella\n"
|
kkonganti@17
|
158 + "Ex: If you want metadata beloginging to release PDG000000002.2507, then you\n"
|
kkonganti@17
|
159 + " would use this command-line option as:\n -rel PDG000000002.2507",
|
kkonganti@17
|
160 )
|
kkonganti@17
|
161
|
kkonganti@17
|
162 args = parser.parse_args()
|
kkonganti@17
|
163 db_path = args.db_path
|
kkonganti@17
|
164 org = args.organism
|
kkonganti@17
|
165 overwrite = args.overwrite_db
|
kkonganti@17
|
166 release = args.release
|
kkonganti@17
|
167 ncbi_pathogens_loc = "/".join(
|
kkonganti@17
|
168 ["https://ftp.ncbi.nlm.nih.gov/pathogen/Results", org, "latest_snps"]
|
kkonganti@17
|
169 )
|
kkonganti@17
|
170
|
kkonganti@17
|
171 if not db_path:
|
kkonganti@17
|
172 db_path = os.getcwd()
|
kkonganti@17
|
173
|
kkonganti@17
|
174 # Save metadata
|
kkonganti@17
|
175 file, dest_dir = dl_pdg(
|
kkonganti@17
|
176 db_path=db_path,
|
kkonganti@17
|
177 url="/".join([ncbi_pathogens_loc, "Metadata"]),
|
kkonganti@17
|
178 regex=re.compile(r"PDG\d+\.\d+\.metadata\.tsv"),
|
kkonganti@17
|
179 suffix=".metadata.tsv",
|
kkonganti@17
|
180 overwrite=overwrite,
|
kkonganti@17
|
181 release=release,
|
kkonganti@17
|
182 )
|
kkonganti@17
|
183
|
kkonganti@17
|
184 # Save cluster to target mapping
|
kkonganti@17
|
185 dl_pdg(
|
kkonganti@17
|
186 db_path=db_path,
|
kkonganti@17
|
187 url="/".join([ncbi_pathogens_loc, "Clusters"]),
|
kkonganti@17
|
188 regex=re.compile(r"PDG\d+\.\d+\.reference_target\.cluster_list\.tsv"),
|
kkonganti@17
|
189 suffix="reference_target\.cluster_list\.tsv",
|
kkonganti@17
|
190 overwrite=overwrite,
|
kkonganti@17
|
191 release=release,
|
kkonganti@17
|
192 )
|
kkonganti@17
|
193
|
kkonganti@17
|
194 # Create accs.txt for dataformat to fetch required ACC fields
|
kkonganti@17
|
195 accs_file = os.path.join(dest_dir, "accs_all.txt")
|
kkonganti@17
|
196 with open(file, "r") as pdg_metadata_fh:
|
kkonganti@17
|
197 with open(accs_file, "w") as accs_fh:
|
kkonganti@17
|
198 for line in pdg_metadata_fh.readlines():
|
kkonganti@17
|
199 if re.match(r"^#", line) or line in ["\n", "\n\r", "\r"]:
|
kkonganti@17
|
200 continue
|
kkonganti@17
|
201 cols = line.strip().split("\t")
|
kkonganti@17
|
202 asm_acc = cols[9]
|
kkonganti@17
|
203 accs_fh.write(f"{asm_acc}\n") if (asm_acc != "NULL") else None
|
kkonganti@17
|
204 accs_fh.close()
|
kkonganti@17
|
205 pdg_metadata_fh.close()
|
kkonganti@17
|
206
|
kkonganti@17
|
207 logging.info("Finished writing accessions for dataformat tool.")
|
kkonganti@17
|
208
|
kkonganti@17
|
209
|
kkonganti@17
|
210 if __name__ == "__main__":
|
kkonganti@17
|
211 main()
|