cfsan_bettercallsal: 0.6.1/bin/dl_pdg

annotate 0.6.1/bin/dl_pdg_metadata.py @ 14:b0a37e88ecb5

"planemo upload"

author	kkonganti
date	Thu, 07 Sep 2023 10:13:31 -0400
parents	749faef1caa9
children

rev	line source
kkonganti@11	1 #!/usr/bin/env python3
kkonganti@11	2
kkonganti@11	3 # Kranti Konganti
kkonganti@11	4
kkonganti@11	5 import os
kkonganti@11	6 import shutil
kkonganti@11	7 import tempfile
kkonganti@11	8 import argparse
kkonganti@11	9 import inspect
kkonganti@11	10 import logging
kkonganti@11	11 import re
kkonganti@11	12 from urllib.request import urlopen
kkonganti@11	13 from html.parser import HTMLParser
kkonganti@11	14
kkonganti@11	15 # Set logging.f
kkonganti@11	16 logging.basicConfig(
kkonganti@11	17 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n",
kkonganti@11	18 level=logging.DEBUG,
kkonganti@11	19 )
kkonganti@11	20
kkonganti@11	21 # Multiple inheritence for pretty printing of help text.
kkonganti@11	22 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
kkonganti@11	23 pass
kkonganti@11	24
kkonganti@11	25
kkonganti@11	26 # HTMLParser override class to get PDG release and latest Cluster .tsv file
kkonganti@11	27 class NCBIPathogensHTMLParser(HTMLParser):
kkonganti@11	28 def __init__(self, *, convert_charrefs: bool = ...) -> None:
kkonganti@11	29 super().__init__(convert_charrefs=convert_charrefs)
kkonganti@11	30 self.reset()
kkonganti@11	31 self.href_data = list()
kkonganti@11	32
kkonganti@11	33 def handle_data(self, data):
kkonganti@11	34 self.href_data.append(data)
kkonganti@11	35
kkonganti@11	36
kkonganti@11	37 def dl_pdg(**kwargs) -> None:
kkonganti@11	38 """
kkonganti@11	39 Method to save the PDG metadata file and
kkonganti@11	40 return the latest PDG release.
kkonganti@11	41 """
kkonganti@11	42 db_path, url, regex, suffix, overwrite, release = [kwargs[k] for k in kwargs.keys()]
kkonganti@11	43
kkonganti@11	44 if (db_path or url) == None:
kkonganti@11	45 logging.error("Please provide absolute UNIX path\n" + "to store the result DB flat files.")
kkonganti@11	46 exit(1)
kkonganti@11	47
kkonganti@11	48 if re.match(r"^PDG\d+\.\d+$", release):
kkonganti@11	49 url = re.sub("latest_snps", release.strip(), url)
kkonganti@11	50
kkonganti@11	51 html_parser = NCBIPathogensHTMLParser()
kkonganti@11	52 logging.info(f"Finding latest NCBI PDG release at:\n{url}")
kkonganti@11	53
kkonganti@11	54 with urlopen(url) as response:
kkonganti@11	55 with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file:
kkonganti@11	56 shutil.copyfileobj(response, tmp_html_file)
kkonganti@11	57
kkonganti@11	58 with open(tmp_html_file.name, "r") as html:
kkonganti@11	59 html_parser.feed("".join(html.readlines()))
kkonganti@11	60
kkonganti@11	61 pdg_filename = re.search(regex, "".join(html_parser.href_data)).group(0)
kkonganti@11	62 pdg_release = pdg_filename.rstrip(suffix)
kkonganti@11	63 pdg_metadata_url = "/".join([url, pdg_filename])
kkonganti@11	64 pdg_release = pdg_filename.rstrip(suffix)
kkonganti@11	65 dest_dir = os.path.join(db_path, pdg_release)
kkonganti@11	66
kkonganti@11	67 logging.info(f"Found NCBI PDG file:\n{pdg_metadata_url}")
kkonganti@11	68
kkonganti@11	69 if (
kkonganti@11	70 not overwrite
kkonganti@11	71 and re.match(r".+?\.metadata\.tsv$", pdg_filename)
kkonganti@11	72 and os.path.exists(dest_dir)
kkonganti@11	73 ):
kkonganti@11	74 logging.error(f"DB path\n{dest_dir}\nalready exists. Please use -f to overwrite.")
kkonganti@11	75 exit(1)
kkonganti@11	76 elif overwrite and not re.match(r".+?\.reference_target\.cluster_list\.tsv$", pdg_filename):
kkonganti@11	77 shutil.rmtree(dest_dir, ignore_errors=True) if os.path.exists(dest_dir) else None
kkonganti@11	78 os.makedirs(dest_dir)
kkonganti@11	79 elif (
kkonganti@11	80 not overwrite
kkonganti@11	81 and re.match(r".+?\.metadata\.tsv$", pdg_filename)
kkonganti@11	82 and not os.path.exists(dest_dir)
kkonganti@11	83 ):
kkonganti@11	84 os.makedirs(dest_dir)
kkonganti@11	85
kkonganti@11	86 tsv_at = os.path.join(dest_dir, pdg_filename)
kkonganti@11	87 logging.info(f"Saving to:\n{tsv_at}")
kkonganti@11	88
kkonganti@11	89 with urlopen(pdg_metadata_url) as response:
kkonganti@11	90 with open(tsv_at, "w") as tsv:
kkonganti@11	91 tsv.writelines(response.read().decode("utf-8"))
kkonganti@11	92
kkonganti@11	93 html.close()
kkonganti@11	94 tmp_html_file.close()
kkonganti@11	95 os.unlink(tmp_html_file.name)
kkonganti@11	96 tsv.close()
kkonganti@11	97 response.close()
kkonganti@11	98
kkonganti@11	99 return tsv_at, dest_dir
kkonganti@11	100
kkonganti@11	101
kkonganti@11	102 def main() -> None:
kkonganti@11	103 """
kkonganti@11	104 This script is part of the `bettercallsal_db` Nextflow workflow and is only
kkonganti@11	105 tested on POSIX sytems.
kkonganti@11	106 It:
kkonganti@11	107 1. Downloads the latest NCBI Pathogens Release metadata file, which
kkonganti@11	108 looks like PDGXXXXXXXXXX.2504.metadata.csv and also the SNP cluster
kkonganti@11	109 information file which looks like PDGXXXXXXXXXX.2504.reference_target.cluster_list.tsv
kkonganti@11	110 2. Generates a new metadata file with only required information such as
kkonganti@11	111 computed_serotype, isolates GenBank or RefSeq downloadable genome FASTA
kkonganti@11	112 URL.
kkonganti@11	113 """
kkonganti@11	114
kkonganti@11	115 prog_name = os.path.basename(inspect.stack()[0].filename)
kkonganti@11	116
kkonganti@11	117 parser = argparse.ArgumentParser(
kkonganti@11	118 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@11	119 )
kkonganti@11	120
kkonganti@11	121 # required = parser.add_argument_group("required arguments")
kkonganti@11	122
kkonganti@11	123 parser.add_argument(
kkonganti@11	124 "-db",
kkonganti@11	125 dest="db_path",
kkonganti@11	126 default=os.getcwd(),
kkonganti@11	127 required=False,
kkonganti@11	128 help="Absolute UNIX path to a path where all results files are\nstored.",
kkonganti@11	129 )
kkonganti@11	130 parser.add_argument(
kkonganti@11	131 "-f",
kkonganti@11	132 dest="overwrite_db",
kkonganti@11	133 default=False,
kkonganti@11	134 required=False,
kkonganti@11	135 action="store_true",
kkonganti@11	136 help="Force overwrite a PDG release directory at DB path\nmentioned with -db.",
kkonganti@11	137 )
kkonganti@11	138 parser.add_argument(
kkonganti@11	139 "-org",
kkonganti@11	140 dest="organism",
kkonganti@11	141 default="Salmonella",
kkonganti@11	142 required=False,
kkonganti@11	143 help="The organism to create the DB flat files\nfor.",
kkonganti@11	144 )
kkonganti@11	145 parser.add_argument(
kkonganti@11	146 "-rel",
kkonganti@11	147 dest="release",
kkonganti@11	148 default=False,
kkonganti@11	149 required=False,
kkonganti@11	150 help="If you get a 404 error, try mentioning the actual release identifier.\n"
kkonganti@11	151 + "Ex: For Salmonella, you can get the release identifier by going to:\n"
kkonganti@11	152 + " https://ftp.ncbi.nlm.nih.gov/pathogen/Results/Salmonella\n"
kkonganti@11	153 + "Ex: If you want metadata beloginging to release PDG000000002.2507, then you\n"
kkonganti@11	154 + " would use this command-line option as:\n -rel PDG000000002.2507",
kkonganti@11	155 )
kkonganti@11	156
kkonganti@11	157 args = parser.parse_args()
kkonganti@11	158 db_path = args.db_path
kkonganti@11	159 org = args.organism
kkonganti@11	160 overwrite = args.overwrite_db
kkonganti@11	161 release = args.release
kkonganti@11	162 ncbi_pathogens_loc = "/".join(
kkonganti@11	163 ["https://ftp.ncbi.nlm.nih.gov/pathogen/Results", org, "latest_snps"]
kkonganti@11	164 )
kkonganti@11	165
kkonganti@11	166 if not db_path:
kkonganti@11	167 db_path = os.getcwd()
kkonganti@11	168
kkonganti@11	169 # Save metadata
kkonganti@11	170 file, dest_dir = dl_pdg(
kkonganti@11	171 db_path=db_path,
kkonganti@11	172 url="/".join([ncbi_pathogens_loc, "Metadata"]),
kkonganti@11	173 regex=re.compile(r"PDG\d+\.\d+\.metadata\.tsv"),
kkonganti@11	174 suffix=".metadata.tsv",
kkonganti@11	175 overwrite=overwrite,
kkonganti@11	176 release=release,
kkonganti@11	177 )
kkonganti@11	178
kkonganti@11	179 # Save cluster to target mapping
kkonganti@11	180 dl_pdg(
kkonganti@11	181 db_path=db_path,
kkonganti@11	182 url="/".join([ncbi_pathogens_loc, "Clusters"]),
kkonganti@11	183 regex=re.compile(r"PDG\d+\.\d+\.reference_target\.cluster_list\.tsv"),
kkonganti@11	184 suffix="reference_target\.cluster_list\.tsv",
kkonganti@11	185 overwrite=overwrite,
kkonganti@11	186 release=release,
kkonganti@11	187 )
kkonganti@11	188
kkonganti@11	189 # Create accs.txt for dataformat to fetch required ACC fields
kkonganti@11	190 accs_file = os.path.join(dest_dir, "accs_all.txt")
kkonganti@11	191 with open(file, "r") as pdg_metadata_fh:
kkonganti@11	192 with open(accs_file, "w") as accs_fh:
kkonganti@11	193 for line in pdg_metadata_fh.readlines():
kkonganti@11	194 if re.match(r"^#", line) or line in ["\n", "\n\r", "\r"]:
kkonganti@11	195 continue
kkonganti@11	196 cols = line.strip().split("\t")
kkonganti@11	197 asm_acc = cols[9]
kkonganti@11	198 accs_fh.write(f"{asm_acc}\n") if (asm_acc != "NULL") else None
kkonganti@11	199 accs_fh.close()
kkonganti@11	200 pdg_metadata_fh.close()
kkonganti@11	201
kkonganti@11	202 logging.info("Finished writing accessions for dataformat tool.")
kkonganti@11	203
kkonganti@11	204
kkonganti@11	205 if __name__ == "__main__":
kkonganti@11	206 main()

Mercurial > repos > kkonganti > cfsan_bettercallsal

annotate 0.6.1/bin/dl_pdg_metadata.py @ 14:b0a37e88ecb5