Mercurial > repos > estrain > data_manager_mlst
comparison data_manager/data_manager_mlst.py @ 0:a9ff6184213f draft default tip
planemo upload commit bdb45cf3a98e21f5002866b6789a1457f521bf5d
| author | estrain |
|---|---|
| date | Thu, 12 Mar 2026 20:06:31 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:a9ff6184213f |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 """ | |
| 3 Galaxy Data Manager for PubMLST. | |
| 4 Downloads all PubMLST databases, creates a combined BLAST database, | |
| 5 and writes a Galaxy-compatible data table JSON. | |
| 6 """ | |
| 7 | |
| 8 import argparse | |
| 9 import json | |
| 10 import os | |
| 11 import shutil | |
| 12 import subprocess | |
| 13 import sys | |
| 14 import datetime | |
| 15 from urllib.request import urlopen | |
| 16 | |
| 17 | |
| 18 class MLSTDataManager: | |
| 19 def __init__(self, json_path: str): | |
| 20 self.json_path = json_path | |
| 21 self.extra_files_path = None | |
| 22 self.output_dir = None | |
| 23 self.timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") | |
| 24 self.db_name = f"mlst_database_{self.timestamp}" | |
| 25 | |
| 26 # ---------------------------------------------------------------------- | |
| 27 # Galaxy JSON I/O | |
| 28 # ---------------------------------------------------------------------- | |
| 29 | |
| 30 def read_input_json(self): | |
| 31 """Read Galaxy input JSON and create the output directory.""" | |
| 32 with open(self.json_path) as fh: | |
| 33 params = json.load(fh) | |
| 34 self.extra_files_path = params["output_data"][0]["extra_files_path"] | |
| 35 os.makedirs(self.extra_files_path, exist_ok=True) | |
| 36 self.output_dir = os.path.abspath(self.extra_files_path) | |
| 37 | |
| 38 def write_output_json(self): | |
| 39 """Write the final Galaxy data manager JSON.""" | |
| 40 entry = { | |
| 41 "data_tables": { | |
| 42 "mlst": [ | |
| 43 { | |
| 44 "value": self.db_name, | |
| 45 "name": self.db_name, | |
| 46 "path": "mlst-db" | |
| 47 } | |
| 48 ] | |
| 49 } | |
| 50 } | |
| 51 | |
| 52 with open(self.json_path, "w") as fh: | |
| 53 json.dump(entry, fh, indent=2, sort_keys=True) | |
| 54 fh.flush() | |
| 55 os.fsync(fh.fileno()) | |
| 56 | |
| 57 # ---------------------------------------------------------------------- | |
| 58 # Database steps | |
| 59 # ---------------------------------------------------------------------- | |
| 60 | |
| 61 def download_pubmlst_databases(self): | |
| 62 """Download all PubMLST databases.""" | |
| 63 print("Downloading PubMLST databases...") | |
| 64 try: | |
| 65 subprocess.run(["mlst-download_pub_mlst", "-d", "pubmlst"], check=True) | |
| 66 except subprocess.CalledProcessError as e: | |
| 67 print(f"Error downloading databases: {e}") | |
| 68 sys.exit(1) | |
| 69 | |
| 70 def make_blast_database(self): | |
| 71 """Build a BLAST database from the downloaded data.""" | |
| 72 cwd = os.getcwd() | |
| 73 src_dir = os.path.join(cwd, "pubmlst") | |
| 74 dst_dir = os.path.join(self.output_dir, "pubmlst") | |
| 75 | |
| 76 if os.path.exists(dst_dir): | |
| 77 shutil.rmtree(dst_dir) | |
| 78 shutil.move(src_dir, dst_dir) | |
| 79 | |
| 80 blast_dir = os.path.join(self.output_dir, "blast") | |
| 81 os.makedirs(blast_dir, exist_ok=True) | |
| 82 blast_file = os.path.join(blast_dir, "mlst.fa") | |
| 83 | |
| 84 print("Building combined FASTA for BLAST database...") | |
| 85 with open(blast_file, "a") as outfile: | |
| 86 for scheme in os.listdir(dst_dir): | |
| 87 scheme_path = os.path.join(dst_dir, scheme) | |
| 88 if os.path.isdir(scheme_path): | |
| 89 for f in os.listdir(scheme_path): | |
| 90 if f.endswith(".tfa"): | |
| 91 with open(os.path.join(scheme_path, f)) as infile: | |
| 92 for line in infile: | |
| 93 if "not a locus" not in line: | |
| 94 if line.startswith(">"): | |
| 95 outfile.write(f">{scheme}.{line[1:]}") | |
| 96 else: | |
| 97 outfile.write(line) | |
| 98 | |
| 99 print("Running makeblastdb...") | |
| 100 subprocess.run([ | |
| 101 "makeblastdb", "-hash_index", | |
| 102 "-in", blast_file, "-dbtype", "nucl", | |
| 103 "-title", "PubMLST", "-parse_seqids" | |
| 104 ], check=True) | |
| 105 | |
| 106 def download_scheme_species_map(self): | |
| 107 """Fetch the scheme_species_map.tab file from GitHub.""" | |
| 108 url = "https://raw.githubusercontent.com/tseemann/mlst/master/db/scheme_species_map.tab" | |
| 109 dst_file = os.path.join(self.output_dir, "scheme_species_map.tab") | |
| 110 print("Downloading scheme_species_map.tab...") | |
| 111 try: | |
| 112 with urlopen(url) as response, open(dst_file, "w") as out: | |
| 113 out.write(response.read().decode("utf-8")) | |
| 114 print("scheme_species_map.tab downloaded successfully") | |
| 115 except Exception as e: | |
| 116 print(f"Failed to retrieve scheme_species_map.tab: {e}") | |
| 117 | |
| 118 # ---------------------------------------------------------------------- | |
| 119 # Run | |
| 120 # ---------------------------------------------------------------------- | |
| 121 | |
| 122 def run(self): | |
| 123 try: | |
| 124 self.read_input_json() | |
| 125 self.download_pubmlst_databases() | |
| 126 self.make_blast_database() | |
| 127 self.download_scheme_species_map() | |
| 128 except Exception as e: | |
| 129 print(f"MLST Data Manager failed: {e}") | |
| 130 finally: | |
| 131 self.write_output_json() | |
| 132 | |
| 133 | |
| 134 # ---------------------------------------------------------------------- | |
| 135 # CLI | |
| 136 # ---------------------------------------------------------------------- | |
| 137 | |
| 138 def parse_args(): | |
| 139 parser = argparse.ArgumentParser(description="Galaxy Data Manager for PubMLST") | |
| 140 parser.add_argument("data_manager_json", help="Galaxy data manager JSON file") | |
| 141 return parser.parse_args() | |
| 142 | |
| 143 | |
| 144 def main(): | |
| 145 args = parse_args() | |
| 146 mgr = MLSTDataManager(args.data_manager_json) | |
| 147 mgr.run() | |
| 148 | |
| 149 | |
| 150 if __name__ == "__main__": | |
| 151 main() | |
| 152 |
