comparison data_manager/data_manager_mlst.py @ 0:a9ff6184213f draft default tip

planemo upload commit bdb45cf3a98e21f5002866b6789a1457f521bf5d
author estrain
date Thu, 12 Mar 2026 20:06:31 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a9ff6184213f
1 #!/usr/bin/env python3
2 """
3 Galaxy Data Manager for PubMLST.
4 Downloads all PubMLST databases, creates a combined BLAST database,
5 and writes a Galaxy-compatible data table JSON.
6 """
7
8 import argparse
9 import json
10 import os
11 import shutil
12 import subprocess
13 import sys
14 import datetime
15 from urllib.request import urlopen
16
17
18 class MLSTDataManager:
19 def __init__(self, json_path: str):
20 self.json_path = json_path
21 self.extra_files_path = None
22 self.output_dir = None
23 self.timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
24 self.db_name = f"mlst_database_{self.timestamp}"
25
26 # ----------------------------------------------------------------------
27 # Galaxy JSON I/O
28 # ----------------------------------------------------------------------
29
30 def read_input_json(self):
31 """Read Galaxy input JSON and create the output directory."""
32 with open(self.json_path) as fh:
33 params = json.load(fh)
34 self.extra_files_path = params["output_data"][0]["extra_files_path"]
35 os.makedirs(self.extra_files_path, exist_ok=True)
36 self.output_dir = os.path.abspath(self.extra_files_path)
37
38 def write_output_json(self):
39 """Write the final Galaxy data manager JSON."""
40 entry = {
41 "data_tables": {
42 "mlst": [
43 {
44 "value": self.db_name,
45 "name": self.db_name,
46 "path": "mlst-db"
47 }
48 ]
49 }
50 }
51
52 with open(self.json_path, "w") as fh:
53 json.dump(entry, fh, indent=2, sort_keys=True)
54 fh.flush()
55 os.fsync(fh.fileno())
56
57 # ----------------------------------------------------------------------
58 # Database steps
59 # ----------------------------------------------------------------------
60
61 def download_pubmlst_databases(self):
62 """Download all PubMLST databases."""
63 print("Downloading PubMLST databases...")
64 try:
65 subprocess.run(["mlst-download_pub_mlst", "-d", "pubmlst"], check=True)
66 except subprocess.CalledProcessError as e:
67 print(f"Error downloading databases: {e}")
68 sys.exit(1)
69
70 def make_blast_database(self):
71 """Build a BLAST database from the downloaded data."""
72 cwd = os.getcwd()
73 src_dir = os.path.join(cwd, "pubmlst")
74 dst_dir = os.path.join(self.output_dir, "pubmlst")
75
76 if os.path.exists(dst_dir):
77 shutil.rmtree(dst_dir)
78 shutil.move(src_dir, dst_dir)
79
80 blast_dir = os.path.join(self.output_dir, "blast")
81 os.makedirs(blast_dir, exist_ok=True)
82 blast_file = os.path.join(blast_dir, "mlst.fa")
83
84 print("Building combined FASTA for BLAST database...")
85 with open(blast_file, "a") as outfile:
86 for scheme in os.listdir(dst_dir):
87 scheme_path = os.path.join(dst_dir, scheme)
88 if os.path.isdir(scheme_path):
89 for f in os.listdir(scheme_path):
90 if f.endswith(".tfa"):
91 with open(os.path.join(scheme_path, f)) as infile:
92 for line in infile:
93 if "not a locus" not in line:
94 if line.startswith(">"):
95 outfile.write(f">{scheme}.{line[1:]}")
96 else:
97 outfile.write(line)
98
99 print("Running makeblastdb...")
100 subprocess.run([
101 "makeblastdb", "-hash_index",
102 "-in", blast_file, "-dbtype", "nucl",
103 "-title", "PubMLST", "-parse_seqids"
104 ], check=True)
105
106 def download_scheme_species_map(self):
107 """Fetch the scheme_species_map.tab file from GitHub."""
108 url = "https://raw.githubusercontent.com/tseemann/mlst/master/db/scheme_species_map.tab"
109 dst_file = os.path.join(self.output_dir, "scheme_species_map.tab")
110 print("Downloading scheme_species_map.tab...")
111 try:
112 with urlopen(url) as response, open(dst_file, "w") as out:
113 out.write(response.read().decode("utf-8"))
114 print("scheme_species_map.tab downloaded successfully")
115 except Exception as e:
116 print(f"Failed to retrieve scheme_species_map.tab: {e}")
117
118 # ----------------------------------------------------------------------
119 # Run
120 # ----------------------------------------------------------------------
121
122 def run(self):
123 try:
124 self.read_input_json()
125 self.download_pubmlst_databases()
126 self.make_blast_database()
127 self.download_scheme_species_map()
128 except Exception as e:
129 print(f"MLST Data Manager failed: {e}")
130 finally:
131 self.write_output_json()
132
133
134 # ----------------------------------------------------------------------
135 # CLI
136 # ----------------------------------------------------------------------
137
138 def parse_args():
139 parser = argparse.ArgumentParser(description="Galaxy Data Manager for PubMLST")
140 parser.add_argument("data_manager_json", help="Galaxy data manager JSON file")
141 return parser.parse_args()
142
143
144 def main():
145 args = parse_args()
146 mgr = MLSTDataManager(args.data_manager_json)
147 mgr.run()
148
149
150 if __name__ == "__main__":
151 main()
152