# HG changeset patch
# User estrain
# Date 1773345991 0
# Node ID a9ff6184213fd6eb7ce1267b20bba7202dbc4b5a
planemo upload commit bdb45cf3a98e21f5002866b6789a1457f521bf5d
diff -r 000000000000 -r a9ff6184213f data_manager/data_manager_mlst.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_mlst.py Thu Mar 12 20:06:31 2026 +0000
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+"""
+Galaxy Data Manager for PubMLST.
+Downloads all PubMLST databases, creates a combined BLAST database,
+and writes a Galaxy-compatible data table JSON.
+"""
+
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import sys
+import datetime
+from urllib.request import urlopen
+
+
+class MLSTDataManager:
+ def __init__(self, json_path: str):
+ self.json_path = json_path
+ self.extra_files_path = None
+ self.output_dir = None
+ self.timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+ self.db_name = f"mlst_database_{self.timestamp}"
+
+ # ----------------------------------------------------------------------
+ # Galaxy JSON I/O
+ # ----------------------------------------------------------------------
+
+ def read_input_json(self):
+ """Read Galaxy input JSON and create the output directory."""
+ with open(self.json_path) as fh:
+ params = json.load(fh)
+ self.extra_files_path = params["output_data"][0]["extra_files_path"]
+ os.makedirs(self.extra_files_path, exist_ok=True)
+ self.output_dir = os.path.abspath(self.extra_files_path)
+
+ def write_output_json(self):
+ """Write the final Galaxy data manager JSON."""
+ entry = {
+ "data_tables": {
+ "mlst": [
+ {
+ "value": self.db_name,
+ "name": self.db_name,
+ "path": "mlst-db"
+ }
+ ]
+ }
+ }
+
+ with open(self.json_path, "w") as fh:
+ json.dump(entry, fh, indent=2, sort_keys=True)
+ fh.flush()
+ os.fsync(fh.fileno())
+
+ # ----------------------------------------------------------------------
+ # Database steps
+ # ----------------------------------------------------------------------
+
+ def download_pubmlst_databases(self):
+ """Download all PubMLST databases."""
+ print("Downloading PubMLST databases...")
+ try:
+ subprocess.run(["mlst-download_pub_mlst", "-d", "pubmlst"], check=True)
+ except subprocess.CalledProcessError as e:
+ print(f"Error downloading databases: {e}")
+ sys.exit(1)
+
+ def make_blast_database(self):
+ """Build a BLAST database from the downloaded data."""
+ cwd = os.getcwd()
+ src_dir = os.path.join(cwd, "pubmlst")
+ dst_dir = os.path.join(self.output_dir, "pubmlst")
+
+ if os.path.exists(dst_dir):
+ shutil.rmtree(dst_dir)
+ shutil.move(src_dir, dst_dir)
+
+ blast_dir = os.path.join(self.output_dir, "blast")
+ os.makedirs(blast_dir, exist_ok=True)
+ blast_file = os.path.join(blast_dir, "mlst.fa")
+
+ print("Building combined FASTA for BLAST database...")
+ with open(blast_file, "a") as outfile:
+ for scheme in os.listdir(dst_dir):
+ scheme_path = os.path.join(dst_dir, scheme)
+ if os.path.isdir(scheme_path):
+ for f in os.listdir(scheme_path):
+ if f.endswith(".tfa"):
+ with open(os.path.join(scheme_path, f)) as infile:
+ for line in infile:
+ if "not a locus" not in line:
+ if line.startswith(">"):
+ outfile.write(f">{scheme}.{line[1:]}")
+ else:
+ outfile.write(line)
+
+ print("Running makeblastdb...")
+ subprocess.run([
+ "makeblastdb", "-hash_index",
+ "-in", blast_file, "-dbtype", "nucl",
+ "-title", "PubMLST", "-parse_seqids"
+ ], check=True)
+
+ def download_scheme_species_map(self):
+ """Fetch the scheme_species_map.tab file from GitHub."""
+ url = "https://raw.githubusercontent.com/tseemann/mlst/master/db/scheme_species_map.tab"
+ dst_file = os.path.join(self.output_dir, "scheme_species_map.tab")
+ print("Downloading scheme_species_map.tab...")
+ try:
+ with urlopen(url) as response, open(dst_file, "w") as out:
+ out.write(response.read().decode("utf-8"))
+ print("scheme_species_map.tab downloaded successfully")
+ except Exception as e:
+ print(f"Failed to retrieve scheme_species_map.tab: {e}")
+
+ # ----------------------------------------------------------------------
+ # Run
+ # ----------------------------------------------------------------------
+
+ def run(self):
+ try:
+ self.read_input_json()
+ self.download_pubmlst_databases()
+ self.make_blast_database()
+ self.download_scheme_species_map()
+ except Exception as e:
+ print(f"MLST Data Manager failed: {e}")
+ finally:
+ self.write_output_json()
+
+
+# ----------------------------------------------------------------------
+# CLI
+# ----------------------------------------------------------------------
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="Galaxy Data Manager for PubMLST")
+ parser.add_argument("data_manager_json", help="Galaxy data manager JSON file")
+ return parser.parse_args()
+
+
+def main():
+ args = parse_args()
+ mgr = MLSTDataManager(args.data_manager_json)
+ mgr.run()
+
+
+if __name__ == "__main__":
+ main()
+
diff -r 000000000000 -r a9ff6184213f data_manager/data_manager_mlst.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_mlst.xml Thu Mar 12 20:06:31 2026 +0000
@@ -0,0 +1,47 @@
+
+
+ Download and build PubMLST BLAST databases
+
+
+ mlst
+ blast
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+@misc{seemann_mlst,
+ author={Torsten Seemann},
+ title={mlst: Scan contig files against PubMLST typing schemes},
+ year={2025},
+ howpublished={https://github.com/tseemann/mlst}
+}
+
+
+
+
diff -r 000000000000 -r a9ff6184213f data_manager_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml Thu Mar 12 20:06:31 2026 +0000
@@ -0,0 +1,20 @@
+
+
+
+
+
+
+
+