Mercurial > repos > galaxytrakr > data_manager_mapseq
comparison data_manager/data_manager_fetch_mapseq_db.py @ 0:f99c1cc0b190 draft
planemo upload commit bae430588c393222b68c5a7aa47f57bffee99475
| author | galaxytrakr |
|---|---|
| date | Thu, 26 Mar 2026 20:22:36 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:f99c1cc0b190 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # Python 3.6 compatible | |
| 3 | |
| 4 import argparse | |
| 5 import json | |
| 6 import os | |
| 7 import shutil | |
| 8 import tarfile | |
| 9 import tempfile | |
| 10 from datetime import datetime | |
| 11 | |
| 12 import wget | |
| 13 | |
| 14 DB_paths = { | |
| 15 "mgnify_v5_lsu": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/silva_lsu-20200130.tar.gz", | |
| 16 "mgnify_v5_ssu": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/silva_ssu-20200130.tar.gz", | |
| 17 "mgnify_v5_its_unite": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/UNITE-20200214.tar.gz", | |
| 18 "mgnify_v5_its_itsonedb": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/ITSoneDB-20200214.tar.gz", | |
| 19 "mgnify_v6_lsu": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/tool-dbs/silva-lsu/silva-lsu_138.1.tar.gz", | |
| 20 "mgnify_v6_ssu": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/tool-dbs/silva-ssu/silva-ssu_138.1.tar.gz", | |
| 21 "mgnify_v6_its_unite": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/tool-dbs/unite/unite_9.0.tar.gz", | |
| 22 "mgnify_v6_its_itsonedb": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/tool-dbs/itsonedb/itsonedb_1.141.tar.gz", | |
| 23 "mgnify_v6_pr2": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/tool-dbs/pr2/pr2_5.0.0.tar.gz", | |
| 24 "test_lsu": "https://zenodo.org/record/8205348/files/test_lsu.tar.gz", | |
| 25 } | |
| 26 | |
| 27 DB_names = { | |
| 28 "mgnify_v5_lsu": "MGnify LSU (v5.0.7) - silva_lsu-20200130", | |
| 29 "mgnify_v5_ssu": "MGnify SSU (v5.0.7) - silva_ssu-20200130", | |
| 30 "mgnify_v5_its_unite": "MGnify ITS UNITE (v5.0.7) - UNITE-20200214", | |
| 31 "mgnify_v5_its_itsonedb": "MGnify ITS ITSonedb (v5.0.7) - ITSoneDB-20200214", | |
| 32 "mgnify_v6_lsu": "MGnify LSU (v6.0) - silva_lsu-20240702", | |
| 33 "mgnify_v6_ssu": "MGnify SSU (v6.0) - silva_ssu-20240701", | |
| 34 "mgnify_v6_its_unite": "MGnify ITS UNITE (v6.0) - UNITE-20240702", | |
| 35 "mgnify_v6_its_itsonedb": "MGnify ITS ITSonedb (v6.0) - ITSoneDB-20240702", | |
| 36 "mgnify_v6_pr2": "MGnify PR2 (v6.0) - PR2-20240702", | |
| 37 "test_lsu": "Trimmed LSU Test DB", | |
| 38 } | |
| 39 | |
| 40 def _empty_dir(path): | |
| 41 if not os.path.isdir(path): | |
| 42 os.makedirs(path) | |
| 43 return | |
| 44 for name in os.listdir(path): | |
| 45 p = os.path.join(path, name) | |
| 46 if os.path.isdir(p) and not os.path.islink(p): | |
| 47 shutil.rmtree(p) | |
| 48 else: | |
| 49 try: | |
| 50 os.remove(p) | |
| 51 except OSError: | |
| 52 pass | |
| 53 | |
| 54 def _copy_all(src_dir, dest_dir): | |
| 55 for name in os.listdir(src_dir): | |
| 56 s = os.path.join(src_dir, name) | |
| 57 d = os.path.join(dest_dir, name) | |
| 58 if os.path.isdir(s) and not os.path.islink(s): | |
| 59 if os.path.exists(d): | |
| 60 shutil.rmtree(d) | |
| 61 shutil.copytree(s, d) | |
| 62 else: | |
| 63 shutil.copy2(s, dest_dir) | |
| 64 | |
| 65 def _safe_members(members): | |
| 66 for m in members: | |
| 67 mpath = m.name | |
| 68 norm = mpath.replace("\\", "/") | |
| 69 if os.path.isabs(mpath) or ".." in norm.split("/"): | |
| 70 continue | |
| 71 yield m | |
| 72 | |
| 73 def materialize_db_into(output_dir, url): | |
| 74 # Work entirely under /tmp to avoid job-dir races | |
| 75 tmp_root = tempfile.mkdtemp(prefix="mapseq_dm_", dir="/tmp") | |
| 76 temp_extract = tempfile.mkdtemp(prefix="extract_", dir=tmp_root) | |
| 77 | |
| 78 tar_path = wget.download(url, out=tmp_root) | |
| 79 with tarfile.open(tar_path, "r:*") as tar: | |
| 80 tar.extractall(temp_extract, members=_safe_members(tar)) | |
| 81 | |
| 82 _empty_dir(output_dir) | |
| 83 | |
| 84 entries = os.listdir(temp_extract) | |
| 85 if len(entries) == 1 and os.path.isdir(os.path.join(temp_extract, entries[0])): | |
| 86 inner = os.path.join(temp_extract, entries[0]) | |
| 87 vf = os.path.join(inner, "VERSION.txt") | |
| 88 if os.path.exists(vf): | |
| 89 try: | |
| 90 os.remove(vf) | |
| 91 except OSError: | |
| 92 pass | |
| 93 _copy_all(inner, output_dir) | |
| 94 else: | |
| 95 vf = os.path.join(temp_extract, "VERSION.txt") | |
| 96 if os.path.exists(vf): | |
| 97 try: | |
| 98 os.remove(vf) | |
| 99 except OSError: | |
| 100 pass | |
| 101 _copy_all(temp_extract, output_dir) | |
| 102 | |
| 103 try: | |
| 104 shutil.rmtree(tmp_root) | |
| 105 except Exception: | |
| 106 pass | |
| 107 | |
| 108 def main(): | |
| 109 parser = argparse.ArgumentParser(description="Fetch and register MAPseq DB") | |
| 110 parser.add_argument("--out", dest="output", required=True, help="Galaxy params/DM JSON path") | |
| 111 parser.add_argument("--version", dest="version", required=False, help="DB version label (e.g., 6.0)") | |
| 112 parser.add_argument("--database-type", dest="db_type", required=False, help="DB key (e.g., mgnify_v6_lsu)") | |
| 113 parser.add_argument("--test", action="store_true", help="Use small test DB") | |
| 114 args = parser.parse_args() | |
| 115 | |
| 116 # Read Galaxy params to get extra_files_path | |
| 117 with open(args.output) as fh: | |
| 118 params = json.load(fh) | |
| 119 | |
| 120 output_dir = params["output_data"][0]["extra_files_path"] | |
| 121 if not os.path.isdir(output_dir): | |
| 122 os.makedirs(output_dir) | |
| 123 | |
| 124 # Resolve db_type/version: prefer CLI, else try params.select_version | |
| 125 db_type = args.db_type | |
| 126 version = args.version | |
| 127 sel = params.get("select_version") or {} | |
| 128 if not db_type: | |
| 129 db_type = sel.get("database_type") | |
| 130 if not version: | |
| 131 version = sel.get("version") | |
| 132 | |
| 133 if not db_type: | |
| 134 raise RuntimeError("Missing --database-type and no params['select_version']['database_type'] provided.") | |
| 135 if db_type not in DB_paths: | |
| 136 raise KeyError("Unknown database type: {}. Valid keys: {}".format(db_type, ", ".join(sorted(DB_paths.keys())))) | |
| 137 | |
| 138 url = DB_paths["test_lsu"] if args.test else DB_paths[db_type] | |
| 139 materialize_db_into(output_dir, url) | |
| 140 | |
| 141 # Build DM JSON (value/name pattern you used earlier) | |
| 142 date_str = datetime.utcnow().strftime("%Y-%m-%d") | |
| 143 db_value = "{}_from_{}".format(db_type, date_str) | |
| 144 db_name = DB_names.get(db_type, db_type) | |
| 145 entry_name = "{} downloaded at {}".format(db_name, date_str) | |
| 146 | |
| 147 data_manager_entry = { | |
| 148 "data_tables": { | |
| 149 "mapseq_db": [ | |
| 150 { | |
| 151 "value": db_value, | |
| 152 "name": entry_name, | |
| 153 "path": output_dir, | |
| 154 "version": version, | |
| 155 } | |
| 156 ] | |
| 157 } | |
| 158 } | |
| 159 | |
| 160 with open(args.output, "w") as fh: | |
| 161 json.dump(data_manager_entry, fh, indent=2, sort_keys=True) | |
| 162 | |
| 163 try: | |
| 164 count = len(os.listdir(output_dir)) | |
| 165 print("[INFO] Wrote {} items into {}".format(count, output_dir)) | |
| 166 except Exception: | |
| 167 pass | |
| 168 | |
| 169 print("[SUCCESS] Data Manager JSON written to {}".format(args.output)) | |
| 170 | |
| 171 if __name__ == "__main__": | |
| 172 main() |
