annotate 0.2.0/bin/rmlst_post.py @ 0:9e8b1c747a6a draft default tip

planemo upload
author galaxytrakr
date Fri, 29 May 2026 13:32:17 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
1 #!/usr/bin/env python3
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
2
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
3 # Kranti Konganti
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
4
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
5 import argparse
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
6 import base64
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
7 import gzip
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
8 import inspect
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
9 import json
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
10 import logging
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
11 import os
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
12 import pprint
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
13 import re
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
14 from collections import defaultdict
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
15
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
16 import requests
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
17
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
18
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
19 # Multiple inheritence for pretty printing of help text.
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
20 class MultiArgFormatClasses(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
21 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
22 ):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
23 pass
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
24
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
25
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
26 # Main
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
27 def main() -> None:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
28 """
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
29 This script takes as input an assembly .fasta format (gzipped or ungzipped)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
30 and posts to PubMLST to get the species taxonomy.
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
31 """
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
32
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
33 # Set logging.
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
34 logging.basicConfig(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
35 format="\n"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
36 + "=" * 55
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
37 + "\n%(asctime)s - %(levelname)s\n"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
38 + "=" * 55
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
39 + "\n%(message)s\n\n",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
40 level=logging.DEBUG,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
41 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
42
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
43 # Debug print.
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
44 ppp = pprint.PrettyPrinter(width=55)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
45 prog_name = os.path.basename(inspect.stack()[0].filename)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
46
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
47 parser = argparse.ArgumentParser(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
48 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
49 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
50
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
51 required = parser.add_argument_group("required arguments")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
52
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
53 required.add_argument(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
54 "-fasta",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
55 dest="fasta",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
56 default=False,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
57 required=True,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
58 help="Absolute UNIX path to FASTA file.",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
59 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
60 parser.add_argument(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
61 "-prefix",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
62 dest="prefix",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
63 default="response",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
64 required=False,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
65 help="The prefix of the file name that will be created in\nthe current working directory.",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
66 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
67 parser.add_argument(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
68 "-fkey",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
69 dest="fkey",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
70 default="fields",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
71 required=False,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
72 help="The key name in the JSON response that contains ST results.",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
73 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
74 parser.add_argument(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
75 "-tkey",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
76 dest="tkey",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
77 default="taxon_prediction",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
78 required=False,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
79 help="The key name in the JSON response that contains a list of\ntaxonomy predictions.",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
80 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
81
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
82 # Define defaults
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
83
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
84 args = parser.parse_args()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
85 fasta = args.fasta
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
86 fkey = args.fkey
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
87 tkey = args.tkey
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
88 outfile = os.path.join(os.getcwd(), args.prefix + "_rmlstd.tsv")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
89 logfile = os.path.join(os.getcwd(), args.prefix + "_rmlst_req.log.json")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
90 field_keys = ["rST", "other_designation"]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
91 tax_pred_keys = ["rank", "support", "taxon", "taxonomy"]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
92 # uri = "http://rest.pubmlst.org/db/pubmlst_rmlst_seqdef_kiosk/schemes/1/sequence"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
93 uri = "https://rest.pubmlst.org/db/pubmlst_rmlst_seqdef_kiosk/schemes/1/sequence"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
94 payload = '{"base64":true, "details":true, "sequence":"'
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
95 sample_name = str(args.prefix)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
96 out = defaultdict(defaultdict)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
97
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
98 # Basic checks
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
99
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
100 if not (os.path.exists(fasta) and os.path.getsize(fasta) > 0):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
101 logging.error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
102 f"File\n{os.path.basename(fasta)}\ndoes not exist or the file is empty."
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
103 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
104 exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
105
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
106 try:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
107 with gzip.open(fasta, "rb") as fasta_fh:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
108 seqs = fasta_fh.read()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
109 except gzip.BadGzipFile:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
110 with open(fasta, "r") as fasta_fh:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
111 seqs = fasta_fh.read()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
112 payload += base64.b64encode(str(seqs).encode()).decode() + '"}'
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
113 response = requests.post(uri, data=payload)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
114
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
115 if response.status_code == requests.codes.ok:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
116 res = response.json()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
117 json.dump(res, open(logfile, "w"), indent=4, sort_keys=True)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
118
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
119 try:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
120 for count, prediction in enumerate(res[tkey]):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
121 out.setdefault(tkey, {}).setdefault(count, {})
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
122 for key in tax_pred_keys:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
123 out[tkey][count].setdefault(key, prediction[key])
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
124 except (KeyError, AttributeError, TypeError) as e:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
125 logging.warning(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
126 "Did not get taxonomy prediction from JSON response. Probably no match?\n"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
127 + f"KeyError or AttributeError or TypeError:\n{e}"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
128 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
129 exit(0)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
130
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
131 try:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
132 for key in field_keys:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
133 out.setdefault(key, res[fkey][key])
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
134 except (KeyError, AttributeError, TypeError) as e:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
135 for key in field_keys:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
136 out.setdefault(key, "-")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
137 logging.info(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
138 "Did not get rST or other_designation from JSON response. Will skip.\n"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
139 + f"KeyError or AttributeError or TypeError:\n{e}"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
140 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
141
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
142 try:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
143 with open(outfile, "w") as out_fh:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
144 # Header
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
145 out_fh.writelines(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
146 "\t".join(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
147 ["Sample"]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
148 + [k for k, _ in out.items() if out[k] and k != tkey]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
149 + [k for k in out[tkey][0].keys() if out[tkey][0][k]]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
150 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
151 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
152 for count in out[tkey].keys():
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
153 out_fh.writelines(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
154 "\n"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
155 + "\t".join(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
156 [sample_name]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
157 + [v for k, v in out.items() if out[k] and k != tkey]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
158 + [
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
159 str(re.sub(r"\s*\>\s*", ";", str(v)))
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
160 for k, v in out[tkey][count].items()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
161 if out[tkey][count][k]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
162 ],
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
163 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
164 + "\n"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
165 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
166 out_fh.close()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
167 except (KeyError, AttributeError, TypeError) as e:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
168 logging.error(f"Unable to write final results.\nException: {e}")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
169 exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
170
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
171
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
172 if __name__ == "__main__":
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
173 main()