annotate 0.2.0/bin/index_pdg_metadata.py @ 0:9e8b1c747a6a draft default tip

planemo upload
author galaxytrakr
date Fri, 29 May 2026 13:32:17 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
1 #!/usr/bin/env python3
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
2
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
3 # Kranti Konganti
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
4
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
5 import argparse
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
6 import inspect
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
7 import logging
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
8 import os
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
9 import pickle
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
10 import pprint
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
11 import re
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
12 from collections import defaultdict
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
13
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
14
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
15 # Multiple inheritence for pretty printing of help text.
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
16 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
17 pass
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
18
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
19
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
20 # Main
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
21 def main() -> None:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
22 """
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
23 This script works only in the context of `cronology_db` Nextflow workflow.
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
24 It takes an UNIX path to directory containing the following files:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
25 1. PDG metadata file (Ex: `PDG000000043.204.metadata.tsv`)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
26 2. PDG SNP Cluster metadata file (Ex: `PDG000000043.204.reference_target.cluster_list.tsv`)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
27 3. A list of possibly downloadable assembly accessions (one per line) from the metadata file.
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
28 and then generates a pickled file with relevant metadata columns mentioned with the -cols option.
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
29 """
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
30
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
31 # Set logging.
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
32 logging.basicConfig(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
33 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
34 level=logging.DEBUG,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
35 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
36
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
37 # Debug print.
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
38 ppp = pprint.PrettyPrinter(width=55)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
39 prog_name = os.path.basename(inspect.stack()[0].filename)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
40
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
41 parser = argparse.ArgumentParser(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
42 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
43 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
44
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
45 required = parser.add_argument_group("required arguments")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
46
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
47 required.add_argument(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
48 "-pdg_dir",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
49 dest="pdg_dir",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
50 default=False,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
51 required=True,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
52 help="Absolute UNIX path to directory containing the following files.\nEx:"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
53 + "\n1. PDG000000043.204.metadata.tsv"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
54 + "\n2. PDG000000043.204.reference_target.cluster_list.tsv"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
55 + "\n3. A file of assembly accessions, one per line parsed out from"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
56 + "\n the metadata file.",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
57 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
58 parser.add_argument(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
59 "-mlst",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
60 dest="mlst_results",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
61 required=False,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
62 help="Absolute UNIX path to MLST results file\nIf MLST results exists for a accession, they"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
63 + "\nwill be included in the index.",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
64 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
65 parser.add_argument(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
66 "-pdg_meta_pat",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
67 dest="pdg_meta_pat",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
68 default="PDG\d+\.\d+\.metadata\.tsv",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
69 required=False,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
70 help="The pattern to be used to search for PDG metadata\nfile.",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
71 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
72 parser.add_argument(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
73 "-pdg_snp_meta_pat",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
74 dest="pdg_snp_meta_pat",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
75 default="PDG\d+\.\d+\.reference\_target\.cluster\_list\.tsv",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
76 required=False,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
77 help="The pattern to be used to search for PDG SNP Cluster metadata\nfile.",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
78 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
79 parser.add_argument(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
80 "-pdg_accs_filename_pat",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
81 dest="pdg_accs_fn_pat",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
82 default="accs_all.txt",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
83 required=False,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
84 help="The filename to look for where all the parsed GC[AF] accessions are stored,\n"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
85 + "one per line.",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
86 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
87 parser.add_argument(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
88 "-cols",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
89 dest="metadata_cols",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
90 default="epi_type,collected_by,collection_date,host,"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
91 + "\nhost_disease,isolation_source,outbreak,sample_name,scientific_name,serovar,"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
92 + "\nsource_type,strain,computed_types,target_acc",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
93 required=False,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
94 help="The data in these metadata columns will be indexed for each\nisolate.",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
95 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
96 parser.add_argument(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
97 "-fs",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
98 dest="force_write_pick",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
99 action="store_true",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
100 required=False,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
101 help="By default, when -s flag is on, the pickle file named *.IDXD_PDG_METAD.pickle"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
102 + "\nis written to CWD. If the file exists, the program will not overwrite"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
103 + "\nand exit. Use -fs option to overwrite.",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
104 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
105 parser.add_argument(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
106 "-op",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
107 dest="out_prefix",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
108 default="IDXD_PDG_METAD",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
109 help="Set the output file prefix for indexed PDG metadata.",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
110 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
111 parser.add_argument(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
112 "-pfs",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
113 dest="pdg_meta_fs",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
114 default="\t",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
115 help="Change the field separator of the PDG metadata file.",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
116 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
117
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
118 args = parser.parse_args()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
119 pdg_dir = os.path.abspath(args.pdg_dir)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
120 mcols = args.metadata_cols
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
121 f_write_pick = args.force_write_pick
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
122 out_prefix = args.out_prefix
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
123 pdg_meta_fs = args.pdg_meta_fs
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
124 mlst_res = args.mlst_results
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
125 acc_pat = re.compile(r"^GC[AF]\_\d+\.?\d*")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
126 mcols_pat = re.compile(r"[\w+\,]")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
127 pdg_meta_pat = re.compile(f"{args.pdg_meta_pat}")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
128 pdg_snp_meta_pat = re.compile(f"{args.pdg_snp_meta_pat}")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
129 pdg_accs_fn_pat = re.compile(f"{args.pdg_accs_fn_pat}")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
130 target_acc_col = 41
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
131 acc_col = 9
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
132 num_accs_check = list()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
133 mlst_sts = dict()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
134 acceptable_num_mlst_cols = 10
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
135 mlst_st_col = 2
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
136 mlst_acc_col = 0
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
137
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
138 # Basic checks
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
139
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
140 if os.path.exists(pdg_dir) and os.path.isdir(pdg_dir):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
141 pdg_meta_file = [f for f in os.listdir(pdg_dir) if pdg_meta_pat.match(f)]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
142 pdg_snp_meta_file = [f for f in os.listdir(pdg_dir) if pdg_snp_meta_pat.match(f)]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
143 pdg_acc_all = [f for f in os.listdir(pdg_dir) if pdg_accs_fn_pat.match(f)]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
144 req_files = [len(pdg_meta_file), len(pdg_snp_meta_file), len(pdg_acc_all)]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
145 if any(x > 1 for x in req_files):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
146 logging.error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
147 f"Directory {os.path.basename(pdg_dir)} contains"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
148 + "\ncontains mulitple files matching the search pattern."
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
149 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
150 exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
151 elif any(x == 0 for x in req_files):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
152 logging.error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
153 f"Directory {os.path.basename(pdg_dir)} does not contain"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
154 + "\nany files matching the following file patterns:"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
155 + f"\n{pdg_meta_pat.pattern}"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
156 + f"\n{pdg_snp_meta_pat.pattern}"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
157 + f"\n{pdg_accs_fn_pat.pattern}"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
158 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
159 exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
160 pdg_meta_file = os.path.join(pdg_dir, "".join(pdg_meta_file))
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
161 pdg_snp_meta_file = os.path.join(pdg_dir, "".join(pdg_snp_meta_file))
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
162 pdg_acc_all = os.path.join(pdg_dir, "".join(pdg_acc_all))
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
163 else:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
164 logging.error(f"Directory path {pdg_dir} does not exist.")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
165 exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
166
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
167 if mlst_res and not (os.path.exists(mlst_res) or os.path.getsize(mlst_res) > 0):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
168 logging.error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
169 f"Requested to index MLST results. but the file {os.path.basename(mlst_res)}"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
170 + "does not exist or the file is empty."
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
171 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
172 exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
173 elif mlst_res:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
174 with open(mlst_res, "r") as mlst_res_fh:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
175 header = mlst_res_fh.readline()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
176 mlst_res_has_10_cols = False
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
177
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
178 for line in mlst_res_fh:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
179 cols = line.strip().split("\t")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
180 acc = acc_pat.findall(cols[mlst_acc_col])
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
181 if len(acc) > 1:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
182 logging.error(f"Found more than 1 accession in column:\ncols[mlst_acc_col]\n")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
183 exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
184 else:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
185 acc = "".join(acc)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
186 if len(cols) == acceptable_num_mlst_cols and re.match(r"\d+|\-", cols[mlst_st_col]):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
187 mlst_res_has_10_cols = True
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
188 if re.match(r"\-", cols[mlst_st_col]):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
189 mlst_sts[acc] = "NULL"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
190 else:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
191 mlst_sts[acc] = cols[mlst_st_col]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
192
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
193 if not mlst_res_has_10_cols:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
194 logging.error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
195 "Requested to incorporate MLST ST's but file"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
196 + f"\n{os.path.basename(mlst_res)}"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
197 + "\ndoes not have 10 columns in all rows."
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
198 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
199 exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
200
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
201 mlst_res_fh.close()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
202
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
203 with open(pdg_acc_all, "r") as pdg_acc_all_fh:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
204 for a in pdg_acc_all_fh.readlines():
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
205 num_accs_check.append(a.strip())
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
206
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
207 if not mcols_pat.match(mcols):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
208 logging.error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
209 f"Supplied columns' names should only be"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
210 + "\nalphanumeric (including _) separated by a comma."
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
211 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
212 exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
213 else:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
214 mcols = re.sub("\n", "", mcols).split(",")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
215
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
216 if (
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
217 pdg_snp_meta_file
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
218 and os.path.exists(pdg_snp_meta_file)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
219 and os.path.getsize(pdg_snp_meta_file) > 0
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
220 ):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
221 acc2snp = defaultdict()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
222 acc2meta = defaultdict(defaultdict)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
223 init_pickled_sero = os.path.join(os.getcwd(), out_prefix + ".pickle")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
224
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
225 if (
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
226 os.path.exists(init_pickled_sero)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
227 and os.path.getsize(init_pickled_sero)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
228 and not f_write_pick
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
229 ):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
230 logging.error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
231 f"File {os.path.basename(init_pickled_sero)} already exists in\n{os.getcwd()}\n"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
232 + "Use -fs to force overwrite it."
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
233 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
234 exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
235
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
236 with open(pdg_snp_meta_file, "r") as snp_meta:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
237 header = snp_meta.readline()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
238 skipped_acc2snp = set()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
239 for line in snp_meta:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
240 cols = line.strip().split(pdg_meta_fs)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
241 if not 4 <= len(cols) < 5:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
242 logging.error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
243 f"The metadata file {pdg_snp_meta_file} is malformed.\n"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
244 + f"Expected 4 columns. Got {len(cols)} columns.\n"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
245 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
246 exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
247
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
248 if re.match("NULL", cols[3]):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
249 skipped_acc2snp.add(f"Isolate {cols[1]} has no genome accession: {cols[3]}")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
250 elif not acc_pat.match(cols[3]):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
251 logging.error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
252 f"Did not find accession in either field number 4\n"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
253 + "or field number 10 of column 4."
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
254 + f"\nLine: {line}"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
255 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
256 exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
257 elif not re.match("NULL", cols[3]):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
258 acc2snp[cols[3]] = cols[0]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
259
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
260 if len(skipped_acc2snp) > 0:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
261 logging.info(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
262 f"While indexing\n{os.path.basename(pdg_snp_meta_file)},"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
263 + "\nthese isolates were skipped:\n\n"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
264 + "\n".join(skipped_acc2snp)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
265 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
266
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
267 with open(pdg_meta_file, "r") as pdg_meta:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
268 header = pdg_meta.readline().strip().split(pdg_meta_fs)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
269 user_req_cols = [mcol_i for mcol_i, mcol in enumerate(header) if mcol in mcols]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
270 cols_not_found = [mcol for mcol in mcols if mcol not in header]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
271 null_wgs_accs = set()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
272 if len(cols_not_found) > 0:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
273 logging.error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
274 f"The following columns do not exist in the"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
275 + f"\nmetadata file [ {os.path.basename(pdg_meta_file)} ]:\n"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
276 + "".join(cols_not_found)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
277 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
278 exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
279
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
280 for line in pdg_meta.readlines():
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
281 cols = line.strip().split(pdg_meta_fs)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
282 pdg_assm_acc = cols[acc_col]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
283 if not acc_pat.match(pdg_assm_acc):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
284 null_wgs_accs.add(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
285 f"Isolate {cols[target_acc_col]} has no genome accession: {pdg_assm_acc}"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
286 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
287 continue
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
288
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
289 if pdg_assm_acc in mlst_sts.keys():
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
290 acc2meta[pdg_assm_acc].setdefault("mlst_sequence_type", []).append(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
291 str(mlst_sts[pdg_assm_acc])
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
292 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
293
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
294 for col in user_req_cols:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
295 acc2meta[pdg_assm_acc].setdefault(header[col], []).append(str(cols[col]))
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
296
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
297 if len(null_wgs_accs) > 0:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
298 logging.info(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
299 f"While indexing\n{os.path.basename(pdg_meta_file)},"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
300 + "\nthese isolates were skipped:\n\n"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
301 + "\n".join(null_wgs_accs)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
302 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
303
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
304 with open(init_pickled_sero, "wb") as write_pickled_sero:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
305 pickle.dump(file=write_pickled_sero, obj=acc2meta)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
306
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
307 if len(num_accs_check) != len(acc2meta.keys()):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
308 logging.error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
309 "Failed the accession count check."
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
310 + f"\nExpected {len(num_accs_check)} accessions but got {len(acc2meta.keys())}."
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
311 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
312 exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
313 else:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
314 logging.info(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
315 f"Number of valid accessions: {len(num_accs_check)}"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
316 + f"\nNumber of accessions indexed: {len(acc2meta.keys())}"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
317 + f"\nNumber of accessions participating in any of the SNP Clusters: {len(acc2snp.keys())}"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
318 + f"\n\nCreated the pickle file for\n{os.path.basename(pdg_meta_file)}."
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
319 + "\nThis was the only requested function."
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
320 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
321
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
322 snp_meta.close()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
323 write_pickled_sero.close()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
324 exit(0)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
325 elif pdg_meta_file and not (
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
326 os.path.exists(pdg_meta_file) and os.path.getsize(pdg_meta_file) > 0
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
327 ):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
328 logging.error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
329 "Requested to create pickle from metadata, but\n"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
330 + f"the file, {os.path.basename(pdg_meta_file)} is empty or\ndoes not exist!"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
331 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
332 exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
333
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
334 pdg_acc_all_fh.close()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
335 snp_meta.close()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
336 pdg_meta.close()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
337 write_pickled_sero.close()
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
338
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
339
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
340 if __name__ == "__main__":
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
341 main()