annotate 1.0.0/bin/gen_otf_genome.py @ 0:0a8dda29956e draft default tip

planemo upload
author galaxytrakr
date Thu, 28 May 2026 20:41:10 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
1 #!/usr/bin/env python3
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
2
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
3 # Kranti Konganti
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
4
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
5 import argparse
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
6 import glob
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
7 import gzip
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
8 import inspect
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
9 import logging
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
10 import os
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
11 import pprint
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
12 import re
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
13
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
14 # Set logging.
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
15 logging.basicConfig(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
16 format="\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
17 + "=" * 55
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
18 + "\n%(asctime)s - %(levelname)s\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
19 + "=" * 55
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
20 + "\n%(message)s\n\n",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
21 level=logging.DEBUG,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
22 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
23
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
24 # Debug print.
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
25 ppp = pprint.PrettyPrinter(width=50, indent=4)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
26
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
27
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
28 # Multiple inheritence for pretty printing of help text.
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
29 class MultiArgFormatClasses(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
30 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
31 ):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
32 pass
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
33
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
34
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
35 def main() -> None:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
36 """
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
37 This script works only in the context of `bettercallsal` Nextflow workflow.
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
38 It takes:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
39 1. A text file containing accessions or FASTA IDs, one per line and
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
40 then,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
41 2. Searches for a genome FASTA file in gzipped format in specified
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
42 search path, where the prefix of the filename is the accession or
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
43 FASTA ID from 1. and then,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
44 creates a new concatenated gzipped genome FASTA file with all the genomes
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
45 in the text file from 1.
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
46 """
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
47
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
48 prog_name = os.path.basename(inspect.stack()[0].filename)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
49
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
50 parser = argparse.ArgumentParser(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
51 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
52 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
53
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
54 required = parser.add_argument_group("required arguments")
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
55
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
56 required.add_argument(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
57 "-txt",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
58 dest="accs_txt",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
59 default=False,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
60 required=True,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
61 help="Absolute UNIX path to .txt file containing accessions\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
62 + "FASTA IDs, one per line.",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
63 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
64 required.add_argument(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
65 "-gd",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
66 dest="genomes_dir",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
67 default=False,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
68 required=True,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
69 help="Absolute UNIX path to a directory containing\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
70 + "gzipped genome FASTA files.\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
71 + "Required if -m is on.",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
72 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
73 parser.add_argument(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
74 "-gds",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
75 dest="genomes_dir_suffix",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
76 default="_scaffolded_genomic.fna.gz",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
77 required=False,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
78 help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
79 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
80 parser.add_argument(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
81 "-op",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
82 dest="out_prefix",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
83 default="CATTED_GENOMES",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
84 help="Set the output file prefix for .fna.gz and .txt\n" + "files.",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
85 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
86 parser.add_argument(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
87 "-txts",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
88 dest="accs_suffix",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
89 default="_template_hits.txt",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
90 required=False,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
91 help="The suffix of the file supplied with -txt option. It is assumed that the\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
92 + "sample name is present in the file supplied with -txt option and the suffix\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
93 + "will be stripped and stored in a file that logs samples which have no hits.",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
94 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
95 parser.add_argument(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
96 "-frag_delim",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
97 dest="frag_delim",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
98 default="\t",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
99 required=False,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
100 help="The delimitor by which the fields are separated in *_frag.gz file.",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
101 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
102
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
103 args = parser.parse_args()
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
104 accs_txt = args.accs_txt
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
105 genomes_dir = args.genomes_dir
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
106 genomes_dir_suffix = args.genomes_dir_suffix
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
107 out_prefix = args.out_prefix
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
108 accs_suffix = args.accs_suffix
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
109 frag_delim = args.frag_delim
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
110 accs_seen = dict()
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
111 cat_genomes_gz = os.path.join(os.getcwd(), out_prefix + "_" + genomes_dir_suffix)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
112 cat_genomes_gz = re.sub("__", "_", str(cat_genomes_gz))
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
113 frags_gz = os.path.join(os.getcwd(), out_prefix + ".frag.gz")
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
114 cat_reads_gz = os.path.join(os.getcwd(), out_prefix + "_aln_reads.fna.gz")
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
115 cat_reads_gz = re.sub("__", "_", cat_reads_gz)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
116
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
117 if (
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
118 accs_txt
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
119 and os.path.exists(cat_genomes_gz)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
120 and os.path.getsize(cat_genomes_gz) > 0
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
121 ):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
122 logging.error(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
123 "A concatenated genome FASTA file,\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
124 + f"{os.path.basename(cat_genomes_gz)} already exists in:\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
125 + f"{os.getcwd()}\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
126 + "Please remove or move it as we will not "
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
127 + "overwrite it."
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
128 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
129 exit(1)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
130
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
131 if accs_txt and (not os.path.exists(accs_txt) or not os.path.getsize(accs_txt) > 0):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
132 logging.error("File,\n" + f"{accs_txt}\ndoes not exist " + "or is empty!")
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
133 failed_sample_name = re.sub(accs_suffix, "", os.path.basename(accs_txt))
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
134 with open(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
135 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
136 ) as failed_sample_fh:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
137 failed_sample_fh.write(f"{failed_sample_name}\n")
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
138 failed_sample_fh.close()
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
139 exit(0)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
140
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
141 if genomes_dir:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
142 if not os.path.isdir(genomes_dir):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
143 logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!")
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
144 exit(1)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
145 if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
146 logging.error(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
147 "Genomes directory"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
148 + f"{genomes_dir}"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
149 + "\ndoes not seem to have any\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
150 + f"files ending with suffix: {genomes_dir_suffix}"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
151 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
152 exit(1)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
153
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
154 # ppp.pprint(mash_hits)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
155 empty_lines = 0
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
156 empty_lines_msg = ""
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
157 with open(cat_genomes_gz, "wb") as genomes_out_gz:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
158 with open(accs_txt, "r") as accs_txt_fh:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
159 for line in accs_txt_fh:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
160 if line in ["\n", "\n\r"]:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
161 empty_lines += 1
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
162 continue
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
163 else:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
164 line = line.strip()
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
165
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
166 if line in accs_seen.keys():
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
167 continue
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
168 else:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
169 accs_seen[line] = 1
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
170
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
171 genome_file = os.path.join(genomes_dir, line + genomes_dir_suffix)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
172
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
173 if (
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
174 not os.path.exists(genome_file)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
175 or os.path.getsize(genome_file) <= 0
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
176 ):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
177 logging.error(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
178 f"Genome file {os.path.basename(genome_file)} does not\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
179 + "exits or is empty!"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
180 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
181 exit(1)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
182 else:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
183 with open(genome_file, "rb") as genome_file_h:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
184 genomes_out_gz.writelines(genome_file_h.readlines())
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
185 genome_file_h.close()
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
186 accs_txt_fh.close()
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
187 genomes_out_gz.close()
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
188
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
189 if (
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
190 len(accs_seen.keys()) > 0
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
191 and os.path.exists(frags_gz)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
192 and os.path.getsize(frags_gz) > 0
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
193 ):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
194 with gzip.open(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
195 cat_reads_gz, "wt", encoding="utf-8", compresslevel=6
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
196 ) as cat_reads_gz_fh:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
197 with gzip.open(frags_gz, "rb", compresslevel=6) as fragz_gz_fh:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
198 for frag_line in fragz_gz_fh:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
199 frag_lines = frag_line.decode("utf-8").strip().split(frag_delim)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
200 # Per KMA specification, 6=template, 7=query, 1=read
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
201 cat_reads_gz_fh.write(f">{frag_lines[6]}\n{frag_lines[0]}\n")
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
202 fragz_gz_fh.close()
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
203 cat_reads_gz_fh.close()
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
204
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
205 if empty_lines > 0:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
206 empty_lines_msg = f"Skipped {empty_lines} empty line(s).\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
207
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
208 logging.info(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
209 empty_lines_msg
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
210 + f"File {os.path.basename(cat_genomes_gz)}\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
211 + f"written in:\n{os.getcwd()}\nDone! Bye!"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
212 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
213 exit(0)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
214
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
215
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
216 if __name__ == "__main__":
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
217 main()