comparison 0.5.0/bin/gen_otf_genome.py @ 0:97cd2f532efe

planemo upload
author kkonganti
date Mon, 31 Mar 2025 14:50:40 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:97cd2f532efe
1 #!/usr/bin/env python3
2
3 # Kranti Konganti
4
5 import argparse
6 import glob
7 import gzip
8 import inspect
9 import logging
10 import os
11 import pprint
12 import re
13
14 # Set logging.
15 logging.basicConfig(
16 format="\n"
17 + "=" * 55
18 + "\n%(asctime)s - %(levelname)s\n"
19 + "=" * 55
20 + "\n%(message)s\n\n",
21 level=logging.DEBUG,
22 )
23
24 # Debug print.
25 ppp = pprint.PrettyPrinter(width=50, indent=4)
26
27
28 # Multiple inheritence for pretty printing of help text.
29 class MultiArgFormatClasses(
30 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
31 ):
32 pass
33
34
35 def main() -> None:
36 """
37 This script works only in the context of a Nextflow workflow.
38 It takes:
39 1. A text file containing accessions or FASTA IDs, one per line and
40 then,
41 2. Optionally, searches for a genome FASTA file in gzipped format in specified
42 search path, where the prefix of the filename is the accession or
43 FASTA ID from 1. and then, creates a new concatenated gzipped genome FASTA
44 file with all the genomes in the text file from 1.
45 3. Creates a new FASTQ file with reads aligned to the accessions in the text
46 file from 1.
47 """
48
49 prog_name = os.path.basename(inspect.stack()[0].filename)
50
51 parser = argparse.ArgumentParser(
52 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
53 )
54
55 required = parser.add_argument_group("required arguments")
56
57 required.add_argument(
58 "-txt",
59 dest="accs_txt",
60 default=False,
61 required=True,
62 help="Absolute UNIX path to .txt file containing accessions\n"
63 + "FASTA IDs, one per line.",
64 )
65 required.add_argument(
66 "-op",
67 dest="out_prefix",
68 default="CATTED_GENOMES",
69 required=True,
70 help="Set the output file prefix for .fna.gz and .txt\n" + "files.",
71 )
72 parser.add_argument(
73 "-gd",
74 dest="genomes_dir",
75 default=False,
76 required=False,
77 help="Absolute UNIX path to a directory containing\n"
78 + "gzipped genome FASTA files or a file.\n",
79 )
80 parser.add_argument(
81 "-gds",
82 dest="genomes_dir_suffix",
83 default="_scaffolded_genomic.fna.gz",
84 required=False,
85 help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.",
86 )
87 parser.add_argument(
88 "-query",
89 dest="id_is_query",
90 default=False,
91 action="store_true",
92 required=False,
93 help="In the produced FASTQ file, should the FASTA ID should be of KMA query ID\n"
94 + "or template ID.",
95 )
96 parser.add_argument(
97 "-txts",
98 dest="accs_suffix",
99 default="_template_hits.txt",
100 required=False,
101 help="The suffix of the file supplied with -txt option. It is assumed that the\n"
102 + "sample name is present in the file supplied with -txt option and the suffix\n"
103 + "will be stripped and stored in a file that logs samples which have no hits.",
104 )
105 parser.add_argument(
106 "-frag_delim",
107 dest="frag_delim",
108 default="\t",
109 required=False,
110 help="The delimitor by which the fields are separated in *_frag.gz file.",
111 )
112
113 args = parser.parse_args()
114 accs_txt = args.accs_txt
115 genomes_dir = args.genomes_dir
116 genomes_dir_suffix = args.genomes_dir_suffix
117 id_is_query = args.id_is_query
118 out_prefix = args.out_prefix
119 accs_suffix = args.accs_suffix
120 frag_delim = args.frag_delim
121 accs_seen = dict()
122 cat_genomes_gz = os.path.join(os.getcwd(), out_prefix + "_" + genomes_dir_suffix)
123 cat_genomes_gz = re.sub("__", "_", str(cat_genomes_gz))
124 frags_gz = os.path.join(os.getcwd(), out_prefix + ".frag.gz")
125 cat_reads_gz = os.path.join(os.getcwd(), out_prefix + "_aln_reads.fna.gz")
126 cat_reads_gz = re.sub("__", "_", cat_reads_gz)
127
128 if (
129 accs_txt
130 and os.path.exists(cat_genomes_gz)
131 and os.path.getsize(cat_genomes_gz) > 0
132 ):
133 logging.error(
134 "A concatenated genome FASTA file,\n"
135 + f"{os.path.basename(cat_genomes_gz)} already exists in:\n"
136 + f"{os.getcwd()}\n"
137 + "Please remove or move it as we will not "
138 + "overwrite it."
139 )
140 exit(1)
141
142 if accs_txt and (not os.path.exists(accs_txt) or not os.path.getsize(accs_txt) > 0):
143 logging.error("File,\n" + f"{accs_txt}\ndoes not exist " + "or is empty!")
144 failed_sample_name = re.sub(accs_suffix, "", os.path.basename(accs_txt))
145 with open(
146 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w"
147 ) as failed_sample_fh:
148 failed_sample_fh.write(f"{failed_sample_name}\n")
149 failed_sample_fh.close()
150 exit(0)
151
152 # ppp.pprint(mash_hits)
153 empty_lines = 0
154 empty_lines_msg = ""
155
156 with open(accs_txt, "r") as accs_txt_fh:
157 for line in accs_txt_fh:
158 if line in ["\n", "\n\r"]:
159 empty_lines += 1
160 continue
161 else:
162 line = line.strip()
163
164 if line in accs_seen.keys():
165 continue
166 else:
167 accs_seen[line] = 1
168 accs_txt_fh.close()
169
170 if genomes_dir:
171 if not os.path.isdir(genomes_dir):
172 logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!")
173 exit(1)
174 if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0:
175 logging.error(
176 "Genomes directory"
177 + f"{genomes_dir}"
178 + "\ndoes not seem to have any\n"
179 + f"files ending with suffix: {genomes_dir_suffix}"
180 )
181 exit(1)
182
183 with open(cat_genomes_gz, "wb") as genomes_out_gz:
184 for line in accs_seen.keys():
185 genome_file = os.path.join(genomes_dir, line + genomes_dir_suffix)
186
187 if not os.path.exists(genome_file) or os.path.getsize(genome_file) <= 0:
188 logging.error(
189 f"Genome file {os.path.basename(genome_file)} does not\n"
190 + "exits or is empty!"
191 )
192 exit(1)
193 else:
194 with open(genome_file, "rb") as genome_file_h:
195 genomes_out_gz.writelines(genome_file_h.readlines())
196 genome_file_h.close()
197 genomes_out_gz.close()
198
199 if (
200 len(accs_seen.keys()) > 0
201 and os.path.exists(frags_gz)
202 and os.path.getsize(frags_gz) > 0
203 ):
204 with gzip.open(
205 cat_reads_gz, "wt", encoding="utf-8", compresslevel=6
206 ) as cat_reads_gz_fh:
207 with gzip.open(frags_gz, "rb", compresslevel=6) as fragz_gz_fh:
208 fasta_id = 7 if id_is_query else 6
209 for frag_line in fragz_gz_fh:
210 frag_lines = frag_line.decode("utf-8").strip().split(frag_delim)
211 # Per KMA specification, 6=template, 7=query, 1=read
212 cat_reads_gz_fh.write(f">{frag_lines[fasta_id]}\n{frag_lines[0]}\n")
213 fragz_gz_fh.close()
214 cat_reads_gz_fh.close()
215
216 if empty_lines > 0:
217 empty_lines_msg = f"Skipped {empty_lines} empty line(s).\n"
218
219 logging.info(
220 empty_lines_msg
221 + f"File {os.path.basename(cat_genomes_gz)}\n"
222 + f"written in:\n{os.getcwd()}\nDone! Bye!"
223 )
224
225
226 if __name__ == "__main__":
227 main()