kkonganti@17
|
1 #!/usr/bin/env python3
|
kkonganti@17
|
2
|
kkonganti@17
|
3 # Kranti Konganti
|
kkonganti@17
|
4
|
kkonganti@17
|
5 import argparse
|
kkonganti@17
|
6 import glob
|
kkonganti@17
|
7 import gzip
|
kkonganti@17
|
8 import inspect
|
kkonganti@17
|
9 import logging
|
kkonganti@17
|
10 import os
|
kkonganti@17
|
11 import pprint
|
kkonganti@17
|
12 import re
|
kkonganti@17
|
13
|
kkonganti@17
|
14 # Set logging.
|
kkonganti@17
|
15 logging.basicConfig(
|
kkonganti@17
|
16 format="\n"
|
kkonganti@17
|
17 + "=" * 55
|
kkonganti@17
|
18 + "\n%(asctime)s - %(levelname)s\n"
|
kkonganti@17
|
19 + "=" * 55
|
kkonganti@17
|
20 + "\n%(message)s\n\n",
|
kkonganti@17
|
21 level=logging.DEBUG,
|
kkonganti@17
|
22 )
|
kkonganti@17
|
23
|
kkonganti@17
|
24 # Debug print.
|
kkonganti@17
|
25 ppp = pprint.PrettyPrinter(width=50, indent=4)
|
kkonganti@17
|
26
|
kkonganti@17
|
27
|
kkonganti@17
|
28 # Multiple inheritence for pretty printing of help text.
|
kkonganti@17
|
29 class MultiArgFormatClasses(
|
kkonganti@17
|
30 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
|
kkonganti@17
|
31 ):
|
kkonganti@17
|
32 pass
|
kkonganti@17
|
33
|
kkonganti@17
|
34
|
kkonganti@17
|
35 def main() -> None:
|
kkonganti@17
|
36 """
|
kkonganti@17
|
37 This script works only in the context of `bettercallsal` Nextflow workflow.
|
kkonganti@17
|
38 It takes:
|
kkonganti@17
|
39 1. A text file containing accessions or FASTA IDs, one per line and
|
kkonganti@17
|
40 then,
|
kkonganti@17
|
41 2. Searches for a genome FASTA file in gzipped format in specified
|
kkonganti@17
|
42 search path, where the prefix of the filename is the accession or
|
kkonganti@17
|
43 FASTA ID from 1. and then,
|
kkonganti@17
|
44 creates a new concatenated gzipped genome FASTA file with all the genomes
|
kkonganti@17
|
45 in the text file from 1.
|
kkonganti@17
|
46 """
|
kkonganti@17
|
47
|
kkonganti@17
|
48 prog_name = os.path.basename(inspect.stack()[0].filename)
|
kkonganti@17
|
49
|
kkonganti@17
|
50 parser = argparse.ArgumentParser(
|
kkonganti@17
|
51 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
|
kkonganti@17
|
52 )
|
kkonganti@17
|
53
|
kkonganti@17
|
54 required = parser.add_argument_group("required arguments")
|
kkonganti@17
|
55
|
kkonganti@17
|
56 required.add_argument(
|
kkonganti@17
|
57 "-txt",
|
kkonganti@17
|
58 dest="accs_txt",
|
kkonganti@17
|
59 default=False,
|
kkonganti@17
|
60 required=True,
|
kkonganti@17
|
61 help="Absolute UNIX path to .txt file containing accessions\n"
|
kkonganti@17
|
62 + "FASTA IDs, one per line.",
|
kkonganti@17
|
63 )
|
kkonganti@17
|
64 required.add_argument(
|
kkonganti@17
|
65 "-gd",
|
kkonganti@17
|
66 dest="genomes_dir",
|
kkonganti@17
|
67 default=False,
|
kkonganti@17
|
68 required=True,
|
kkonganti@17
|
69 help="Absolute UNIX path to a directory containing\n"
|
kkonganti@17
|
70 + "gzipped genome FASTA files.\n"
|
kkonganti@17
|
71 + "Required if -m is on.",
|
kkonganti@17
|
72 )
|
kkonganti@17
|
73 parser.add_argument(
|
kkonganti@17
|
74 "-gds",
|
kkonganti@17
|
75 dest="genomes_dir_suffix",
|
kkonganti@17
|
76 default="_scaffolded_genomic.fna.gz",
|
kkonganti@17
|
77 required=False,
|
kkonganti@17
|
78 help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.",
|
kkonganti@17
|
79 )
|
kkonganti@17
|
80 parser.add_argument(
|
kkonganti@17
|
81 "-op",
|
kkonganti@17
|
82 dest="out_prefix",
|
kkonganti@17
|
83 default="CATTED_GENOMES",
|
kkonganti@17
|
84 help="Set the output file prefix for .fna.gz and .txt\n" + "files.",
|
kkonganti@17
|
85 )
|
kkonganti@17
|
86 parser.add_argument(
|
kkonganti@17
|
87 "-txts",
|
kkonganti@17
|
88 dest="accs_suffix",
|
kkonganti@17
|
89 default="_template_hits.txt",
|
kkonganti@17
|
90 required=False,
|
kkonganti@17
|
91 help="The suffix of the file supplied with -txt option. It is assumed that the\n"
|
kkonganti@17
|
92 + "sample name is present in the file supplied with -txt option and the suffix\n"
|
kkonganti@17
|
93 + "will be stripped and stored in a file that logs samples which have no hits.",
|
kkonganti@17
|
94 )
|
kkonganti@17
|
95 parser.add_argument(
|
kkonganti@17
|
96 "-frag_delim",
|
kkonganti@17
|
97 dest="frag_delim",
|
kkonganti@17
|
98 default="\t",
|
kkonganti@17
|
99 required=False,
|
kkonganti@17
|
100 help="The delimitor by which the fields are separated in *_frag.gz file.",
|
kkonganti@17
|
101 )
|
kkonganti@17
|
102
|
kkonganti@17
|
103 args = parser.parse_args()
|
kkonganti@17
|
104 accs_txt = args.accs_txt
|
kkonganti@17
|
105 genomes_dir = args.genomes_dir
|
kkonganti@17
|
106 genomes_dir_suffix = args.genomes_dir_suffix
|
kkonganti@17
|
107 out_prefix = args.out_prefix
|
kkonganti@17
|
108 accs_suffix = args.accs_suffix
|
kkonganti@17
|
109 frag_delim = args.frag_delim
|
kkonganti@17
|
110 accs_seen = dict()
|
kkonganti@17
|
111 cat_genomes_gz = os.path.join(os.getcwd(), out_prefix + "_" + genomes_dir_suffix)
|
kkonganti@17
|
112 cat_genomes_gz = re.sub("__", "_", str(cat_genomes_gz))
|
kkonganti@17
|
113 frags_gz = os.path.join(os.getcwd(), out_prefix + ".frag.gz")
|
kkonganti@17
|
114 cat_reads_gz = os.path.join(os.getcwd(), out_prefix + "_aln_reads.fna.gz")
|
kkonganti@17
|
115 cat_reads_gz = re.sub("__", "_", cat_reads_gz)
|
kkonganti@17
|
116
|
kkonganti@17
|
117 if (
|
kkonganti@17
|
118 accs_txt
|
kkonganti@17
|
119 and os.path.exists(cat_genomes_gz)
|
kkonganti@17
|
120 and os.path.getsize(cat_genomes_gz) > 0
|
kkonganti@17
|
121 ):
|
kkonganti@17
|
122 logging.error(
|
kkonganti@17
|
123 "A concatenated genome FASTA file,\n"
|
kkonganti@17
|
124 + f"{os.path.basename(cat_genomes_gz)} already exists in:\n"
|
kkonganti@17
|
125 + f"{os.getcwd()}\n"
|
kkonganti@17
|
126 + "Please remove or move it as we will not "
|
kkonganti@17
|
127 + "overwrite it."
|
kkonganti@17
|
128 )
|
kkonganti@17
|
129 exit(1)
|
kkonganti@17
|
130
|
kkonganti@17
|
131 if accs_txt and (not os.path.exists(accs_txt) or not os.path.getsize(accs_txt) > 0):
|
kkonganti@17
|
132 logging.error("File,\n" + f"{accs_txt}\ndoes not exist " + "or is empty!")
|
kkonganti@17
|
133 failed_sample_name = re.sub(accs_suffix, "", os.path.basename(accs_txt))
|
kkonganti@17
|
134 with open(
|
kkonganti@17
|
135 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w"
|
kkonganti@17
|
136 ) as failed_sample_fh:
|
kkonganti@17
|
137 failed_sample_fh.write(f"{failed_sample_name}\n")
|
kkonganti@17
|
138 failed_sample_fh.close()
|
kkonganti@17
|
139 exit(0)
|
kkonganti@17
|
140
|
kkonganti@17
|
141 if genomes_dir:
|
kkonganti@17
|
142 if not os.path.isdir(genomes_dir):
|
kkonganti@17
|
143 logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!")
|
kkonganti@17
|
144 exit(1)
|
kkonganti@17
|
145 if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0:
|
kkonganti@17
|
146 logging.error(
|
kkonganti@17
|
147 "Genomes directory"
|
kkonganti@17
|
148 + f"{genomes_dir}"
|
kkonganti@17
|
149 + "\ndoes not seem to have any\n"
|
kkonganti@17
|
150 + f"files ending with suffix: {genomes_dir_suffix}"
|
kkonganti@17
|
151 )
|
kkonganti@17
|
152 exit(1)
|
kkonganti@17
|
153
|
kkonganti@17
|
154 # ppp.pprint(mash_hits)
|
kkonganti@17
|
155 empty_lines = 0
|
kkonganti@17
|
156 empty_lines_msg = ""
|
kkonganti@17
|
157 with open(cat_genomes_gz, "wb") as genomes_out_gz:
|
kkonganti@17
|
158 with open(accs_txt, "r") as accs_txt_fh:
|
kkonganti@17
|
159 for line in accs_txt_fh:
|
kkonganti@17
|
160 if line in ["\n", "\n\r"]:
|
kkonganti@17
|
161 empty_lines += 1
|
kkonganti@17
|
162 continue
|
kkonganti@17
|
163 else:
|
kkonganti@17
|
164 line = line.strip()
|
kkonganti@17
|
165
|
kkonganti@17
|
166 if line in accs_seen.keys():
|
kkonganti@17
|
167 continue
|
kkonganti@17
|
168 else:
|
kkonganti@17
|
169 accs_seen[line] = 1
|
kkonganti@17
|
170
|
kkonganti@17
|
171 genome_file = os.path.join(genomes_dir, line + genomes_dir_suffix)
|
kkonganti@17
|
172
|
kkonganti@17
|
173 if (
|
kkonganti@17
|
174 not os.path.exists(genome_file)
|
kkonganti@17
|
175 or os.path.getsize(genome_file) <= 0
|
kkonganti@17
|
176 ):
|
kkonganti@17
|
177 logging.error(
|
kkonganti@17
|
178 f"Genome file {os.path.basename(genome_file)} does not\n"
|
kkonganti@17
|
179 + "exits or is empty!"
|
kkonganti@17
|
180 )
|
kkonganti@17
|
181 exit(1)
|
kkonganti@17
|
182 else:
|
kkonganti@17
|
183 with open(genome_file, "rb") as genome_file_h:
|
kkonganti@17
|
184 genomes_out_gz.writelines(genome_file_h.readlines())
|
kkonganti@17
|
185 genome_file_h.close()
|
kkonganti@17
|
186 accs_txt_fh.close()
|
kkonganti@17
|
187 genomes_out_gz.close()
|
kkonganti@17
|
188
|
kkonganti@17
|
189 if (
|
kkonganti@17
|
190 len(accs_seen.keys()) > 0
|
kkonganti@17
|
191 and os.path.exists(frags_gz)
|
kkonganti@17
|
192 and os.path.getsize(frags_gz) > 0
|
kkonganti@17
|
193 ):
|
kkonganti@17
|
194 with gzip.open(
|
kkonganti@17
|
195 cat_reads_gz, "wt", encoding="utf-8", compresslevel=6
|
kkonganti@17
|
196 ) as cat_reads_gz_fh:
|
kkonganti@17
|
197 with gzip.open(frags_gz, "rb", compresslevel=6) as fragz_gz_fh:
|
kkonganti@17
|
198 for frag_line in fragz_gz_fh:
|
kkonganti@17
|
199 frag_lines = frag_line.decode("utf-8").strip().split(frag_delim)
|
kkonganti@17
|
200 # Per KMA specification, 6=template, 7=query, 1=read
|
kkonganti@17
|
201 cat_reads_gz_fh.write(f">{frag_lines[6]}\n{frag_lines[0]}\n")
|
kkonganti@17
|
202 fragz_gz_fh.close()
|
kkonganti@17
|
203 cat_reads_gz_fh.close()
|
kkonganti@17
|
204
|
kkonganti@17
|
205 if empty_lines > 0:
|
kkonganti@17
|
206 empty_lines_msg = f"Skipped {empty_lines} empty line(s).\n"
|
kkonganti@17
|
207
|
kkonganti@17
|
208 logging.info(
|
kkonganti@17
|
209 empty_lines_msg
|
kkonganti@17
|
210 + f"File {os.path.basename(cat_genomes_gz)}\n"
|
kkonganti@17
|
211 + f"written in:\n{os.getcwd()}\nDone! Bye!"
|
kkonganti@17
|
212 )
|
kkonganti@17
|
213 exit(0)
|
kkonganti@17
|
214
|
kkonganti@17
|
215
|
kkonganti@17
|
216 if __name__ == "__main__":
|
kkonganti@17
|
217 main()
|