kkonganti@11
|
1 #!/usr/bin/env python3
|
kkonganti@11
|
2
|
kkonganti@11
|
3 # Kranti Konganti
|
kkonganti@11
|
4
|
kkonganti@11
|
5 import argparse
|
kkonganti@11
|
6 import glob
|
kkonganti@11
|
7 import gzip
|
kkonganti@11
|
8 import inspect
|
kkonganti@11
|
9 import logging
|
kkonganti@11
|
10 import os
|
kkonganti@11
|
11 import pprint
|
kkonganti@11
|
12 import re
|
kkonganti@11
|
13
|
kkonganti@11
|
14 # Set logging.
|
kkonganti@11
|
15 logging.basicConfig(
|
kkonganti@11
|
16 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n",
|
kkonganti@11
|
17 level=logging.DEBUG,
|
kkonganti@11
|
18 )
|
kkonganti@11
|
19
|
kkonganti@11
|
20 # Debug print.
|
kkonganti@11
|
21 ppp = pprint.PrettyPrinter(width=50, indent=4)
|
kkonganti@11
|
22
|
kkonganti@11
|
23 # Multiple inheritence for pretty printing of help text.
|
kkonganti@11
|
24 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
|
kkonganti@11
|
25 pass
|
kkonganti@11
|
26
|
kkonganti@11
|
27
|
kkonganti@11
|
28 def main() -> None:
|
kkonganti@11
|
29 """
|
kkonganti@11
|
30 This script works only in the context of `bettercallsal` Nextflow workflow.
|
kkonganti@11
|
31 It takes:
|
kkonganti@11
|
32 1. A text file containing accessions or FASTA IDs, one per line and
|
kkonganti@11
|
33 then,
|
kkonganti@11
|
34 2. Searches for a genome FASTA file in gzipped format in specified
|
kkonganti@11
|
35 search path, where the prefix of the filename is the accession or
|
kkonganti@11
|
36 FASTA ID from 1. and then,
|
kkonganti@11
|
37 creates a new concatenated gzipped genome FASTA file with all the genomes
|
kkonganti@11
|
38 in the text file from 1.
|
kkonganti@11
|
39 """
|
kkonganti@11
|
40
|
kkonganti@11
|
41 prog_name = os.path.basename(inspect.stack()[0].filename)
|
kkonganti@11
|
42
|
kkonganti@11
|
43 parser = argparse.ArgumentParser(
|
kkonganti@11
|
44 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
|
kkonganti@11
|
45 )
|
kkonganti@11
|
46
|
kkonganti@11
|
47 required = parser.add_argument_group("required arguments")
|
kkonganti@11
|
48
|
kkonganti@11
|
49 required.add_argument(
|
kkonganti@11
|
50 "-txt",
|
kkonganti@11
|
51 dest="accs_txt",
|
kkonganti@11
|
52 default=False,
|
kkonganti@11
|
53 required=True,
|
kkonganti@11
|
54 help="Absolute UNIX path to .txt file containing accessions\n" + "FASTA IDs, one per line.",
|
kkonganti@11
|
55 )
|
kkonganti@11
|
56 required.add_argument(
|
kkonganti@11
|
57 "-gd",
|
kkonganti@11
|
58 dest="genomes_dir",
|
kkonganti@11
|
59 default=False,
|
kkonganti@11
|
60 required=True,
|
kkonganti@11
|
61 help="Absolute UNIX path to a directory containing\n"
|
kkonganti@11
|
62 + "gzipped genome FASTA files.\n"
|
kkonganti@11
|
63 + "Required if -m is on.",
|
kkonganti@11
|
64 )
|
kkonganti@11
|
65 parser.add_argument(
|
kkonganti@11
|
66 "-gds",
|
kkonganti@11
|
67 dest="genomes_dir_suffix",
|
kkonganti@11
|
68 default="_scaffolded_genomic.fna.gz",
|
kkonganti@11
|
69 required=False,
|
kkonganti@11
|
70 help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.",
|
kkonganti@11
|
71 )
|
kkonganti@11
|
72 parser.add_argument(
|
kkonganti@11
|
73 "-op",
|
kkonganti@11
|
74 dest="out_prefix",
|
kkonganti@11
|
75 default="CATTED_GENOMES",
|
kkonganti@11
|
76 help="Set the output file prefix for .fna.gz and .txt\n" + "files.",
|
kkonganti@11
|
77 )
|
kkonganti@11
|
78 parser.add_argument(
|
kkonganti@11
|
79 "-txts",
|
kkonganti@11
|
80 dest="accs_suffix",
|
kkonganti@11
|
81 default="_template_hits.txt",
|
kkonganti@11
|
82 required=False,
|
kkonganti@11
|
83 help="The suffix of the file supplied with -txt option. It is assumed that the\n"
|
kkonganti@11
|
84 + "sample name is present in the file supplied with -txt option and the suffix\n"
|
kkonganti@11
|
85 + "will be stripped and stored in a file that logs samples which have no hits.",
|
kkonganti@11
|
86 )
|
kkonganti@11
|
87 parser.add_argument(
|
kkonganti@11
|
88 "-frag_delim",
|
kkonganti@11
|
89 dest="frag_delim",
|
kkonganti@11
|
90 default="\t",
|
kkonganti@11
|
91 required=False,
|
kkonganti@11
|
92 help="The delimitor by which the fields are separated in *_frag.gz file.",
|
kkonganti@11
|
93 )
|
kkonganti@11
|
94
|
kkonganti@11
|
95 args = parser.parse_args()
|
kkonganti@11
|
96 accs_txt = args.accs_txt
|
kkonganti@11
|
97 genomes_dir = args.genomes_dir
|
kkonganti@11
|
98 genomes_dir_suffix = args.genomes_dir_suffix
|
kkonganti@11
|
99 out_prefix = args.out_prefix
|
kkonganti@11
|
100 accs_suffix = args.accs_suffix
|
kkonganti@11
|
101 frag_delim = args.frag_delim
|
kkonganti@11
|
102 accs_seen = dict()
|
kkonganti@11
|
103 cat_genomes_gz = os.path.join(os.getcwd(), out_prefix + "_" + genomes_dir_suffix)
|
kkonganti@11
|
104 cat_genomes_gz = re.sub("__", "_", str(cat_genomes_gz))
|
kkonganti@11
|
105 frags_gz = os.path.join(os.getcwd(), out_prefix + ".frag.gz")
|
kkonganti@11
|
106 cat_reads_gz = os.path.join(os.getcwd(), out_prefix + "_aln_reads.fna.gz")
|
kkonganti@11
|
107 cat_reads_gz = re.sub("__", "_", cat_reads_gz)
|
kkonganti@11
|
108
|
kkonganti@11
|
109 if accs_txt and os.path.exists(cat_genomes_gz) and os.path.getsize(cat_genomes_gz) > 0:
|
kkonganti@11
|
110 logging.error(
|
kkonganti@11
|
111 "A concatenated genome FASTA file,\n"
|
kkonganti@11
|
112 + f"{os.path.basename(cat_genomes_gz)} already exists in:\n"
|
kkonganti@11
|
113 + f"{os.getcwd()}\n"
|
kkonganti@11
|
114 + "Please remove or move it as we will not "
|
kkonganti@11
|
115 + "overwrite it."
|
kkonganti@11
|
116 )
|
kkonganti@11
|
117 exit(1)
|
kkonganti@11
|
118
|
kkonganti@11
|
119 if accs_txt and (not os.path.exists(accs_txt) or not os.path.getsize(accs_txt) > 0):
|
kkonganti@11
|
120 logging.error("File,\n" + f"{accs_txt}\ndoes not exist " + "or is empty!")
|
kkonganti@11
|
121 failed_sample_name = re.sub(accs_suffix, "", os.path.basename(accs_txt))
|
kkonganti@11
|
122 with open(
|
kkonganti@11
|
123 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w"
|
kkonganti@11
|
124 ) as failed_sample_fh:
|
kkonganti@11
|
125 failed_sample_fh.write(f"{failed_sample_name}\n")
|
kkonganti@11
|
126 failed_sample_fh.close()
|
kkonganti@11
|
127 exit(0)
|
kkonganti@11
|
128
|
kkonganti@11
|
129 if genomes_dir:
|
kkonganti@11
|
130 if not os.path.isdir(genomes_dir):
|
kkonganti@11
|
131 logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!")
|
kkonganti@11
|
132 exit(1)
|
kkonganti@11
|
133 if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0:
|
kkonganti@11
|
134 logging.error(
|
kkonganti@11
|
135 "Genomes directory"
|
kkonganti@11
|
136 + f"{genomes_dir}"
|
kkonganti@11
|
137 + "\ndoes not seem to have any\n"
|
kkonganti@11
|
138 + f"files ending with suffix: {genomes_dir_suffix}"
|
kkonganti@11
|
139 )
|
kkonganti@11
|
140 exit(1)
|
kkonganti@11
|
141
|
kkonganti@11
|
142 # ppp.pprint(mash_hits)
|
kkonganti@11
|
143 empty_lines = 0
|
kkonganti@11
|
144 empty_lines_msg = ""
|
kkonganti@11
|
145 with open(cat_genomes_gz, "wb") as genomes_out_gz:
|
kkonganti@11
|
146 with open(accs_txt, "r") as accs_txt_fh:
|
kkonganti@11
|
147 for line in accs_txt_fh:
|
kkonganti@11
|
148 if line in ["\n", "\n\r"]:
|
kkonganti@11
|
149 empty_lines += 1
|
kkonganti@11
|
150 continue
|
kkonganti@11
|
151 else:
|
kkonganti@11
|
152 line = line.strip()
|
kkonganti@11
|
153
|
kkonganti@11
|
154 if line in accs_seen.keys():
|
kkonganti@11
|
155 continue
|
kkonganti@11
|
156 else:
|
kkonganti@11
|
157 accs_seen[line] = 1
|
kkonganti@11
|
158
|
kkonganti@11
|
159 genome_file = os.path.join(genomes_dir, line + genomes_dir_suffix)
|
kkonganti@11
|
160
|
kkonganti@11
|
161 if not os.path.exists(genome_file) or os.path.getsize(genome_file) <= 0:
|
kkonganti@11
|
162 logging.error(
|
kkonganti@11
|
163 f"Genome file {os.path.basename(genome_file)} does not\n"
|
kkonganti@11
|
164 + "exits or is empty!"
|
kkonganti@11
|
165 )
|
kkonganti@11
|
166 exit(1)
|
kkonganti@11
|
167 else:
|
kkonganti@11
|
168 with open(genome_file, "rb") as genome_file_h:
|
kkonganti@11
|
169 genomes_out_gz.writelines(genome_file_h.readlines())
|
kkonganti@11
|
170 genome_file_h.close()
|
kkonganti@11
|
171 accs_txt_fh.close()
|
kkonganti@11
|
172 genomes_out_gz.close()
|
kkonganti@11
|
173
|
kkonganti@11
|
174 if len(accs_seen.keys()) > 0 and os.path.exists(frags_gz) and os.path.getsize(frags_gz) > 0:
|
kkonganti@11
|
175 with gzip.open(cat_reads_gz, "wt", encoding="utf-8") as cat_reads_gz_fh:
|
kkonganti@11
|
176 with gzip.open(frags_gz, "rb") as fragz_gz_fh:
|
kkonganti@11
|
177 for frag_line in fragz_gz_fh.readlines():
|
kkonganti@11
|
178 frag_lines = frag_line.decode("utf-8").strip().split(frag_delim)
|
kkonganti@11
|
179 # Per KMA specification, 6=template, 7=query, 1=read
|
kkonganti@11
|
180 cat_reads_gz_fh.write(f">{frag_lines[6]}\n{frag_lines[0]}\n")
|
kkonganti@11
|
181 fragz_gz_fh.close()
|
kkonganti@11
|
182 cat_reads_gz_fh.close()
|
kkonganti@11
|
183
|
kkonganti@11
|
184 if empty_lines > 0:
|
kkonganti@11
|
185 empty_lines_msg = f"Skipped {empty_lines} empty line(s).\n"
|
kkonganti@11
|
186
|
kkonganti@11
|
187 logging.info(
|
kkonganti@11
|
188 empty_lines_msg
|
kkonganti@11
|
189 + f"File {os.path.basename(cat_genomes_gz)}\n"
|
kkonganti@11
|
190 + f"written in:\n{os.getcwd()}\nDone! Bye!"
|
kkonganti@11
|
191 )
|
kkonganti@11
|
192 exit(0)
|
kkonganti@11
|
193
|
kkonganti@11
|
194
|
kkonganti@11
|
195 if __name__ == "__main__":
|
kkonganti@11
|
196 main()
|