comparison 0.6.1/bin/gen_otf_genome.py @ 11:749faef1caa9

"planemo upload"
author kkonganti
date Tue, 05 Sep 2023 11:51:40 -0400
parents
children
comparison
equal deleted inserted replaced
10:1b9de878b04a 11:749faef1caa9
1 #!/usr/bin/env python3
2
3 # Kranti Konganti
4
5 import argparse
6 import glob
7 import gzip
8 import inspect
9 import logging
10 import os
11 import pprint
12 import re
13
14 # Set logging.
15 logging.basicConfig(
16 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n",
17 level=logging.DEBUG,
18 )
19
20 # Debug print.
21 ppp = pprint.PrettyPrinter(width=50, indent=4)
22
23 # Multiple inheritence for pretty printing of help text.
24 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
25 pass
26
27
28 def main() -> None:
29 """
30 This script works only in the context of `bettercallsal` Nextflow workflow.
31 It takes:
32 1. A text file containing accessions or FASTA IDs, one per line and
33 then,
34 2. Searches for a genome FASTA file in gzipped format in specified
35 search path, where the prefix of the filename is the accession or
36 FASTA ID from 1. and then,
37 creates a new concatenated gzipped genome FASTA file with all the genomes
38 in the text file from 1.
39 """
40
41 prog_name = os.path.basename(inspect.stack()[0].filename)
42
43 parser = argparse.ArgumentParser(
44 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
45 )
46
47 required = parser.add_argument_group("required arguments")
48
49 required.add_argument(
50 "-txt",
51 dest="accs_txt",
52 default=False,
53 required=True,
54 help="Absolute UNIX path to .txt file containing accessions\n" + "FASTA IDs, one per line.",
55 )
56 required.add_argument(
57 "-gd",
58 dest="genomes_dir",
59 default=False,
60 required=True,
61 help="Absolute UNIX path to a directory containing\n"
62 + "gzipped genome FASTA files.\n"
63 + "Required if -m is on.",
64 )
65 parser.add_argument(
66 "-gds",
67 dest="genomes_dir_suffix",
68 default="_scaffolded_genomic.fna.gz",
69 required=False,
70 help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.",
71 )
72 parser.add_argument(
73 "-op",
74 dest="out_prefix",
75 default="CATTED_GENOMES",
76 help="Set the output file prefix for .fna.gz and .txt\n" + "files.",
77 )
78 parser.add_argument(
79 "-txts",
80 dest="accs_suffix",
81 default="_template_hits.txt",
82 required=False,
83 help="The suffix of the file supplied with -txt option. It is assumed that the\n"
84 + "sample name is present in the file supplied with -txt option and the suffix\n"
85 + "will be stripped and stored in a file that logs samples which have no hits.",
86 )
87 parser.add_argument(
88 "-frag_delim",
89 dest="frag_delim",
90 default="\t",
91 required=False,
92 help="The delimitor by which the fields are separated in *_frag.gz file.",
93 )
94
95 args = parser.parse_args()
96 accs_txt = args.accs_txt
97 genomes_dir = args.genomes_dir
98 genomes_dir_suffix = args.genomes_dir_suffix
99 out_prefix = args.out_prefix
100 accs_suffix = args.accs_suffix
101 frag_delim = args.frag_delim
102 accs_seen = dict()
103 cat_genomes_gz = os.path.join(os.getcwd(), out_prefix + "_" + genomes_dir_suffix)
104 cat_genomes_gz = re.sub("__", "_", str(cat_genomes_gz))
105 frags_gz = os.path.join(os.getcwd(), out_prefix + ".frag.gz")
106 cat_reads_gz = os.path.join(os.getcwd(), out_prefix + "_aln_reads.fna.gz")
107 cat_reads_gz = re.sub("__", "_", cat_reads_gz)
108
109 if accs_txt and os.path.exists(cat_genomes_gz) and os.path.getsize(cat_genomes_gz) > 0:
110 logging.error(
111 "A concatenated genome FASTA file,\n"
112 + f"{os.path.basename(cat_genomes_gz)} already exists in:\n"
113 + f"{os.getcwd()}\n"
114 + "Please remove or move it as we will not "
115 + "overwrite it."
116 )
117 exit(1)
118
119 if accs_txt and (not os.path.exists(accs_txt) or not os.path.getsize(accs_txt) > 0):
120 logging.error("File,\n" + f"{accs_txt}\ndoes not exist " + "or is empty!")
121 failed_sample_name = re.sub(accs_suffix, "", os.path.basename(accs_txt))
122 with open(
123 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w"
124 ) as failed_sample_fh:
125 failed_sample_fh.write(f"{failed_sample_name}\n")
126 failed_sample_fh.close()
127 exit(0)
128
129 if genomes_dir:
130 if not os.path.isdir(genomes_dir):
131 logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!")
132 exit(1)
133 if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0:
134 logging.error(
135 "Genomes directory"
136 + f"{genomes_dir}"
137 + "\ndoes not seem to have any\n"
138 + f"files ending with suffix: {genomes_dir_suffix}"
139 )
140 exit(1)
141
142 # ppp.pprint(mash_hits)
143 empty_lines = 0
144 empty_lines_msg = ""
145 with open(cat_genomes_gz, "wb") as genomes_out_gz:
146 with open(accs_txt, "r") as accs_txt_fh:
147 for line in accs_txt_fh:
148 if line in ["\n", "\n\r"]:
149 empty_lines += 1
150 continue
151 else:
152 line = line.strip()
153
154 if line in accs_seen.keys():
155 continue
156 else:
157 accs_seen[line] = 1
158
159 genome_file = os.path.join(genomes_dir, line + genomes_dir_suffix)
160
161 if not os.path.exists(genome_file) or os.path.getsize(genome_file) <= 0:
162 logging.error(
163 f"Genome file {os.path.basename(genome_file)} does not\n"
164 + "exits or is empty!"
165 )
166 exit(1)
167 else:
168 with open(genome_file, "rb") as genome_file_h:
169 genomes_out_gz.writelines(genome_file_h.readlines())
170 genome_file_h.close()
171 accs_txt_fh.close()
172 genomes_out_gz.close()
173
174 if len(accs_seen.keys()) > 0 and os.path.exists(frags_gz) and os.path.getsize(frags_gz) > 0:
175 with gzip.open(cat_reads_gz, "wt", encoding="utf-8") as cat_reads_gz_fh:
176 with gzip.open(frags_gz, "rb") as fragz_gz_fh:
177 for frag_line in fragz_gz_fh.readlines():
178 frag_lines = frag_line.decode("utf-8").strip().split(frag_delim)
179 # Per KMA specification, 6=template, 7=query, 1=read
180 cat_reads_gz_fh.write(f">{frag_lines[6]}\n{frag_lines[0]}\n")
181 fragz_gz_fh.close()
182 cat_reads_gz_fh.close()
183
184 if empty_lines > 0:
185 empty_lines_msg = f"Skipped {empty_lines} empty line(s).\n"
186
187 logging.info(
188 empty_lines_msg
189 + f"File {os.path.basename(cat_genomes_gz)}\n"
190 + f"written in:\n{os.getcwd()}\nDone! Bye!"
191 )
192 exit(0)
193
194
195 if __name__ == "__main__":
196 main()