comparison 0.5.0/bin/gen_otf_genome.py @ 1:365849f031fd

"planemo upload"
author kkonganti
date Mon, 05 Jun 2023 18:48:51 -0400
parents
children
comparison
equal deleted inserted replaced
0:a4b1ee4b68b1 1:365849f031fd
1 #!/usr/bin/env python3
2
3 # Kranti Konganti
4
5 import os
6 import argparse
7 import inspect
8 import logging
9 import re
10 import pprint
11 import glob
12
13 # Set logging.
14 logging.basicConfig(
15 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n",
16 level=logging.DEBUG,
17 )
18
19 # Debug print.
20 ppp = pprint.PrettyPrinter(width=50, indent=4)
21
22 # Multiple inheritence for pretty printing of help text.
23 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
24 pass
25
26
27 def main() -> None:
28 """
29 This script works only in the context of `bettercallsal` Nextflow workflow.
30 It takes:
31 1. A text file containing accessions or FASTA IDs, one per line and
32 then,
33 2. Searches for a genome FASTA file in gzipped format in specified
34 search path, where the prefix of the filename is the accession or
35 FASTA ID from 1. and then,
36 creates a new concatenated gzipped genome FASTA file with all the genomes
37 in the text file from 1.
38 """
39
40 prog_name = os.path.basename(inspect.stack()[0].filename)
41
42 parser = argparse.ArgumentParser(
43 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
44 )
45
46 required = parser.add_argument_group("required arguments")
47
48 required.add_argument(
49 "-txt",
50 dest="accs_txt",
51 default=False,
52 required=True,
53 help="Absolute UNIX path to .txt file containing accessions\n" + "FASTA IDs, one per line.",
54 )
55 required.add_argument(
56 "-gd",
57 dest="genomes_dir",
58 default=False,
59 required=True,
60 help="Absolute UNIX path to a directory containing\n"
61 + "gzipped genome FASTA files.\n"
62 + "Required if -m is on.",
63 )
64 parser.add_argument(
65 "-gds",
66 dest="genomes_dir_suffix",
67 default="_scaffolded_genomic.fna.gz",
68 required=False,
69 help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.",
70 )
71 parser.add_argument(
72 "-op",
73 dest="out_prefix",
74 default="CATTED_GENOMES",
75 help="Set the output file prefix for .fna.gz and .txt\n" + "files.",
76 )
77 parser.add_argument(
78 "-txts",
79 dest="accs_suffix",
80 default="_template_hits.txt",
81 required=False,
82 help="The suffix of the file supplied with -txt option. It is assumed that the\n"
83 + "sample name is present in the file supplied with -txt option and the suffix\n"
84 + "will be stripped and stored in a file that logs samples which have no hits.",
85 )
86
87 args = parser.parse_args()
88 accs_txt = args.accs_txt
89 genomes_dir = args.genomes_dir
90 genomes_dir_suffix = args.genomes_dir_suffix
91 out_prefix = args.out_prefix
92 accs_suffix = args.accs_suffix
93 accs_seen = dict()
94 cat_genomes_gz = os.path.join(os.getcwd(), out_prefix + "_" + genomes_dir_suffix)
95 cat_genomes_gz = re.sub("__", "_", str(cat_genomes_gz))
96
97 if accs_txt and os.path.exists(cat_genomes_gz) and os.path.getsize(cat_genomes_gz) > 0:
98 logging.error(
99 "A concatenated genome FASTA file,\n"
100 + f"{os.path.basename(cat_genomes_gz)} already exists in:\n"
101 + f"{os.getcwd()}\n"
102 + "Please remove or move it as we will not "
103 + "overwrite it."
104 )
105 exit(1)
106
107 if accs_txt and (not os.path.exists(accs_txt) or not os.path.getsize(accs_txt) > 0):
108 logging.error("File,\n" + f"{accs_txt}\ndoes not exist " + "or is empty!")
109 failed_sample_name = re.sub(accs_suffix, "", os.path.basename(accs_txt))
110 with open(
111 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w"
112 ) as failed_sample_fh:
113 failed_sample_fh.write(f"{failed_sample_name}\n")
114 failed_sample_fh.close()
115 exit(0)
116
117 if genomes_dir:
118 if not os.path.isdir(genomes_dir):
119 logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!")
120 exit(1)
121 if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0:
122 logging.error(
123 "Genomes directory"
124 + f"{genomes_dir}"
125 + "\ndoes not seem to have any\n"
126 + f"files ending with suffix: {genomes_dir_suffix}"
127 )
128 exit(1)
129
130 # ppp.pprint(mash_hits)
131 empty_lines = 0
132 empty_lines_msg = ""
133 with open(cat_genomes_gz, "wb") as genomes_out_gz:
134 with open(accs_txt, "r") as accs_txt_fh:
135 for line in accs_txt_fh:
136 if line in ["\n", "\n\r"]:
137 empty_lines += 1
138 continue
139 else:
140 line = line.strip()
141
142 if line in accs_seen.keys():
143 continue
144 else:
145 accs_seen[line] = 1
146
147 genome_file = os.path.join(genomes_dir, line + genomes_dir_suffix)
148
149 if not os.path.exists(genome_file) or os.path.getsize(genome_file) <= 0:
150 logging.error(
151 f"Genome file {os.path.basename(genome_file)} does not\n"
152 + "exits or is empty!"
153 )
154 exit(1)
155 else:
156 with open(genome_file, "rb") as genome_file_h:
157 genomes_out_gz.writelines(genome_file_h.readlines())
158 genome_file_h.close()
159 accs_txt_fh.close()
160 genomes_out_gz.close()
161
162 if empty_lines > 0:
163 empty_lines_msg = f"Skipped {empty_lines} empty line(s).\n"
164
165 logging.info(
166 empty_lines_msg
167 + f"File {os.path.basename(cat_genomes_gz)}\n"
168 + f"written in:\n{os.getcwd()}\nDone! Bye!"
169 )
170 exit(0)
171
172
173 if __name__ == "__main__":
174 main()