Mercurial > repos > kkonganti > cfsan_bettercallsal
comparison 0.5.0/bin/gen_otf_genome.py @ 1:365849f031fd
"planemo upload"
author | kkonganti |
---|---|
date | Mon, 05 Jun 2023 18:48:51 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:a4b1ee4b68b1 | 1:365849f031fd |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 # Kranti Konganti | |
4 | |
5 import os | |
6 import argparse | |
7 import inspect | |
8 import logging | |
9 import re | |
10 import pprint | |
11 import glob | |
12 | |
13 # Set logging. | |
14 logging.basicConfig( | |
15 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n", | |
16 level=logging.DEBUG, | |
17 ) | |
18 | |
19 # Debug print. | |
20 ppp = pprint.PrettyPrinter(width=50, indent=4) | |
21 | |
22 # Multiple inheritence for pretty printing of help text. | |
23 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): | |
24 pass | |
25 | |
26 | |
27 def main() -> None: | |
28 """ | |
29 This script works only in the context of `bettercallsal` Nextflow workflow. | |
30 It takes: | |
31 1. A text file containing accessions or FASTA IDs, one per line and | |
32 then, | |
33 2. Searches for a genome FASTA file in gzipped format in specified | |
34 search path, where the prefix of the filename is the accession or | |
35 FASTA ID from 1. and then, | |
36 creates a new concatenated gzipped genome FASTA file with all the genomes | |
37 in the text file from 1. | |
38 """ | |
39 | |
40 prog_name = os.path.basename(inspect.stack()[0].filename) | |
41 | |
42 parser = argparse.ArgumentParser( | |
43 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses | |
44 ) | |
45 | |
46 required = parser.add_argument_group("required arguments") | |
47 | |
48 required.add_argument( | |
49 "-txt", | |
50 dest="accs_txt", | |
51 default=False, | |
52 required=True, | |
53 help="Absolute UNIX path to .txt file containing accessions\n" + "FASTA IDs, one per line.", | |
54 ) | |
55 required.add_argument( | |
56 "-gd", | |
57 dest="genomes_dir", | |
58 default=False, | |
59 required=True, | |
60 help="Absolute UNIX path to a directory containing\n" | |
61 + "gzipped genome FASTA files.\n" | |
62 + "Required if -m is on.", | |
63 ) | |
64 parser.add_argument( | |
65 "-gds", | |
66 dest="genomes_dir_suffix", | |
67 default="_scaffolded_genomic.fna.gz", | |
68 required=False, | |
69 help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.", | |
70 ) | |
71 parser.add_argument( | |
72 "-op", | |
73 dest="out_prefix", | |
74 default="CATTED_GENOMES", | |
75 help="Set the output file prefix for .fna.gz and .txt\n" + "files.", | |
76 ) | |
77 parser.add_argument( | |
78 "-txts", | |
79 dest="accs_suffix", | |
80 default="_template_hits.txt", | |
81 required=False, | |
82 help="The suffix of the file supplied with -txt option. It is assumed that the\n" | |
83 + "sample name is present in the file supplied with -txt option and the suffix\n" | |
84 + "will be stripped and stored in a file that logs samples which have no hits.", | |
85 ) | |
86 | |
87 args = parser.parse_args() | |
88 accs_txt = args.accs_txt | |
89 genomes_dir = args.genomes_dir | |
90 genomes_dir_suffix = args.genomes_dir_suffix | |
91 out_prefix = args.out_prefix | |
92 accs_suffix = args.accs_suffix | |
93 accs_seen = dict() | |
94 cat_genomes_gz = os.path.join(os.getcwd(), out_prefix + "_" + genomes_dir_suffix) | |
95 cat_genomes_gz = re.sub("__", "_", str(cat_genomes_gz)) | |
96 | |
97 if accs_txt and os.path.exists(cat_genomes_gz) and os.path.getsize(cat_genomes_gz) > 0: | |
98 logging.error( | |
99 "A concatenated genome FASTA file,\n" | |
100 + f"{os.path.basename(cat_genomes_gz)} already exists in:\n" | |
101 + f"{os.getcwd()}\n" | |
102 + "Please remove or move it as we will not " | |
103 + "overwrite it." | |
104 ) | |
105 exit(1) | |
106 | |
107 if accs_txt and (not os.path.exists(accs_txt) or not os.path.getsize(accs_txt) > 0): | |
108 logging.error("File,\n" + f"{accs_txt}\ndoes not exist " + "or is empty!") | |
109 failed_sample_name = re.sub(accs_suffix, "", os.path.basename(accs_txt)) | |
110 with open( | |
111 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w" | |
112 ) as failed_sample_fh: | |
113 failed_sample_fh.write(f"{failed_sample_name}\n") | |
114 failed_sample_fh.close() | |
115 exit(0) | |
116 | |
117 if genomes_dir: | |
118 if not os.path.isdir(genomes_dir): | |
119 logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!") | |
120 exit(1) | |
121 if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0: | |
122 logging.error( | |
123 "Genomes directory" | |
124 + f"{genomes_dir}" | |
125 + "\ndoes not seem to have any\n" | |
126 + f"files ending with suffix: {genomes_dir_suffix}" | |
127 ) | |
128 exit(1) | |
129 | |
130 # ppp.pprint(mash_hits) | |
131 empty_lines = 0 | |
132 empty_lines_msg = "" | |
133 with open(cat_genomes_gz, "wb") as genomes_out_gz: | |
134 with open(accs_txt, "r") as accs_txt_fh: | |
135 for line in accs_txt_fh: | |
136 if line in ["\n", "\n\r"]: | |
137 empty_lines += 1 | |
138 continue | |
139 else: | |
140 line = line.strip() | |
141 | |
142 if line in accs_seen.keys(): | |
143 continue | |
144 else: | |
145 accs_seen[line] = 1 | |
146 | |
147 genome_file = os.path.join(genomes_dir, line + genomes_dir_suffix) | |
148 | |
149 if not os.path.exists(genome_file) or os.path.getsize(genome_file) <= 0: | |
150 logging.error( | |
151 f"Genome file {os.path.basename(genome_file)} does not\n" | |
152 + "exits or is empty!" | |
153 ) | |
154 exit(1) | |
155 else: | |
156 with open(genome_file, "rb") as genome_file_h: | |
157 genomes_out_gz.writelines(genome_file_h.readlines()) | |
158 genome_file_h.close() | |
159 accs_txt_fh.close() | |
160 genomes_out_gz.close() | |
161 | |
162 if empty_lines > 0: | |
163 empty_lines_msg = f"Skipped {empty_lines} empty line(s).\n" | |
164 | |
165 logging.info( | |
166 empty_lines_msg | |
167 + f"File {os.path.basename(cat_genomes_gz)}\n" | |
168 + f"written in:\n{os.getcwd()}\nDone! Bye!" | |
169 ) | |
170 exit(0) | |
171 | |
172 | |
173 if __name__ == "__main__": | |
174 main() |