Mercurial > repos > kkonganti > hfp_nowayout
comparison 0.5.0/bin/gen_per_species_fa_from_lin.py @ 0:97cd2f532efe
planemo upload
author | kkonganti |
---|---|
date | Mon, 31 Mar 2025 14:50:40 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:97cd2f532efe |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import argparse | |
4 import gzip | |
5 import inspect | |
6 import logging | |
7 import os | |
8 import pprint | |
9 import re | |
10 import shutil | |
11 from collections import defaultdict | |
12 from typing import BinaryIO, TextIO, Union | |
13 | |
14 from Bio import SeqIO | |
15 from Bio.Seq import Seq | |
16 from Bio.SeqRecord import SeqRecord | |
17 | |
18 | |
19 # Multiple inheritence for pretty printing of help text. | |
20 class MultiArgFormatClasses( | |
21 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter | |
22 ): | |
23 pass | |
24 | |
25 | |
26 def get_lineages(csv: os.PathLike) -> defaultdict: | |
27 """ | |
28 Parse the lineages.csv file and store a list of | |
29 accessions. | |
30 """ | |
31 lineages = dict() | |
32 if csv == None or not (os.path.exists(csv) or os.path.getsize(csv) > 0): | |
33 logging.error( | |
34 f"The CSV file [{os.path.basename(csv)}] is empty or does not exist!" | |
35 ) | |
36 exit(1) | |
37 | |
38 logging.info(f"Indexing {os.path.basename(csv)}...") | |
39 | |
40 with open(csv, "r") as csv_fh: | |
41 _ = csv_fh.readline().strip().split(",") | |
42 for line in csv_fh: | |
43 cols = line.strip().split(",") | |
44 | |
45 if len(cols) < 9: | |
46 logging.error( | |
47 f"The CSV file {os.path.basename(csv)} should have a mandatory 9 columns." | |
48 + "\n\nEx: identifiers,superkingdom,phylum,class,order,family,genus,species,strain" | |
49 + "\nAB211151.1,Eukaryota,Arthropoda,Malacostraca,Decapoda,Majidae,Chionoecetes,Chionoecetes opilio," | |
50 + f"\n\nGot:\n{line}" | |
51 ) | |
52 exit(1) | |
53 | |
54 lineages[cols[0]] = re.sub(r"\W+", "-", "_".join(cols[7].split(" "))) | |
55 | |
56 csv_fh.close() | |
57 return lineages | |
58 | |
59 | |
60 def write_fasta(recs: list, basedir: os.PathLike, name: str, suffix: str) -> None: | |
61 """ | |
62 Write sequence with no description to a specified file. | |
63 """ | |
64 SeqIO.write( | |
65 recs, | |
66 os.path.join(basedir, name + suffix), | |
67 "fasta", | |
68 ) | |
69 | |
70 | |
71 def parse_fasta(fh: Union[TextIO, BinaryIO], sp2accs: dict) -> list: | |
72 """ | |
73 Parse the sequences and create per species FASTA record. | |
74 """ | |
75 records = defaultdict() | |
76 logging.info("") | |
77 | |
78 for record in SeqIO.parse(fh, "fasta"): | |
79 | |
80 id = record.id | |
81 seq = record.seq | |
82 | |
83 if id in sp2accs.keys(): | |
84 records.setdefault(sp2accs[id], []).append( | |
85 SeqRecord(Seq(seq), id=id, description=str()) | |
86 ) | |
87 else: | |
88 print(f"Lineage row does not exist for accession: {id}") | |
89 | |
90 logging.info(f"Collected FASTA records for {len(records.keys())} species'.") | |
91 fh.close() | |
92 return records | |
93 | |
94 | |
95 # Main | |
96 def main() -> None: | |
97 """ | |
98 This script takes: | |
99 1. The FASTA file and, | |
100 2. Takes the corresponding lineages.csv file and, | |
101 | |
102 then generates a folder containing individual FASTA sequence files | |
103 per species. | |
104 """ | |
105 | |
106 # Set logging. | |
107 logging.basicConfig( | |
108 format="\n" | |
109 + "=" * 55 | |
110 + "\n%(asctime)s - %(levelname)s\n" | |
111 + "=" * 55 | |
112 + "\n%(message)s\r\r", | |
113 level=logging.DEBUG, | |
114 ) | |
115 | |
116 # Debug print. | |
117 ppp = pprint.PrettyPrinter(width=55) | |
118 prog_name = os.path.basename(inspect.stack()[0].filename) | |
119 | |
120 parser = argparse.ArgumentParser( | |
121 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses | |
122 ) | |
123 | |
124 required = parser.add_argument_group("required arguments") | |
125 | |
126 required.add_argument( | |
127 "-fa", | |
128 dest="fna", | |
129 default=False, | |
130 required=True, | |
131 help="Absolute UNIX path to the FASTA file that corresponds" | |
132 + "\nto the lineages.csv file.", | |
133 ) | |
134 required.add_argument( | |
135 "-csv", | |
136 dest="csv", | |
137 default=False, | |
138 required=True, | |
139 help="Absolute UNIX path to lineages.csv which has a guaranteed 9 " | |
140 + "\ncolumns with the first being an accession.", | |
141 ) | |
142 parser.add_argument( | |
143 "-out", | |
144 dest="out_folder", | |
145 default=os.path.join(os.getcwd(), "species"), | |
146 required=False, | |
147 help="By default, the output is written to this\nfolder.", | |
148 ) | |
149 parser.add_argument( | |
150 "-f", | |
151 dest="force_write_out", | |
152 default=False, | |
153 action="store_true", | |
154 required=False, | |
155 help="Force overwrite output directory contents.", | |
156 ) | |
157 parser.add_argument( | |
158 "-suffix", | |
159 dest="fna_suffix", | |
160 default=".fna", | |
161 required=False, | |
162 help="Suffix of the individual species FASTA files\nthat will be saved.", | |
163 ) | |
164 | |
165 # Parse defaults | |
166 args = parser.parse_args() | |
167 csv = args.csv | |
168 fna = args.fna | |
169 out = args.out_folder | |
170 overwrite = args.force_write_out | |
171 fna_suffix = args.fna_suffix | |
172 | |
173 # Basic checks | |
174 if not overwrite and os.path.exists(out): | |
175 logging.warning( | |
176 f"Output destination [{os.path.basename(out)}] already exists!" | |
177 + "\nPlease use -f to delete and overwrite." | |
178 ) | |
179 elif overwrite and os.path.exists(out): | |
180 logging.info(f"Overwrite requested. Deleting {os.path.basename(out)}...") | |
181 shutil.rmtree(out) | |
182 | |
183 # Get taxonomy from ncbitax2lin | |
184 lineages = get_lineages(csv) | |
185 | |
186 logging.info(f"Creating new squences per species...") | |
187 | |
188 if not os.path.exists(out): | |
189 os.makedirs(out) | |
190 | |
191 try: | |
192 gz_fh = gzip.open(fna, "rt") | |
193 fa_recs = parse_fasta(gz_fh, lineages) | |
194 except gzip.BadGzipFile: | |
195 logging.info( | |
196 f"Input FASTA file {os.path.basename(csv)} is not in\nGZIP format." | |
197 ) | |
198 txt_fh = open(fna, "r") | |
199 fa_recs = parse_fasta(txt_fh, lineages) | |
200 finally: | |
201 logging.info("Assigned FASTA records per species...") | |
202 | |
203 logging.info("Writing FASTA records per species...") | |
204 | |
205 for sp in fa_recs.keys(): | |
206 write_fasta(fa_recs[sp], out, sp, fna_suffix) | |
207 | |
208 | |
209 if __name__ == "__main__": | |
210 main() | |
211 | |
212 # ~/apps/nowayout/bin/gen_per_species_fa_from_bold.py -tsv BOLD_Public.05-Feb-2024.tsv -csv ../tax.csv ─╯ | |
213 | |
214 # ======================================================= | |
215 # 2024-02-08 21:37:28,541 - INFO | |
216 # ======================================================= | |
217 # Indexing tax.csv... | |
218 | |
219 # ======================================================= | |
220 # 2024-02-08 21:38:06,567 - INFO | |
221 # ======================================================= | |
222 # Creating new squences per species... | |
223 | |
224 # ======================================================= | |
225 # 2024-02-08 21:38:06,572 - INFO | |
226 # ======================================================= | |
227 # Input TSV file BOLD_Public.05-Feb-2024.tsv is not in | |
228 # GZIP format. | |
229 | |
230 # ======================================================= | |
231 # 2024-02-08 22:01:04,554 - INFO | |
232 # ======================================================= | |
233 # Collected FASTA records for 497421 species'. | |
234 | |
235 # ======================================================= | |
236 # 2024-02-08 22:24:35,000 - INFO | |
237 # ======================================================= | |
238 # No. of raw records present in `ncbitax2lin` [tax.csv]: 2550767 | |
239 # No. of valid records collected from `ncbitax2lin` [tax.csv]: 2134980 | |
240 # No. of raw records in TSV [BOLD_Public.05-Feb-2024.tsv]: 9735210 | |
241 # No. of valid records in TSV [BOLD_Public.05-Feb-2024.tsv]: 4988323 | |
242 # No. of FASTA records for which new lineages were created: 4069202 | |
243 # No. of FASTA records for which only genus, species and/or strain information were created: 919121 | |
244 | |
245 # ======================================================= | |
246 # 2024-02-08 22:24:35,001 - INFO | |
247 # ======================================================= | |
248 # Succesfully created lineages and FASTA records! Done!! |