Mercurial > repos > kkonganti > hfp_nowayout
comparison 0.5.0/bin/gen_per_species_fa_from_bold.py @ 0:97cd2f532efe
planemo upload
author | kkonganti |
---|---|
date | Mon, 31 Mar 2025 14:50:40 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:97cd2f532efe |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import argparse | |
4 import gzip | |
5 import inspect | |
6 import logging | |
7 import os | |
8 import pprint | |
9 import re | |
10 import shutil | |
11 from collections import defaultdict | |
12 from typing import BinaryIO, TextIO, Union | |
13 | |
14 from Bio import SeqIO | |
15 from Bio.Seq import Seq | |
16 from Bio.SeqRecord import SeqRecord | |
17 | |
18 | |
19 # Multiple inheritence for pretty printing of help text. | |
20 class MultiArgFormatClasses( | |
21 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter | |
22 ): | |
23 pass | |
24 | |
25 | |
26 def get_lineages(csv: os.PathLike, cols: list) -> list: | |
27 """ | |
28 Parse the output from `ncbitax2lin` tool and | |
29 return a dict of lineages where the key is | |
30 genusspeciesstrain. | |
31 """ | |
32 lineages = dict() | |
33 if csv == None or not (os.path.exists(csv) or os.path.getsize(csv) > 0): | |
34 logging.error( | |
35 f"The CSV file [{os.path.basename(csv)}] is empty or does not exist!" | |
36 ) | |
37 exit(1) | |
38 | |
39 logging.info(f"Indexing {os.path.basename(csv)}...") | |
40 | |
41 with open(csv, "r") as csv_fh: | |
42 header_cols = csv_fh.readline().strip().split(",") | |
43 user_req_cols = [ | |
44 tcol_i for tcol_i, tcol in enumerate(header_cols) if tcol in cols | |
45 ] | |
46 cols_not_found = [tcol for tcol in cols if tcol not in header_cols] | |
47 raw_recs = 0 | |
48 | |
49 if len(cols_not_found) > 0: | |
50 logging.error( | |
51 f"The following columns do not exist in the" | |
52 + f"\nCSV file [ {os.path.basename(csv)} ]:\n" | |
53 + "".join(cols_not_found) | |
54 ) | |
55 exit(1) | |
56 elif len(user_req_cols) > 9: | |
57 logging.error( | |
58 f"Only a total of 9 columns are needed!" | |
59 + "\ntax_id,kindom,phylum,class,order,family,genus,species,strain" | |
60 ) | |
61 exit(1) | |
62 | |
63 for tax in csv_fh: | |
64 raw_recs += 1 | |
65 lcols = tax.strip().split(",") | |
66 | |
67 if bool(lcols[user_req_cols[8]]): | |
68 lineages[lcols[user_req_cols[8]]] = ",".join( | |
69 [lcols[l] for l in user_req_cols[1:]] | |
70 ) | |
71 elif bool(lcols[user_req_cols[7]]): | |
72 lineages[lcols[user_req_cols[7]]] = ",".join( | |
73 [lcols[l] for l in user_req_cols[1:8]] + [str()] | |
74 ) | |
75 | |
76 csv_fh.close() | |
77 return lineages, raw_recs | |
78 | |
79 | |
80 def write_fasta(recs: list, basedir: os.PathLike, name: str, suffix: str) -> None: | |
81 """ | |
82 Write sequence with no description to a specified file. | |
83 """ | |
84 SeqIO.write( | |
85 recs, | |
86 os.path.join(basedir, name + suffix), | |
87 "fasta", | |
88 ) | |
89 | |
90 | |
91 def check_and_get_cols(pat: re, cols: str, delim: str) -> list: | |
92 """ | |
93 Check if header column matches the pattern and return | |
94 columns. | |
95 """ | |
96 if not pat.match(cols): | |
97 logging.error( | |
98 f"Supplied columns' names {cols} should only have words" | |
99 f"\n(alphanumeric) separated by: {delim}." | |
100 ) | |
101 exit(1) | |
102 else: | |
103 cols = re.sub("\n", "", cols).split(delim) | |
104 | |
105 return cols | |
106 | |
107 | |
108 def parse_tsv(fh: Union[TextIO, BinaryIO], tcols: list, delim: str) -> list: | |
109 """ | |
110 Parse the TSV file and produce the required per | |
111 species FASTA's. | |
112 """ | |
113 records, sp2accs = (defaultdict(list), defaultdict(list)) | |
114 header = fh.readline().strip().split(delim) | |
115 raw_recs = 0 | |
116 | |
117 if not all(col in header for col in tcols): | |
118 logging.error( | |
119 "The following columns were not found in the" | |
120 + f"\nheader row of file {os.path.basename(fh.name)}\n" | |
121 + "\n".join([ele for ele in tcols if ele not in header]) | |
122 ) | |
123 | |
124 id_i, genus_i, species_i, strain_i, seq_i = [ | |
125 i for i, ele in enumerate(header) if ele in tcols | |
126 ] | |
127 | |
128 for record in fh: | |
129 raw_recs += 1 | |
130 | |
131 id = record.strip().split(delim)[id_i] | |
132 genus = record.strip().split(delim)[genus_i] | |
133 species = re.sub(r"[\/\\]+", "-", record.strip().split(delim)[species_i]) | |
134 strain = record.strip().split(delim)[strain_i] | |
135 seq = re.sub(r"[^ATGC]+", "", record.strip().split(delim)[seq_i], re.IGNORECASE) | |
136 | |
137 if re.match(r"None|Null", species, re.IGNORECASE): | |
138 continue | |
139 | |
140 # print(id) | |
141 # print(genus) | |
142 # print(species) | |
143 # print(strain) | |
144 # print(seq) | |
145 | |
146 records.setdefault(species, []).append( | |
147 SeqRecord(Seq(seq), id=id, description=str()) | |
148 ) | |
149 sp2accs.setdefault(species, []).append(id) | |
150 | |
151 logging.info(f"Collected FASTA records for {len(records.keys())} species'.") | |
152 fh.close() | |
153 return records, sp2accs, raw_recs | |
154 | |
155 | |
156 # Main | |
157 def main() -> None: | |
158 """ | |
159 This script takes: | |
160 1. The TSV file from BOLD systems, | |
161 2. Takes as input a .csv file generated by `ncbitax2lin`. | |
162 | |
163 and then generates a folder containing individual FASTA sequence files | |
164 per species. This is only possible if the full taxonomy of the barcode | |
165 sequence is present in the FASTA header. | |
166 """ | |
167 | |
168 # Set logging. | |
169 logging.basicConfig( | |
170 format="\n" | |
171 + "=" * 55 | |
172 + "\n%(asctime)s - %(levelname)s\n" | |
173 + "=" * 55 | |
174 + "\n%(message)s\r\r", | |
175 level=logging.DEBUG, | |
176 ) | |
177 | |
178 # Debug print. | |
179 ppp = pprint.PrettyPrinter(width=55) | |
180 prog_name = os.path.basename(inspect.stack()[0].filename) | |
181 | |
182 parser = argparse.ArgumentParser( | |
183 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses | |
184 ) | |
185 | |
186 required = parser.add_argument_group("required arguments") | |
187 | |
188 required.add_argument( | |
189 "-tsv", | |
190 dest="tsv", | |
191 default=False, | |
192 required=True, | |
193 help="Absolute UNIX path to the TSV file from BOLD systems" | |
194 + "\nin uncompressed TXT format.", | |
195 ) | |
196 required.add_argument( | |
197 "-csv", | |
198 dest="csv", | |
199 default=False, | |
200 required=True, | |
201 help="Absolute UNIX path to .csv or .csv.gz file which is generated " | |
202 + "\nby the `ncbitax2lin` tool.", | |
203 ) | |
204 parser.add_argument( | |
205 "-out", | |
206 dest="out_folder", | |
207 default=os.path.join(os.getcwd(), "species"), | |
208 required=False, | |
209 help="By default, the output is written to this\nfolder.", | |
210 ) | |
211 parser.add_argument( | |
212 "-f", | |
213 dest="force_write_out", | |
214 default=False, | |
215 action="store_true", | |
216 required=False, | |
217 help="Force overwrite output directory contents.", | |
218 ) | |
219 parser.add_argument( | |
220 "-suffix", | |
221 dest="fna_suffix", | |
222 default=".fna", | |
223 required=False, | |
224 help="Suffix of the individual species FASTA files\nthat will be saved.", | |
225 ) | |
226 parser.add_argument( | |
227 "-ccols", | |
228 dest="csv_cols", | |
229 default="tax_id,superkingdom,phylum,class,order,family,genus,species,strain", | |
230 required=False, | |
231 help="Taxonomic lineage will be built using these columns from the output of" | |
232 + "\n`ncbitax2lin`\ntool.", | |
233 ) | |
234 parser.add_argument( | |
235 "-ccols-sep", | |
236 dest="csv_delim", | |
237 default=",", | |
238 required=False, | |
239 help="The delimitor of the fields in the CSV file.", | |
240 ) | |
241 parser.add_argument( | |
242 "-tcols", | |
243 dest="tsv_cols", | |
244 default="processid\tgenus\tspecies\tsubspecies\tnucraw", | |
245 required=False, | |
246 help="For each species, the nucletide sequences will be\naggregated.", | |
247 ) | |
248 parser.add_argument( | |
249 "-tcols-sep", | |
250 dest="tsv_delim", | |
251 default="\t", | |
252 required=False, | |
253 help="The delimitor of the fields in the TSV file.", | |
254 ) | |
255 | |
256 # Parse defaults | |
257 args = parser.parse_args() | |
258 tsv = args.tsv | |
259 csv = args.csv | |
260 csep = args.csv_delim | |
261 tsep = args.tsv_delim | |
262 csv_cols = args.csv_cols | |
263 tsv_cols = args.tsv_cols | |
264 out = args.out_folder | |
265 overwrite = args.force_write_out | |
266 fna_suffix = args.fna_suffix | |
267 ccols_pat = re.compile(f"^[\w\{csep}]+?\w$") | |
268 tcols_pat = re.compile(f"^[\w\{tsep}]+?\w$") | |
269 final_lineages = os.path.join(out, "lineages.csv") | |
270 lineages_not_found = os.path.join(out, "lineages_not_found.csv") | |
271 base_fasta_dir = os.path.join(out, "fasta") | |
272 | |
273 # Basic checks | |
274 if not overwrite and os.path.exists(out): | |
275 logging.warning( | |
276 f"Output destination [{os.path.basename(out)}] already exists!" | |
277 + "\nPlease use -f to delete and overwrite." | |
278 ) | |
279 elif overwrite and os.path.exists(out): | |
280 logging.info(f"Overwrite requested. Deleting {os.path.basename(out)}...") | |
281 shutil.rmtree(out) | |
282 | |
283 # Validate user requested columns | |
284 passed_ccols = check_and_get_cols(ccols_pat, csv_cols, csep) | |
285 passed_tcols = check_and_get_cols(tcols_pat, tsv_cols, tsep) | |
286 | |
287 # Get taxonomy from ncbitax2lin | |
288 lineages, raw_recs = get_lineages(csv, passed_ccols) | |
289 | |
290 # Finally, read BOLD tsv if lineage exists. | |
291 logging.info(f"Creating new squences per species...") | |
292 | |
293 if not os.path.exists(out): | |
294 os.makedirs(out) | |
295 | |
296 try: | |
297 gz_fh = gzip.open(tsv, "rt") | |
298 records, sp2accs, traw_recs = parse_tsv(gz_fh, passed_tcols, tsep) | |
299 except gzip.BadGzipFile: | |
300 logging.info(f"Input TSV file {os.path.basename(tsv)} is not in\nGZIP format.") | |
301 txt_fh = open(tsv, "r") | |
302 records, sp2accs, traw_recs = parse_tsv(txt_fh, passed_tcols, tsep) | |
303 | |
304 passed_tax_check = 0 | |
305 failed_tax_check = 0 | |
306 fasta_recs_written = 0 | |
307 l_fh = open(final_lineages, "w") | |
308 ln_fh = open(lineages_not_found, "w") | |
309 l_fh.write( | |
310 "identifiers,superkingdom,phylum,class,order,family,genus,species,strain\n" | |
311 ) | |
312 ln_fh.write("fna_id,parsed_org\n") | |
313 | |
314 if not os.path.exists(base_fasta_dir): | |
315 os.makedirs(base_fasta_dir) | |
316 | |
317 for genus_species in records.keys(): | |
318 fasta_recs_written += len(records[genus_species]) | |
319 write_fasta( | |
320 records[genus_species], | |
321 base_fasta_dir, | |
322 "_".join(genus_species.split(" ")), | |
323 fna_suffix, | |
324 ) | |
325 org_words = genus_species.split(" ") | |
326 | |
327 for id in sp2accs[genus_species]: | |
328 if genus_species in lineages.keys(): | |
329 this_line = ",".join([id, lineages[genus_species]]) + "\n" | |
330 | |
331 if len(org_words) > 2: | |
332 this_line = ( | |
333 ",".join( | |
334 [id, lineages[genus_species].rstrip(","), genus_species] | |
335 ) | |
336 + "\n" | |
337 ) | |
338 | |
339 l_fh.write(this_line) | |
340 passed_tax_check += 1 | |
341 else: | |
342 this_line = ( | |
343 ",".join( | |
344 [ | |
345 id, | |
346 "", | |
347 "", | |
348 "", | |
349 "", | |
350 "", | |
351 org_words[0], | |
352 genus_species, | |
353 "", | |
354 ] | |
355 ) | |
356 + "\n" | |
357 ) | |
358 if len(org_words) > 2: | |
359 this_line = ( | |
360 ",".join( | |
361 [ | |
362 id, | |
363 "", | |
364 "", | |
365 "", | |
366 "", | |
367 "", | |
368 org_words[0], | |
369 org_words[0] + " " + org_words[1], | |
370 genus_species, | |
371 ] | |
372 ) | |
373 + "\n" | |
374 ) | |
375 l_fh.write(this_line) | |
376 ln_fh.write(",".join([id, genus_species]) + "\n") | |
377 failed_tax_check += 1 | |
378 | |
379 logging.info( | |
380 f"No. of raw records present in `ncbitax2lin` [{os.path.basename(csv)}]: {raw_recs}" | |
381 + f"\nNo. of valid records collected from `ncbitax2lin` [{os.path.basename(csv)}]: {len(lineages.keys())}" | |
382 + f"\nNo. of raw records in TSV [{os.path.basename(tsv)}]: {traw_recs}" | |
383 + f"\nNo. of valid records in TSV [{os.path.basename(tsv)}]: {passed_tax_check + failed_tax_check}" | |
384 + f"\nNo. of FASTA records for which new lineages were created: {passed_tax_check}" | |
385 + f"\nNo. of FASTA records for which only genus, species and/or strain information were created: {failed_tax_check}" | |
386 ) | |
387 | |
388 if (passed_tax_check + failed_tax_check) != fasta_recs_written: | |
389 logging.error( | |
390 f"The number of input FASTA records [{fasta_recs_written}]" | |
391 + f"\nis not equal to number of lineages created [{passed_tax_check + failed_tax_check}]!" | |
392 ) | |
393 exit(1) | |
394 else: | |
395 logging.info("Succesfully created lineages and FASTA records! Done!!") | |
396 | |
397 | |
398 if __name__ == "__main__": | |
399 main() | |
400 | |
401 # ~/apps/nowayout/bin/gen_per_species_fa_from_bold.py -tsv BOLD_Public.05-Feb-2024.tsv -csv ../tax.csv ─╯ | |
402 | |
403 # ======================================================= | |
404 # 2024-02-08 21:37:28,541 - INFO | |
405 # ======================================================= | |
406 # Indexing tax.csv... | |
407 | |
408 # ======================================================= | |
409 # 2024-02-08 21:38:06,567 - INFO | |
410 # ======================================================= | |
411 # Creating new squences per species... | |
412 | |
413 # ======================================================= | |
414 # 2024-02-08 21:38:06,572 - INFO | |
415 # ======================================================= | |
416 # Input TSV file BOLD_Public.05-Feb-2024.tsv is not in | |
417 # GZIP format. | |
418 | |
419 # ======================================================= | |
420 # 2024-02-08 22:01:04,554 - INFO | |
421 # ======================================================= | |
422 # Collected FASTA records for 497421 species'. | |
423 | |
424 # ======================================================= | |
425 # 2024-02-08 22:24:35,000 - INFO | |
426 # ======================================================= | |
427 # No. of raw records present in `ncbitax2lin` [tax.csv]: 2550767 | |
428 # No. of valid records collected from `ncbitax2lin` [tax.csv]: 2134980 | |
429 # No. of raw records in TSV [BOLD_Public.05-Feb-2024.tsv]: 9735210 | |
430 # No. of valid records in TSV [BOLD_Public.05-Feb-2024.tsv]: 4988323 | |
431 # No. of FASTA records for which new lineages were created: 4069202 | |
432 # No. of FASTA records for which only genus, species and/or strain information were created: 919121 | |
433 | |
434 # ======================================================= | |
435 # 2024-02-08 22:24:35,001 - INFO | |
436 # ======================================================= | |
437 # Succesfully created lineages and FASTA records! Done!! |