comparison 0.5.0/bin/gen_per_species_fa_from_bold.py @ 0:97cd2f532efe

planemo upload
author kkonganti
date Mon, 31 Mar 2025 14:50:40 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:97cd2f532efe
1 #!/usr/bin/env python3
2
3 import argparse
4 import gzip
5 import inspect
6 import logging
7 import os
8 import pprint
9 import re
10 import shutil
11 from collections import defaultdict
12 from typing import BinaryIO, TextIO, Union
13
14 from Bio import SeqIO
15 from Bio.Seq import Seq
16 from Bio.SeqRecord import SeqRecord
17
18
19 # Multiple inheritence for pretty printing of help text.
20 class MultiArgFormatClasses(
21 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
22 ):
23 pass
24
25
26 def get_lineages(csv: os.PathLike, cols: list) -> list:
27 """
28 Parse the output from `ncbitax2lin` tool and
29 return a dict of lineages where the key is
30 genusspeciesstrain.
31 """
32 lineages = dict()
33 if csv == None or not (os.path.exists(csv) or os.path.getsize(csv) > 0):
34 logging.error(
35 f"The CSV file [{os.path.basename(csv)}] is empty or does not exist!"
36 )
37 exit(1)
38
39 logging.info(f"Indexing {os.path.basename(csv)}...")
40
41 with open(csv, "r") as csv_fh:
42 header_cols = csv_fh.readline().strip().split(",")
43 user_req_cols = [
44 tcol_i for tcol_i, tcol in enumerate(header_cols) if tcol in cols
45 ]
46 cols_not_found = [tcol for tcol in cols if tcol not in header_cols]
47 raw_recs = 0
48
49 if len(cols_not_found) > 0:
50 logging.error(
51 f"The following columns do not exist in the"
52 + f"\nCSV file [ {os.path.basename(csv)} ]:\n"
53 + "".join(cols_not_found)
54 )
55 exit(1)
56 elif len(user_req_cols) > 9:
57 logging.error(
58 f"Only a total of 9 columns are needed!"
59 + "\ntax_id,kindom,phylum,class,order,family,genus,species,strain"
60 )
61 exit(1)
62
63 for tax in csv_fh:
64 raw_recs += 1
65 lcols = tax.strip().split(",")
66
67 if bool(lcols[user_req_cols[8]]):
68 lineages[lcols[user_req_cols[8]]] = ",".join(
69 [lcols[l] for l in user_req_cols[1:]]
70 )
71 elif bool(lcols[user_req_cols[7]]):
72 lineages[lcols[user_req_cols[7]]] = ",".join(
73 [lcols[l] for l in user_req_cols[1:8]] + [str()]
74 )
75
76 csv_fh.close()
77 return lineages, raw_recs
78
79
80 def write_fasta(recs: list, basedir: os.PathLike, name: str, suffix: str) -> None:
81 """
82 Write sequence with no description to a specified file.
83 """
84 SeqIO.write(
85 recs,
86 os.path.join(basedir, name + suffix),
87 "fasta",
88 )
89
90
91 def check_and_get_cols(pat: re, cols: str, delim: str) -> list:
92 """
93 Check if header column matches the pattern and return
94 columns.
95 """
96 if not pat.match(cols):
97 logging.error(
98 f"Supplied columns' names {cols} should only have words"
99 f"\n(alphanumeric) separated by: {delim}."
100 )
101 exit(1)
102 else:
103 cols = re.sub("\n", "", cols).split(delim)
104
105 return cols
106
107
108 def parse_tsv(fh: Union[TextIO, BinaryIO], tcols: list, delim: str) -> list:
109 """
110 Parse the TSV file and produce the required per
111 species FASTA's.
112 """
113 records, sp2accs = (defaultdict(list), defaultdict(list))
114 header = fh.readline().strip().split(delim)
115 raw_recs = 0
116
117 if not all(col in header for col in tcols):
118 logging.error(
119 "The following columns were not found in the"
120 + f"\nheader row of file {os.path.basename(fh.name)}\n"
121 + "\n".join([ele for ele in tcols if ele not in header])
122 )
123
124 id_i, genus_i, species_i, strain_i, seq_i = [
125 i for i, ele in enumerate(header) if ele in tcols
126 ]
127
128 for record in fh:
129 raw_recs += 1
130
131 id = record.strip().split(delim)[id_i]
132 genus = record.strip().split(delim)[genus_i]
133 species = re.sub(r"[\/\\]+", "-", record.strip().split(delim)[species_i])
134 strain = record.strip().split(delim)[strain_i]
135 seq = re.sub(r"[^ATGC]+", "", record.strip().split(delim)[seq_i], re.IGNORECASE)
136
137 if re.match(r"None|Null", species, re.IGNORECASE):
138 continue
139
140 # print(id)
141 # print(genus)
142 # print(species)
143 # print(strain)
144 # print(seq)
145
146 records.setdefault(species, []).append(
147 SeqRecord(Seq(seq), id=id, description=str())
148 )
149 sp2accs.setdefault(species, []).append(id)
150
151 logging.info(f"Collected FASTA records for {len(records.keys())} species'.")
152 fh.close()
153 return records, sp2accs, raw_recs
154
155
156 # Main
157 def main() -> None:
158 """
159 This script takes:
160 1. The TSV file from BOLD systems,
161 2. Takes as input a .csv file generated by `ncbitax2lin`.
162
163 and then generates a folder containing individual FASTA sequence files
164 per species. This is only possible if the full taxonomy of the barcode
165 sequence is present in the FASTA header.
166 """
167
168 # Set logging.
169 logging.basicConfig(
170 format="\n"
171 + "=" * 55
172 + "\n%(asctime)s - %(levelname)s\n"
173 + "=" * 55
174 + "\n%(message)s\r\r",
175 level=logging.DEBUG,
176 )
177
178 # Debug print.
179 ppp = pprint.PrettyPrinter(width=55)
180 prog_name = os.path.basename(inspect.stack()[0].filename)
181
182 parser = argparse.ArgumentParser(
183 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
184 )
185
186 required = parser.add_argument_group("required arguments")
187
188 required.add_argument(
189 "-tsv",
190 dest="tsv",
191 default=False,
192 required=True,
193 help="Absolute UNIX path to the TSV file from BOLD systems"
194 + "\nin uncompressed TXT format.",
195 )
196 required.add_argument(
197 "-csv",
198 dest="csv",
199 default=False,
200 required=True,
201 help="Absolute UNIX path to .csv or .csv.gz file which is generated "
202 + "\nby the `ncbitax2lin` tool.",
203 )
204 parser.add_argument(
205 "-out",
206 dest="out_folder",
207 default=os.path.join(os.getcwd(), "species"),
208 required=False,
209 help="By default, the output is written to this\nfolder.",
210 )
211 parser.add_argument(
212 "-f",
213 dest="force_write_out",
214 default=False,
215 action="store_true",
216 required=False,
217 help="Force overwrite output directory contents.",
218 )
219 parser.add_argument(
220 "-suffix",
221 dest="fna_suffix",
222 default=".fna",
223 required=False,
224 help="Suffix of the individual species FASTA files\nthat will be saved.",
225 )
226 parser.add_argument(
227 "-ccols",
228 dest="csv_cols",
229 default="tax_id,superkingdom,phylum,class,order,family,genus,species,strain",
230 required=False,
231 help="Taxonomic lineage will be built using these columns from the output of"
232 + "\n`ncbitax2lin`\ntool.",
233 )
234 parser.add_argument(
235 "-ccols-sep",
236 dest="csv_delim",
237 default=",",
238 required=False,
239 help="The delimitor of the fields in the CSV file.",
240 )
241 parser.add_argument(
242 "-tcols",
243 dest="tsv_cols",
244 default="processid\tgenus\tspecies\tsubspecies\tnucraw",
245 required=False,
246 help="For each species, the nucletide sequences will be\naggregated.",
247 )
248 parser.add_argument(
249 "-tcols-sep",
250 dest="tsv_delim",
251 default="\t",
252 required=False,
253 help="The delimitor of the fields in the TSV file.",
254 )
255
256 # Parse defaults
257 args = parser.parse_args()
258 tsv = args.tsv
259 csv = args.csv
260 csep = args.csv_delim
261 tsep = args.tsv_delim
262 csv_cols = args.csv_cols
263 tsv_cols = args.tsv_cols
264 out = args.out_folder
265 overwrite = args.force_write_out
266 fna_suffix = args.fna_suffix
267 ccols_pat = re.compile(f"^[\w\{csep}]+?\w$")
268 tcols_pat = re.compile(f"^[\w\{tsep}]+?\w$")
269 final_lineages = os.path.join(out, "lineages.csv")
270 lineages_not_found = os.path.join(out, "lineages_not_found.csv")
271 base_fasta_dir = os.path.join(out, "fasta")
272
273 # Basic checks
274 if not overwrite and os.path.exists(out):
275 logging.warning(
276 f"Output destination [{os.path.basename(out)}] already exists!"
277 + "\nPlease use -f to delete and overwrite."
278 )
279 elif overwrite and os.path.exists(out):
280 logging.info(f"Overwrite requested. Deleting {os.path.basename(out)}...")
281 shutil.rmtree(out)
282
283 # Validate user requested columns
284 passed_ccols = check_and_get_cols(ccols_pat, csv_cols, csep)
285 passed_tcols = check_and_get_cols(tcols_pat, tsv_cols, tsep)
286
287 # Get taxonomy from ncbitax2lin
288 lineages, raw_recs = get_lineages(csv, passed_ccols)
289
290 # Finally, read BOLD tsv if lineage exists.
291 logging.info(f"Creating new squences per species...")
292
293 if not os.path.exists(out):
294 os.makedirs(out)
295
296 try:
297 gz_fh = gzip.open(tsv, "rt")
298 records, sp2accs, traw_recs = parse_tsv(gz_fh, passed_tcols, tsep)
299 except gzip.BadGzipFile:
300 logging.info(f"Input TSV file {os.path.basename(tsv)} is not in\nGZIP format.")
301 txt_fh = open(tsv, "r")
302 records, sp2accs, traw_recs = parse_tsv(txt_fh, passed_tcols, tsep)
303
304 passed_tax_check = 0
305 failed_tax_check = 0
306 fasta_recs_written = 0
307 l_fh = open(final_lineages, "w")
308 ln_fh = open(lineages_not_found, "w")
309 l_fh.write(
310 "identifiers,superkingdom,phylum,class,order,family,genus,species,strain\n"
311 )
312 ln_fh.write("fna_id,parsed_org\n")
313
314 if not os.path.exists(base_fasta_dir):
315 os.makedirs(base_fasta_dir)
316
317 for genus_species in records.keys():
318 fasta_recs_written += len(records[genus_species])
319 write_fasta(
320 records[genus_species],
321 base_fasta_dir,
322 "_".join(genus_species.split(" ")),
323 fna_suffix,
324 )
325 org_words = genus_species.split(" ")
326
327 for id in sp2accs[genus_species]:
328 if genus_species in lineages.keys():
329 this_line = ",".join([id, lineages[genus_species]]) + "\n"
330
331 if len(org_words) > 2:
332 this_line = (
333 ",".join(
334 [id, lineages[genus_species].rstrip(","), genus_species]
335 )
336 + "\n"
337 )
338
339 l_fh.write(this_line)
340 passed_tax_check += 1
341 else:
342 this_line = (
343 ",".join(
344 [
345 id,
346 "",
347 "",
348 "",
349 "",
350 "",
351 org_words[0],
352 genus_species,
353 "",
354 ]
355 )
356 + "\n"
357 )
358 if len(org_words) > 2:
359 this_line = (
360 ",".join(
361 [
362 id,
363 "",
364 "",
365 "",
366 "",
367 "",
368 org_words[0],
369 org_words[0] + " " + org_words[1],
370 genus_species,
371 ]
372 )
373 + "\n"
374 )
375 l_fh.write(this_line)
376 ln_fh.write(",".join([id, genus_species]) + "\n")
377 failed_tax_check += 1
378
379 logging.info(
380 f"No. of raw records present in `ncbitax2lin` [{os.path.basename(csv)}]: {raw_recs}"
381 + f"\nNo. of valid records collected from `ncbitax2lin` [{os.path.basename(csv)}]: {len(lineages.keys())}"
382 + f"\nNo. of raw records in TSV [{os.path.basename(tsv)}]: {traw_recs}"
383 + f"\nNo. of valid records in TSV [{os.path.basename(tsv)}]: {passed_tax_check + failed_tax_check}"
384 + f"\nNo. of FASTA records for which new lineages were created: {passed_tax_check}"
385 + f"\nNo. of FASTA records for which only genus, species and/or strain information were created: {failed_tax_check}"
386 )
387
388 if (passed_tax_check + failed_tax_check) != fasta_recs_written:
389 logging.error(
390 f"The number of input FASTA records [{fasta_recs_written}]"
391 + f"\nis not equal to number of lineages created [{passed_tax_check + failed_tax_check}]!"
392 )
393 exit(1)
394 else:
395 logging.info("Succesfully created lineages and FASTA records! Done!!")
396
397
398 if __name__ == "__main__":
399 main()
400
401 # ~/apps/nowayout/bin/gen_per_species_fa_from_bold.py -tsv BOLD_Public.05-Feb-2024.tsv -csv ../tax.csv ─╯
402
403 # =======================================================
404 # 2024-02-08 21:37:28,541 - INFO
405 # =======================================================
406 # Indexing tax.csv...
407
408 # =======================================================
409 # 2024-02-08 21:38:06,567 - INFO
410 # =======================================================
411 # Creating new squences per species...
412
413 # =======================================================
414 # 2024-02-08 21:38:06,572 - INFO
415 # =======================================================
416 # Input TSV file BOLD_Public.05-Feb-2024.tsv is not in
417 # GZIP format.
418
419 # =======================================================
420 # 2024-02-08 22:01:04,554 - INFO
421 # =======================================================
422 # Collected FASTA records for 497421 species'.
423
424 # =======================================================
425 # 2024-02-08 22:24:35,000 - INFO
426 # =======================================================
427 # No. of raw records present in `ncbitax2lin` [tax.csv]: 2550767
428 # No. of valid records collected from `ncbitax2lin` [tax.csv]: 2134980
429 # No. of raw records in TSV [BOLD_Public.05-Feb-2024.tsv]: 9735210
430 # No. of valid records in TSV [BOLD_Public.05-Feb-2024.tsv]: 4988323
431 # No. of FASTA records for which new lineages were created: 4069202
432 # No. of FASTA records for which only genus, species and/or strain information were created: 919121
433
434 # =======================================================
435 # 2024-02-08 22:24:35,001 - INFO
436 # =======================================================
437 # Succesfully created lineages and FASTA records! Done!!