annotate 0.5.0/bin/gen_per_species_fa_from_bold.py @ 1:fa47b825716b

planemo upload
author kkonganti
date Mon, 31 Mar 2025 14:53:21 -0400
parents 97cd2f532efe
children
rev   line source
kkonganti@0 1 #!/usr/bin/env python3
kkonganti@0 2
kkonganti@0 3 import argparse
kkonganti@0 4 import gzip
kkonganti@0 5 import inspect
kkonganti@0 6 import logging
kkonganti@0 7 import os
kkonganti@0 8 import pprint
kkonganti@0 9 import re
kkonganti@0 10 import shutil
kkonganti@0 11 from collections import defaultdict
kkonganti@0 12 from typing import BinaryIO, TextIO, Union
kkonganti@0 13
kkonganti@0 14 from Bio import SeqIO
kkonganti@0 15 from Bio.Seq import Seq
kkonganti@0 16 from Bio.SeqRecord import SeqRecord
kkonganti@0 17
kkonganti@0 18
kkonganti@0 19 # Multiple inheritence for pretty printing of help text.
kkonganti@0 20 class MultiArgFormatClasses(
kkonganti@0 21 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
kkonganti@0 22 ):
kkonganti@0 23 pass
kkonganti@0 24
kkonganti@0 25
kkonganti@0 26 def get_lineages(csv: os.PathLike, cols: list) -> list:
kkonganti@0 27 """
kkonganti@0 28 Parse the output from `ncbitax2lin` tool and
kkonganti@0 29 return a dict of lineages where the key is
kkonganti@0 30 genusspeciesstrain.
kkonganti@0 31 """
kkonganti@0 32 lineages = dict()
kkonganti@0 33 if csv == None or not (os.path.exists(csv) or os.path.getsize(csv) > 0):
kkonganti@0 34 logging.error(
kkonganti@0 35 f"The CSV file [{os.path.basename(csv)}] is empty or does not exist!"
kkonganti@0 36 )
kkonganti@0 37 exit(1)
kkonganti@0 38
kkonganti@0 39 logging.info(f"Indexing {os.path.basename(csv)}...")
kkonganti@0 40
kkonganti@0 41 with open(csv, "r") as csv_fh:
kkonganti@0 42 header_cols = csv_fh.readline().strip().split(",")
kkonganti@0 43 user_req_cols = [
kkonganti@0 44 tcol_i for tcol_i, tcol in enumerate(header_cols) if tcol in cols
kkonganti@0 45 ]
kkonganti@0 46 cols_not_found = [tcol for tcol in cols if tcol not in header_cols]
kkonganti@0 47 raw_recs = 0
kkonganti@0 48
kkonganti@0 49 if len(cols_not_found) > 0:
kkonganti@0 50 logging.error(
kkonganti@0 51 f"The following columns do not exist in the"
kkonganti@0 52 + f"\nCSV file [ {os.path.basename(csv)} ]:\n"
kkonganti@0 53 + "".join(cols_not_found)
kkonganti@0 54 )
kkonganti@0 55 exit(1)
kkonganti@0 56 elif len(user_req_cols) > 9:
kkonganti@0 57 logging.error(
kkonganti@0 58 f"Only a total of 9 columns are needed!"
kkonganti@0 59 + "\ntax_id,kindom,phylum,class,order,family,genus,species,strain"
kkonganti@0 60 )
kkonganti@0 61 exit(1)
kkonganti@0 62
kkonganti@0 63 for tax in csv_fh:
kkonganti@0 64 raw_recs += 1
kkonganti@0 65 lcols = tax.strip().split(",")
kkonganti@0 66
kkonganti@0 67 if bool(lcols[user_req_cols[8]]):
kkonganti@0 68 lineages[lcols[user_req_cols[8]]] = ",".join(
kkonganti@0 69 [lcols[l] for l in user_req_cols[1:]]
kkonganti@0 70 )
kkonganti@0 71 elif bool(lcols[user_req_cols[7]]):
kkonganti@0 72 lineages[lcols[user_req_cols[7]]] = ",".join(
kkonganti@0 73 [lcols[l] for l in user_req_cols[1:8]] + [str()]
kkonganti@0 74 )
kkonganti@0 75
kkonganti@0 76 csv_fh.close()
kkonganti@0 77 return lineages, raw_recs
kkonganti@0 78
kkonganti@0 79
kkonganti@0 80 def write_fasta(recs: list, basedir: os.PathLike, name: str, suffix: str) -> None:
kkonganti@0 81 """
kkonganti@0 82 Write sequence with no description to a specified file.
kkonganti@0 83 """
kkonganti@0 84 SeqIO.write(
kkonganti@0 85 recs,
kkonganti@0 86 os.path.join(basedir, name + suffix),
kkonganti@0 87 "fasta",
kkonganti@0 88 )
kkonganti@0 89
kkonganti@0 90
kkonganti@0 91 def check_and_get_cols(pat: re, cols: str, delim: str) -> list:
kkonganti@0 92 """
kkonganti@0 93 Check if header column matches the pattern and return
kkonganti@0 94 columns.
kkonganti@0 95 """
kkonganti@0 96 if not pat.match(cols):
kkonganti@0 97 logging.error(
kkonganti@0 98 f"Supplied columns' names {cols} should only have words"
kkonganti@0 99 f"\n(alphanumeric) separated by: {delim}."
kkonganti@0 100 )
kkonganti@0 101 exit(1)
kkonganti@0 102 else:
kkonganti@0 103 cols = re.sub("\n", "", cols).split(delim)
kkonganti@0 104
kkonganti@0 105 return cols
kkonganti@0 106
kkonganti@0 107
kkonganti@0 108 def parse_tsv(fh: Union[TextIO, BinaryIO], tcols: list, delim: str) -> list:
kkonganti@0 109 """
kkonganti@0 110 Parse the TSV file and produce the required per
kkonganti@0 111 species FASTA's.
kkonganti@0 112 """
kkonganti@0 113 records, sp2accs = (defaultdict(list), defaultdict(list))
kkonganti@0 114 header = fh.readline().strip().split(delim)
kkonganti@0 115 raw_recs = 0
kkonganti@0 116
kkonganti@0 117 if not all(col in header for col in tcols):
kkonganti@0 118 logging.error(
kkonganti@0 119 "The following columns were not found in the"
kkonganti@0 120 + f"\nheader row of file {os.path.basename(fh.name)}\n"
kkonganti@0 121 + "\n".join([ele for ele in tcols if ele not in header])
kkonganti@0 122 )
kkonganti@0 123
kkonganti@0 124 id_i, genus_i, species_i, strain_i, seq_i = [
kkonganti@0 125 i for i, ele in enumerate(header) if ele in tcols
kkonganti@0 126 ]
kkonganti@0 127
kkonganti@0 128 for record in fh:
kkonganti@0 129 raw_recs += 1
kkonganti@0 130
kkonganti@0 131 id = record.strip().split(delim)[id_i]
kkonganti@0 132 genus = record.strip().split(delim)[genus_i]
kkonganti@0 133 species = re.sub(r"[\/\\]+", "-", record.strip().split(delim)[species_i])
kkonganti@0 134 strain = record.strip().split(delim)[strain_i]
kkonganti@0 135 seq = re.sub(r"[^ATGC]+", "", record.strip().split(delim)[seq_i], re.IGNORECASE)
kkonganti@0 136
kkonganti@0 137 if re.match(r"None|Null", species, re.IGNORECASE):
kkonganti@0 138 continue
kkonganti@0 139
kkonganti@0 140 # print(id)
kkonganti@0 141 # print(genus)
kkonganti@0 142 # print(species)
kkonganti@0 143 # print(strain)
kkonganti@0 144 # print(seq)
kkonganti@0 145
kkonganti@0 146 records.setdefault(species, []).append(
kkonganti@0 147 SeqRecord(Seq(seq), id=id, description=str())
kkonganti@0 148 )
kkonganti@0 149 sp2accs.setdefault(species, []).append(id)
kkonganti@0 150
kkonganti@0 151 logging.info(f"Collected FASTA records for {len(records.keys())} species'.")
kkonganti@0 152 fh.close()
kkonganti@0 153 return records, sp2accs, raw_recs
kkonganti@0 154
kkonganti@0 155
kkonganti@0 156 # Main
kkonganti@0 157 def main() -> None:
kkonganti@0 158 """
kkonganti@0 159 This script takes:
kkonganti@0 160 1. The TSV file from BOLD systems,
kkonganti@0 161 2. Takes as input a .csv file generated by `ncbitax2lin`.
kkonganti@0 162
kkonganti@0 163 and then generates a folder containing individual FASTA sequence files
kkonganti@0 164 per species. This is only possible if the full taxonomy of the barcode
kkonganti@0 165 sequence is present in the FASTA header.
kkonganti@0 166 """
kkonganti@0 167
kkonganti@0 168 # Set logging.
kkonganti@0 169 logging.basicConfig(
kkonganti@0 170 format="\n"
kkonganti@0 171 + "=" * 55
kkonganti@0 172 + "\n%(asctime)s - %(levelname)s\n"
kkonganti@0 173 + "=" * 55
kkonganti@0 174 + "\n%(message)s\r\r",
kkonganti@0 175 level=logging.DEBUG,
kkonganti@0 176 )
kkonganti@0 177
kkonganti@0 178 # Debug print.
kkonganti@0 179 ppp = pprint.PrettyPrinter(width=55)
kkonganti@0 180 prog_name = os.path.basename(inspect.stack()[0].filename)
kkonganti@0 181
kkonganti@0 182 parser = argparse.ArgumentParser(
kkonganti@0 183 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@0 184 )
kkonganti@0 185
kkonganti@0 186 required = parser.add_argument_group("required arguments")
kkonganti@0 187
kkonganti@0 188 required.add_argument(
kkonganti@0 189 "-tsv",
kkonganti@0 190 dest="tsv",
kkonganti@0 191 default=False,
kkonganti@0 192 required=True,
kkonganti@0 193 help="Absolute UNIX path to the TSV file from BOLD systems"
kkonganti@0 194 + "\nin uncompressed TXT format.",
kkonganti@0 195 )
kkonganti@0 196 required.add_argument(
kkonganti@0 197 "-csv",
kkonganti@0 198 dest="csv",
kkonganti@0 199 default=False,
kkonganti@0 200 required=True,
kkonganti@0 201 help="Absolute UNIX path to .csv or .csv.gz file which is generated "
kkonganti@0 202 + "\nby the `ncbitax2lin` tool.",
kkonganti@0 203 )
kkonganti@0 204 parser.add_argument(
kkonganti@0 205 "-out",
kkonganti@0 206 dest="out_folder",
kkonganti@0 207 default=os.path.join(os.getcwd(), "species"),
kkonganti@0 208 required=False,
kkonganti@0 209 help="By default, the output is written to this\nfolder.",
kkonganti@0 210 )
kkonganti@0 211 parser.add_argument(
kkonganti@0 212 "-f",
kkonganti@0 213 dest="force_write_out",
kkonganti@0 214 default=False,
kkonganti@0 215 action="store_true",
kkonganti@0 216 required=False,
kkonganti@0 217 help="Force overwrite output directory contents.",
kkonganti@0 218 )
kkonganti@0 219 parser.add_argument(
kkonganti@0 220 "-suffix",
kkonganti@0 221 dest="fna_suffix",
kkonganti@0 222 default=".fna",
kkonganti@0 223 required=False,
kkonganti@0 224 help="Suffix of the individual species FASTA files\nthat will be saved.",
kkonganti@0 225 )
kkonganti@0 226 parser.add_argument(
kkonganti@0 227 "-ccols",
kkonganti@0 228 dest="csv_cols",
kkonganti@0 229 default="tax_id,superkingdom,phylum,class,order,family,genus,species,strain",
kkonganti@0 230 required=False,
kkonganti@0 231 help="Taxonomic lineage will be built using these columns from the output of"
kkonganti@0 232 + "\n`ncbitax2lin`\ntool.",
kkonganti@0 233 )
kkonganti@0 234 parser.add_argument(
kkonganti@0 235 "-ccols-sep",
kkonganti@0 236 dest="csv_delim",
kkonganti@0 237 default=",",
kkonganti@0 238 required=False,
kkonganti@0 239 help="The delimitor of the fields in the CSV file.",
kkonganti@0 240 )
kkonganti@0 241 parser.add_argument(
kkonganti@0 242 "-tcols",
kkonganti@0 243 dest="tsv_cols",
kkonganti@0 244 default="processid\tgenus\tspecies\tsubspecies\tnucraw",
kkonganti@0 245 required=False,
kkonganti@0 246 help="For each species, the nucletide sequences will be\naggregated.",
kkonganti@0 247 )
kkonganti@0 248 parser.add_argument(
kkonganti@0 249 "-tcols-sep",
kkonganti@0 250 dest="tsv_delim",
kkonganti@0 251 default="\t",
kkonganti@0 252 required=False,
kkonganti@0 253 help="The delimitor of the fields in the TSV file.",
kkonganti@0 254 )
kkonganti@0 255
kkonganti@0 256 # Parse defaults
kkonganti@0 257 args = parser.parse_args()
kkonganti@0 258 tsv = args.tsv
kkonganti@0 259 csv = args.csv
kkonganti@0 260 csep = args.csv_delim
kkonganti@0 261 tsep = args.tsv_delim
kkonganti@0 262 csv_cols = args.csv_cols
kkonganti@0 263 tsv_cols = args.tsv_cols
kkonganti@0 264 out = args.out_folder
kkonganti@0 265 overwrite = args.force_write_out
kkonganti@0 266 fna_suffix = args.fna_suffix
kkonganti@0 267 ccols_pat = re.compile(f"^[\w\{csep}]+?\w$")
kkonganti@0 268 tcols_pat = re.compile(f"^[\w\{tsep}]+?\w$")
kkonganti@0 269 final_lineages = os.path.join(out, "lineages.csv")
kkonganti@0 270 lineages_not_found = os.path.join(out, "lineages_not_found.csv")
kkonganti@0 271 base_fasta_dir = os.path.join(out, "fasta")
kkonganti@0 272
kkonganti@0 273 # Basic checks
kkonganti@0 274 if not overwrite and os.path.exists(out):
kkonganti@0 275 logging.warning(
kkonganti@0 276 f"Output destination [{os.path.basename(out)}] already exists!"
kkonganti@0 277 + "\nPlease use -f to delete and overwrite."
kkonganti@0 278 )
kkonganti@0 279 elif overwrite and os.path.exists(out):
kkonganti@0 280 logging.info(f"Overwrite requested. Deleting {os.path.basename(out)}...")
kkonganti@0 281 shutil.rmtree(out)
kkonganti@0 282
kkonganti@0 283 # Validate user requested columns
kkonganti@0 284 passed_ccols = check_and_get_cols(ccols_pat, csv_cols, csep)
kkonganti@0 285 passed_tcols = check_and_get_cols(tcols_pat, tsv_cols, tsep)
kkonganti@0 286
kkonganti@0 287 # Get taxonomy from ncbitax2lin
kkonganti@0 288 lineages, raw_recs = get_lineages(csv, passed_ccols)
kkonganti@0 289
kkonganti@0 290 # Finally, read BOLD tsv if lineage exists.
kkonganti@0 291 logging.info(f"Creating new squences per species...")
kkonganti@0 292
kkonganti@0 293 if not os.path.exists(out):
kkonganti@0 294 os.makedirs(out)
kkonganti@0 295
kkonganti@0 296 try:
kkonganti@0 297 gz_fh = gzip.open(tsv, "rt")
kkonganti@0 298 records, sp2accs, traw_recs = parse_tsv(gz_fh, passed_tcols, tsep)
kkonganti@0 299 except gzip.BadGzipFile:
kkonganti@0 300 logging.info(f"Input TSV file {os.path.basename(tsv)} is not in\nGZIP format.")
kkonganti@0 301 txt_fh = open(tsv, "r")
kkonganti@0 302 records, sp2accs, traw_recs = parse_tsv(txt_fh, passed_tcols, tsep)
kkonganti@0 303
kkonganti@0 304 passed_tax_check = 0
kkonganti@0 305 failed_tax_check = 0
kkonganti@0 306 fasta_recs_written = 0
kkonganti@0 307 l_fh = open(final_lineages, "w")
kkonganti@0 308 ln_fh = open(lineages_not_found, "w")
kkonganti@0 309 l_fh.write(
kkonganti@0 310 "identifiers,superkingdom,phylum,class,order,family,genus,species,strain\n"
kkonganti@0 311 )
kkonganti@0 312 ln_fh.write("fna_id,parsed_org\n")
kkonganti@0 313
kkonganti@0 314 if not os.path.exists(base_fasta_dir):
kkonganti@0 315 os.makedirs(base_fasta_dir)
kkonganti@0 316
kkonganti@0 317 for genus_species in records.keys():
kkonganti@0 318 fasta_recs_written += len(records[genus_species])
kkonganti@0 319 write_fasta(
kkonganti@0 320 records[genus_species],
kkonganti@0 321 base_fasta_dir,
kkonganti@0 322 "_".join(genus_species.split(" ")),
kkonganti@0 323 fna_suffix,
kkonganti@0 324 )
kkonganti@0 325 org_words = genus_species.split(" ")
kkonganti@0 326
kkonganti@0 327 for id in sp2accs[genus_species]:
kkonganti@0 328 if genus_species in lineages.keys():
kkonganti@0 329 this_line = ",".join([id, lineages[genus_species]]) + "\n"
kkonganti@0 330
kkonganti@0 331 if len(org_words) > 2:
kkonganti@0 332 this_line = (
kkonganti@0 333 ",".join(
kkonganti@0 334 [id, lineages[genus_species].rstrip(","), genus_species]
kkonganti@0 335 )
kkonganti@0 336 + "\n"
kkonganti@0 337 )
kkonganti@0 338
kkonganti@0 339 l_fh.write(this_line)
kkonganti@0 340 passed_tax_check += 1
kkonganti@0 341 else:
kkonganti@0 342 this_line = (
kkonganti@0 343 ",".join(
kkonganti@0 344 [
kkonganti@0 345 id,
kkonganti@0 346 "",
kkonganti@0 347 "",
kkonganti@0 348 "",
kkonganti@0 349 "",
kkonganti@0 350 "",
kkonganti@0 351 org_words[0],
kkonganti@0 352 genus_species,
kkonganti@0 353 "",
kkonganti@0 354 ]
kkonganti@0 355 )
kkonganti@0 356 + "\n"
kkonganti@0 357 )
kkonganti@0 358 if len(org_words) > 2:
kkonganti@0 359 this_line = (
kkonganti@0 360 ",".join(
kkonganti@0 361 [
kkonganti@0 362 id,
kkonganti@0 363 "",
kkonganti@0 364 "",
kkonganti@0 365 "",
kkonganti@0 366 "",
kkonganti@0 367 "",
kkonganti@0 368 org_words[0],
kkonganti@0 369 org_words[0] + " " + org_words[1],
kkonganti@0 370 genus_species,
kkonganti@0 371 ]
kkonganti@0 372 )
kkonganti@0 373 + "\n"
kkonganti@0 374 )
kkonganti@0 375 l_fh.write(this_line)
kkonganti@0 376 ln_fh.write(",".join([id, genus_species]) + "\n")
kkonganti@0 377 failed_tax_check += 1
kkonganti@0 378
kkonganti@0 379 logging.info(
kkonganti@0 380 f"No. of raw records present in `ncbitax2lin` [{os.path.basename(csv)}]: {raw_recs}"
kkonganti@0 381 + f"\nNo. of valid records collected from `ncbitax2lin` [{os.path.basename(csv)}]: {len(lineages.keys())}"
kkonganti@0 382 + f"\nNo. of raw records in TSV [{os.path.basename(tsv)}]: {traw_recs}"
kkonganti@0 383 + f"\nNo. of valid records in TSV [{os.path.basename(tsv)}]: {passed_tax_check + failed_tax_check}"
kkonganti@0 384 + f"\nNo. of FASTA records for which new lineages were created: {passed_tax_check}"
kkonganti@0 385 + f"\nNo. of FASTA records for which only genus, species and/or strain information were created: {failed_tax_check}"
kkonganti@0 386 )
kkonganti@0 387
kkonganti@0 388 if (passed_tax_check + failed_tax_check) != fasta_recs_written:
kkonganti@0 389 logging.error(
kkonganti@0 390 f"The number of input FASTA records [{fasta_recs_written}]"
kkonganti@0 391 + f"\nis not equal to number of lineages created [{passed_tax_check + failed_tax_check}]!"
kkonganti@0 392 )
kkonganti@0 393 exit(1)
kkonganti@0 394 else:
kkonganti@0 395 logging.info("Succesfully created lineages and FASTA records! Done!!")
kkonganti@0 396
kkonganti@0 397
kkonganti@0 398 if __name__ == "__main__":
kkonganti@0 399 main()
kkonganti@0 400
kkonganti@0 401 # ~/apps/nowayout/bin/gen_per_species_fa_from_bold.py -tsv BOLD_Public.05-Feb-2024.tsv -csv ../tax.csv ─╯
kkonganti@0 402
kkonganti@0 403 # =======================================================
kkonganti@0 404 # 2024-02-08 21:37:28,541 - INFO
kkonganti@0 405 # =======================================================
kkonganti@0 406 # Indexing tax.csv...
kkonganti@0 407
kkonganti@0 408 # =======================================================
kkonganti@0 409 # 2024-02-08 21:38:06,567 - INFO
kkonganti@0 410 # =======================================================
kkonganti@0 411 # Creating new squences per species...
kkonganti@0 412
kkonganti@0 413 # =======================================================
kkonganti@0 414 # 2024-02-08 21:38:06,572 - INFO
kkonganti@0 415 # =======================================================
kkonganti@0 416 # Input TSV file BOLD_Public.05-Feb-2024.tsv is not in
kkonganti@0 417 # GZIP format.
kkonganti@0 418
kkonganti@0 419 # =======================================================
kkonganti@0 420 # 2024-02-08 22:01:04,554 - INFO
kkonganti@0 421 # =======================================================
kkonganti@0 422 # Collected FASTA records for 497421 species'.
kkonganti@0 423
kkonganti@0 424 # =======================================================
kkonganti@0 425 # 2024-02-08 22:24:35,000 - INFO
kkonganti@0 426 # =======================================================
kkonganti@0 427 # No. of raw records present in `ncbitax2lin` [tax.csv]: 2550767
kkonganti@0 428 # No. of valid records collected from `ncbitax2lin` [tax.csv]: 2134980
kkonganti@0 429 # No. of raw records in TSV [BOLD_Public.05-Feb-2024.tsv]: 9735210
kkonganti@0 430 # No. of valid records in TSV [BOLD_Public.05-Feb-2024.tsv]: 4988323
kkonganti@0 431 # No. of FASTA records for which new lineages were created: 4069202
kkonganti@0 432 # No. of FASTA records for which only genus, species and/or strain information were created: 919121
kkonganti@0 433
kkonganti@0 434 # =======================================================
kkonganti@0 435 # 2024-02-08 22:24:35,001 - INFO
kkonganti@0 436 # =======================================================
kkonganti@0 437 # Succesfully created lineages and FASTA records! Done!!