diff 0.5.0/bin/gen_per_species_fa_from_bold.py @ 0:97cd2f532efe

planemo upload
author kkonganti
date Mon, 31 Mar 2025 14:50:40 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/0.5.0/bin/gen_per_species_fa_from_bold.py	Mon Mar 31 14:50:40 2025 -0400
@@ -0,0 +1,437 @@
+#!/usr/bin/env python3
+
+import argparse
+import gzip
+import inspect
+import logging
+import os
+import pprint
+import re
+import shutil
+from collections import defaultdict
+from typing import BinaryIO, TextIO, Union
+
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+
+# Multiple inheritence for pretty printing of help text.
+class MultiArgFormatClasses(
+    argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
+):
+    pass
+
+
+def get_lineages(csv: os.PathLike, cols: list) -> list:
+    """
+    Parse the output from `ncbitax2lin` tool and
+    return a dict of lineages where the key is
+    genusspeciesstrain.
+    """
+    lineages = dict()
+    if csv == None or not (os.path.exists(csv) or os.path.getsize(csv) > 0):
+        logging.error(
+            f"The CSV file [{os.path.basename(csv)}] is empty or does not exist!"
+        )
+        exit(1)
+
+    logging.info(f"Indexing {os.path.basename(csv)}...")
+
+    with open(csv, "r") as csv_fh:
+        header_cols = csv_fh.readline().strip().split(",")
+        user_req_cols = [
+            tcol_i for tcol_i, tcol in enumerate(header_cols) if tcol in cols
+        ]
+        cols_not_found = [tcol for tcol in cols if tcol not in header_cols]
+        raw_recs = 0
+
+        if len(cols_not_found) > 0:
+            logging.error(
+                f"The following columns do not exist in the"
+                + f"\nCSV file [ {os.path.basename(csv)} ]:\n"
+                + "".join(cols_not_found)
+            )
+            exit(1)
+        elif len(user_req_cols) > 9:
+            logging.error(
+                f"Only a total of 9 columns are needed!"
+                + "\ntax_id,kindom,phylum,class,order,family,genus,species,strain"
+            )
+            exit(1)
+
+        for tax in csv_fh:
+            raw_recs += 1
+            lcols = tax.strip().split(",")
+
+            if bool(lcols[user_req_cols[8]]):
+                lineages[lcols[user_req_cols[8]]] = ",".join(
+                    [lcols[l] for l in user_req_cols[1:]]
+                )
+            elif bool(lcols[user_req_cols[7]]):
+                lineages[lcols[user_req_cols[7]]] = ",".join(
+                    [lcols[l] for l in user_req_cols[1:8]] + [str()]
+                )
+
+    csv_fh.close()
+    return lineages, raw_recs
+
+
+def write_fasta(recs: list, basedir: os.PathLike, name: str, suffix: str) -> None:
+    """
+    Write sequence with no description to a specified file.
+    """
+    SeqIO.write(
+        recs,
+        os.path.join(basedir, name + suffix),
+        "fasta",
+    )
+
+
+def check_and_get_cols(pat: re, cols: str, delim: str) -> list:
+    """
+    Check if header column matches the pattern and return
+    columns.
+    """
+    if not pat.match(cols):
+        logging.error(
+            f"Supplied columns' names {cols} should only have words"
+            f"\n(alphanumeric) separated by: {delim}."
+        )
+        exit(1)
+    else:
+        cols = re.sub("\n", "", cols).split(delim)
+
+    return cols
+
+
+def parse_tsv(fh: Union[TextIO, BinaryIO], tcols: list, delim: str) -> list:
+    """
+    Parse the TSV file and produce the required per
+    species FASTA's.
+    """
+    records, sp2accs = (defaultdict(list), defaultdict(list))
+    header = fh.readline().strip().split(delim)
+    raw_recs = 0
+
+    if not all(col in header for col in tcols):
+        logging.error(
+            "The following columns were not found in the"
+            + f"\nheader row of file {os.path.basename(fh.name)}\n"
+            + "\n".join([ele for ele in tcols if ele not in header])
+        )
+
+    id_i, genus_i, species_i, strain_i, seq_i = [
+        i for i, ele in enumerate(header) if ele in tcols
+    ]
+
+    for record in fh:
+        raw_recs += 1
+
+        id = record.strip().split(delim)[id_i]
+        genus = record.strip().split(delim)[genus_i]
+        species = re.sub(r"[\/\\]+", "-", record.strip().split(delim)[species_i])
+        strain = record.strip().split(delim)[strain_i]
+        seq = re.sub(r"[^ATGC]+", "", record.strip().split(delim)[seq_i], re.IGNORECASE)
+
+        if re.match(r"None|Null", species, re.IGNORECASE):
+            continue
+
+        # print(id)
+        # print(genus)
+        # print(species)
+        # print(strain)
+        # print(seq)
+
+        records.setdefault(species, []).append(
+            SeqRecord(Seq(seq), id=id, description=str())
+        )
+        sp2accs.setdefault(species, []).append(id)
+
+    logging.info(f"Collected FASTA records for {len(records.keys())} species'.")
+    fh.close()
+    return records, sp2accs, raw_recs
+
+
+# Main
+def main() -> None:
+    """
+    This script takes:
+        1. The TSV file from BOLD systems,
+        2. Takes as input a .csv file generated by `ncbitax2lin`.
+
+    and then generates a folder containing individual FASTA sequence files
+    per species. This is only possible if the full taxonomy of the barcode
+    sequence is present in the FASTA header.
+    """
+
+    # Set logging.
+    logging.basicConfig(
+        format="\n"
+        + "=" * 55
+        + "\n%(asctime)s - %(levelname)s\n"
+        + "=" * 55
+        + "\n%(message)s\r\r",
+        level=logging.DEBUG,
+    )
+
+    # Debug print.
+    ppp = pprint.PrettyPrinter(width=55)
+    prog_name = os.path.basename(inspect.stack()[0].filename)
+
+    parser = argparse.ArgumentParser(
+        prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
+    )
+
+    required = parser.add_argument_group("required arguments")
+
+    required.add_argument(
+        "-tsv",
+        dest="tsv",
+        default=False,
+        required=True,
+        help="Absolute UNIX path to the TSV file from BOLD systems"
+        + "\nin uncompressed TXT format.",
+    )
+    required.add_argument(
+        "-csv",
+        dest="csv",
+        default=False,
+        required=True,
+        help="Absolute UNIX path to .csv or .csv.gz file which is generated "
+        + "\nby the `ncbitax2lin` tool.",
+    )
+    parser.add_argument(
+        "-out",
+        dest="out_folder",
+        default=os.path.join(os.getcwd(), "species"),
+        required=False,
+        help="By default, the output is written to this\nfolder.",
+    )
+    parser.add_argument(
+        "-f",
+        dest="force_write_out",
+        default=False,
+        action="store_true",
+        required=False,
+        help="Force overwrite output directory contents.",
+    )
+    parser.add_argument(
+        "-suffix",
+        dest="fna_suffix",
+        default=".fna",
+        required=False,
+        help="Suffix of the individual species FASTA files\nthat will be saved.",
+    )
+    parser.add_argument(
+        "-ccols",
+        dest="csv_cols",
+        default="tax_id,superkingdom,phylum,class,order,family,genus,species,strain",
+        required=False,
+        help="Taxonomic lineage will be built using these columns from the output of"
+        + "\n`ncbitax2lin`\ntool.",
+    )
+    parser.add_argument(
+        "-ccols-sep",
+        dest="csv_delim",
+        default=",",
+        required=False,
+        help="The delimitor of the fields in the CSV file.",
+    )
+    parser.add_argument(
+        "-tcols",
+        dest="tsv_cols",
+        default="processid\tgenus\tspecies\tsubspecies\tnucraw",
+        required=False,
+        help="For each species, the nucletide sequences will be\naggregated.",
+    )
+    parser.add_argument(
+        "-tcols-sep",
+        dest="tsv_delim",
+        default="\t",
+        required=False,
+        help="The delimitor of the fields in the TSV file.",
+    )
+
+    # Parse defaults
+    args = parser.parse_args()
+    tsv = args.tsv
+    csv = args.csv
+    csep = args.csv_delim
+    tsep = args.tsv_delim
+    csv_cols = args.csv_cols
+    tsv_cols = args.tsv_cols
+    out = args.out_folder
+    overwrite = args.force_write_out
+    fna_suffix = args.fna_suffix
+    ccols_pat = re.compile(f"^[\w\{csep}]+?\w$")
+    tcols_pat = re.compile(f"^[\w\{tsep}]+?\w$")
+    final_lineages = os.path.join(out, "lineages.csv")
+    lineages_not_found = os.path.join(out, "lineages_not_found.csv")
+    base_fasta_dir = os.path.join(out, "fasta")
+
+    # Basic checks
+    if not overwrite and os.path.exists(out):
+        logging.warning(
+            f"Output destination [{os.path.basename(out)}] already exists!"
+            + "\nPlease use -f to delete and overwrite."
+        )
+    elif overwrite and os.path.exists(out):
+        logging.info(f"Overwrite requested. Deleting {os.path.basename(out)}...")
+        shutil.rmtree(out)
+
+    # Validate user requested columns
+    passed_ccols = check_and_get_cols(ccols_pat, csv_cols, csep)
+    passed_tcols = check_and_get_cols(tcols_pat, tsv_cols, tsep)
+
+    # Get  taxonomy from ncbitax2lin
+    lineages, raw_recs = get_lineages(csv, passed_ccols)
+
+    # Finally, read BOLD tsv if lineage exists.
+    logging.info(f"Creating new squences per species...")
+
+    if not os.path.exists(out):
+        os.makedirs(out)
+
+    try:
+        gz_fh = gzip.open(tsv, "rt")
+        records, sp2accs, traw_recs = parse_tsv(gz_fh, passed_tcols, tsep)
+    except gzip.BadGzipFile:
+        logging.info(f"Input TSV file {os.path.basename(tsv)} is not in\nGZIP format.")
+        txt_fh = open(tsv, "r")
+        records, sp2accs, traw_recs = parse_tsv(txt_fh, passed_tcols, tsep)
+
+    passed_tax_check = 0
+    failed_tax_check = 0
+    fasta_recs_written = 0
+    l_fh = open(final_lineages, "w")
+    ln_fh = open(lineages_not_found, "w")
+    l_fh.write(
+        "identifiers,superkingdom,phylum,class,order,family,genus,species,strain\n"
+    )
+    ln_fh.write("fna_id,parsed_org\n")
+
+    if not os.path.exists(base_fasta_dir):
+        os.makedirs(base_fasta_dir)
+
+    for genus_species in records.keys():
+        fasta_recs_written += len(records[genus_species])
+        write_fasta(
+            records[genus_species],
+            base_fasta_dir,
+            "_".join(genus_species.split(" ")),
+            fna_suffix,
+        )
+        org_words = genus_species.split(" ")
+
+        for id in sp2accs[genus_species]:
+            if genus_species in lineages.keys():
+                this_line = ",".join([id, lineages[genus_species]]) + "\n"
+
+                if len(org_words) > 2:
+                    this_line = (
+                        ",".join(
+                            [id, lineages[genus_species].rstrip(","), genus_species]
+                        )
+                        + "\n"
+                    )
+
+                l_fh.write(this_line)
+                passed_tax_check += 1
+            else:
+                this_line = (
+                    ",".join(
+                        [
+                            id,
+                            "",
+                            "",
+                            "",
+                            "",
+                            "",
+                            org_words[0],
+                            genus_species,
+                            "",
+                        ]
+                    )
+                    + "\n"
+                )
+                if len(org_words) > 2:
+                    this_line = (
+                        ",".join(
+                            [
+                                id,
+                                "",
+                                "",
+                                "",
+                                "",
+                                "",
+                                org_words[0],
+                                org_words[0] + " " + org_words[1],
+                                genus_species,
+                            ]
+                        )
+                        + "\n"
+                    )
+                l_fh.write(this_line)
+                ln_fh.write(",".join([id, genus_species]) + "\n")
+                failed_tax_check += 1
+
+    logging.info(
+        f"No. of raw records present in `ncbitax2lin` [{os.path.basename(csv)}]: {raw_recs}"
+        + f"\nNo. of valid records collected from `ncbitax2lin` [{os.path.basename(csv)}]: {len(lineages.keys())}"
+        + f"\nNo. of raw records in TSV [{os.path.basename(tsv)}]: {traw_recs}"
+        + f"\nNo. of valid records in TSV [{os.path.basename(tsv)}]: {passed_tax_check + failed_tax_check}"
+        + f"\nNo. of FASTA records for which new lineages were created: {passed_tax_check}"
+        + f"\nNo. of FASTA records for which only genus, species and/or strain information were created: {failed_tax_check}"
+    )
+
+    if (passed_tax_check + failed_tax_check) != fasta_recs_written:
+        logging.error(
+            f"The number of input FASTA records [{fasta_recs_written}]"
+            + f"\nis not equal to number of lineages created [{passed_tax_check + failed_tax_check}]!"
+        )
+        exit(1)
+    else:
+        logging.info("Succesfully created lineages and FASTA records! Done!!")
+
+
+if __name__ == "__main__":
+    main()
+
+# ~/apps/nowayout/bin/gen_per_species_fa_from_bold.py -tsv BOLD_Public.05-Feb-2024.tsv -csv ../tax.csv                                  ─╯
+
+# =======================================================
+# 2024-02-08 21:37:28,541 - INFO
+# =======================================================
+# Indexing tax.csv...
+
+# =======================================================
+# 2024-02-08 21:38:06,567 - INFO
+# =======================================================
+# Creating new squences per species...
+
+# =======================================================
+# 2024-02-08 21:38:06,572 - INFO
+# =======================================================
+# Input TSV file BOLD_Public.05-Feb-2024.tsv is not in
+# GZIP format.
+
+# =======================================================
+# 2024-02-08 22:01:04,554 - INFO
+# =======================================================
+# Collected FASTA records for 497421 species'.
+
+# =======================================================
+# 2024-02-08 22:24:35,000 - INFO
+# =======================================================
+# No. of raw records present in `ncbitax2lin` [tax.csv]: 2550767
+# No. of valid records collected from `ncbitax2lin` [tax.csv]: 2134980
+# No. of raw records in TSV [BOLD_Public.05-Feb-2024.tsv]: 9735210
+# No. of valid records in TSV [BOLD_Public.05-Feb-2024.tsv]: 4988323
+# No. of FASTA records for which new lineages were created: 4069202
+# No. of FASTA records for which only genus, species and/or strain information were created: 919121
+
+# =======================================================
+# 2024-02-08 22:24:35,001 - INFO
+# =======================================================
+# Succesfully created lineages and FASTA records! Done!!