kkonganti@0: #!/usr/bin/env python3
kkonganti@0: 
kkonganti@0: import argparse
kkonganti@0: import gzip
kkonganti@0: import inspect
kkonganti@0: import logging
kkonganti@0: import os
kkonganti@0: import pprint
kkonganti@0: import re
kkonganti@0: import shutil
kkonganti@0: from collections import defaultdict
kkonganti@0: from typing import BinaryIO, TextIO, Union
kkonganti@0: 
kkonganti@0: from Bio import SeqIO
kkonganti@0: from Bio.Seq import Seq
kkonganti@0: from Bio.SeqRecord import SeqRecord
kkonganti@0: 
kkonganti@0: 
kkonganti@0: # Multiple inheritence for pretty printing of help text.
kkonganti@0: class MultiArgFormatClasses(
kkonganti@0:     argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
kkonganti@0: ):
kkonganti@0:     pass
kkonganti@0: 
kkonganti@0: 
kkonganti@0: def get_lineages(csv: os.PathLike, cols: list) -> list:
kkonganti@0:     """
kkonganti@0:     Parse the output from `ncbitax2lin` tool and
kkonganti@0:     return a dict of lineages where the key is
kkonganti@0:     genusspeciesstrain.
kkonganti@0:     """
kkonganti@0:     lineages = dict()
kkonganti@0:     if csv == None or not (os.path.exists(csv) or os.path.getsize(csv) > 0):
kkonganti@0:         logging.error(
kkonganti@0:             f"The CSV file [{os.path.basename(csv)}] is empty or does not exist!"
kkonganti@0:         )
kkonganti@0:         exit(1)
kkonganti@0: 
kkonganti@0:     logging.info(f"Indexing {os.path.basename(csv)}...")
kkonganti@0: 
kkonganti@0:     with open(csv, "r") as csv_fh:
kkonganti@0:         header_cols = csv_fh.readline().strip().split(",")
kkonganti@0:         user_req_cols = [
kkonganti@0:             tcol_i for tcol_i, tcol in enumerate(header_cols) if tcol in cols
kkonganti@0:         ]
kkonganti@0:         cols_not_found = [tcol for tcol in cols if tcol not in header_cols]
kkonganti@0:         raw_recs = 0
kkonganti@0: 
kkonganti@0:         if len(cols_not_found) > 0:
kkonganti@0:             logging.error(
kkonganti@0:                 f"The following columns do not exist in the"
kkonganti@0:                 + f"\nCSV file [ {os.path.basename(csv)} ]:\n"
kkonganti@0:                 + "".join(cols_not_found)
kkonganti@0:             )
kkonganti@0:             exit(1)
kkonganti@0:         elif len(user_req_cols) > 9:
kkonganti@0:             logging.error(
kkonganti@0:                 f"Only a total of 9 columns are needed!"
kkonganti@0:                 + "\ntax_id,kindom,phylum,class,order,family,genus,species,strain"
kkonganti@0:             )
kkonganti@0:             exit(1)
kkonganti@0: 
kkonganti@0:         for tax in csv_fh:
kkonganti@0:             raw_recs += 1
kkonganti@0:             lcols = tax.strip().split(",")
kkonganti@0: 
kkonganti@0:             if bool(lcols[user_req_cols[8]]):
kkonganti@0:                 lineages[lcols[user_req_cols[8]]] = ",".join(
kkonganti@0:                     [lcols[l] for l in user_req_cols[1:]]
kkonganti@0:                 )
kkonganti@0:             elif bool(lcols[user_req_cols[7]]):
kkonganti@0:                 lineages[lcols[user_req_cols[7]]] = ",".join(
kkonganti@0:                     [lcols[l] for l in user_req_cols[1:8]] + [str()]
kkonganti@0:                 )
kkonganti@0: 
kkonganti@0:     csv_fh.close()
kkonganti@0:     return lineages, raw_recs
kkonganti@0: 
kkonganti@0: 
kkonganti@0: def write_fasta(recs: list, basedir: os.PathLike, name: str, suffix: str) -> None:
kkonganti@0:     """
kkonganti@0:     Write sequence with no description to a specified file.
kkonganti@0:     """
kkonganti@0:     SeqIO.write(
kkonganti@0:         recs,
kkonganti@0:         os.path.join(basedir, name + suffix),
kkonganti@0:         "fasta",
kkonganti@0:     )
kkonganti@0: 
kkonganti@0: 
kkonganti@0: def check_and_get_cols(pat: re, cols: str, delim: str) -> list:
kkonganti@0:     """
kkonganti@0:     Check if header column matches the pattern and return
kkonganti@0:     columns.
kkonganti@0:     """
kkonganti@0:     if not pat.match(cols):
kkonganti@0:         logging.error(
kkonganti@0:             f"Supplied columns' names {cols} should only have words"
kkonganti@0:             f"\n(alphanumeric) separated by: {delim}."
kkonganti@0:         )
kkonganti@0:         exit(1)
kkonganti@0:     else:
kkonganti@0:         cols = re.sub("\n", "", cols).split(delim)
kkonganti@0: 
kkonganti@0:     return cols
kkonganti@0: 
kkonganti@0: 
kkonganti@0: def parse_tsv(fh: Union[TextIO, BinaryIO], tcols: list, delim: str) -> list:
kkonganti@0:     """
kkonganti@0:     Parse the TSV file and produce the required per
kkonganti@0:     species FASTA's.
kkonganti@0:     """
kkonganti@0:     records, sp2accs = (defaultdict(list), defaultdict(list))
kkonganti@0:     header = fh.readline().strip().split(delim)
kkonganti@0:     raw_recs = 0
kkonganti@0: 
kkonganti@0:     if not all(col in header for col in tcols):
kkonganti@0:         logging.error(
kkonganti@0:             "The following columns were not found in the"
kkonganti@0:             + f"\nheader row of file {os.path.basename(fh.name)}\n"
kkonganti@0:             + "\n".join([ele for ele in tcols if ele not in header])
kkonganti@0:         )
kkonganti@0: 
kkonganti@0:     id_i, genus_i, species_i, strain_i, seq_i = [
kkonganti@0:         i for i, ele in enumerate(header) if ele in tcols
kkonganti@0:     ]
kkonganti@0: 
kkonganti@0:     for record in fh:
kkonganti@0:         raw_recs += 1
kkonganti@0: 
kkonganti@0:         id = record.strip().split(delim)[id_i]
kkonganti@0:         genus = record.strip().split(delim)[genus_i]
kkonganti@0:         species = re.sub(r"[\/\\]+", "-", record.strip().split(delim)[species_i])
kkonganti@0:         strain = record.strip().split(delim)[strain_i]
kkonganti@0:         seq = re.sub(r"[^ATGC]+", "", record.strip().split(delim)[seq_i], re.IGNORECASE)
kkonganti@0: 
kkonganti@0:         if re.match(r"None|Null", species, re.IGNORECASE):
kkonganti@0:             continue
kkonganti@0: 
kkonganti@0:         # print(id)
kkonganti@0:         # print(genus)
kkonganti@0:         # print(species)
kkonganti@0:         # print(strain)
kkonganti@0:         # print(seq)
kkonganti@0: 
kkonganti@0:         records.setdefault(species, []).append(
kkonganti@0:             SeqRecord(Seq(seq), id=id, description=str())
kkonganti@0:         )
kkonganti@0:         sp2accs.setdefault(species, []).append(id)
kkonganti@0: 
kkonganti@0:     logging.info(f"Collected FASTA records for {len(records.keys())} species'.")
kkonganti@0:     fh.close()
kkonganti@0:     return records, sp2accs, raw_recs
kkonganti@0: 
kkonganti@0: 
kkonganti@0: # Main
kkonganti@0: def main() -> None:
kkonganti@0:     """
kkonganti@0:     This script takes:
kkonganti@0:         1. The TSV file from BOLD systems,
kkonganti@0:         2. Takes as input a .csv file generated by `ncbitax2lin`.
kkonganti@0: 
kkonganti@0:     and then generates a folder containing individual FASTA sequence files
kkonganti@0:     per species. This is only possible if the full taxonomy of the barcode
kkonganti@0:     sequence is present in the FASTA header.
kkonganti@0:     """
kkonganti@0: 
kkonganti@0:     # Set logging.
kkonganti@0:     logging.basicConfig(
kkonganti@0:         format="\n"
kkonganti@0:         + "=" * 55
kkonganti@0:         + "\n%(asctime)s - %(levelname)s\n"
kkonganti@0:         + "=" * 55
kkonganti@0:         + "\n%(message)s\r\r",
kkonganti@0:         level=logging.DEBUG,
kkonganti@0:     )
kkonganti@0: 
kkonganti@0:     # Debug print.
kkonganti@0:     ppp = pprint.PrettyPrinter(width=55)
kkonganti@0:     prog_name = os.path.basename(inspect.stack()[0].filename)
kkonganti@0: 
kkonganti@0:     parser = argparse.ArgumentParser(
kkonganti@0:         prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@0:     )
kkonganti@0: 
kkonganti@0:     required = parser.add_argument_group("required arguments")
kkonganti@0: 
kkonganti@0:     required.add_argument(
kkonganti@0:         "-tsv",
kkonganti@0:         dest="tsv",
kkonganti@0:         default=False,
kkonganti@0:         required=True,
kkonganti@0:         help="Absolute UNIX path to the TSV file from BOLD systems"
kkonganti@0:         + "\nin uncompressed TXT format.",
kkonganti@0:     )
kkonganti@0:     required.add_argument(
kkonganti@0:         "-csv",
kkonganti@0:         dest="csv",
kkonganti@0:         default=False,
kkonganti@0:         required=True,
kkonganti@0:         help="Absolute UNIX path to .csv or .csv.gz file which is generated "
kkonganti@0:         + "\nby the `ncbitax2lin` tool.",
kkonganti@0:     )
kkonganti@0:     parser.add_argument(
kkonganti@0:         "-out",
kkonganti@0:         dest="out_folder",
kkonganti@0:         default=os.path.join(os.getcwd(), "species"),
kkonganti@0:         required=False,
kkonganti@0:         help="By default, the output is written to this\nfolder.",
kkonganti@0:     )
kkonganti@0:     parser.add_argument(
kkonganti@0:         "-f",
kkonganti@0:         dest="force_write_out",
kkonganti@0:         default=False,
kkonganti@0:         action="store_true",
kkonganti@0:         required=False,
kkonganti@0:         help="Force overwrite output directory contents.",
kkonganti@0:     )
kkonganti@0:     parser.add_argument(
kkonganti@0:         "-suffix",
kkonganti@0:         dest="fna_suffix",
kkonganti@0:         default=".fna",
kkonganti@0:         required=False,
kkonganti@0:         help="Suffix of the individual species FASTA files\nthat will be saved.",
kkonganti@0:     )
kkonganti@0:     parser.add_argument(
kkonganti@0:         "-ccols",
kkonganti@0:         dest="csv_cols",
kkonganti@0:         default="tax_id,superkingdom,phylum,class,order,family,genus,species,strain",
kkonganti@0:         required=False,
kkonganti@0:         help="Taxonomic lineage will be built using these columns from the output of"
kkonganti@0:         + "\n`ncbitax2lin`\ntool.",
kkonganti@0:     )
kkonganti@0:     parser.add_argument(
kkonganti@0:         "-ccols-sep",
kkonganti@0:         dest="csv_delim",
kkonganti@0:         default=",",
kkonganti@0:         required=False,
kkonganti@0:         help="The delimitor of the fields in the CSV file.",
kkonganti@0:     )
kkonganti@0:     parser.add_argument(
kkonganti@0:         "-tcols",
kkonganti@0:         dest="tsv_cols",
kkonganti@0:         default="processid\tgenus\tspecies\tsubspecies\tnucraw",
kkonganti@0:         required=False,
kkonganti@0:         help="For each species, the nucletide sequences will be\naggregated.",
kkonganti@0:     )
kkonganti@0:     parser.add_argument(
kkonganti@0:         "-tcols-sep",
kkonganti@0:         dest="tsv_delim",
kkonganti@0:         default="\t",
kkonganti@0:         required=False,
kkonganti@0:         help="The delimitor of the fields in the TSV file.",
kkonganti@0:     )
kkonganti@0: 
kkonganti@0:     # Parse defaults
kkonganti@0:     args = parser.parse_args()
kkonganti@0:     tsv = args.tsv
kkonganti@0:     csv = args.csv
kkonganti@0:     csep = args.csv_delim
kkonganti@0:     tsep = args.tsv_delim
kkonganti@0:     csv_cols = args.csv_cols
kkonganti@0:     tsv_cols = args.tsv_cols
kkonganti@0:     out = args.out_folder
kkonganti@0:     overwrite = args.force_write_out
kkonganti@0:     fna_suffix = args.fna_suffix
kkonganti@0:     ccols_pat = re.compile(f"^[\w\{csep}]+?\w$")
kkonganti@0:     tcols_pat = re.compile(f"^[\w\{tsep}]+?\w$")
kkonganti@0:     final_lineages = os.path.join(out, "lineages.csv")
kkonganti@0:     lineages_not_found = os.path.join(out, "lineages_not_found.csv")
kkonganti@0:     base_fasta_dir = os.path.join(out, "fasta")
kkonganti@0: 
kkonganti@0:     # Basic checks
kkonganti@0:     if not overwrite and os.path.exists(out):
kkonganti@0:         logging.warning(
kkonganti@0:             f"Output destination [{os.path.basename(out)}] already exists!"
kkonganti@0:             + "\nPlease use -f to delete and overwrite."
kkonganti@0:         )
kkonganti@0:     elif overwrite and os.path.exists(out):
kkonganti@0:         logging.info(f"Overwrite requested. Deleting {os.path.basename(out)}...")
kkonganti@0:         shutil.rmtree(out)
kkonganti@0: 
kkonganti@0:     # Validate user requested columns
kkonganti@0:     passed_ccols = check_and_get_cols(ccols_pat, csv_cols, csep)
kkonganti@0:     passed_tcols = check_and_get_cols(tcols_pat, tsv_cols, tsep)
kkonganti@0: 
kkonganti@0:     # Get  taxonomy from ncbitax2lin
kkonganti@0:     lineages, raw_recs = get_lineages(csv, passed_ccols)
kkonganti@0: 
kkonganti@0:     # Finally, read BOLD tsv if lineage exists.
kkonganti@0:     logging.info(f"Creating new squences per species...")
kkonganti@0: 
kkonganti@0:     if not os.path.exists(out):
kkonganti@0:         os.makedirs(out)
kkonganti@0: 
kkonganti@0:     try:
kkonganti@0:         gz_fh = gzip.open(tsv, "rt")
kkonganti@0:         records, sp2accs, traw_recs = parse_tsv(gz_fh, passed_tcols, tsep)
kkonganti@0:     except gzip.BadGzipFile:
kkonganti@0:         logging.info(f"Input TSV file {os.path.basename(tsv)} is not in\nGZIP format.")
kkonganti@0:         txt_fh = open(tsv, "r")
kkonganti@0:         records, sp2accs, traw_recs = parse_tsv(txt_fh, passed_tcols, tsep)
kkonganti@0: 
kkonganti@0:     passed_tax_check = 0
kkonganti@0:     failed_tax_check = 0
kkonganti@0:     fasta_recs_written = 0
kkonganti@0:     l_fh = open(final_lineages, "w")
kkonganti@0:     ln_fh = open(lineages_not_found, "w")
kkonganti@0:     l_fh.write(
kkonganti@0:         "identifiers,superkingdom,phylum,class,order,family,genus,species,strain\n"
kkonganti@0:     )
kkonganti@0:     ln_fh.write("fna_id,parsed_org\n")
kkonganti@0: 
kkonganti@0:     if not os.path.exists(base_fasta_dir):
kkonganti@0:         os.makedirs(base_fasta_dir)
kkonganti@0: 
kkonganti@0:     for genus_species in records.keys():
kkonganti@0:         fasta_recs_written += len(records[genus_species])
kkonganti@0:         write_fasta(
kkonganti@0:             records[genus_species],
kkonganti@0:             base_fasta_dir,
kkonganti@0:             "_".join(genus_species.split(" ")),
kkonganti@0:             fna_suffix,
kkonganti@0:         )
kkonganti@0:         org_words = genus_species.split(" ")
kkonganti@0: 
kkonganti@0:         for id in sp2accs[genus_species]:
kkonganti@0:             if genus_species in lineages.keys():
kkonganti@0:                 this_line = ",".join([id, lineages[genus_species]]) + "\n"
kkonganti@0: 
kkonganti@0:                 if len(org_words) > 2:
kkonganti@0:                     this_line = (
kkonganti@0:                         ",".join(
kkonganti@0:                             [id, lineages[genus_species].rstrip(","), genus_species]
kkonganti@0:                         )
kkonganti@0:                         + "\n"
kkonganti@0:                     )
kkonganti@0: 
kkonganti@0:                 l_fh.write(this_line)
kkonganti@0:                 passed_tax_check += 1
kkonganti@0:             else:
kkonganti@0:                 this_line = (
kkonganti@0:                     ",".join(
kkonganti@0:                         [
kkonganti@0:                             id,
kkonganti@0:                             "",
kkonganti@0:                             "",
kkonganti@0:                             "",
kkonganti@0:                             "",
kkonganti@0:                             "",
kkonganti@0:                             org_words[0],
kkonganti@0:                             genus_species,
kkonganti@0:                             "",
kkonganti@0:                         ]
kkonganti@0:                     )
kkonganti@0:                     + "\n"
kkonganti@0:                 )
kkonganti@0:                 if len(org_words) > 2:
kkonganti@0:                     this_line = (
kkonganti@0:                         ",".join(
kkonganti@0:                             [
kkonganti@0:                                 id,
kkonganti@0:                                 "",
kkonganti@0:                                 "",
kkonganti@0:                                 "",
kkonganti@0:                                 "",
kkonganti@0:                                 "",
kkonganti@0:                                 org_words[0],
kkonganti@0:                                 org_words[0] + " " + org_words[1],
kkonganti@0:                                 genus_species,
kkonganti@0:                             ]
kkonganti@0:                         )
kkonganti@0:                         + "\n"
kkonganti@0:                     )
kkonganti@0:                 l_fh.write(this_line)
kkonganti@0:                 ln_fh.write(",".join([id, genus_species]) + "\n")
kkonganti@0:                 failed_tax_check += 1
kkonganti@0: 
kkonganti@0:     logging.info(
kkonganti@0:         f"No. of raw records present in `ncbitax2lin` [{os.path.basename(csv)}]: {raw_recs}"
kkonganti@0:         + f"\nNo. of valid records collected from `ncbitax2lin` [{os.path.basename(csv)}]: {len(lineages.keys())}"
kkonganti@0:         + f"\nNo. of raw records in TSV [{os.path.basename(tsv)}]: {traw_recs}"
kkonganti@0:         + f"\nNo. of valid records in TSV [{os.path.basename(tsv)}]: {passed_tax_check + failed_tax_check}"
kkonganti@0:         + f"\nNo. of FASTA records for which new lineages were created: {passed_tax_check}"
kkonganti@0:         + f"\nNo. of FASTA records for which only genus, species and/or strain information were created: {failed_tax_check}"
kkonganti@0:     )
kkonganti@0: 
kkonganti@0:     if (passed_tax_check + failed_tax_check) != fasta_recs_written:
kkonganti@0:         logging.error(
kkonganti@0:             f"The number of input FASTA records [{fasta_recs_written}]"
kkonganti@0:             + f"\nis not equal to number of lineages created [{passed_tax_check + failed_tax_check}]!"
kkonganti@0:         )
kkonganti@0:         exit(1)
kkonganti@0:     else:
kkonganti@0:         logging.info("Succesfully created lineages and FASTA records! Done!!")
kkonganti@0: 
kkonganti@0: 
kkonganti@0: if __name__ == "__main__":
kkonganti@0:     main()
kkonganti@0: 
kkonganti@0: # ~/apps/nowayout/bin/gen_per_species_fa_from_bold.py -tsv BOLD_Public.05-Feb-2024.tsv -csv ../tax.csv                                  ─╯
kkonganti@0: 
kkonganti@0: # =======================================================
kkonganti@0: # 2024-02-08 21:37:28,541 - INFO
kkonganti@0: # =======================================================
kkonganti@0: # Indexing tax.csv...
kkonganti@0: 
kkonganti@0: # =======================================================
kkonganti@0: # 2024-02-08 21:38:06,567 - INFO
kkonganti@0: # =======================================================
kkonganti@0: # Creating new squences per species...
kkonganti@0: 
kkonganti@0: # =======================================================
kkonganti@0: # 2024-02-08 21:38:06,572 - INFO
kkonganti@0: # =======================================================
kkonganti@0: # Input TSV file BOLD_Public.05-Feb-2024.tsv is not in
kkonganti@0: # GZIP format.
kkonganti@0: 
kkonganti@0: # =======================================================
kkonganti@0: # 2024-02-08 22:01:04,554 - INFO
kkonganti@0: # =======================================================
kkonganti@0: # Collected FASTA records for 497421 species'.
kkonganti@0: 
kkonganti@0: # =======================================================
kkonganti@0: # 2024-02-08 22:24:35,000 - INFO
kkonganti@0: # =======================================================
kkonganti@0: # No. of raw records present in `ncbitax2lin` [tax.csv]: 2550767
kkonganti@0: # No. of valid records collected from `ncbitax2lin` [tax.csv]: 2134980
kkonganti@0: # No. of raw records in TSV [BOLD_Public.05-Feb-2024.tsv]: 9735210
kkonganti@0: # No. of valid records in TSV [BOLD_Public.05-Feb-2024.tsv]: 4988323
kkonganti@0: # No. of FASTA records for which new lineages were created: 4069202
kkonganti@0: # No. of FASTA records for which only genus, species and/or strain information were created: 919121
kkonganti@0: 
kkonganti@0: # =======================================================
kkonganti@0: # 2024-02-08 22:24:35,001 - INFO
kkonganti@0: # =======================================================
kkonganti@0: # Succesfully created lineages and FASTA records! Done!!