kkonganti@0: #!/usr/bin/env python3 kkonganti@0: kkonganti@0: import argparse kkonganti@0: import gzip kkonganti@0: import inspect kkonganti@0: import logging kkonganti@0: import os kkonganti@0: import pprint kkonganti@0: import re kkonganti@0: import shutil kkonganti@0: from collections import defaultdict kkonganti@0: from typing import BinaryIO, TextIO, Union kkonganti@0: kkonganti@0: from Bio import SeqIO kkonganti@0: from Bio.Seq import Seq kkonganti@0: from Bio.SeqRecord import SeqRecord kkonganti@0: kkonganti@0: kkonganti@0: # Multiple inheritence for pretty printing of help text. kkonganti@0: class MultiArgFormatClasses( kkonganti@0: argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter kkonganti@0: ): kkonganti@0: pass kkonganti@0: kkonganti@0: kkonganti@0: def get_lineages(csv: os.PathLike, cols: list) -> list: kkonganti@0: """ kkonganti@0: Parse the output from `ncbitax2lin` tool and kkonganti@0: return a dict of lineages where the key is kkonganti@0: genusspeciesstrain. kkonganti@0: """ kkonganti@0: lineages = dict() kkonganti@0: if csv == None or not (os.path.exists(csv) or os.path.getsize(csv) > 0): kkonganti@0: logging.error( kkonganti@0: f"The CSV file [{os.path.basename(csv)}] is empty or does not exist!" kkonganti@0: ) kkonganti@0: exit(1) kkonganti@0: kkonganti@0: logging.info(f"Indexing {os.path.basename(csv)}...") kkonganti@0: kkonganti@0: with open(csv, "r") as csv_fh: kkonganti@0: header_cols = csv_fh.readline().strip().split(",") kkonganti@0: user_req_cols = [ kkonganti@0: tcol_i for tcol_i, tcol in enumerate(header_cols) if tcol in cols kkonganti@0: ] kkonganti@0: cols_not_found = [tcol for tcol in cols if tcol not in header_cols] kkonganti@0: raw_recs = 0 kkonganti@0: kkonganti@0: if len(cols_not_found) > 0: kkonganti@0: logging.error( kkonganti@0: f"The following columns do not exist in the" kkonganti@0: + f"\nCSV file [ {os.path.basename(csv)} ]:\n" kkonganti@0: + "".join(cols_not_found) kkonganti@0: ) kkonganti@0: exit(1) kkonganti@0: elif len(user_req_cols) > 9: kkonganti@0: logging.error( kkonganti@0: f"Only a total of 9 columns are needed!" kkonganti@0: + "\ntax_id,kindom,phylum,class,order,family,genus,species,strain" kkonganti@0: ) kkonganti@0: exit(1) kkonganti@0: kkonganti@0: for tax in csv_fh: kkonganti@0: raw_recs += 1 kkonganti@0: lcols = tax.strip().split(",") kkonganti@0: kkonganti@0: if bool(lcols[user_req_cols[8]]): kkonganti@0: lineages[lcols[user_req_cols[8]]] = ",".join( kkonganti@0: [lcols[l] for l in user_req_cols[1:]] kkonganti@0: ) kkonganti@0: elif bool(lcols[user_req_cols[7]]): kkonganti@0: lineages[lcols[user_req_cols[7]]] = ",".join( kkonganti@0: [lcols[l] for l in user_req_cols[1:8]] + [str()] kkonganti@0: ) kkonganti@0: kkonganti@0: csv_fh.close() kkonganti@0: return lineages, raw_recs kkonganti@0: kkonganti@0: kkonganti@0: def write_fasta(recs: list, basedir: os.PathLike, name: str, suffix: str) -> None: kkonganti@0: """ kkonganti@0: Write sequence with no description to a specified file. kkonganti@0: """ kkonganti@0: SeqIO.write( kkonganti@0: recs, kkonganti@0: os.path.join(basedir, name + suffix), kkonganti@0: "fasta", kkonganti@0: ) kkonganti@0: kkonganti@0: kkonganti@0: def check_and_get_cols(pat: re, cols: str, delim: str) -> list: kkonganti@0: """ kkonganti@0: Check if header column matches the pattern and return kkonganti@0: columns. kkonganti@0: """ kkonganti@0: if not pat.match(cols): kkonganti@0: logging.error( kkonganti@0: f"Supplied columns' names {cols} should only have words" kkonganti@0: f"\n(alphanumeric) separated by: {delim}." kkonganti@0: ) kkonganti@0: exit(1) kkonganti@0: else: kkonganti@0: cols = re.sub("\n", "", cols).split(delim) kkonganti@0: kkonganti@0: return cols kkonganti@0: kkonganti@0: kkonganti@0: def parse_tsv(fh: Union[TextIO, BinaryIO], tcols: list, delim: str) -> list: kkonganti@0: """ kkonganti@0: Parse the TSV file and produce the required per kkonganti@0: species FASTA's. kkonganti@0: """ kkonganti@0: records, sp2accs = (defaultdict(list), defaultdict(list)) kkonganti@0: header = fh.readline().strip().split(delim) kkonganti@0: raw_recs = 0 kkonganti@0: kkonganti@0: if not all(col in header for col in tcols): kkonganti@0: logging.error( kkonganti@0: "The following columns were not found in the" kkonganti@0: + f"\nheader row of file {os.path.basename(fh.name)}\n" kkonganti@0: + "\n".join([ele for ele in tcols if ele not in header]) kkonganti@0: ) kkonganti@0: kkonganti@0: id_i, genus_i, species_i, strain_i, seq_i = [ kkonganti@0: i for i, ele in enumerate(header) if ele in tcols kkonganti@0: ] kkonganti@0: kkonganti@0: for record in fh: kkonganti@0: raw_recs += 1 kkonganti@0: kkonganti@0: id = record.strip().split(delim)[id_i] kkonganti@0: genus = record.strip().split(delim)[genus_i] kkonganti@0: species = re.sub(r"[\/\\]+", "-", record.strip().split(delim)[species_i]) kkonganti@0: strain = record.strip().split(delim)[strain_i] kkonganti@0: seq = re.sub(r"[^ATGC]+", "", record.strip().split(delim)[seq_i], re.IGNORECASE) kkonganti@0: kkonganti@0: if re.match(r"None|Null", species, re.IGNORECASE): kkonganti@0: continue kkonganti@0: kkonganti@0: # print(id) kkonganti@0: # print(genus) kkonganti@0: # print(species) kkonganti@0: # print(strain) kkonganti@0: # print(seq) kkonganti@0: kkonganti@0: records.setdefault(species, []).append( kkonganti@0: SeqRecord(Seq(seq), id=id, description=str()) kkonganti@0: ) kkonganti@0: sp2accs.setdefault(species, []).append(id) kkonganti@0: kkonganti@0: logging.info(f"Collected FASTA records for {len(records.keys())} species'.") kkonganti@0: fh.close() kkonganti@0: return records, sp2accs, raw_recs kkonganti@0: kkonganti@0: kkonganti@0: # Main kkonganti@0: def main() -> None: kkonganti@0: """ kkonganti@0: This script takes: kkonganti@0: 1. The TSV file from BOLD systems, kkonganti@0: 2. Takes as input a .csv file generated by `ncbitax2lin`. kkonganti@0: kkonganti@0: and then generates a folder containing individual FASTA sequence files kkonganti@0: per species. This is only possible if the full taxonomy of the barcode kkonganti@0: sequence is present in the FASTA header. kkonganti@0: """ kkonganti@0: kkonganti@0: # Set logging. kkonganti@0: logging.basicConfig( kkonganti@0: format="\n" kkonganti@0: + "=" * 55 kkonganti@0: + "\n%(asctime)s - %(levelname)s\n" kkonganti@0: + "=" * 55 kkonganti@0: + "\n%(message)s\r\r", kkonganti@0: level=logging.DEBUG, kkonganti@0: ) kkonganti@0: kkonganti@0: # Debug print. kkonganti@0: ppp = pprint.PrettyPrinter(width=55) kkonganti@0: prog_name = os.path.basename(inspect.stack()[0].filename) kkonganti@0: kkonganti@0: parser = argparse.ArgumentParser( kkonganti@0: prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses kkonganti@0: ) kkonganti@0: kkonganti@0: required = parser.add_argument_group("required arguments") kkonganti@0: kkonganti@0: required.add_argument( kkonganti@0: "-tsv", kkonganti@0: dest="tsv", kkonganti@0: default=False, kkonganti@0: required=True, kkonganti@0: help="Absolute UNIX path to the TSV file from BOLD systems" kkonganti@0: + "\nin uncompressed TXT format.", kkonganti@0: ) kkonganti@0: required.add_argument( kkonganti@0: "-csv", kkonganti@0: dest="csv", kkonganti@0: default=False, kkonganti@0: required=True, kkonganti@0: help="Absolute UNIX path to .csv or .csv.gz file which is generated " kkonganti@0: + "\nby the `ncbitax2lin` tool.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-out", kkonganti@0: dest="out_folder", kkonganti@0: default=os.path.join(os.getcwd(), "species"), kkonganti@0: required=False, kkonganti@0: help="By default, the output is written to this\nfolder.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-f", kkonganti@0: dest="force_write_out", kkonganti@0: default=False, kkonganti@0: action="store_true", kkonganti@0: required=False, kkonganti@0: help="Force overwrite output directory contents.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-suffix", kkonganti@0: dest="fna_suffix", kkonganti@0: default=".fna", kkonganti@0: required=False, kkonganti@0: help="Suffix of the individual species FASTA files\nthat will be saved.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-ccols", kkonganti@0: dest="csv_cols", kkonganti@0: default="tax_id,superkingdom,phylum,class,order,family,genus,species,strain", kkonganti@0: required=False, kkonganti@0: help="Taxonomic lineage will be built using these columns from the output of" kkonganti@0: + "\n`ncbitax2lin`\ntool.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-ccols-sep", kkonganti@0: dest="csv_delim", kkonganti@0: default=",", kkonganti@0: required=False, kkonganti@0: help="The delimitor of the fields in the CSV file.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-tcols", kkonganti@0: dest="tsv_cols", kkonganti@0: default="processid\tgenus\tspecies\tsubspecies\tnucraw", kkonganti@0: required=False, kkonganti@0: help="For each species, the nucletide sequences will be\naggregated.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-tcols-sep", kkonganti@0: dest="tsv_delim", kkonganti@0: default="\t", kkonganti@0: required=False, kkonganti@0: help="The delimitor of the fields in the TSV file.", kkonganti@0: ) kkonganti@0: kkonganti@0: # Parse defaults kkonganti@0: args = parser.parse_args() kkonganti@0: tsv = args.tsv kkonganti@0: csv = args.csv kkonganti@0: csep = args.csv_delim kkonganti@0: tsep = args.tsv_delim kkonganti@0: csv_cols = args.csv_cols kkonganti@0: tsv_cols = args.tsv_cols kkonganti@0: out = args.out_folder kkonganti@0: overwrite = args.force_write_out kkonganti@0: fna_suffix = args.fna_suffix kkonganti@0: ccols_pat = re.compile(f"^[\w\{csep}]+?\w$") kkonganti@0: tcols_pat = re.compile(f"^[\w\{tsep}]+?\w$") kkonganti@0: final_lineages = os.path.join(out, "lineages.csv") kkonganti@0: lineages_not_found = os.path.join(out, "lineages_not_found.csv") kkonganti@0: base_fasta_dir = os.path.join(out, "fasta") kkonganti@0: kkonganti@0: # Basic checks kkonganti@0: if not overwrite and os.path.exists(out): kkonganti@0: logging.warning( kkonganti@0: f"Output destination [{os.path.basename(out)}] already exists!" kkonganti@0: + "\nPlease use -f to delete and overwrite." kkonganti@0: ) kkonganti@0: elif overwrite and os.path.exists(out): kkonganti@0: logging.info(f"Overwrite requested. Deleting {os.path.basename(out)}...") kkonganti@0: shutil.rmtree(out) kkonganti@0: kkonganti@0: # Validate user requested columns kkonganti@0: passed_ccols = check_and_get_cols(ccols_pat, csv_cols, csep) kkonganti@0: passed_tcols = check_and_get_cols(tcols_pat, tsv_cols, tsep) kkonganti@0: kkonganti@0: # Get taxonomy from ncbitax2lin kkonganti@0: lineages, raw_recs = get_lineages(csv, passed_ccols) kkonganti@0: kkonganti@0: # Finally, read BOLD tsv if lineage exists. kkonganti@0: logging.info(f"Creating new squences per species...") kkonganti@0: kkonganti@0: if not os.path.exists(out): kkonganti@0: os.makedirs(out) kkonganti@0: kkonganti@0: try: kkonganti@0: gz_fh = gzip.open(tsv, "rt") kkonganti@0: records, sp2accs, traw_recs = parse_tsv(gz_fh, passed_tcols, tsep) kkonganti@0: except gzip.BadGzipFile: kkonganti@0: logging.info(f"Input TSV file {os.path.basename(tsv)} is not in\nGZIP format.") kkonganti@0: txt_fh = open(tsv, "r") kkonganti@0: records, sp2accs, traw_recs = parse_tsv(txt_fh, passed_tcols, tsep) kkonganti@0: kkonganti@0: passed_tax_check = 0 kkonganti@0: failed_tax_check = 0 kkonganti@0: fasta_recs_written = 0 kkonganti@0: l_fh = open(final_lineages, "w") kkonganti@0: ln_fh = open(lineages_not_found, "w") kkonganti@0: l_fh.write( kkonganti@0: "identifiers,superkingdom,phylum,class,order,family,genus,species,strain\n" kkonganti@0: ) kkonganti@0: ln_fh.write("fna_id,parsed_org\n") kkonganti@0: kkonganti@0: if not os.path.exists(base_fasta_dir): kkonganti@0: os.makedirs(base_fasta_dir) kkonganti@0: kkonganti@0: for genus_species in records.keys(): kkonganti@0: fasta_recs_written += len(records[genus_species]) kkonganti@0: write_fasta( kkonganti@0: records[genus_species], kkonganti@0: base_fasta_dir, kkonganti@0: "_".join(genus_species.split(" ")), kkonganti@0: fna_suffix, kkonganti@0: ) kkonganti@0: org_words = genus_species.split(" ") kkonganti@0: kkonganti@0: for id in sp2accs[genus_species]: kkonganti@0: if genus_species in lineages.keys(): kkonganti@0: this_line = ",".join([id, lineages[genus_species]]) + "\n" kkonganti@0: kkonganti@0: if len(org_words) > 2: kkonganti@0: this_line = ( kkonganti@0: ",".join( kkonganti@0: [id, lineages[genus_species].rstrip(","), genus_species] kkonganti@0: ) kkonganti@0: + "\n" kkonganti@0: ) kkonganti@0: kkonganti@0: l_fh.write(this_line) kkonganti@0: passed_tax_check += 1 kkonganti@0: else: kkonganti@0: this_line = ( kkonganti@0: ",".join( kkonganti@0: [ kkonganti@0: id, kkonganti@0: "", kkonganti@0: "", kkonganti@0: "", kkonganti@0: "", kkonganti@0: "", kkonganti@0: org_words[0], kkonganti@0: genus_species, kkonganti@0: "", kkonganti@0: ] kkonganti@0: ) kkonganti@0: + "\n" kkonganti@0: ) kkonganti@0: if len(org_words) > 2: kkonganti@0: this_line = ( kkonganti@0: ",".join( kkonganti@0: [ kkonganti@0: id, kkonganti@0: "", kkonganti@0: "", kkonganti@0: "", kkonganti@0: "", kkonganti@0: "", kkonganti@0: org_words[0], kkonganti@0: org_words[0] + " " + org_words[1], kkonganti@0: genus_species, kkonganti@0: ] kkonganti@0: ) kkonganti@0: + "\n" kkonganti@0: ) kkonganti@0: l_fh.write(this_line) kkonganti@0: ln_fh.write(",".join([id, genus_species]) + "\n") kkonganti@0: failed_tax_check += 1 kkonganti@0: kkonganti@0: logging.info( kkonganti@0: f"No. of raw records present in `ncbitax2lin` [{os.path.basename(csv)}]: {raw_recs}" kkonganti@0: + f"\nNo. of valid records collected from `ncbitax2lin` [{os.path.basename(csv)}]: {len(lineages.keys())}" kkonganti@0: + f"\nNo. of raw records in TSV [{os.path.basename(tsv)}]: {traw_recs}" kkonganti@0: + f"\nNo. of valid records in TSV [{os.path.basename(tsv)}]: {passed_tax_check + failed_tax_check}" kkonganti@0: + f"\nNo. of FASTA records for which new lineages were created: {passed_tax_check}" kkonganti@0: + f"\nNo. of FASTA records for which only genus, species and/or strain information were created: {failed_tax_check}" kkonganti@0: ) kkonganti@0: kkonganti@0: if (passed_tax_check + failed_tax_check) != fasta_recs_written: kkonganti@0: logging.error( kkonganti@0: f"The number of input FASTA records [{fasta_recs_written}]" kkonganti@0: + f"\nis not equal to number of lineages created [{passed_tax_check + failed_tax_check}]!" kkonganti@0: ) kkonganti@0: exit(1) kkonganti@0: else: kkonganti@0: logging.info("Succesfully created lineages and FASTA records! Done!!") kkonganti@0: kkonganti@0: kkonganti@0: if __name__ == "__main__": kkonganti@0: main() kkonganti@0: kkonganti@0: # ~/apps/nowayout/bin/gen_per_species_fa_from_bold.py -tsv BOLD_Public.05-Feb-2024.tsv -csv ../tax.csv ─╯ kkonganti@0: kkonganti@0: # ======================================================= kkonganti@0: # 2024-02-08 21:37:28,541 - INFO kkonganti@0: # ======================================================= kkonganti@0: # Indexing tax.csv... kkonganti@0: kkonganti@0: # ======================================================= kkonganti@0: # 2024-02-08 21:38:06,567 - INFO kkonganti@0: # ======================================================= kkonganti@0: # Creating new squences per species... kkonganti@0: kkonganti@0: # ======================================================= kkonganti@0: # 2024-02-08 21:38:06,572 - INFO kkonganti@0: # ======================================================= kkonganti@0: # Input TSV file BOLD_Public.05-Feb-2024.tsv is not in kkonganti@0: # GZIP format. kkonganti@0: kkonganti@0: # ======================================================= kkonganti@0: # 2024-02-08 22:01:04,554 - INFO kkonganti@0: # ======================================================= kkonganti@0: # Collected FASTA records for 497421 species'. kkonganti@0: kkonganti@0: # ======================================================= kkonganti@0: # 2024-02-08 22:24:35,000 - INFO kkonganti@0: # ======================================================= kkonganti@0: # No. of raw records present in `ncbitax2lin` [tax.csv]: 2550767 kkonganti@0: # No. of valid records collected from `ncbitax2lin` [tax.csv]: 2134980 kkonganti@0: # No. of raw records in TSV [BOLD_Public.05-Feb-2024.tsv]: 9735210 kkonganti@0: # No. of valid records in TSV [BOLD_Public.05-Feb-2024.tsv]: 4988323 kkonganti@0: # No. of FASTA records for which new lineages were created: 4069202 kkonganti@0: # No. of FASTA records for which only genus, species and/or strain information were created: 919121 kkonganti@0: kkonganti@0: # ======================================================= kkonganti@0: # 2024-02-08 22:24:35,001 - INFO kkonganti@0: # ======================================================= kkonganti@0: # Succesfully created lineages and FASTA records! Done!!