annotate 0.5.0/bin/gen_per_species_fa_from_lin.py @ 0:97cd2f532efe

planemo upload
author kkonganti
date Mon, 31 Mar 2025 14:50:40 -0400
parents
children
rev   line source
kkonganti@0 1 #!/usr/bin/env python3
kkonganti@0 2
kkonganti@0 3 import argparse
kkonganti@0 4 import gzip
kkonganti@0 5 import inspect
kkonganti@0 6 import logging
kkonganti@0 7 import os
kkonganti@0 8 import pprint
kkonganti@0 9 import re
kkonganti@0 10 import shutil
kkonganti@0 11 from collections import defaultdict
kkonganti@0 12 from typing import BinaryIO, TextIO, Union
kkonganti@0 13
kkonganti@0 14 from Bio import SeqIO
kkonganti@0 15 from Bio.Seq import Seq
kkonganti@0 16 from Bio.SeqRecord import SeqRecord
kkonganti@0 17
kkonganti@0 18
kkonganti@0 19 # Multiple inheritence for pretty printing of help text.
kkonganti@0 20 class MultiArgFormatClasses(
kkonganti@0 21 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
kkonganti@0 22 ):
kkonganti@0 23 pass
kkonganti@0 24
kkonganti@0 25
kkonganti@0 26 def get_lineages(csv: os.PathLike) -> defaultdict:
kkonganti@0 27 """
kkonganti@0 28 Parse the lineages.csv file and store a list of
kkonganti@0 29 accessions.
kkonganti@0 30 """
kkonganti@0 31 lineages = dict()
kkonganti@0 32 if csv == None or not (os.path.exists(csv) or os.path.getsize(csv) > 0):
kkonganti@0 33 logging.error(
kkonganti@0 34 f"The CSV file [{os.path.basename(csv)}] is empty or does not exist!"
kkonganti@0 35 )
kkonganti@0 36 exit(1)
kkonganti@0 37
kkonganti@0 38 logging.info(f"Indexing {os.path.basename(csv)}...")
kkonganti@0 39
kkonganti@0 40 with open(csv, "r") as csv_fh:
kkonganti@0 41 _ = csv_fh.readline().strip().split(",")
kkonganti@0 42 for line in csv_fh:
kkonganti@0 43 cols = line.strip().split(",")
kkonganti@0 44
kkonganti@0 45 if len(cols) < 9:
kkonganti@0 46 logging.error(
kkonganti@0 47 f"The CSV file {os.path.basename(csv)} should have a mandatory 9 columns."
kkonganti@0 48 + "\n\nEx: identifiers,superkingdom,phylum,class,order,family,genus,species,strain"
kkonganti@0 49 + "\nAB211151.1,Eukaryota,Arthropoda,Malacostraca,Decapoda,Majidae,Chionoecetes,Chionoecetes opilio,"
kkonganti@0 50 + f"\n\nGot:\n{line}"
kkonganti@0 51 )
kkonganti@0 52 exit(1)
kkonganti@0 53
kkonganti@0 54 lineages[cols[0]] = re.sub(r"\W+", "-", "_".join(cols[7].split(" ")))
kkonganti@0 55
kkonganti@0 56 csv_fh.close()
kkonganti@0 57 return lineages
kkonganti@0 58
kkonganti@0 59
kkonganti@0 60 def write_fasta(recs: list, basedir: os.PathLike, name: str, suffix: str) -> None:
kkonganti@0 61 """
kkonganti@0 62 Write sequence with no description to a specified file.
kkonganti@0 63 """
kkonganti@0 64 SeqIO.write(
kkonganti@0 65 recs,
kkonganti@0 66 os.path.join(basedir, name + suffix),
kkonganti@0 67 "fasta",
kkonganti@0 68 )
kkonganti@0 69
kkonganti@0 70
kkonganti@0 71 def parse_fasta(fh: Union[TextIO, BinaryIO], sp2accs: dict) -> list:
kkonganti@0 72 """
kkonganti@0 73 Parse the sequences and create per species FASTA record.
kkonganti@0 74 """
kkonganti@0 75 records = defaultdict()
kkonganti@0 76 logging.info("")
kkonganti@0 77
kkonganti@0 78 for record in SeqIO.parse(fh, "fasta"):
kkonganti@0 79
kkonganti@0 80 id = record.id
kkonganti@0 81 seq = record.seq
kkonganti@0 82
kkonganti@0 83 if id in sp2accs.keys():
kkonganti@0 84 records.setdefault(sp2accs[id], []).append(
kkonganti@0 85 SeqRecord(Seq(seq), id=id, description=str())
kkonganti@0 86 )
kkonganti@0 87 else:
kkonganti@0 88 print(f"Lineage row does not exist for accession: {id}")
kkonganti@0 89
kkonganti@0 90 logging.info(f"Collected FASTA records for {len(records.keys())} species'.")
kkonganti@0 91 fh.close()
kkonganti@0 92 return records
kkonganti@0 93
kkonganti@0 94
kkonganti@0 95 # Main
kkonganti@0 96 def main() -> None:
kkonganti@0 97 """
kkonganti@0 98 This script takes:
kkonganti@0 99 1. The FASTA file and,
kkonganti@0 100 2. Takes the corresponding lineages.csv file and,
kkonganti@0 101
kkonganti@0 102 then generates a folder containing individual FASTA sequence files
kkonganti@0 103 per species.
kkonganti@0 104 """
kkonganti@0 105
kkonganti@0 106 # Set logging.
kkonganti@0 107 logging.basicConfig(
kkonganti@0 108 format="\n"
kkonganti@0 109 + "=" * 55
kkonganti@0 110 + "\n%(asctime)s - %(levelname)s\n"
kkonganti@0 111 + "=" * 55
kkonganti@0 112 + "\n%(message)s\r\r",
kkonganti@0 113 level=logging.DEBUG,
kkonganti@0 114 )
kkonganti@0 115
kkonganti@0 116 # Debug print.
kkonganti@0 117 ppp = pprint.PrettyPrinter(width=55)
kkonganti@0 118 prog_name = os.path.basename(inspect.stack()[0].filename)
kkonganti@0 119
kkonganti@0 120 parser = argparse.ArgumentParser(
kkonganti@0 121 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@0 122 )
kkonganti@0 123
kkonganti@0 124 required = parser.add_argument_group("required arguments")
kkonganti@0 125
kkonganti@0 126 required.add_argument(
kkonganti@0 127 "-fa",
kkonganti@0 128 dest="fna",
kkonganti@0 129 default=False,
kkonganti@0 130 required=True,
kkonganti@0 131 help="Absolute UNIX path to the FASTA file that corresponds"
kkonganti@0 132 + "\nto the lineages.csv file.",
kkonganti@0 133 )
kkonganti@0 134 required.add_argument(
kkonganti@0 135 "-csv",
kkonganti@0 136 dest="csv",
kkonganti@0 137 default=False,
kkonganti@0 138 required=True,
kkonganti@0 139 help="Absolute UNIX path to lineages.csv which has a guaranteed 9 "
kkonganti@0 140 + "\ncolumns with the first being an accession.",
kkonganti@0 141 )
kkonganti@0 142 parser.add_argument(
kkonganti@0 143 "-out",
kkonganti@0 144 dest="out_folder",
kkonganti@0 145 default=os.path.join(os.getcwd(), "species"),
kkonganti@0 146 required=False,
kkonganti@0 147 help="By default, the output is written to this\nfolder.",
kkonganti@0 148 )
kkonganti@0 149 parser.add_argument(
kkonganti@0 150 "-f",
kkonganti@0 151 dest="force_write_out",
kkonganti@0 152 default=False,
kkonganti@0 153 action="store_true",
kkonganti@0 154 required=False,
kkonganti@0 155 help="Force overwrite output directory contents.",
kkonganti@0 156 )
kkonganti@0 157 parser.add_argument(
kkonganti@0 158 "-suffix",
kkonganti@0 159 dest="fna_suffix",
kkonganti@0 160 default=".fna",
kkonganti@0 161 required=False,
kkonganti@0 162 help="Suffix of the individual species FASTA files\nthat will be saved.",
kkonganti@0 163 )
kkonganti@0 164
kkonganti@0 165 # Parse defaults
kkonganti@0 166 args = parser.parse_args()
kkonganti@0 167 csv = args.csv
kkonganti@0 168 fna = args.fna
kkonganti@0 169 out = args.out_folder
kkonganti@0 170 overwrite = args.force_write_out
kkonganti@0 171 fna_suffix = args.fna_suffix
kkonganti@0 172
kkonganti@0 173 # Basic checks
kkonganti@0 174 if not overwrite and os.path.exists(out):
kkonganti@0 175 logging.warning(
kkonganti@0 176 f"Output destination [{os.path.basename(out)}] already exists!"
kkonganti@0 177 + "\nPlease use -f to delete and overwrite."
kkonganti@0 178 )
kkonganti@0 179 elif overwrite and os.path.exists(out):
kkonganti@0 180 logging.info(f"Overwrite requested. Deleting {os.path.basename(out)}...")
kkonganti@0 181 shutil.rmtree(out)
kkonganti@0 182
kkonganti@0 183 # Get taxonomy from ncbitax2lin
kkonganti@0 184 lineages = get_lineages(csv)
kkonganti@0 185
kkonganti@0 186 logging.info(f"Creating new squences per species...")
kkonganti@0 187
kkonganti@0 188 if not os.path.exists(out):
kkonganti@0 189 os.makedirs(out)
kkonganti@0 190
kkonganti@0 191 try:
kkonganti@0 192 gz_fh = gzip.open(fna, "rt")
kkonganti@0 193 fa_recs = parse_fasta(gz_fh, lineages)
kkonganti@0 194 except gzip.BadGzipFile:
kkonganti@0 195 logging.info(
kkonganti@0 196 f"Input FASTA file {os.path.basename(csv)} is not in\nGZIP format."
kkonganti@0 197 )
kkonganti@0 198 txt_fh = open(fna, "r")
kkonganti@0 199 fa_recs = parse_fasta(txt_fh, lineages)
kkonganti@0 200 finally:
kkonganti@0 201 logging.info("Assigned FASTA records per species...")
kkonganti@0 202
kkonganti@0 203 logging.info("Writing FASTA records per species...")
kkonganti@0 204
kkonganti@0 205 for sp in fa_recs.keys():
kkonganti@0 206 write_fasta(fa_recs[sp], out, sp, fna_suffix)
kkonganti@0 207
kkonganti@0 208
kkonganti@0 209 if __name__ == "__main__":
kkonganti@0 210 main()
kkonganti@0 211
kkonganti@0 212 # ~/apps/nowayout/bin/gen_per_species_fa_from_bold.py -tsv BOLD_Public.05-Feb-2024.tsv -csv ../tax.csv ─╯
kkonganti@0 213
kkonganti@0 214 # =======================================================
kkonganti@0 215 # 2024-02-08 21:37:28,541 - INFO
kkonganti@0 216 # =======================================================
kkonganti@0 217 # Indexing tax.csv...
kkonganti@0 218
kkonganti@0 219 # =======================================================
kkonganti@0 220 # 2024-02-08 21:38:06,567 - INFO
kkonganti@0 221 # =======================================================
kkonganti@0 222 # Creating new squences per species...
kkonganti@0 223
kkonganti@0 224 # =======================================================
kkonganti@0 225 # 2024-02-08 21:38:06,572 - INFO
kkonganti@0 226 # =======================================================
kkonganti@0 227 # Input TSV file BOLD_Public.05-Feb-2024.tsv is not in
kkonganti@0 228 # GZIP format.
kkonganti@0 229
kkonganti@0 230 # =======================================================
kkonganti@0 231 # 2024-02-08 22:01:04,554 - INFO
kkonganti@0 232 # =======================================================
kkonganti@0 233 # Collected FASTA records for 497421 species'.
kkonganti@0 234
kkonganti@0 235 # =======================================================
kkonganti@0 236 # 2024-02-08 22:24:35,000 - INFO
kkonganti@0 237 # =======================================================
kkonganti@0 238 # No. of raw records present in `ncbitax2lin` [tax.csv]: 2550767
kkonganti@0 239 # No. of valid records collected from `ncbitax2lin` [tax.csv]: 2134980
kkonganti@0 240 # No. of raw records in TSV [BOLD_Public.05-Feb-2024.tsv]: 9735210
kkonganti@0 241 # No. of valid records in TSV [BOLD_Public.05-Feb-2024.tsv]: 4988323
kkonganti@0 242 # No. of FASTA records for which new lineages were created: 4069202
kkonganti@0 243 # No. of FASTA records for which only genus, species and/or strain information were created: 919121
kkonganti@0 244
kkonganti@0 245 # =======================================================
kkonganti@0 246 # 2024-02-08 22:24:35,001 - INFO
kkonganti@0 247 # =======================================================
kkonganti@0 248 # Succesfully created lineages and FASTA records! Done!!