hfp_nowayout: 0.5.0/bin/gen_per_species_fa_from

annotate 0.5.0/bin/gen_per_species_fa_from_bold.py @ 1:fa47b825716b

planemo upload

author	kkonganti
date	Mon, 31 Mar 2025 14:53:21 -0400
parents	97cd2f532efe
children

rev	line source
kkonganti@0	1 #!/usr/bin/env python3
kkonganti@0	2
kkonganti@0	3 import argparse
kkonganti@0	4 import gzip
kkonganti@0	5 import inspect
kkonganti@0	6 import logging
kkonganti@0	7 import os
kkonganti@0	8 import pprint
kkonganti@0	9 import re
kkonganti@0	10 import shutil
kkonganti@0	11 from collections import defaultdict
kkonganti@0	12 from typing import BinaryIO, TextIO, Union
kkonganti@0	13
kkonganti@0	14 from Bio import SeqIO
kkonganti@0	15 from Bio.Seq import Seq
kkonganti@0	16 from Bio.SeqRecord import SeqRecord
kkonganti@0	17
kkonganti@0	18
kkonganti@0	19 # Multiple inheritence for pretty printing of help text.
kkonganti@0	20 class MultiArgFormatClasses(
kkonganti@0	21 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
kkonganti@0	22 ):
kkonganti@0	23 pass
kkonganti@0	24
kkonganti@0	25
kkonganti@0	26 def get_lineages(csv: os.PathLike, cols: list) -> list:
kkonganti@0	27 """
kkonganti@0	28 Parse the output from `ncbitax2lin` tool and
kkonganti@0	29 return a dict of lineages where the key is
kkonganti@0	30 genusspeciesstrain.
kkonganti@0	31 """
kkonganti@0	32 lineages = dict()
kkonganti@0	33 if csv == None or not (os.path.exists(csv) or os.path.getsize(csv) > 0):
kkonganti@0	34 logging.error(
kkonganti@0	35 f"The CSV file [{os.path.basename(csv)}] is empty or does not exist!"
kkonganti@0	36 )
kkonganti@0	37 exit(1)
kkonganti@0	38
kkonganti@0	39 logging.info(f"Indexing {os.path.basename(csv)}...")
kkonganti@0	40
kkonganti@0	41 with open(csv, "r") as csv_fh:
kkonganti@0	42 header_cols = csv_fh.readline().strip().split(",")
kkonganti@0	43 user_req_cols = [
kkonganti@0	44 tcol_i for tcol_i, tcol in enumerate(header_cols) if tcol in cols
kkonganti@0	45 ]
kkonganti@0	46 cols_not_found = [tcol for tcol in cols if tcol not in header_cols]
kkonganti@0	47 raw_recs = 0
kkonganti@0	48
kkonganti@0	49 if len(cols_not_found) > 0:
kkonganti@0	50 logging.error(
kkonganti@0	51 f"The following columns do not exist in the"
kkonganti@0	52 + f"\nCSV file [ {os.path.basename(csv)} ]:\n"
kkonganti@0	53 + "".join(cols_not_found)
kkonganti@0	54 )
kkonganti@0	55 exit(1)
kkonganti@0	56 elif len(user_req_cols) > 9:
kkonganti@0	57 logging.error(
kkonganti@0	58 f"Only a total of 9 columns are needed!"
kkonganti@0	59 + "\ntax_id,kindom,phylum,class,order,family,genus,species,strain"
kkonganti@0	60 )
kkonganti@0	61 exit(1)
kkonganti@0	62
kkonganti@0	63 for tax in csv_fh:
kkonganti@0	64 raw_recs += 1
kkonganti@0	65 lcols = tax.strip().split(",")
kkonganti@0	66
kkonganti@0	67 if bool(lcols[user_req_cols[8]]):
kkonganti@0	68 lineages[lcols[user_req_cols[8]]] = ",".join(
kkonganti@0	69 [lcols[l] for l in user_req_cols[1:]]
kkonganti@0	70 )
kkonganti@0	71 elif bool(lcols[user_req_cols[7]]):
kkonganti@0	72 lineages[lcols[user_req_cols[7]]] = ",".join(
kkonganti@0	73 [lcols[l] for l in user_req_cols[1:8]] + [str()]
kkonganti@0	74 )
kkonganti@0	75
kkonganti@0	76 csv_fh.close()
kkonganti@0	77 return lineages, raw_recs
kkonganti@0	78
kkonganti@0	79
kkonganti@0	80 def write_fasta(recs: list, basedir: os.PathLike, name: str, suffix: str) -> None:
kkonganti@0	81 """
kkonganti@0	82 Write sequence with no description to a specified file.
kkonganti@0	83 """
kkonganti@0	84 SeqIO.write(
kkonganti@0	85 recs,
kkonganti@0	86 os.path.join(basedir, name + suffix),
kkonganti@0	87 "fasta",
kkonganti@0	88 )
kkonganti@0	89
kkonganti@0	90
kkonganti@0	91 def check_and_get_cols(pat: re, cols: str, delim: str) -> list:
kkonganti@0	92 """
kkonganti@0	93 Check if header column matches the pattern and return
kkonganti@0	94 columns.
kkonganti@0	95 """
kkonganti@0	96 if not pat.match(cols):
kkonganti@0	97 logging.error(
kkonganti@0	98 f"Supplied columns' names {cols} should only have words"
kkonganti@0	99 f"\n(alphanumeric) separated by: {delim}."
kkonganti@0	100 )
kkonganti@0	101 exit(1)
kkonganti@0	102 else:
kkonganti@0	103 cols = re.sub("\n", "", cols).split(delim)
kkonganti@0	104
kkonganti@0	105 return cols
kkonganti@0	106
kkonganti@0	107
kkonganti@0	108 def parse_tsv(fh: Union[TextIO, BinaryIO], tcols: list, delim: str) -> list:
kkonganti@0	109 """
kkonganti@0	110 Parse the TSV file and produce the required per
kkonganti@0	111 species FASTA's.
kkonganti@0	112 """
kkonganti@0	113 records, sp2accs = (defaultdict(list), defaultdict(list))
kkonganti@0	114 header = fh.readline().strip().split(delim)
kkonganti@0	115 raw_recs = 0
kkonganti@0	116
kkonganti@0	117 if not all(col in header for col in tcols):
kkonganti@0	118 logging.error(
kkonganti@0	119 "The following columns were not found in the"
kkonganti@0	120 + f"\nheader row of file {os.path.basename(fh.name)}\n"
kkonganti@0	121 + "\n".join([ele for ele in tcols if ele not in header])
kkonganti@0	122 )
kkonganti@0	123
kkonganti@0	124 id_i, genus_i, species_i, strain_i, seq_i = [
kkonganti@0	125 i for i, ele in enumerate(header) if ele in tcols
kkonganti@0	126 ]
kkonganti@0	127
kkonganti@0	128 for record in fh:
kkonganti@0	129 raw_recs += 1
kkonganti@0	130
kkonganti@0	131 id = record.strip().split(delim)[id_i]
kkonganti@0	132 genus = record.strip().split(delim)[genus_i]
kkonganti@0	133 species = re.sub(r"[\/\\]+", "-", record.strip().split(delim)[species_i])
kkonganti@0	134 strain = record.strip().split(delim)[strain_i]
kkonganti@0	135 seq = re.sub(r"[^ATGC]+", "", record.strip().split(delim)[seq_i], re.IGNORECASE)
kkonganti@0	136
kkonganti@0	137 if re.match(r"None\|Null", species, re.IGNORECASE):
kkonganti@0	138 continue
kkonganti@0	139
kkonganti@0	140 # print(id)
kkonganti@0	141 # print(genus)
kkonganti@0	142 # print(species)
kkonganti@0	143 # print(strain)
kkonganti@0	144 # print(seq)
kkonganti@0	145
kkonganti@0	146 records.setdefault(species, []).append(
kkonganti@0	147 SeqRecord(Seq(seq), id=id, description=str())
kkonganti@0	148 )
kkonganti@0	149 sp2accs.setdefault(species, []).append(id)
kkonganti@0	150
kkonganti@0	151 logging.info(f"Collected FASTA records for {len(records.keys())} species'.")
kkonganti@0	152 fh.close()
kkonganti@0	153 return records, sp2accs, raw_recs
kkonganti@0	154
kkonganti@0	155
kkonganti@0	156 # Main
kkonganti@0	157 def main() -> None:
kkonganti@0	158 """
kkonganti@0	159 This script takes:
kkonganti@0	160 1. The TSV file from BOLD systems,
kkonganti@0	161 2. Takes as input a .csv file generated by `ncbitax2lin`.
kkonganti@0	162
kkonganti@0	163 and then generates a folder containing individual FASTA sequence files
kkonganti@0	164 per species. This is only possible if the full taxonomy of the barcode
kkonganti@0	165 sequence is present in the FASTA header.
kkonganti@0	166 """
kkonganti@0	167
kkonganti@0	168 # Set logging.
kkonganti@0	169 logging.basicConfig(
kkonganti@0	170 format="\n"
kkonganti@0	171 + "=" * 55
kkonganti@0	172 + "\n%(asctime)s - %(levelname)s\n"
kkonganti@0	173 + "=" * 55
kkonganti@0	174 + "\n%(message)s\r\r",
kkonganti@0	175 level=logging.DEBUG,
kkonganti@0	176 )
kkonganti@0	177
kkonganti@0	178 # Debug print.
kkonganti@0	179 ppp = pprint.PrettyPrinter(width=55)
kkonganti@0	180 prog_name = os.path.basename(inspect.stack()[0].filename)
kkonganti@0	181
kkonganti@0	182 parser = argparse.ArgumentParser(
kkonganti@0	183 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@0	184 )
kkonganti@0	185
kkonganti@0	186 required = parser.add_argument_group("required arguments")
kkonganti@0	187
kkonganti@0	188 required.add_argument(
kkonganti@0	189 "-tsv",
kkonganti@0	190 dest="tsv",
kkonganti@0	191 default=False,
kkonganti@0	192 required=True,
kkonganti@0	193 help="Absolute UNIX path to the TSV file from BOLD systems"
kkonganti@0	194 + "\nin uncompressed TXT format.",
kkonganti@0	195 )
kkonganti@0	196 required.add_argument(
kkonganti@0	197 "-csv",
kkonganti@0	198 dest="csv",
kkonganti@0	199 default=False,
kkonganti@0	200 required=True,
kkonganti@0	201 help="Absolute UNIX path to .csv or .csv.gz file which is generated "
kkonganti@0	202 + "\nby the `ncbitax2lin` tool.",
kkonganti@0	203 )
kkonganti@0	204 parser.add_argument(
kkonganti@0	205 "-out",
kkonganti@0	206 dest="out_folder",
kkonganti@0	207 default=os.path.join(os.getcwd(), "species"),
kkonganti@0	208 required=False,
kkonganti@0	209 help="By default, the output is written to this\nfolder.",
kkonganti@0	210 )
kkonganti@0	211 parser.add_argument(
kkonganti@0	212 "-f",
kkonganti@0	213 dest="force_write_out",
kkonganti@0	214 default=False,
kkonganti@0	215 action="store_true",
kkonganti@0	216 required=False,
kkonganti@0	217 help="Force overwrite output directory contents.",
kkonganti@0	218 )
kkonganti@0	219 parser.add_argument(
kkonganti@0	220 "-suffix",
kkonganti@0	221 dest="fna_suffix",
kkonganti@0	222 default=".fna",
kkonganti@0	223 required=False,
kkonganti@0	224 help="Suffix of the individual species FASTA files\nthat will be saved.",
kkonganti@0	225 )
kkonganti@0	226 parser.add_argument(
kkonganti@0	227 "-ccols",
kkonganti@0	228 dest="csv_cols",
kkonganti@0	229 default="tax_id,superkingdom,phylum,class,order,family,genus,species,strain",
kkonganti@0	230 required=False,
kkonganti@0	231 help="Taxonomic lineage will be built using these columns from the output of"
kkonganti@0	232 + "\n`ncbitax2lin`\ntool.",
kkonganti@0	233 )
kkonganti@0	234 parser.add_argument(
kkonganti@0	235 "-ccols-sep",
kkonganti@0	236 dest="csv_delim",
kkonganti@0	237 default=",",
kkonganti@0	238 required=False,
kkonganti@0	239 help="The delimitor of the fields in the CSV file.",
kkonganti@0	240 )
kkonganti@0	241 parser.add_argument(
kkonganti@0	242 "-tcols",
kkonganti@0	243 dest="tsv_cols",
kkonganti@0	244 default="processid\tgenus\tspecies\tsubspecies\tnucraw",
kkonganti@0	245 required=False,
kkonganti@0	246 help="For each species, the nucletide sequences will be\naggregated.",
kkonganti@0	247 )
kkonganti@0	248 parser.add_argument(
kkonganti@0	249 "-tcols-sep",
kkonganti@0	250 dest="tsv_delim",
kkonganti@0	251 default="\t",
kkonganti@0	252 required=False,
kkonganti@0	253 help="The delimitor of the fields in the TSV file.",
kkonganti@0	254 )
kkonganti@0	255
kkonganti@0	256 # Parse defaults
kkonganti@0	257 args = parser.parse_args()
kkonganti@0	258 tsv = args.tsv
kkonganti@0	259 csv = args.csv
kkonganti@0	260 csep = args.csv_delim
kkonganti@0	261 tsep = args.tsv_delim
kkonganti@0	262 csv_cols = args.csv_cols
kkonganti@0	263 tsv_cols = args.tsv_cols
kkonganti@0	264 out = args.out_folder
kkonganti@0	265 overwrite = args.force_write_out
kkonganti@0	266 fna_suffix = args.fna_suffix
kkonganti@0	267 ccols_pat = re.compile(f"^[\w\{csep}]+?\w$")
kkonganti@0	268 tcols_pat = re.compile(f"^[\w\{tsep}]+?\w$")
kkonganti@0	269 final_lineages = os.path.join(out, "lineages.csv")
kkonganti@0	270 lineages_not_found = os.path.join(out, "lineages_not_found.csv")
kkonganti@0	271 base_fasta_dir = os.path.join(out, "fasta")
kkonganti@0	272
kkonganti@0	273 # Basic checks
kkonganti@0	274 if not overwrite and os.path.exists(out):
kkonganti@0	275 logging.warning(
kkonganti@0	276 f"Output destination [{os.path.basename(out)}] already exists!"
kkonganti@0	277 + "\nPlease use -f to delete and overwrite."
kkonganti@0	278 )
kkonganti@0	279 elif overwrite and os.path.exists(out):
kkonganti@0	280 logging.info(f"Overwrite requested. Deleting {os.path.basename(out)}...")
kkonganti@0	281 shutil.rmtree(out)
kkonganti@0	282
kkonganti@0	283 # Validate user requested columns
kkonganti@0	284 passed_ccols = check_and_get_cols(ccols_pat, csv_cols, csep)
kkonganti@0	285 passed_tcols = check_and_get_cols(tcols_pat, tsv_cols, tsep)
kkonganti@0	286
kkonganti@0	287 # Get taxonomy from ncbitax2lin
kkonganti@0	288 lineages, raw_recs = get_lineages(csv, passed_ccols)
kkonganti@0	289
kkonganti@0	290 # Finally, read BOLD tsv if lineage exists.
kkonganti@0	291 logging.info(f"Creating new squences per species...")
kkonganti@0	292
kkonganti@0	293 if not os.path.exists(out):
kkonganti@0	294 os.makedirs(out)
kkonganti@0	295
kkonganti@0	296 try:
kkonganti@0	297 gz_fh = gzip.open(tsv, "rt")
kkonganti@0	298 records, sp2accs, traw_recs = parse_tsv(gz_fh, passed_tcols, tsep)
kkonganti@0	299 except gzip.BadGzipFile:
kkonganti@0	300 logging.info(f"Input TSV file {os.path.basename(tsv)} is not in\nGZIP format.")
kkonganti@0	301 txt_fh = open(tsv, "r")
kkonganti@0	302 records, sp2accs, traw_recs = parse_tsv(txt_fh, passed_tcols, tsep)
kkonganti@0	303
kkonganti@0	304 passed_tax_check = 0
kkonganti@0	305 failed_tax_check = 0
kkonganti@0	306 fasta_recs_written = 0
kkonganti@0	307 l_fh = open(final_lineages, "w")
kkonganti@0	308 ln_fh = open(lineages_not_found, "w")
kkonganti@0	309 l_fh.write(
kkonganti@0	310 "identifiers,superkingdom,phylum,class,order,family,genus,species,strain\n"
kkonganti@0	311 )
kkonganti@0	312 ln_fh.write("fna_id,parsed_org\n")
kkonganti@0	313
kkonganti@0	314 if not os.path.exists(base_fasta_dir):
kkonganti@0	315 os.makedirs(base_fasta_dir)
kkonganti@0	316
kkonganti@0	317 for genus_species in records.keys():
kkonganti@0	318 fasta_recs_written += len(records[genus_species])
kkonganti@0	319 write_fasta(
kkonganti@0	320 records[genus_species],
kkonganti@0	321 base_fasta_dir,
kkonganti@0	322 "_".join(genus_species.split(" ")),
kkonganti@0	323 fna_suffix,
kkonganti@0	324 )
kkonganti@0	325 org_words = genus_species.split(" ")
kkonganti@0	326
kkonganti@0	327 for id in sp2accs[genus_species]:
kkonganti@0	328 if genus_species in lineages.keys():
kkonganti@0	329 this_line = ",".join([id, lineages[genus_species]]) + "\n"
kkonganti@0	330
kkonganti@0	331 if len(org_words) > 2:
kkonganti@0	332 this_line = (
kkonganti@0	333 ",".join(
kkonganti@0	334 [id, lineages[genus_species].rstrip(","), genus_species]
kkonganti@0	335 )
kkonganti@0	336 + "\n"
kkonganti@0	337 )
kkonganti@0	338
kkonganti@0	339 l_fh.write(this_line)
kkonganti@0	340 passed_tax_check += 1
kkonganti@0	341 else:
kkonganti@0	342 this_line = (
kkonganti@0	343 ",".join(
kkonganti@0	344 [
kkonganti@0	345 id,
kkonganti@0	346 "",
kkonganti@0	347 "",
kkonganti@0	348 "",
kkonganti@0	349 "",
kkonganti@0	350 "",
kkonganti@0	351 org_words[0],
kkonganti@0	352 genus_species,
kkonganti@0	353 "",
kkonganti@0	354 ]
kkonganti@0	355 )
kkonganti@0	356 + "\n"
kkonganti@0	357 )
kkonganti@0	358 if len(org_words) > 2:
kkonganti@0	359 this_line = (
kkonganti@0	360 ",".join(
kkonganti@0	361 [
kkonganti@0	362 id,
kkonganti@0	363 "",
kkonganti@0	364 "",
kkonganti@0	365 "",
kkonganti@0	366 "",
kkonganti@0	367 "",
kkonganti@0	368 org_words[0],
kkonganti@0	369 org_words[0] + " " + org_words[1],
kkonganti@0	370 genus_species,
kkonganti@0	371 ]
kkonganti@0	372 )
kkonganti@0	373 + "\n"
kkonganti@0	374 )
kkonganti@0	375 l_fh.write(this_line)
kkonganti@0	376 ln_fh.write(",".join([id, genus_species]) + "\n")
kkonganti@0	377 failed_tax_check += 1
kkonganti@0	378
kkonganti@0	379 logging.info(
kkonganti@0	380 f"No. of raw records present in `ncbitax2lin` [{os.path.basename(csv)}]: {raw_recs}"
kkonganti@0	381 + f"\nNo. of valid records collected from `ncbitax2lin` [{os.path.basename(csv)}]: {len(lineages.keys())}"
kkonganti@0	382 + f"\nNo. of raw records in TSV [{os.path.basename(tsv)}]: {traw_recs}"
kkonganti@0	383 + f"\nNo. of valid records in TSV [{os.path.basename(tsv)}]: {passed_tax_check + failed_tax_check}"
kkonganti@0	384 + f"\nNo. of FASTA records for which new lineages were created: {passed_tax_check}"
kkonganti@0	385 + f"\nNo. of FASTA records for which only genus, species and/or strain information were created: {failed_tax_check}"
kkonganti@0	386 )
kkonganti@0	387
kkonganti@0	388 if (passed_tax_check + failed_tax_check) != fasta_recs_written:
kkonganti@0	389 logging.error(
kkonganti@0	390 f"The number of input FASTA records [{fasta_recs_written}]"
kkonganti@0	391 + f"\nis not equal to number of lineages created [{passed_tax_check + failed_tax_check}]!"
kkonganti@0	392 )
kkonganti@0	393 exit(1)
kkonganti@0	394 else:
kkonganti@0	395 logging.info("Succesfully created lineages and FASTA records! Done!!")
kkonganti@0	396
kkonganti@0	397
kkonganti@0	398 if __name__ == "__main__":
kkonganti@0	399 main()
kkonganti@0	400
kkonganti@0	401 # ~/apps/nowayout/bin/gen_per_species_fa_from_bold.py -tsv BOLD_Public.05-Feb-2024.tsv -csv ../tax.csv ─╯
kkonganti@0	402
kkonganti@0	403 # =======================================================
kkonganti@0	404 # 2024-02-08 21:37:28,541 - INFO
kkonganti@0	405 # =======================================================
kkonganti@0	406 # Indexing tax.csv...
kkonganti@0	407
kkonganti@0	408 # =======================================================
kkonganti@0	409 # 2024-02-08 21:38:06,567 - INFO
kkonganti@0	410 # =======================================================
kkonganti@0	411 # Creating new squences per species...
kkonganti@0	412
kkonganti@0	413 # =======================================================
kkonganti@0	414 # 2024-02-08 21:38:06,572 - INFO
kkonganti@0	415 # =======================================================
kkonganti@0	416 # Input TSV file BOLD_Public.05-Feb-2024.tsv is not in
kkonganti@0	417 # GZIP format.
kkonganti@0	418
kkonganti@0	419 # =======================================================
kkonganti@0	420 # 2024-02-08 22:01:04,554 - INFO
kkonganti@0	421 # =======================================================
kkonganti@0	422 # Collected FASTA records for 497421 species'.
kkonganti@0	423
kkonganti@0	424 # =======================================================
kkonganti@0	425 # 2024-02-08 22:24:35,000 - INFO
kkonganti@0	426 # =======================================================
kkonganti@0	427 # No. of raw records present in `ncbitax2lin` [tax.csv]: 2550767
kkonganti@0	428 # No. of valid records collected from `ncbitax2lin` [tax.csv]: 2134980
kkonganti@0	429 # No. of raw records in TSV [BOLD_Public.05-Feb-2024.tsv]: 9735210
kkonganti@0	430 # No. of valid records in TSV [BOLD_Public.05-Feb-2024.tsv]: 4988323
kkonganti@0	431 # No. of FASTA records for which new lineages were created: 4069202
kkonganti@0	432 # No. of FASTA records for which only genus, species and/or strain information were created: 919121
kkonganti@0	433
kkonganti@0	434 # =======================================================
kkonganti@0	435 # 2024-02-08 22:24:35,001 - INFO
kkonganti@0	436 # =======================================================
kkonganti@0	437 # Succesfully created lineages and FASTA records! Done!!

Mercurial > repos > kkonganti > hfp_nowayout

annotate 0.5.0/bin/gen_per_species_fa_from_bold.py @ 1:fa47b825716b