annotate 0.5.0/bin/create_fasta_and_lineages.py @ 8:3539fbeb4230

planemo upload
author kkonganti
date Tue, 01 Apr 2025 10:27:25 -0400
parents 97cd2f532efe
children
rev   line source
kkonganti@0 1 #!/usr/bin/env python3
kkonganti@0 2
kkonganti@0 3 import argparse
kkonganti@0 4 import gzip
kkonganti@0 5 import inspect
kkonganti@0 6 import logging
kkonganti@0 7 import os
kkonganti@0 8 import pprint
kkonganti@0 9 import re
kkonganti@0 10 import shutil
kkonganti@0 11 import ssl
kkonganti@0 12 import tempfile
kkonganti@0 13 from html.parser import HTMLParser
kkonganti@0 14 from urllib.request import urlopen
kkonganti@0 15
kkonganti@0 16 from Bio import SeqIO
kkonganti@0 17 from Bio.Seq import Seq
kkonganti@0 18 from Bio.SeqRecord import SeqRecord
kkonganti@0 19
kkonganti@0 20
kkonganti@0 21 # Multiple inheritence for pretty printing of help text.
kkonganti@0 22 class MultiArgFormatClasses(
kkonganti@0 23 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
kkonganti@0 24 ):
kkonganti@0 25 pass
kkonganti@0 26
kkonganti@0 27
kkonganti@0 28 # HTMLParser override class to get fna.gz and gbff.gz
kkonganti@0 29 class NCBIHTMLParser(HTMLParser):
kkonganti@0 30 def __init__(self, *, convert_charrefs: bool = ...) -> None:
kkonganti@0 31 super().__init__(convert_charrefs=convert_charrefs)
kkonganti@0 32 self.reset()
kkonganti@0 33 self.href_data = list()
kkonganti@0 34
kkonganti@0 35 def handle_data(self, data):
kkonganti@0 36 self.href_data.append(data)
kkonganti@0 37
kkonganti@0 38
kkonganti@0 39 # Download organelle FASTA and GenBank file.
kkonganti@0 40 def dl_mito_seqs_and_flat_files(url: str, suffix: re, out: os.PathLike) -> os.PathLike:
kkonganti@0 41 """
kkonganti@0 42 Method to save .fna.gz and .gbff.gz files for the
kkonganti@0 43 RefSeq mitochondrion release.
kkonganti@0 44 """
kkonganti@0 45 contxt = ssl.create_default_context()
kkonganti@0 46 contxt.check_hostname = False
kkonganti@0 47 contxt.verify_mode = ssl.CERT_NONE
kkonganti@0 48
kkonganti@0 49 if url == None:
kkonganti@0 50 logging.error(
kkonganti@0 51 "Please provide the base URL where .fna.gz and .gbff.gz"
kkonganti@0 52 + "\nfiles for RefSeq mitochondrion can be found."
kkonganti@0 53 )
kkonganti@0 54 exit(1)
kkonganti@0 55
kkonganti@0 56 if os.path.exists(out):
kkonganti@0 57 for file in os.listdir(out):
kkonganti@0 58 file_path = os.path.join(out, file)
kkonganti@0 59
kkonganti@0 60 if suffix.match(file_path) and os.path.getsize(file_path) > 0:
kkonganti@0 61 logging.info(
kkonganti@0 62 f"The required mitochondrion file(s)\n[{os.path.basename(file_path)}]"
kkonganti@0 63 + " already exists.\nSkipping download from NCBI..."
kkonganti@0 64 + "\nPlease use -f to delete and overwrite."
kkonganti@0 65 )
kkonganti@0 66 return file_path
kkonganti@0 67 else:
kkonganti@0 68 os.makedirs(out)
kkonganti@0 69
kkonganti@0 70 html_parser = NCBIHTMLParser()
kkonganti@0 71 logging.info(f"Finding latest NCBI RefSeq mitochondrion release at:\n{url}")
kkonganti@0 72
kkonganti@0 73 with urlopen(url, context=contxt) as response:
kkonganti@0 74 with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file:
kkonganti@0 75 shutil.copyfileobj(response, tmp_html_file)
kkonganti@0 76
kkonganti@0 77 with open(tmp_html_file.name, "r") as html:
kkonganti@0 78 html_parser.feed("".join(html.readlines()))
kkonganti@0 79
kkonganti@0 80 file = suffix.search("".join(html_parser.href_data)).group(0)
kkonganti@0 81 file_url = "/".join([url, file + ".gz"])
kkonganti@0 82 file_at = os.path.join(out, file)
kkonganti@0 83
kkonganti@0 84 logging.info(f"Found NCBI RefSeq mitochondrian file(s):\n{file_url}")
kkonganti@0 85
kkonganti@0 86 logging.info(f"Saving to:\n{file_at}")
kkonganti@0 87
kkonganti@0 88 with tempfile.NamedTemporaryFile(delete=False) as tmp_gz:
kkonganti@0 89 with urlopen(file_url, context=contxt) as response:
kkonganti@0 90 tmp_gz.write(response.read())
kkonganti@0 91
kkonganti@0 92 with open(file_at, "w") as fh:
kkonganti@0 93 with gzip.open(tmp_gz.name, "rb") as web_gz:
kkonganti@0 94 fh.write(web_gz.read().decode("utf-8"))
kkonganti@0 95
kkonganti@0 96 html.close()
kkonganti@0 97 tmp_gz.close()
kkonganti@0 98 tmp_html_file.close()
kkonganti@0 99 os.unlink(tmp_gz.name)
kkonganti@0 100 os.unlink(tmp_html_file.name)
kkonganti@0 101 fh.close()
kkonganti@0 102 web_gz.close()
kkonganti@0 103 response.close()
kkonganti@0 104
kkonganti@0 105 return file_at
kkonganti@0 106
kkonganti@0 107
kkonganti@0 108 def get_lineages(csv: os.PathLike, cols: list) -> list:
kkonganti@0 109 """
kkonganti@0 110 Parse the output from `ncbitax2lin` tool and
kkonganti@0 111 return a dict of lineages where the key is
kkonganti@0 112 genusspeciesstrain.
kkonganti@0 113 """
kkonganti@0 114 lineages = dict()
kkonganti@0 115 if csv == None or not (os.path.exists(csv) or os.path.getsize(csv) > 0):
kkonganti@0 116 logging.error(
kkonganti@0 117 f"The CSV file [{os.path.basename(csv)}] is empty or does not exist!"
kkonganti@0 118 )
kkonganti@0 119 exit(1)
kkonganti@0 120
kkonganti@0 121 logging.info(f"Indexing {os.path.basename(csv)}...")
kkonganti@0 122
kkonganti@0 123 with open(csv, "r") as csv_fh:
kkonganti@0 124 header_cols = csv_fh.readline().strip().split(",")
kkonganti@0 125 user_req_cols = [
kkonganti@0 126 tcol_i for tcol_i, tcol in enumerate(header_cols) if tcol in cols
kkonganti@0 127 ]
kkonganti@0 128 cols_not_found = [tcol for tcol in cols if tcol not in header_cols]
kkonganti@0 129 raw_recs = 0
kkonganti@0 130
kkonganti@0 131 if len(cols_not_found) > 0:
kkonganti@0 132 logging.error(
kkonganti@0 133 f"The following columns do not exist in the"
kkonganti@0 134 + f"\nCSV file [ {os.path.basename(csv)} ]:\n"
kkonganti@0 135 + "".join(cols_not_found)
kkonganti@0 136 )
kkonganti@0 137 exit(1)
kkonganti@0 138 elif len(user_req_cols) > 9:
kkonganti@0 139 logging.error(
kkonganti@0 140 f"Only a total of 9 columns are needed!"
kkonganti@0 141 + "\ntax_id,kindom,phylum,class,order,family,genus,species,strain"
kkonganti@0 142 )
kkonganti@0 143 exit(1)
kkonganti@0 144
kkonganti@0 145 for tax in csv_fh:
kkonganti@0 146 raw_recs += 1
kkonganti@0 147 lcols = tax.strip().split(",")
kkonganti@0 148
kkonganti@0 149 if bool(lcols[user_req_cols[8]]):
kkonganti@0 150 lineages[lcols[user_req_cols[8]]] = ",".join(
kkonganti@0 151 [lcols[l] for l in user_req_cols[1:]]
kkonganti@0 152 )
kkonganti@0 153 elif bool(lcols[user_req_cols[7]]):
kkonganti@0 154 lineages[lcols[user_req_cols[7]]] = ",".join(
kkonganti@0 155 [lcols[l] for l in user_req_cols[1:8]] + [str()]
kkonganti@0 156 )
kkonganti@0 157
kkonganti@0 158 csv_fh.close()
kkonganti@0 159 return lineages, raw_recs
kkonganti@0 160
kkonganti@0 161
kkonganti@0 162 def from_genbank(gbk: os.PathLike, min_len: int) -> dict:
kkonganti@0 163 """
kkonganti@0 164 Method to parse GenBank file and return
kkonganti@0 165 organism to latest accession mapping.
kkonganti@0 166 """
kkonganti@0 167 accs2orgs = dict()
kkonganti@0 168
kkonganti@0 169 if not (os.path.exists(gbk) or os.path.getsize(gbk) > 0):
kkonganti@0 170 logging.info(
kkonganti@0 171 f"The GenBank file [{os.path.basename(gbk)}] does not exist"
kkonganti@0 172 + "\nor is of size 0."
kkonganti@0 173 )
kkonganti@0 174 exit(1)
kkonganti@0 175
kkonganti@0 176 logging.info(f"Indexing {os.path.basename(gbk)}...")
kkonganti@0 177
kkonganti@0 178 # a = open("./_accs", "w")
kkonganti@0 179 for record in SeqIO.parse(gbk, "genbank"):
kkonganti@0 180 if len(record.seq) < min_len:
kkonganti@0 181 continue
kkonganti@0 182 else:
kkonganti@0 183 # a.write(f"{record.id}\n")
kkonganti@0 184 accs2orgs[record.id] = record.annotations["organism"]
kkonganti@0 185
kkonganti@0 186 return accs2orgs
kkonganti@0 187
kkonganti@0 188
kkonganti@0 189 def from_genbank_alt(gbk: os.PathLike) -> dict:
kkonganti@0 190 """
kkonganti@0 191 Method to parse GenBank file and return
kkonganti@0 192 organism to latest accession mapping without
kkonganti@0 193 using BioPython's GenBank Scanner
kkonganti@0 194 """
kkonganti@0 195 accs2orgs = dict()
kkonganti@0 196 accs = dict()
kkonganti@0 197 orgs = dict()
kkonganti@0 198 acc = False
kkonganti@0 199 acc_pat = re.compile(r"^VERSION\s+(.+)")
kkonganti@0 200 org_pat = re.compile(r"^\s+ORGANISM\s+(.+)")
kkonganti@0 201
kkonganti@0 202 if not (os.path.exists(gbk) or os.path.getsize(gbk) > 0):
kkonganti@0 203 logging.info(
kkonganti@0 204 f"The GenBank file [{os.path.basename(gbk)}] does not exist"
kkonganti@0 205 + "\nor is of size 0."
kkonganti@0 206 )
kkonganti@0 207 exit(1)
kkonganti@0 208
kkonganti@0 209 logging.info(
kkonganti@0 210 f"Indexing {os.path.basename(gbk)} without using\nBioPython's GenBank Scanner..."
kkonganti@0 211 )
kkonganti@0 212
kkonganti@0 213 with open(gbk, "r") as gbk_fh:
kkonganti@0 214 for line in gbk_fh:
kkonganti@0 215 line = line.rstrip()
kkonganti@0 216 if line.startswith("VERSION") and acc_pat.match(line):
kkonganti@0 217 acc = acc_pat.match(line).group(1)
kkonganti@0 218 accs[acc] = 1
kkonganti@0 219 if org_pat.match(line):
kkonganti@0 220 if acc and acc not in orgs.keys():
kkonganti@0 221 orgs[acc] = org_pat.match(line).group(1)
kkonganti@0 222 elif acc and acc in orgs.keys():
kkonganti@0 223 logging.error(f"Duplicate VERSION line: {acc}")
kkonganti@0 224 exit(1)
kkonganti@0 225 if len(accs.keys()) != len(orgs.keys()):
kkonganti@0 226 logging.error(
kkonganti@0 227 f"Got unequal number of organisms ({len(orgs.keys())})\n"
kkonganti@0 228 + f"and accessions ({len(accs.keys())})"
kkonganti@0 229 )
kkonganti@0 230 exit(1)
kkonganti@0 231 else:
kkonganti@0 232 for acc in accs.keys():
kkonganti@0 233 if acc not in orgs.keys():
kkonganti@0 234 logging.error(f"ORAGANISM not found for accession: {acc}")
kkonganti@0 235 exit(1)
kkonganti@0 236 accs2orgs[acc] = orgs[acc]
kkonganti@0 237
kkonganti@0 238 gbk_fh.close()
kkonganti@0 239 return accs2orgs
kkonganti@0 240
kkonganti@0 241
kkonganti@0 242 def write_fasta(seq: str, id: str, basedir: os.PathLike, suffix: str) -> None:
kkonganti@0 243 """
kkonganti@0 244 Write sequence with no description to specified file.
kkonganti@0 245 """
kkonganti@0 246 SeqIO.write(
kkonganti@0 247 SeqRecord(Seq(seq), id=id, description=str()),
kkonganti@0 248 os.path.join(basedir, id + suffix),
kkonganti@0 249 "fasta",
kkonganti@0 250 )
kkonganti@0 251
kkonganti@0 252
kkonganti@0 253 # Main
kkonganti@0 254 def main() -> None:
kkonganti@0 255 """
kkonganti@0 256 This script takes:
kkonganti@0 257 1. Downloads the RefSeq Mitochrondrial GenBank and FASTA format files.
kkonganti@0 258 2. Takes as input and output .csv.gz or .csv file generated by `ncbitax2lin`.
kkonganti@0 259
kkonganti@0 260 and then generates a folder containing individual FASTA sequence files
kkonganti@0 261 per organelle, and a corresponding lineage file in CSV format.
kkonganti@0 262 """
kkonganti@0 263
kkonganti@0 264 # Set logging.
kkonganti@0 265 logging.basicConfig(
kkonganti@0 266 format="\n"
kkonganti@0 267 + "=" * 55
kkonganti@0 268 + "\n%(asctime)s - %(levelname)s\n"
kkonganti@0 269 + "=" * 55
kkonganti@0 270 + "\n%(message)s\n\n",
kkonganti@0 271 level=logging.DEBUG,
kkonganti@0 272 )
kkonganti@0 273
kkonganti@0 274 # Debug print.
kkonganti@0 275 ppp = pprint.PrettyPrinter(width=55)
kkonganti@0 276 prog_name = os.path.basename(inspect.stack()[0].filename)
kkonganti@0 277
kkonganti@0 278 parser = argparse.ArgumentParser(
kkonganti@0 279 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@0 280 )
kkonganti@0 281
kkonganti@0 282 required = parser.add_argument_group("required arguments")
kkonganti@0 283
kkonganti@0 284 required.add_argument(
kkonganti@0 285 "-csv",
kkonganti@0 286 dest="csv",
kkonganti@0 287 default=False,
kkonganti@0 288 required=True,
kkonganti@0 289 help="Absolute UNIX path to .csv or .csv.gz file which is generated "
kkonganti@0 290 + "\nby the `ncbitax2lin` tool.",
kkonganti@0 291 )
kkonganti@0 292 parser.add_argument(
kkonganti@0 293 "-cols",
kkonganti@0 294 dest="lineage_cols",
kkonganti@0 295 default="tax_id,superkingdom,phylum,class,order,family,genus,species,strain",
kkonganti@0 296 required=False,
kkonganti@0 297 help="Taxonomic lineage will be built using these columns from the output of"
kkonganti@0 298 + "\n`ncbitax2lin` tool.",
kkonganti@0 299 )
kkonganti@0 300 parser.add_argument(
kkonganti@0 301 "-url",
kkonganti@0 302 dest="url",
kkonganti@0 303 default="https://ftp.ncbi.nlm.nih.gov/refseq/release/mitochondrion",
kkonganti@0 304 required=False,
kkonganti@0 305 help="Base URL from where NCBI RefSeq mitochondrion files will be downloaded\nfrom.",
kkonganti@0 306 )
kkonganti@0 307 parser.add_argument(
kkonganti@0 308 "-out",
kkonganti@0 309 dest="out_folder",
kkonganti@0 310 default=os.path.join(os.getcwd(), "organelles"),
kkonganti@0 311 required=False,
kkonganti@0 312 help="By default, the output is written to this folder.",
kkonganti@0 313 )
kkonganti@0 314 parser.add_argument(
kkonganti@0 315 "-f",
kkonganti@0 316 dest="force_write_out",
kkonganti@0 317 default=False,
kkonganti@0 318 action="store_true",
kkonganti@0 319 required=False,
kkonganti@0 320 help="Force overwrite output directory contents.",
kkonganti@0 321 )
kkonganti@0 322 parser.add_argument(
kkonganti@0 323 "--fna-suffix",
kkonganti@0 324 dest="fna_suffix",
kkonganti@0 325 default=".fna",
kkonganti@0 326 required=False,
kkonganti@0 327 help="Suffix of the individual organelle FASTA files that will be saved.",
kkonganti@0 328 )
kkonganti@0 329 parser.add_argument(
kkonganti@0 330 "-ml",
kkonganti@0 331 dest="fa_min_len",
kkonganti@0 332 default=200,
kkonganti@0 333 required=False,
kkonganti@0 334 help="Minimum length of the FASTA sequence for it to be considered for"
kkonganti@0 335 + "\nfurther processing",
kkonganti@0 336 )
kkonganti@0 337 parser.add_argument(
kkonganti@0 338 "--skip-per-fa",
kkonganti@0 339 dest="skip_per_fa",
kkonganti@0 340 default=False,
kkonganti@0 341 required=False,
kkonganti@0 342 action="store_true",
kkonganti@0 343 help="Do not generate per sequence FASTA file.",
kkonganti@0 344 )
kkonganti@0 345 parser.add_argument(
kkonganti@0 346 "--alt-gb-parser",
kkonganti@0 347 dest="alt_gb_parser",
kkonganti@0 348 default=False,
kkonganti@0 349 required=False,
kkonganti@0 350 action="store_true",
kkonganti@0 351 help="Use alternate GenBank parser instead of BioPython's.",
kkonganti@0 352 )
kkonganti@0 353
kkonganti@0 354 # Parse defaults
kkonganti@0 355 args = parser.parse_args()
kkonganti@0 356 csv = args.csv
kkonganti@0 357 out = args.out_folder
kkonganti@0 358 overwrite = args.force_write_out
kkonganti@0 359 fna_suffix = args.fna_suffix
kkonganti@0 360 url = args.url
kkonganti@0 361 tax_cols = args.lineage_cols
kkonganti@0 362 skip_per_fa = args.skip_per_fa
kkonganti@0 363 alt_gb_parser = args.alt_gb_parser
kkonganti@0 364 min_len = int(args.fa_min_len)
kkonganti@0 365 tcols_pat = re.compile(r"^[\w\,]+?\w$")
kkonganti@0 366 mito_fna_suffix = re.compile(r".*?\.genomic\.fna")
kkonganti@0 367 mito_gbff_suffix = re.compile(r".*?\.genomic\.gbff")
kkonganti@0 368 final_lineages = os.path.join(out, "lineages.csv")
kkonganti@0 369 lineages_not_found = os.path.join(out, "lineages_not_found.csv")
kkonganti@0 370 base_fasta_dir = os.path.join(out, "fasta")
kkonganti@0 371
kkonganti@0 372 # Basic checks
kkonganti@0 373 if not overwrite and os.path.exists(out):
kkonganti@0 374 logging.warning(
kkonganti@0 375 f"Output destination [{os.path.basename(out)}] already exists!"
kkonganti@0 376 + "\nPlease use -f to delete and overwrite."
kkonganti@0 377 )
kkonganti@0 378 elif overwrite and os.path.exists(out):
kkonganti@0 379 logging.info(f"Overwrite requested. Deleting {os.path.basename(out)}...")
kkonganti@0 380 shutil.rmtree(out)
kkonganti@0 381
kkonganti@0 382 if not tcols_pat.match(tax_cols):
kkonganti@0 383 logging.error(
kkonganti@0 384 f"Supplied columns' names {tax_cols} should only have words (alphanumeric) separated by a comma."
kkonganti@0 385 )
kkonganti@0 386 exit(1)
kkonganti@0 387 else:
kkonganti@0 388 tax_cols = re.sub("\n", "", tax_cols).split(",")
kkonganti@0 389
kkonganti@0 390 # Get .fna and .gbk files
kkonganti@0 391 fna = dl_mito_seqs_and_flat_files(url, mito_fna_suffix, out)
kkonganti@0 392 gbk = dl_mito_seqs_and_flat_files(url, mito_gbff_suffix, out)
kkonganti@0 393
kkonganti@0 394 # Get taxonomy from ncbitax2lin
kkonganti@0 395 lineages, raw_recs = get_lineages(csv, tax_cols)
kkonganti@0 396
kkonganti@0 397 # Get parsed organisms and latest accession from GenBank file.
kkonganti@0 398 if alt_gb_parser:
kkonganti@0 399 accs2orgs = from_genbank_alt(gbk)
kkonganti@0 400 else:
kkonganti@0 401 accs2orgs = from_genbank(gbk, min_len)
kkonganti@0 402
kkonganti@0 403 # # Finally, read FASTA and create individual FASTA if lineage exists.
kkonganti@0 404 logging.info(f"Creating new sequences and lineages...")
kkonganti@0 405
kkonganti@0 406 l_fh = open(final_lineages, "w")
kkonganti@0 407 ln_fh = open(lineages_not_found, "w")
kkonganti@0 408 l_fh.write(
kkonganti@0 409 "identifiers,superkingdom,phylum,class,order,family,genus,species,strain\n"
kkonganti@0 410 )
kkonganti@0 411 ln_fh.write("fna_id,gbk_org\n")
kkonganti@0 412 passed_lookup = 0
kkonganti@0 413 failed_lookup = 0
kkonganti@0 414 gbk_recs_missing = 0
kkonganti@0 415 skipped_len_short = 0
kkonganti@0 416
kkonganti@0 417 if not os.path.exists(base_fasta_dir):
kkonganti@0 418 os.makedirs(base_fasta_dir)
kkonganti@0 419
kkonganti@0 420 for record in SeqIO.parse(fna, "fasta"):
kkonganti@0 421 if len(record.seq) < min_len:
kkonganti@0 422 skipped_len_short += 1
kkonganti@0 423 continue
kkonganti@0 424 elif record.id in accs2orgs.keys():
kkonganti@0 425 org_words = accs2orgs[record.id].split(" ")
kkonganti@0 426 else:
kkonganti@0 427 gbk_recs_missing += 1
kkonganti@0 428 continue
kkonganti@0 429
kkonganti@0 430 genus_species = (
kkonganti@0 431 " ".join(org_words[0:2]) if len(org_words) > 2 else " ".join(org_words[0:])
kkonganti@0 432 )
kkonganti@0 433
kkonganti@0 434 if not skip_per_fa:
kkonganti@0 435 write_fasta(record.seq, record.id, base_fasta_dir, fna_suffix)
kkonganti@0 436
kkonganti@0 437 if record.id in accs2orgs.keys() and accs2orgs[record.id] in lineages.keys():
kkonganti@0 438 l_fh.write(",".join([record.id, lineages[accs2orgs[record.id]]]) + "\n")
kkonganti@0 439 passed_lookup += 1
kkonganti@0 440 elif record.id in accs2orgs.keys() and genus_species in lineages.keys():
kkonganti@0 441 if len(org_words) > 2:
kkonganti@0 442 l_fh.write(
kkonganti@0 443 ",".join(
kkonganti@0 444 [
kkonganti@0 445 record.id,
kkonganti@0 446 lineages[genus_species].rstrip(","),
kkonganti@0 447 accs2orgs[record.id],
kkonganti@0 448 ]
kkonganti@0 449 )
kkonganti@0 450 + "\n"
kkonganti@0 451 )
kkonganti@0 452 else:
kkonganti@0 453 l_fh.write(",".join([record.id, lineages[genus_species]]) + "\n")
kkonganti@0 454 passed_lookup += 1
kkonganti@0 455 else:
kkonganti@0 456 if len(org_words) > 2:
kkonganti@0 457 l_fh.write(
kkonganti@0 458 ",".join(
kkonganti@0 459 [
kkonganti@0 460 record.id,
kkonganti@0 461 "",
kkonganti@0 462 "",
kkonganti@0 463 "",
kkonganti@0 464 "",
kkonganti@0 465 "",
kkonganti@0 466 org_words[0],
kkonganti@0 467 org_words[0] + " " + org_words[1],
kkonganti@0 468 accs2orgs[record.id],
kkonganti@0 469 ]
kkonganti@0 470 )
kkonganti@0 471 + "\n"
kkonganti@0 472 )
kkonganti@0 473 else:
kkonganti@0 474 l_fh.write(
kkonganti@0 475 ",".join(
kkonganti@0 476 [
kkonganti@0 477 record.id,
kkonganti@0 478 "",
kkonganti@0 479 "",
kkonganti@0 480 "",
kkonganti@0 481 "",
kkonganti@0 482 "",
kkonganti@0 483 org_words[0],
kkonganti@0 484 accs2orgs[record.id],
kkonganti@0 485 "",
kkonganti@0 486 ]
kkonganti@0 487 )
kkonganti@0 488 + "\n"
kkonganti@0 489 )
kkonganti@0 490 ln_fh.write(",".join([record.id, accs2orgs[record.id]]) + "\n")
kkonganti@0 491 failed_lookup += 1
kkonganti@0 492
kkonganti@0 493 logging.info(
kkonganti@0 494 f"No. of raw records present in `ncbitax2lin` [{os.path.basename(csv)}]: {raw_recs}"
kkonganti@0 495 + f"\nNo. of valid records collected from `ncbitax2lin` [{os.path.basename(csv)}]: {len(lineages.keys())}"
kkonganti@0 496 + f"\nNo. of sequences skipped (Sequence length < {min_len}): {skipped_len_short}"
kkonganti@0 497 + f"\nNo. of records in FASTA [{os.path.basename(fna)}]: {passed_lookup + failed_lookup}"
kkonganti@0 498 + f"\nNo. of records in GenBank [{os.path.basename(gbk)}]: {len(accs2orgs.keys())}"
kkonganti@0 499 + f"\nNo. of FASTA records for which new lineages were created: {passed_lookup}"
kkonganti@0 500 + f"\nNo. of FASTA records for which only genus, species and/or strain information were created: {failed_lookup}"
kkonganti@0 501 + f"\nNo. of FASTA records for which no GenBank records exist: {gbk_recs_missing}"
kkonganti@0 502 )
kkonganti@0 503
kkonganti@0 504 if (passed_lookup + failed_lookup) != len(accs2orgs.keys()):
kkonganti@0 505 logging.error(
kkonganti@0 506 f"The number of FASTA records written [{len(accs2orgs.keys())}]"
kkonganti@0 507 + f"\nis not equal to number of lineages created [{passed_lookup + failed_lookup}]!"
kkonganti@0 508 )
kkonganti@0 509 exit(1)
kkonganti@0 510 else:
kkonganti@0 511 logging.info("Succesfully created lineages and FASTA records! Done!!")
kkonganti@0 512
kkonganti@0 513 l_fh.close()
kkonganti@0 514 ln_fh.close()
kkonganti@0 515
kkonganti@0 516
kkonganti@0 517 if __name__ == "__main__":
kkonganti@0 518 main()