kkonganti@0
|
1 #!/usr/bin/env python3
|
kkonganti@0
|
2
|
kkonganti@0
|
3 import argparse
|
kkonganti@0
|
4 import gzip
|
kkonganti@0
|
5 import inspect
|
kkonganti@0
|
6 import logging
|
kkonganti@0
|
7 import os
|
kkonganti@0
|
8 import pprint
|
kkonganti@0
|
9 import re
|
kkonganti@0
|
10 import shutil
|
kkonganti@0
|
11 from collections import defaultdict
|
kkonganti@0
|
12 from typing import BinaryIO, TextIO, Union
|
kkonganti@0
|
13
|
kkonganti@0
|
14 from Bio import SeqIO
|
kkonganti@0
|
15 from Bio.Seq import Seq
|
kkonganti@0
|
16 from Bio.SeqRecord import SeqRecord
|
kkonganti@0
|
17
|
kkonganti@0
|
18
|
kkonganti@0
|
19 # Multiple inheritence for pretty printing of help text.
|
kkonganti@0
|
20 class MultiArgFormatClasses(
|
kkonganti@0
|
21 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
|
kkonganti@0
|
22 ):
|
kkonganti@0
|
23 pass
|
kkonganti@0
|
24
|
kkonganti@0
|
25
|
kkonganti@0
|
26 def get_lineages(csv: os.PathLike, cols: list) -> list:
|
kkonganti@0
|
27 """
|
kkonganti@0
|
28 Parse the output from `ncbitax2lin` tool and
|
kkonganti@0
|
29 return a dict of lineages where the key is
|
kkonganti@0
|
30 genusspeciesstrain.
|
kkonganti@0
|
31 """
|
kkonganti@0
|
32 lineages = dict()
|
kkonganti@0
|
33 if csv == None or not (os.path.exists(csv) or os.path.getsize(csv) > 0):
|
kkonganti@0
|
34 logging.error(
|
kkonganti@0
|
35 f"The CSV file [{os.path.basename(csv)}] is empty or does not exist!"
|
kkonganti@0
|
36 )
|
kkonganti@0
|
37 exit(1)
|
kkonganti@0
|
38
|
kkonganti@0
|
39 logging.info(f"Indexing {os.path.basename(csv)}...")
|
kkonganti@0
|
40
|
kkonganti@0
|
41 with open(csv, "r") as csv_fh:
|
kkonganti@0
|
42 header_cols = csv_fh.readline().strip().split(",")
|
kkonganti@0
|
43 user_req_cols = [
|
kkonganti@0
|
44 tcol_i for tcol_i, tcol in enumerate(header_cols) if tcol in cols
|
kkonganti@0
|
45 ]
|
kkonganti@0
|
46 cols_not_found = [tcol for tcol in cols if tcol not in header_cols]
|
kkonganti@0
|
47 raw_recs = 0
|
kkonganti@0
|
48
|
kkonganti@0
|
49 if len(cols_not_found) > 0:
|
kkonganti@0
|
50 logging.error(
|
kkonganti@0
|
51 f"The following columns do not exist in the"
|
kkonganti@0
|
52 + f"\nCSV file [ {os.path.basename(csv)} ]:\n"
|
kkonganti@0
|
53 + "".join(cols_not_found)
|
kkonganti@0
|
54 )
|
kkonganti@0
|
55 exit(1)
|
kkonganti@0
|
56 elif len(user_req_cols) > 9:
|
kkonganti@0
|
57 logging.error(
|
kkonganti@0
|
58 f"Only a total of 9 columns are needed!"
|
kkonganti@0
|
59 + "\ntax_id,kindom,phylum,class,order,family,genus,species,strain"
|
kkonganti@0
|
60 )
|
kkonganti@0
|
61 exit(1)
|
kkonganti@0
|
62
|
kkonganti@0
|
63 for tax in csv_fh:
|
kkonganti@0
|
64 raw_recs += 1
|
kkonganti@0
|
65 lcols = tax.strip().split(",")
|
kkonganti@0
|
66
|
kkonganti@0
|
67 if bool(lcols[user_req_cols[8]]):
|
kkonganti@0
|
68 lineages[lcols[user_req_cols[8]]] = ",".join(
|
kkonganti@0
|
69 [lcols[l] for l in user_req_cols[1:]]
|
kkonganti@0
|
70 )
|
kkonganti@0
|
71 elif bool(lcols[user_req_cols[7]]):
|
kkonganti@0
|
72 lineages[lcols[user_req_cols[7]]] = ",".join(
|
kkonganti@0
|
73 [lcols[l] for l in user_req_cols[1:8]] + [str()]
|
kkonganti@0
|
74 )
|
kkonganti@0
|
75
|
kkonganti@0
|
76 csv_fh.close()
|
kkonganti@0
|
77 return lineages, raw_recs
|
kkonganti@0
|
78
|
kkonganti@0
|
79
|
kkonganti@0
|
80 def write_fasta(recs: list, basedir: os.PathLike, name: str, suffix: str) -> None:
|
kkonganti@0
|
81 """
|
kkonganti@0
|
82 Write sequence with no description to a specified file.
|
kkonganti@0
|
83 """
|
kkonganti@0
|
84 SeqIO.write(
|
kkonganti@0
|
85 recs,
|
kkonganti@0
|
86 os.path.join(basedir, name + suffix),
|
kkonganti@0
|
87 "fasta",
|
kkonganti@0
|
88 )
|
kkonganti@0
|
89
|
kkonganti@0
|
90
|
kkonganti@0
|
91 def check_and_get_cols(pat: re, cols: str, delim: str) -> list:
|
kkonganti@0
|
92 """
|
kkonganti@0
|
93 Check if header column matches the pattern and return
|
kkonganti@0
|
94 columns.
|
kkonganti@0
|
95 """
|
kkonganti@0
|
96 if not pat.match(cols):
|
kkonganti@0
|
97 logging.error(
|
kkonganti@0
|
98 f"Supplied columns' names {cols} should only have words"
|
kkonganti@0
|
99 f"\n(alphanumeric) separated by: {delim}."
|
kkonganti@0
|
100 )
|
kkonganti@0
|
101 exit(1)
|
kkonganti@0
|
102 else:
|
kkonganti@0
|
103 cols = re.sub("\n", "", cols).split(delim)
|
kkonganti@0
|
104
|
kkonganti@0
|
105 return cols
|
kkonganti@0
|
106
|
kkonganti@0
|
107
|
kkonganti@0
|
108 def parse_tsv(fh: Union[TextIO, BinaryIO], tcols: list, delim: str) -> list:
|
kkonganti@0
|
109 """
|
kkonganti@0
|
110 Parse the TSV file and produce the required per
|
kkonganti@0
|
111 species FASTA's.
|
kkonganti@0
|
112 """
|
kkonganti@0
|
113 records, sp2accs = (defaultdict(list), defaultdict(list))
|
kkonganti@0
|
114 header = fh.readline().strip().split(delim)
|
kkonganti@0
|
115 raw_recs = 0
|
kkonganti@0
|
116
|
kkonganti@0
|
117 if not all(col in header for col in tcols):
|
kkonganti@0
|
118 logging.error(
|
kkonganti@0
|
119 "The following columns were not found in the"
|
kkonganti@0
|
120 + f"\nheader row of file {os.path.basename(fh.name)}\n"
|
kkonganti@0
|
121 + "\n".join([ele for ele in tcols if ele not in header])
|
kkonganti@0
|
122 )
|
kkonganti@0
|
123
|
kkonganti@0
|
124 id_i, genus_i, species_i, strain_i, seq_i = [
|
kkonganti@0
|
125 i for i, ele in enumerate(header) if ele in tcols
|
kkonganti@0
|
126 ]
|
kkonganti@0
|
127
|
kkonganti@0
|
128 for record in fh:
|
kkonganti@0
|
129 raw_recs += 1
|
kkonganti@0
|
130
|
kkonganti@0
|
131 id = record.strip().split(delim)[id_i]
|
kkonganti@0
|
132 genus = record.strip().split(delim)[genus_i]
|
kkonganti@0
|
133 species = re.sub(r"[\/\\]+", "-", record.strip().split(delim)[species_i])
|
kkonganti@0
|
134 strain = record.strip().split(delim)[strain_i]
|
kkonganti@0
|
135 seq = re.sub(r"[^ATGC]+", "", record.strip().split(delim)[seq_i], re.IGNORECASE)
|
kkonganti@0
|
136
|
kkonganti@0
|
137 if re.match(r"None|Null", species, re.IGNORECASE):
|
kkonganti@0
|
138 continue
|
kkonganti@0
|
139
|
kkonganti@0
|
140 # print(id)
|
kkonganti@0
|
141 # print(genus)
|
kkonganti@0
|
142 # print(species)
|
kkonganti@0
|
143 # print(strain)
|
kkonganti@0
|
144 # print(seq)
|
kkonganti@0
|
145
|
kkonganti@0
|
146 records.setdefault(species, []).append(
|
kkonganti@0
|
147 SeqRecord(Seq(seq), id=id, description=str())
|
kkonganti@0
|
148 )
|
kkonganti@0
|
149 sp2accs.setdefault(species, []).append(id)
|
kkonganti@0
|
150
|
kkonganti@0
|
151 logging.info(f"Collected FASTA records for {len(records.keys())} species'.")
|
kkonganti@0
|
152 fh.close()
|
kkonganti@0
|
153 return records, sp2accs, raw_recs
|
kkonganti@0
|
154
|
kkonganti@0
|
155
|
kkonganti@0
|
156 # Main
|
kkonganti@0
|
157 def main() -> None:
|
kkonganti@0
|
158 """
|
kkonganti@0
|
159 This script takes:
|
kkonganti@0
|
160 1. The TSV file from BOLD systems,
|
kkonganti@0
|
161 2. Takes as input a .csv file generated by `ncbitax2lin`.
|
kkonganti@0
|
162
|
kkonganti@0
|
163 and then generates a folder containing individual FASTA sequence files
|
kkonganti@0
|
164 per species. This is only possible if the full taxonomy of the barcode
|
kkonganti@0
|
165 sequence is present in the FASTA header.
|
kkonganti@0
|
166 """
|
kkonganti@0
|
167
|
kkonganti@0
|
168 # Set logging.
|
kkonganti@0
|
169 logging.basicConfig(
|
kkonganti@0
|
170 format="\n"
|
kkonganti@0
|
171 + "=" * 55
|
kkonganti@0
|
172 + "\n%(asctime)s - %(levelname)s\n"
|
kkonganti@0
|
173 + "=" * 55
|
kkonganti@0
|
174 + "\n%(message)s\r\r",
|
kkonganti@0
|
175 level=logging.DEBUG,
|
kkonganti@0
|
176 )
|
kkonganti@0
|
177
|
kkonganti@0
|
178 # Debug print.
|
kkonganti@0
|
179 ppp = pprint.PrettyPrinter(width=55)
|
kkonganti@0
|
180 prog_name = os.path.basename(inspect.stack()[0].filename)
|
kkonganti@0
|
181
|
kkonganti@0
|
182 parser = argparse.ArgumentParser(
|
kkonganti@0
|
183 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
|
kkonganti@0
|
184 )
|
kkonganti@0
|
185
|
kkonganti@0
|
186 required = parser.add_argument_group("required arguments")
|
kkonganti@0
|
187
|
kkonganti@0
|
188 required.add_argument(
|
kkonganti@0
|
189 "-tsv",
|
kkonganti@0
|
190 dest="tsv",
|
kkonganti@0
|
191 default=False,
|
kkonganti@0
|
192 required=True,
|
kkonganti@0
|
193 help="Absolute UNIX path to the TSV file from BOLD systems"
|
kkonganti@0
|
194 + "\nin uncompressed TXT format.",
|
kkonganti@0
|
195 )
|
kkonganti@0
|
196 required.add_argument(
|
kkonganti@0
|
197 "-csv",
|
kkonganti@0
|
198 dest="csv",
|
kkonganti@0
|
199 default=False,
|
kkonganti@0
|
200 required=True,
|
kkonganti@0
|
201 help="Absolute UNIX path to .csv or .csv.gz file which is generated "
|
kkonganti@0
|
202 + "\nby the `ncbitax2lin` tool.",
|
kkonganti@0
|
203 )
|
kkonganti@0
|
204 parser.add_argument(
|
kkonganti@0
|
205 "-out",
|
kkonganti@0
|
206 dest="out_folder",
|
kkonganti@0
|
207 default=os.path.join(os.getcwd(), "species"),
|
kkonganti@0
|
208 required=False,
|
kkonganti@0
|
209 help="By default, the output is written to this\nfolder.",
|
kkonganti@0
|
210 )
|
kkonganti@0
|
211 parser.add_argument(
|
kkonganti@0
|
212 "-f",
|
kkonganti@0
|
213 dest="force_write_out",
|
kkonganti@0
|
214 default=False,
|
kkonganti@0
|
215 action="store_true",
|
kkonganti@0
|
216 required=False,
|
kkonganti@0
|
217 help="Force overwrite output directory contents.",
|
kkonganti@0
|
218 )
|
kkonganti@0
|
219 parser.add_argument(
|
kkonganti@0
|
220 "-suffix",
|
kkonganti@0
|
221 dest="fna_suffix",
|
kkonganti@0
|
222 default=".fna",
|
kkonganti@0
|
223 required=False,
|
kkonganti@0
|
224 help="Suffix of the individual species FASTA files\nthat will be saved.",
|
kkonganti@0
|
225 )
|
kkonganti@0
|
226 parser.add_argument(
|
kkonganti@0
|
227 "-ccols",
|
kkonganti@0
|
228 dest="csv_cols",
|
kkonganti@0
|
229 default="tax_id,superkingdom,phylum,class,order,family,genus,species,strain",
|
kkonganti@0
|
230 required=False,
|
kkonganti@0
|
231 help="Taxonomic lineage will be built using these columns from the output of"
|
kkonganti@0
|
232 + "\n`ncbitax2lin`\ntool.",
|
kkonganti@0
|
233 )
|
kkonganti@0
|
234 parser.add_argument(
|
kkonganti@0
|
235 "-ccols-sep",
|
kkonganti@0
|
236 dest="csv_delim",
|
kkonganti@0
|
237 default=",",
|
kkonganti@0
|
238 required=False,
|
kkonganti@0
|
239 help="The delimitor of the fields in the CSV file.",
|
kkonganti@0
|
240 )
|
kkonganti@0
|
241 parser.add_argument(
|
kkonganti@0
|
242 "-tcols",
|
kkonganti@0
|
243 dest="tsv_cols",
|
kkonganti@0
|
244 default="processid\tgenus\tspecies\tsubspecies\tnucraw",
|
kkonganti@0
|
245 required=False,
|
kkonganti@0
|
246 help="For each species, the nucletide sequences will be\naggregated.",
|
kkonganti@0
|
247 )
|
kkonganti@0
|
248 parser.add_argument(
|
kkonganti@0
|
249 "-tcols-sep",
|
kkonganti@0
|
250 dest="tsv_delim",
|
kkonganti@0
|
251 default="\t",
|
kkonganti@0
|
252 required=False,
|
kkonganti@0
|
253 help="The delimitor of the fields in the TSV file.",
|
kkonganti@0
|
254 )
|
kkonganti@0
|
255
|
kkonganti@0
|
256 # Parse defaults
|
kkonganti@0
|
257 args = parser.parse_args()
|
kkonganti@0
|
258 tsv = args.tsv
|
kkonganti@0
|
259 csv = args.csv
|
kkonganti@0
|
260 csep = args.csv_delim
|
kkonganti@0
|
261 tsep = args.tsv_delim
|
kkonganti@0
|
262 csv_cols = args.csv_cols
|
kkonganti@0
|
263 tsv_cols = args.tsv_cols
|
kkonganti@0
|
264 out = args.out_folder
|
kkonganti@0
|
265 overwrite = args.force_write_out
|
kkonganti@0
|
266 fna_suffix = args.fna_suffix
|
kkonganti@0
|
267 ccols_pat = re.compile(f"^[\w\{csep}]+?\w$")
|
kkonganti@0
|
268 tcols_pat = re.compile(f"^[\w\{tsep}]+?\w$")
|
kkonganti@0
|
269 final_lineages = os.path.join(out, "lineages.csv")
|
kkonganti@0
|
270 lineages_not_found = os.path.join(out, "lineages_not_found.csv")
|
kkonganti@0
|
271 base_fasta_dir = os.path.join(out, "fasta")
|
kkonganti@0
|
272
|
kkonganti@0
|
273 # Basic checks
|
kkonganti@0
|
274 if not overwrite and os.path.exists(out):
|
kkonganti@0
|
275 logging.warning(
|
kkonganti@0
|
276 f"Output destination [{os.path.basename(out)}] already exists!"
|
kkonganti@0
|
277 + "\nPlease use -f to delete and overwrite."
|
kkonganti@0
|
278 )
|
kkonganti@0
|
279 elif overwrite and os.path.exists(out):
|
kkonganti@0
|
280 logging.info(f"Overwrite requested. Deleting {os.path.basename(out)}...")
|
kkonganti@0
|
281 shutil.rmtree(out)
|
kkonganti@0
|
282
|
kkonganti@0
|
283 # Validate user requested columns
|
kkonganti@0
|
284 passed_ccols = check_and_get_cols(ccols_pat, csv_cols, csep)
|
kkonganti@0
|
285 passed_tcols = check_and_get_cols(tcols_pat, tsv_cols, tsep)
|
kkonganti@0
|
286
|
kkonganti@0
|
287 # Get taxonomy from ncbitax2lin
|
kkonganti@0
|
288 lineages, raw_recs = get_lineages(csv, passed_ccols)
|
kkonganti@0
|
289
|
kkonganti@0
|
290 # Finally, read BOLD tsv if lineage exists.
|
kkonganti@0
|
291 logging.info(f"Creating new squences per species...")
|
kkonganti@0
|
292
|
kkonganti@0
|
293 if not os.path.exists(out):
|
kkonganti@0
|
294 os.makedirs(out)
|
kkonganti@0
|
295
|
kkonganti@0
|
296 try:
|
kkonganti@0
|
297 gz_fh = gzip.open(tsv, "rt")
|
kkonganti@0
|
298 records, sp2accs, traw_recs = parse_tsv(gz_fh, passed_tcols, tsep)
|
kkonganti@0
|
299 except gzip.BadGzipFile:
|
kkonganti@0
|
300 logging.info(f"Input TSV file {os.path.basename(tsv)} is not in\nGZIP format.")
|
kkonganti@0
|
301 txt_fh = open(tsv, "r")
|
kkonganti@0
|
302 records, sp2accs, traw_recs = parse_tsv(txt_fh, passed_tcols, tsep)
|
kkonganti@0
|
303
|
kkonganti@0
|
304 passed_tax_check = 0
|
kkonganti@0
|
305 failed_tax_check = 0
|
kkonganti@0
|
306 fasta_recs_written = 0
|
kkonganti@0
|
307 l_fh = open(final_lineages, "w")
|
kkonganti@0
|
308 ln_fh = open(lineages_not_found, "w")
|
kkonganti@0
|
309 l_fh.write(
|
kkonganti@0
|
310 "identifiers,superkingdom,phylum,class,order,family,genus,species,strain\n"
|
kkonganti@0
|
311 )
|
kkonganti@0
|
312 ln_fh.write("fna_id,parsed_org\n")
|
kkonganti@0
|
313
|
kkonganti@0
|
314 if not os.path.exists(base_fasta_dir):
|
kkonganti@0
|
315 os.makedirs(base_fasta_dir)
|
kkonganti@0
|
316
|
kkonganti@0
|
317 for genus_species in records.keys():
|
kkonganti@0
|
318 fasta_recs_written += len(records[genus_species])
|
kkonganti@0
|
319 write_fasta(
|
kkonganti@0
|
320 records[genus_species],
|
kkonganti@0
|
321 base_fasta_dir,
|
kkonganti@0
|
322 "_".join(genus_species.split(" ")),
|
kkonganti@0
|
323 fna_suffix,
|
kkonganti@0
|
324 )
|
kkonganti@0
|
325 org_words = genus_species.split(" ")
|
kkonganti@0
|
326
|
kkonganti@0
|
327 for id in sp2accs[genus_species]:
|
kkonganti@0
|
328 if genus_species in lineages.keys():
|
kkonganti@0
|
329 this_line = ",".join([id, lineages[genus_species]]) + "\n"
|
kkonganti@0
|
330
|
kkonganti@0
|
331 if len(org_words) > 2:
|
kkonganti@0
|
332 this_line = (
|
kkonganti@0
|
333 ",".join(
|
kkonganti@0
|
334 [id, lineages[genus_species].rstrip(","), genus_species]
|
kkonganti@0
|
335 )
|
kkonganti@0
|
336 + "\n"
|
kkonganti@0
|
337 )
|
kkonganti@0
|
338
|
kkonganti@0
|
339 l_fh.write(this_line)
|
kkonganti@0
|
340 passed_tax_check += 1
|
kkonganti@0
|
341 else:
|
kkonganti@0
|
342 this_line = (
|
kkonganti@0
|
343 ",".join(
|
kkonganti@0
|
344 [
|
kkonganti@0
|
345 id,
|
kkonganti@0
|
346 "",
|
kkonganti@0
|
347 "",
|
kkonganti@0
|
348 "",
|
kkonganti@0
|
349 "",
|
kkonganti@0
|
350 "",
|
kkonganti@0
|
351 org_words[0],
|
kkonganti@0
|
352 genus_species,
|
kkonganti@0
|
353 "",
|
kkonganti@0
|
354 ]
|
kkonganti@0
|
355 )
|
kkonganti@0
|
356 + "\n"
|
kkonganti@0
|
357 )
|
kkonganti@0
|
358 if len(org_words) > 2:
|
kkonganti@0
|
359 this_line = (
|
kkonganti@0
|
360 ",".join(
|
kkonganti@0
|
361 [
|
kkonganti@0
|
362 id,
|
kkonganti@0
|
363 "",
|
kkonganti@0
|
364 "",
|
kkonganti@0
|
365 "",
|
kkonganti@0
|
366 "",
|
kkonganti@0
|
367 "",
|
kkonganti@0
|
368 org_words[0],
|
kkonganti@0
|
369 org_words[0] + " " + org_words[1],
|
kkonganti@0
|
370 genus_species,
|
kkonganti@0
|
371 ]
|
kkonganti@0
|
372 )
|
kkonganti@0
|
373 + "\n"
|
kkonganti@0
|
374 )
|
kkonganti@0
|
375 l_fh.write(this_line)
|
kkonganti@0
|
376 ln_fh.write(",".join([id, genus_species]) + "\n")
|
kkonganti@0
|
377 failed_tax_check += 1
|
kkonganti@0
|
378
|
kkonganti@0
|
379 logging.info(
|
kkonganti@0
|
380 f"No. of raw records present in `ncbitax2lin` [{os.path.basename(csv)}]: {raw_recs}"
|
kkonganti@0
|
381 + f"\nNo. of valid records collected from `ncbitax2lin` [{os.path.basename(csv)}]: {len(lineages.keys())}"
|
kkonganti@0
|
382 + f"\nNo. of raw records in TSV [{os.path.basename(tsv)}]: {traw_recs}"
|
kkonganti@0
|
383 + f"\nNo. of valid records in TSV [{os.path.basename(tsv)}]: {passed_tax_check + failed_tax_check}"
|
kkonganti@0
|
384 + f"\nNo. of FASTA records for which new lineages were created: {passed_tax_check}"
|
kkonganti@0
|
385 + f"\nNo. of FASTA records for which only genus, species and/or strain information were created: {failed_tax_check}"
|
kkonganti@0
|
386 )
|
kkonganti@0
|
387
|
kkonganti@0
|
388 if (passed_tax_check + failed_tax_check) != fasta_recs_written:
|
kkonganti@0
|
389 logging.error(
|
kkonganti@0
|
390 f"The number of input FASTA records [{fasta_recs_written}]"
|
kkonganti@0
|
391 + f"\nis not equal to number of lineages created [{passed_tax_check + failed_tax_check}]!"
|
kkonganti@0
|
392 )
|
kkonganti@0
|
393 exit(1)
|
kkonganti@0
|
394 else:
|
kkonganti@0
|
395 logging.info("Succesfully created lineages and FASTA records! Done!!")
|
kkonganti@0
|
396
|
kkonganti@0
|
397
|
kkonganti@0
|
398 if __name__ == "__main__":
|
kkonganti@0
|
399 main()
|
kkonganti@0
|
400
|
kkonganti@0
|
401 # ~/apps/nowayout/bin/gen_per_species_fa_from_bold.py -tsv BOLD_Public.05-Feb-2024.tsv -csv ../tax.csv ─╯
|
kkonganti@0
|
402
|
kkonganti@0
|
403 # =======================================================
|
kkonganti@0
|
404 # 2024-02-08 21:37:28,541 - INFO
|
kkonganti@0
|
405 # =======================================================
|
kkonganti@0
|
406 # Indexing tax.csv...
|
kkonganti@0
|
407
|
kkonganti@0
|
408 # =======================================================
|
kkonganti@0
|
409 # 2024-02-08 21:38:06,567 - INFO
|
kkonganti@0
|
410 # =======================================================
|
kkonganti@0
|
411 # Creating new squences per species...
|
kkonganti@0
|
412
|
kkonganti@0
|
413 # =======================================================
|
kkonganti@0
|
414 # 2024-02-08 21:38:06,572 - INFO
|
kkonganti@0
|
415 # =======================================================
|
kkonganti@0
|
416 # Input TSV file BOLD_Public.05-Feb-2024.tsv is not in
|
kkonganti@0
|
417 # GZIP format.
|
kkonganti@0
|
418
|
kkonganti@0
|
419 # =======================================================
|
kkonganti@0
|
420 # 2024-02-08 22:01:04,554 - INFO
|
kkonganti@0
|
421 # =======================================================
|
kkonganti@0
|
422 # Collected FASTA records for 497421 species'.
|
kkonganti@0
|
423
|
kkonganti@0
|
424 # =======================================================
|
kkonganti@0
|
425 # 2024-02-08 22:24:35,000 - INFO
|
kkonganti@0
|
426 # =======================================================
|
kkonganti@0
|
427 # No. of raw records present in `ncbitax2lin` [tax.csv]: 2550767
|
kkonganti@0
|
428 # No. of valid records collected from `ncbitax2lin` [tax.csv]: 2134980
|
kkonganti@0
|
429 # No. of raw records in TSV [BOLD_Public.05-Feb-2024.tsv]: 9735210
|
kkonganti@0
|
430 # No. of valid records in TSV [BOLD_Public.05-Feb-2024.tsv]: 4988323
|
kkonganti@0
|
431 # No. of FASTA records for which new lineages were created: 4069202
|
kkonganti@0
|
432 # No. of FASTA records for which only genus, species and/or strain information were created: 919121
|
kkonganti@0
|
433
|
kkonganti@0
|
434 # =======================================================
|
kkonganti@0
|
435 # 2024-02-08 22:24:35,001 - INFO
|
kkonganti@0
|
436 # =======================================================
|
kkonganti@0
|
437 # Succesfully created lineages and FASTA records! Done!!
|