kkonganti@0
|
1 #!/usr/bin/env python3
|
kkonganti@0
|
2
|
kkonganti@0
|
3 import argparse
|
kkonganti@0
|
4 import gzip
|
kkonganti@0
|
5 import inspect
|
kkonganti@0
|
6 import logging
|
kkonganti@0
|
7 import os
|
kkonganti@0
|
8 import pprint
|
kkonganti@0
|
9 import re
|
kkonganti@0
|
10 import shutil
|
kkonganti@0
|
11 import ssl
|
kkonganti@0
|
12 import tempfile
|
kkonganti@0
|
13 from html.parser import HTMLParser
|
kkonganti@0
|
14 from urllib.request import urlopen
|
kkonganti@0
|
15
|
kkonganti@0
|
16 from Bio import SeqIO
|
kkonganti@0
|
17 from Bio.Seq import Seq
|
kkonganti@0
|
18 from Bio.SeqRecord import SeqRecord
|
kkonganti@0
|
19
|
kkonganti@0
|
20
|
kkonganti@0
|
21 # Multiple inheritence for pretty printing of help text.
|
kkonganti@0
|
22 class MultiArgFormatClasses(
|
kkonganti@0
|
23 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
|
kkonganti@0
|
24 ):
|
kkonganti@0
|
25 pass
|
kkonganti@0
|
26
|
kkonganti@0
|
27
|
kkonganti@0
|
28 # HTMLParser override class to get fna.gz and gbff.gz
|
kkonganti@0
|
29 class NCBIHTMLParser(HTMLParser):
|
kkonganti@0
|
30 def __init__(self, *, convert_charrefs: bool = ...) -> None:
|
kkonganti@0
|
31 super().__init__(convert_charrefs=convert_charrefs)
|
kkonganti@0
|
32 self.reset()
|
kkonganti@0
|
33 self.href_data = list()
|
kkonganti@0
|
34
|
kkonganti@0
|
35 def handle_data(self, data):
|
kkonganti@0
|
36 self.href_data.append(data)
|
kkonganti@0
|
37
|
kkonganti@0
|
38
|
kkonganti@0
|
39 # Download organelle FASTA and GenBank file.
|
kkonganti@0
|
40 def dl_mito_seqs_and_flat_files(url: str, suffix: re, out: os.PathLike) -> os.PathLike:
|
kkonganti@0
|
41 """
|
kkonganti@0
|
42 Method to save .fna.gz and .gbff.gz files for the
|
kkonganti@0
|
43 RefSeq mitochondrion release.
|
kkonganti@0
|
44 """
|
kkonganti@0
|
45 contxt = ssl.create_default_context()
|
kkonganti@0
|
46 contxt.check_hostname = False
|
kkonganti@0
|
47 contxt.verify_mode = ssl.CERT_NONE
|
kkonganti@0
|
48
|
kkonganti@0
|
49 if url == None:
|
kkonganti@0
|
50 logging.error(
|
kkonganti@0
|
51 "Please provide the base URL where .fna.gz and .gbff.gz"
|
kkonganti@0
|
52 + "\nfiles for RefSeq mitochondrion can be found."
|
kkonganti@0
|
53 )
|
kkonganti@0
|
54 exit(1)
|
kkonganti@0
|
55
|
kkonganti@0
|
56 if os.path.exists(out):
|
kkonganti@0
|
57 for file in os.listdir(out):
|
kkonganti@0
|
58 file_path = os.path.join(out, file)
|
kkonganti@0
|
59
|
kkonganti@0
|
60 if suffix.match(file_path) and os.path.getsize(file_path) > 0:
|
kkonganti@0
|
61 logging.info(
|
kkonganti@0
|
62 f"The required mitochondrion file(s)\n[{os.path.basename(file_path)}]"
|
kkonganti@0
|
63 + " already exists.\nSkipping download from NCBI..."
|
kkonganti@0
|
64 + "\nPlease use -f to delete and overwrite."
|
kkonganti@0
|
65 )
|
kkonganti@0
|
66 return file_path
|
kkonganti@0
|
67 else:
|
kkonganti@0
|
68 os.makedirs(out)
|
kkonganti@0
|
69
|
kkonganti@0
|
70 html_parser = NCBIHTMLParser()
|
kkonganti@0
|
71 logging.info(f"Finding latest NCBI RefSeq mitochondrion release at:\n{url}")
|
kkonganti@0
|
72
|
kkonganti@0
|
73 with urlopen(url, context=contxt) as response:
|
kkonganti@0
|
74 with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file:
|
kkonganti@0
|
75 shutil.copyfileobj(response, tmp_html_file)
|
kkonganti@0
|
76
|
kkonganti@0
|
77 with open(tmp_html_file.name, "r") as html:
|
kkonganti@0
|
78 html_parser.feed("".join(html.readlines()))
|
kkonganti@0
|
79
|
kkonganti@0
|
80 file = suffix.search("".join(html_parser.href_data)).group(0)
|
kkonganti@0
|
81 file_url = "/".join([url, file + ".gz"])
|
kkonganti@0
|
82 file_at = os.path.join(out, file)
|
kkonganti@0
|
83
|
kkonganti@0
|
84 logging.info(f"Found NCBI RefSeq mitochondrian file(s):\n{file_url}")
|
kkonganti@0
|
85
|
kkonganti@0
|
86 logging.info(f"Saving to:\n{file_at}")
|
kkonganti@0
|
87
|
kkonganti@0
|
88 with tempfile.NamedTemporaryFile(delete=False) as tmp_gz:
|
kkonganti@0
|
89 with urlopen(file_url, context=contxt) as response:
|
kkonganti@0
|
90 tmp_gz.write(response.read())
|
kkonganti@0
|
91
|
kkonganti@0
|
92 with open(file_at, "w") as fh:
|
kkonganti@0
|
93 with gzip.open(tmp_gz.name, "rb") as web_gz:
|
kkonganti@0
|
94 fh.write(web_gz.read().decode("utf-8"))
|
kkonganti@0
|
95
|
kkonganti@0
|
96 html.close()
|
kkonganti@0
|
97 tmp_gz.close()
|
kkonganti@0
|
98 tmp_html_file.close()
|
kkonganti@0
|
99 os.unlink(tmp_gz.name)
|
kkonganti@0
|
100 os.unlink(tmp_html_file.name)
|
kkonganti@0
|
101 fh.close()
|
kkonganti@0
|
102 web_gz.close()
|
kkonganti@0
|
103 response.close()
|
kkonganti@0
|
104
|
kkonganti@0
|
105 return file_at
|
kkonganti@0
|
106
|
kkonganti@0
|
107
|
kkonganti@0
|
108 def get_lineages(csv: os.PathLike, cols: list) -> list:
|
kkonganti@0
|
109 """
|
kkonganti@0
|
110 Parse the output from `ncbitax2lin` tool and
|
kkonganti@0
|
111 return a dict of lineages where the key is
|
kkonganti@0
|
112 genusspeciesstrain.
|
kkonganti@0
|
113 """
|
kkonganti@0
|
114 lineages = dict()
|
kkonganti@0
|
115 if csv == None or not (os.path.exists(csv) or os.path.getsize(csv) > 0):
|
kkonganti@0
|
116 logging.error(
|
kkonganti@0
|
117 f"The CSV file [{os.path.basename(csv)}] is empty or does not exist!"
|
kkonganti@0
|
118 )
|
kkonganti@0
|
119 exit(1)
|
kkonganti@0
|
120
|
kkonganti@0
|
121 logging.info(f"Indexing {os.path.basename(csv)}...")
|
kkonganti@0
|
122
|
kkonganti@0
|
123 with open(csv, "r") as csv_fh:
|
kkonganti@0
|
124 header_cols = csv_fh.readline().strip().split(",")
|
kkonganti@0
|
125 user_req_cols = [
|
kkonganti@0
|
126 tcol_i for tcol_i, tcol in enumerate(header_cols) if tcol in cols
|
kkonganti@0
|
127 ]
|
kkonganti@0
|
128 cols_not_found = [tcol for tcol in cols if tcol not in header_cols]
|
kkonganti@0
|
129 raw_recs = 0
|
kkonganti@0
|
130
|
kkonganti@0
|
131 if len(cols_not_found) > 0:
|
kkonganti@0
|
132 logging.error(
|
kkonganti@0
|
133 f"The following columns do not exist in the"
|
kkonganti@0
|
134 + f"\nCSV file [ {os.path.basename(csv)} ]:\n"
|
kkonganti@0
|
135 + "".join(cols_not_found)
|
kkonganti@0
|
136 )
|
kkonganti@0
|
137 exit(1)
|
kkonganti@0
|
138 elif len(user_req_cols) > 9:
|
kkonganti@0
|
139 logging.error(
|
kkonganti@0
|
140 f"Only a total of 9 columns are needed!"
|
kkonganti@0
|
141 + "\ntax_id,kindom,phylum,class,order,family,genus,species,strain"
|
kkonganti@0
|
142 )
|
kkonganti@0
|
143 exit(1)
|
kkonganti@0
|
144
|
kkonganti@0
|
145 for tax in csv_fh:
|
kkonganti@0
|
146 raw_recs += 1
|
kkonganti@0
|
147 lcols = tax.strip().split(",")
|
kkonganti@0
|
148
|
kkonganti@0
|
149 if bool(lcols[user_req_cols[8]]):
|
kkonganti@0
|
150 lineages[lcols[user_req_cols[8]]] = ",".join(
|
kkonganti@0
|
151 [lcols[l] for l in user_req_cols[1:]]
|
kkonganti@0
|
152 )
|
kkonganti@0
|
153 elif bool(lcols[user_req_cols[7]]):
|
kkonganti@0
|
154 lineages[lcols[user_req_cols[7]]] = ",".join(
|
kkonganti@0
|
155 [lcols[l] for l in user_req_cols[1:8]] + [str()]
|
kkonganti@0
|
156 )
|
kkonganti@0
|
157
|
kkonganti@0
|
158 csv_fh.close()
|
kkonganti@0
|
159 return lineages, raw_recs
|
kkonganti@0
|
160
|
kkonganti@0
|
161
|
kkonganti@0
|
162 def from_genbank(gbk: os.PathLike, min_len: int) -> dict:
|
kkonganti@0
|
163 """
|
kkonganti@0
|
164 Method to parse GenBank file and return
|
kkonganti@0
|
165 organism to latest accession mapping.
|
kkonganti@0
|
166 """
|
kkonganti@0
|
167 accs2orgs = dict()
|
kkonganti@0
|
168
|
kkonganti@0
|
169 if not (os.path.exists(gbk) or os.path.getsize(gbk) > 0):
|
kkonganti@0
|
170 logging.info(
|
kkonganti@0
|
171 f"The GenBank file [{os.path.basename(gbk)}] does not exist"
|
kkonganti@0
|
172 + "\nor is of size 0."
|
kkonganti@0
|
173 )
|
kkonganti@0
|
174 exit(1)
|
kkonganti@0
|
175
|
kkonganti@0
|
176 logging.info(f"Indexing {os.path.basename(gbk)}...")
|
kkonganti@0
|
177
|
kkonganti@0
|
178 # a = open("./_accs", "w")
|
kkonganti@0
|
179 for record in SeqIO.parse(gbk, "genbank"):
|
kkonganti@0
|
180 if len(record.seq) < min_len:
|
kkonganti@0
|
181 continue
|
kkonganti@0
|
182 else:
|
kkonganti@0
|
183 # a.write(f"{record.id}\n")
|
kkonganti@0
|
184 accs2orgs[record.id] = record.annotations["organism"]
|
kkonganti@0
|
185
|
kkonganti@0
|
186 return accs2orgs
|
kkonganti@0
|
187
|
kkonganti@0
|
188
|
kkonganti@0
|
189 def from_genbank_alt(gbk: os.PathLike) -> dict:
|
kkonganti@0
|
190 """
|
kkonganti@0
|
191 Method to parse GenBank file and return
|
kkonganti@0
|
192 organism to latest accession mapping without
|
kkonganti@0
|
193 using BioPython's GenBank Scanner
|
kkonganti@0
|
194 """
|
kkonganti@0
|
195 accs2orgs = dict()
|
kkonganti@0
|
196 accs = dict()
|
kkonganti@0
|
197 orgs = dict()
|
kkonganti@0
|
198 acc = False
|
kkonganti@0
|
199 acc_pat = re.compile(r"^VERSION\s+(.+)")
|
kkonganti@0
|
200 org_pat = re.compile(r"^\s+ORGANISM\s+(.+)")
|
kkonganti@0
|
201
|
kkonganti@0
|
202 if not (os.path.exists(gbk) or os.path.getsize(gbk) > 0):
|
kkonganti@0
|
203 logging.info(
|
kkonganti@0
|
204 f"The GenBank file [{os.path.basename(gbk)}] does not exist"
|
kkonganti@0
|
205 + "\nor is of size 0."
|
kkonganti@0
|
206 )
|
kkonganti@0
|
207 exit(1)
|
kkonganti@0
|
208
|
kkonganti@0
|
209 logging.info(
|
kkonganti@0
|
210 f"Indexing {os.path.basename(gbk)} without using\nBioPython's GenBank Scanner..."
|
kkonganti@0
|
211 )
|
kkonganti@0
|
212
|
kkonganti@0
|
213 with open(gbk, "r") as gbk_fh:
|
kkonganti@0
|
214 for line in gbk_fh:
|
kkonganti@0
|
215 line = line.rstrip()
|
kkonganti@0
|
216 if line.startswith("VERSION") and acc_pat.match(line):
|
kkonganti@0
|
217 acc = acc_pat.match(line).group(1)
|
kkonganti@0
|
218 accs[acc] = 1
|
kkonganti@0
|
219 if org_pat.match(line):
|
kkonganti@0
|
220 if acc and acc not in orgs.keys():
|
kkonganti@0
|
221 orgs[acc] = org_pat.match(line).group(1)
|
kkonganti@0
|
222 elif acc and acc in orgs.keys():
|
kkonganti@0
|
223 logging.error(f"Duplicate VERSION line: {acc}")
|
kkonganti@0
|
224 exit(1)
|
kkonganti@0
|
225 if len(accs.keys()) != len(orgs.keys()):
|
kkonganti@0
|
226 logging.error(
|
kkonganti@0
|
227 f"Got unequal number of organisms ({len(orgs.keys())})\n"
|
kkonganti@0
|
228 + f"and accessions ({len(accs.keys())})"
|
kkonganti@0
|
229 )
|
kkonganti@0
|
230 exit(1)
|
kkonganti@0
|
231 else:
|
kkonganti@0
|
232 for acc in accs.keys():
|
kkonganti@0
|
233 if acc not in orgs.keys():
|
kkonganti@0
|
234 logging.error(f"ORAGANISM not found for accession: {acc}")
|
kkonganti@0
|
235 exit(1)
|
kkonganti@0
|
236 accs2orgs[acc] = orgs[acc]
|
kkonganti@0
|
237
|
kkonganti@0
|
238 gbk_fh.close()
|
kkonganti@0
|
239 return accs2orgs
|
kkonganti@0
|
240
|
kkonganti@0
|
241
|
kkonganti@0
|
242 def write_fasta(seq: str, id: str, basedir: os.PathLike, suffix: str) -> None:
|
kkonganti@0
|
243 """
|
kkonganti@0
|
244 Write sequence with no description to specified file.
|
kkonganti@0
|
245 """
|
kkonganti@0
|
246 SeqIO.write(
|
kkonganti@0
|
247 SeqRecord(Seq(seq), id=id, description=str()),
|
kkonganti@0
|
248 os.path.join(basedir, id + suffix),
|
kkonganti@0
|
249 "fasta",
|
kkonganti@0
|
250 )
|
kkonganti@0
|
251
|
kkonganti@0
|
252
|
kkonganti@0
|
253 # Main
|
kkonganti@0
|
254 def main() -> None:
|
kkonganti@0
|
255 """
|
kkonganti@0
|
256 This script takes:
|
kkonganti@0
|
257 1. Downloads the RefSeq Mitochrondrial GenBank and FASTA format files.
|
kkonganti@0
|
258 2. Takes as input and output .csv.gz or .csv file generated by `ncbitax2lin`.
|
kkonganti@0
|
259
|
kkonganti@0
|
260 and then generates a folder containing individual FASTA sequence files
|
kkonganti@0
|
261 per organelle, and a corresponding lineage file in CSV format.
|
kkonganti@0
|
262 """
|
kkonganti@0
|
263
|
kkonganti@0
|
264 # Set logging.
|
kkonganti@0
|
265 logging.basicConfig(
|
kkonganti@0
|
266 format="\n"
|
kkonganti@0
|
267 + "=" * 55
|
kkonganti@0
|
268 + "\n%(asctime)s - %(levelname)s\n"
|
kkonganti@0
|
269 + "=" * 55
|
kkonganti@0
|
270 + "\n%(message)s\n\n",
|
kkonganti@0
|
271 level=logging.DEBUG,
|
kkonganti@0
|
272 )
|
kkonganti@0
|
273
|
kkonganti@0
|
274 # Debug print.
|
kkonganti@0
|
275 ppp = pprint.PrettyPrinter(width=55)
|
kkonganti@0
|
276 prog_name = os.path.basename(inspect.stack()[0].filename)
|
kkonganti@0
|
277
|
kkonganti@0
|
278 parser = argparse.ArgumentParser(
|
kkonganti@0
|
279 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
|
kkonganti@0
|
280 )
|
kkonganti@0
|
281
|
kkonganti@0
|
282 required = parser.add_argument_group("required arguments")
|
kkonganti@0
|
283
|
kkonganti@0
|
284 required.add_argument(
|
kkonganti@0
|
285 "-csv",
|
kkonganti@0
|
286 dest="csv",
|
kkonganti@0
|
287 default=False,
|
kkonganti@0
|
288 required=True,
|
kkonganti@0
|
289 help="Absolute UNIX path to .csv or .csv.gz file which is generated "
|
kkonganti@0
|
290 + "\nby the `ncbitax2lin` tool.",
|
kkonganti@0
|
291 )
|
kkonganti@0
|
292 parser.add_argument(
|
kkonganti@0
|
293 "-cols",
|
kkonganti@0
|
294 dest="lineage_cols",
|
kkonganti@0
|
295 default="tax_id,superkingdom,phylum,class,order,family,genus,species,strain",
|
kkonganti@0
|
296 required=False,
|
kkonganti@0
|
297 help="Taxonomic lineage will be built using these columns from the output of"
|
kkonganti@0
|
298 + "\n`ncbitax2lin` tool.",
|
kkonganti@0
|
299 )
|
kkonganti@0
|
300 parser.add_argument(
|
kkonganti@0
|
301 "-url",
|
kkonganti@0
|
302 dest="url",
|
kkonganti@0
|
303 default="https://ftp.ncbi.nlm.nih.gov/refseq/release/mitochondrion",
|
kkonganti@0
|
304 required=False,
|
kkonganti@0
|
305 help="Base URL from where NCBI RefSeq mitochondrion files will be downloaded\nfrom.",
|
kkonganti@0
|
306 )
|
kkonganti@0
|
307 parser.add_argument(
|
kkonganti@0
|
308 "-out",
|
kkonganti@0
|
309 dest="out_folder",
|
kkonganti@0
|
310 default=os.path.join(os.getcwd(), "organelles"),
|
kkonganti@0
|
311 required=False,
|
kkonganti@0
|
312 help="By default, the output is written to this folder.",
|
kkonganti@0
|
313 )
|
kkonganti@0
|
314 parser.add_argument(
|
kkonganti@0
|
315 "-f",
|
kkonganti@0
|
316 dest="force_write_out",
|
kkonganti@0
|
317 default=False,
|
kkonganti@0
|
318 action="store_true",
|
kkonganti@0
|
319 required=False,
|
kkonganti@0
|
320 help="Force overwrite output directory contents.",
|
kkonganti@0
|
321 )
|
kkonganti@0
|
322 parser.add_argument(
|
kkonganti@0
|
323 "--fna-suffix",
|
kkonganti@0
|
324 dest="fna_suffix",
|
kkonganti@0
|
325 default=".fna",
|
kkonganti@0
|
326 required=False,
|
kkonganti@0
|
327 help="Suffix of the individual organelle FASTA files that will be saved.",
|
kkonganti@0
|
328 )
|
kkonganti@0
|
329 parser.add_argument(
|
kkonganti@0
|
330 "-ml",
|
kkonganti@0
|
331 dest="fa_min_len",
|
kkonganti@0
|
332 default=200,
|
kkonganti@0
|
333 required=False,
|
kkonganti@0
|
334 help="Minimum length of the FASTA sequence for it to be considered for"
|
kkonganti@0
|
335 + "\nfurther processing",
|
kkonganti@0
|
336 )
|
kkonganti@0
|
337 parser.add_argument(
|
kkonganti@0
|
338 "--skip-per-fa",
|
kkonganti@0
|
339 dest="skip_per_fa",
|
kkonganti@0
|
340 default=False,
|
kkonganti@0
|
341 required=False,
|
kkonganti@0
|
342 action="store_true",
|
kkonganti@0
|
343 help="Do not generate per sequence FASTA file.",
|
kkonganti@0
|
344 )
|
kkonganti@0
|
345 parser.add_argument(
|
kkonganti@0
|
346 "--alt-gb-parser",
|
kkonganti@0
|
347 dest="alt_gb_parser",
|
kkonganti@0
|
348 default=False,
|
kkonganti@0
|
349 required=False,
|
kkonganti@0
|
350 action="store_true",
|
kkonganti@0
|
351 help="Use alternate GenBank parser instead of BioPython's.",
|
kkonganti@0
|
352 )
|
kkonganti@0
|
353
|
kkonganti@0
|
354 # Parse defaults
|
kkonganti@0
|
355 args = parser.parse_args()
|
kkonganti@0
|
356 csv = args.csv
|
kkonganti@0
|
357 out = args.out_folder
|
kkonganti@0
|
358 overwrite = args.force_write_out
|
kkonganti@0
|
359 fna_suffix = args.fna_suffix
|
kkonganti@0
|
360 url = args.url
|
kkonganti@0
|
361 tax_cols = args.lineage_cols
|
kkonganti@0
|
362 skip_per_fa = args.skip_per_fa
|
kkonganti@0
|
363 alt_gb_parser = args.alt_gb_parser
|
kkonganti@0
|
364 min_len = int(args.fa_min_len)
|
kkonganti@0
|
365 tcols_pat = re.compile(r"^[\w\,]+?\w$")
|
kkonganti@0
|
366 mito_fna_suffix = re.compile(r".*?\.genomic\.fna")
|
kkonganti@0
|
367 mito_gbff_suffix = re.compile(r".*?\.genomic\.gbff")
|
kkonganti@0
|
368 final_lineages = os.path.join(out, "lineages.csv")
|
kkonganti@0
|
369 lineages_not_found = os.path.join(out, "lineages_not_found.csv")
|
kkonganti@0
|
370 base_fasta_dir = os.path.join(out, "fasta")
|
kkonganti@0
|
371
|
kkonganti@0
|
372 # Basic checks
|
kkonganti@0
|
373 if not overwrite and os.path.exists(out):
|
kkonganti@0
|
374 logging.warning(
|
kkonganti@0
|
375 f"Output destination [{os.path.basename(out)}] already exists!"
|
kkonganti@0
|
376 + "\nPlease use -f to delete and overwrite."
|
kkonganti@0
|
377 )
|
kkonganti@0
|
378 elif overwrite and os.path.exists(out):
|
kkonganti@0
|
379 logging.info(f"Overwrite requested. Deleting {os.path.basename(out)}...")
|
kkonganti@0
|
380 shutil.rmtree(out)
|
kkonganti@0
|
381
|
kkonganti@0
|
382 if not tcols_pat.match(tax_cols):
|
kkonganti@0
|
383 logging.error(
|
kkonganti@0
|
384 f"Supplied columns' names {tax_cols} should only have words (alphanumeric) separated by a comma."
|
kkonganti@0
|
385 )
|
kkonganti@0
|
386 exit(1)
|
kkonganti@0
|
387 else:
|
kkonganti@0
|
388 tax_cols = re.sub("\n", "", tax_cols).split(",")
|
kkonganti@0
|
389
|
kkonganti@0
|
390 # Get .fna and .gbk files
|
kkonganti@0
|
391 fna = dl_mito_seqs_and_flat_files(url, mito_fna_suffix, out)
|
kkonganti@0
|
392 gbk = dl_mito_seqs_and_flat_files(url, mito_gbff_suffix, out)
|
kkonganti@0
|
393
|
kkonganti@0
|
394 # Get taxonomy from ncbitax2lin
|
kkonganti@0
|
395 lineages, raw_recs = get_lineages(csv, tax_cols)
|
kkonganti@0
|
396
|
kkonganti@0
|
397 # Get parsed organisms and latest accession from GenBank file.
|
kkonganti@0
|
398 if alt_gb_parser:
|
kkonganti@0
|
399 accs2orgs = from_genbank_alt(gbk)
|
kkonganti@0
|
400 else:
|
kkonganti@0
|
401 accs2orgs = from_genbank(gbk, min_len)
|
kkonganti@0
|
402
|
kkonganti@0
|
403 # # Finally, read FASTA and create individual FASTA if lineage exists.
|
kkonganti@0
|
404 logging.info(f"Creating new sequences and lineages...")
|
kkonganti@0
|
405
|
kkonganti@0
|
406 l_fh = open(final_lineages, "w")
|
kkonganti@0
|
407 ln_fh = open(lineages_not_found, "w")
|
kkonganti@0
|
408 l_fh.write(
|
kkonganti@0
|
409 "identifiers,superkingdom,phylum,class,order,family,genus,species,strain\n"
|
kkonganti@0
|
410 )
|
kkonganti@0
|
411 ln_fh.write("fna_id,gbk_org\n")
|
kkonganti@0
|
412 passed_lookup = 0
|
kkonganti@0
|
413 failed_lookup = 0
|
kkonganti@0
|
414 gbk_recs_missing = 0
|
kkonganti@0
|
415 skipped_len_short = 0
|
kkonganti@0
|
416
|
kkonganti@0
|
417 if not os.path.exists(base_fasta_dir):
|
kkonganti@0
|
418 os.makedirs(base_fasta_dir)
|
kkonganti@0
|
419
|
kkonganti@0
|
420 for record in SeqIO.parse(fna, "fasta"):
|
kkonganti@0
|
421 if len(record.seq) < min_len:
|
kkonganti@0
|
422 skipped_len_short += 1
|
kkonganti@0
|
423 continue
|
kkonganti@0
|
424 elif record.id in accs2orgs.keys():
|
kkonganti@0
|
425 org_words = accs2orgs[record.id].split(" ")
|
kkonganti@0
|
426 else:
|
kkonganti@0
|
427 gbk_recs_missing += 1
|
kkonganti@0
|
428 continue
|
kkonganti@0
|
429
|
kkonganti@0
|
430 genus_species = (
|
kkonganti@0
|
431 " ".join(org_words[0:2]) if len(org_words) > 2 else " ".join(org_words[0:])
|
kkonganti@0
|
432 )
|
kkonganti@0
|
433
|
kkonganti@0
|
434 if not skip_per_fa:
|
kkonganti@0
|
435 write_fasta(record.seq, record.id, base_fasta_dir, fna_suffix)
|
kkonganti@0
|
436
|
kkonganti@0
|
437 if record.id in accs2orgs.keys() and accs2orgs[record.id] in lineages.keys():
|
kkonganti@0
|
438 l_fh.write(",".join([record.id, lineages[accs2orgs[record.id]]]) + "\n")
|
kkonganti@0
|
439 passed_lookup += 1
|
kkonganti@0
|
440 elif record.id in accs2orgs.keys() and genus_species in lineages.keys():
|
kkonganti@0
|
441 if len(org_words) > 2:
|
kkonganti@0
|
442 l_fh.write(
|
kkonganti@0
|
443 ",".join(
|
kkonganti@0
|
444 [
|
kkonganti@0
|
445 record.id,
|
kkonganti@0
|
446 lineages[genus_species].rstrip(","),
|
kkonganti@0
|
447 accs2orgs[record.id],
|
kkonganti@0
|
448 ]
|
kkonganti@0
|
449 )
|
kkonganti@0
|
450 + "\n"
|
kkonganti@0
|
451 )
|
kkonganti@0
|
452 else:
|
kkonganti@0
|
453 l_fh.write(",".join([record.id, lineages[genus_species]]) + "\n")
|
kkonganti@0
|
454 passed_lookup += 1
|
kkonganti@0
|
455 else:
|
kkonganti@0
|
456 if len(org_words) > 2:
|
kkonganti@0
|
457 l_fh.write(
|
kkonganti@0
|
458 ",".join(
|
kkonganti@0
|
459 [
|
kkonganti@0
|
460 record.id,
|
kkonganti@0
|
461 "",
|
kkonganti@0
|
462 "",
|
kkonganti@0
|
463 "",
|
kkonganti@0
|
464 "",
|
kkonganti@0
|
465 "",
|
kkonganti@0
|
466 org_words[0],
|
kkonganti@0
|
467 org_words[0] + " " + org_words[1],
|
kkonganti@0
|
468 accs2orgs[record.id],
|
kkonganti@0
|
469 ]
|
kkonganti@0
|
470 )
|
kkonganti@0
|
471 + "\n"
|
kkonganti@0
|
472 )
|
kkonganti@0
|
473 else:
|
kkonganti@0
|
474 l_fh.write(
|
kkonganti@0
|
475 ",".join(
|
kkonganti@0
|
476 [
|
kkonganti@0
|
477 record.id,
|
kkonganti@0
|
478 "",
|
kkonganti@0
|
479 "",
|
kkonganti@0
|
480 "",
|
kkonganti@0
|
481 "",
|
kkonganti@0
|
482 "",
|
kkonganti@0
|
483 org_words[0],
|
kkonganti@0
|
484 accs2orgs[record.id],
|
kkonganti@0
|
485 "",
|
kkonganti@0
|
486 ]
|
kkonganti@0
|
487 )
|
kkonganti@0
|
488 + "\n"
|
kkonganti@0
|
489 )
|
kkonganti@0
|
490 ln_fh.write(",".join([record.id, accs2orgs[record.id]]) + "\n")
|
kkonganti@0
|
491 failed_lookup += 1
|
kkonganti@0
|
492
|
kkonganti@0
|
493 logging.info(
|
kkonganti@0
|
494 f"No. of raw records present in `ncbitax2lin` [{os.path.basename(csv)}]: {raw_recs}"
|
kkonganti@0
|
495 + f"\nNo. of valid records collected from `ncbitax2lin` [{os.path.basename(csv)}]: {len(lineages.keys())}"
|
kkonganti@0
|
496 + f"\nNo. of sequences skipped (Sequence length < {min_len}): {skipped_len_short}"
|
kkonganti@0
|
497 + f"\nNo. of records in FASTA [{os.path.basename(fna)}]: {passed_lookup + failed_lookup}"
|
kkonganti@0
|
498 + f"\nNo. of records in GenBank [{os.path.basename(gbk)}]: {len(accs2orgs.keys())}"
|
kkonganti@0
|
499 + f"\nNo. of FASTA records for which new lineages were created: {passed_lookup}"
|
kkonganti@0
|
500 + f"\nNo. of FASTA records for which only genus, species and/or strain information were created: {failed_lookup}"
|
kkonganti@0
|
501 + f"\nNo. of FASTA records for which no GenBank records exist: {gbk_recs_missing}"
|
kkonganti@0
|
502 )
|
kkonganti@0
|
503
|
kkonganti@0
|
504 if (passed_lookup + failed_lookup) != len(accs2orgs.keys()):
|
kkonganti@0
|
505 logging.error(
|
kkonganti@0
|
506 f"The number of FASTA records written [{len(accs2orgs.keys())}]"
|
kkonganti@0
|
507 + f"\nis not equal to number of lineages created [{passed_lookup + failed_lookup}]!"
|
kkonganti@0
|
508 )
|
kkonganti@0
|
509 exit(1)
|
kkonganti@0
|
510 else:
|
kkonganti@0
|
511 logging.info("Succesfully created lineages and FASTA records! Done!!")
|
kkonganti@0
|
512
|
kkonganti@0
|
513 l_fh.close()
|
kkonganti@0
|
514 ln_fh.close()
|
kkonganti@0
|
515
|
kkonganti@0
|
516
|
kkonganti@0
|
517 if __name__ == "__main__":
|
kkonganti@0
|
518 main()
|