annotate 0.5.0/bin/create_fasta_and_lineages.py @ 0:3c767f9cfd88 draft default tip

planemo upload
author galaxytrakr
date Fri, 29 May 2026 13:37:56 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
1 #!/usr/bin/env python3
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
2
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
3 import argparse
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
4 import gzip
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
5 import inspect
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
6 import logging
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
7 import os
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
8 import pprint
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
9 import re
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
10 import shutil
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
11 import ssl
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
12 import tempfile
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
13 from html.parser import HTMLParser
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
14 from urllib.request import urlopen
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
15
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
16 from Bio import SeqIO
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
17 from Bio.Seq import Seq
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
18 from Bio.SeqRecord import SeqRecord
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
19
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
20
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
21 # Multiple inheritence for pretty printing of help text.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
22 class MultiArgFormatClasses(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
23 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
24 ):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
25 pass
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
26
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
27
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
28 # HTMLParser override class to get fna.gz and gbff.gz
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
29 class NCBIHTMLParser(HTMLParser):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
30 def __init__(self, *, convert_charrefs: bool = ...) -> None:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
31 super().__init__(convert_charrefs=convert_charrefs)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
32 self.reset()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
33 self.href_data = list()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
34
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
35 def handle_data(self, data):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
36 self.href_data.append(data)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
37
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
38
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
39 # Download organelle FASTA and GenBank file.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
40 def dl_mito_seqs_and_flat_files(url: str, suffix: re, out: os.PathLike) -> os.PathLike:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
41 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
42 Method to save .fna.gz and .gbff.gz files for the
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
43 RefSeq mitochondrion release.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
44 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
45 contxt = ssl.create_default_context()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
46 contxt.check_hostname = False
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
47 contxt.verify_mode = ssl.CERT_NONE
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
48
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
49 if url == None:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
50 logging.error(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
51 "Please provide the base URL where .fna.gz and .gbff.gz"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
52 + "\nfiles for RefSeq mitochondrion can be found."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
53 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
54 exit(1)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
55
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
56 if os.path.exists(out):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
57 for file in os.listdir(out):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
58 file_path = os.path.join(out, file)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
59
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
60 if suffix.match(file_path) and os.path.getsize(file_path) > 0:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
61 logging.info(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
62 f"The required mitochondrion file(s)\n[{os.path.basename(file_path)}]"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
63 + " already exists.\nSkipping download from NCBI..."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
64 + "\nPlease use -f to delete and overwrite."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
65 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
66 return file_path
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
67 else:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
68 os.makedirs(out)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
69
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
70 html_parser = NCBIHTMLParser()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
71 logging.info(f"Finding latest NCBI RefSeq mitochondrion release at:\n{url}")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
72
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
73 with urlopen(url, context=contxt) as response:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
74 with tempfile.NamedTemporaryFile(delete=False) as tmp_html_file:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
75 shutil.copyfileobj(response, tmp_html_file)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
76
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
77 with open(tmp_html_file.name, "r") as html:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
78 html_parser.feed("".join(html.readlines()))
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
79
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
80 file = suffix.search("".join(html_parser.href_data)).group(0)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
81 file_url = "/".join([url, file + ".gz"])
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
82 file_at = os.path.join(out, file)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
83
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
84 logging.info(f"Found NCBI RefSeq mitochondrian file(s):\n{file_url}")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
85
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
86 logging.info(f"Saving to:\n{file_at}")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
87
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
88 with tempfile.NamedTemporaryFile(delete=False) as tmp_gz:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
89 with urlopen(file_url, context=contxt) as response:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
90 tmp_gz.write(response.read())
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
91
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
92 with open(file_at, "w") as fh:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
93 with gzip.open(tmp_gz.name, "rb") as web_gz:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
94 fh.write(web_gz.read().decode("utf-8"))
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
95
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
96 html.close()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
97 tmp_gz.close()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
98 tmp_html_file.close()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
99 os.unlink(tmp_gz.name)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
100 os.unlink(tmp_html_file.name)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
101 fh.close()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
102 web_gz.close()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
103 response.close()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
104
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
105 return file_at
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
106
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
107
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
108 def get_lineages(csv: os.PathLike, cols: list) -> list:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
109 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
110 Parse the output from `ncbitax2lin` tool and
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
111 return a dict of lineages where the key is
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
112 genusspeciesstrain.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
113 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
114 lineages = dict()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
115 if csv == None or not (os.path.exists(csv) or os.path.getsize(csv) > 0):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
116 logging.error(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
117 f"The CSV file [{os.path.basename(csv)}] is empty or does not exist!"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
118 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
119 exit(1)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
120
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
121 logging.info(f"Indexing {os.path.basename(csv)}...")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
122
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
123 with open(csv, "r") as csv_fh:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
124 header_cols = csv_fh.readline().strip().split(",")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
125 user_req_cols = [
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
126 tcol_i for tcol_i, tcol in enumerate(header_cols) if tcol in cols
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
127 ]
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
128 cols_not_found = [tcol for tcol in cols if tcol not in header_cols]
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
129 raw_recs = 0
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
130
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
131 if len(cols_not_found) > 0:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
132 logging.error(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
133 f"The following columns do not exist in the"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
134 + f"\nCSV file [ {os.path.basename(csv)} ]:\n"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
135 + "".join(cols_not_found)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
136 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
137 exit(1)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
138 elif len(user_req_cols) > 9:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
139 logging.error(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
140 f"Only a total of 9 columns are needed!"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
141 + "\ntax_id,kindom,phylum,class,order,family,genus,species,strain"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
142 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
143 exit(1)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
144
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
145 for tax in csv_fh:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
146 raw_recs += 1
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
147 lcols = tax.strip().split(",")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
148
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
149 if bool(lcols[user_req_cols[8]]):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
150 lineages[lcols[user_req_cols[8]]] = ",".join(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
151 [re.sub(r"[\,\"]", "", lcols[l]) for l in user_req_cols[1:]]
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
152 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
153 elif bool(lcols[user_req_cols[7]]):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
154 lineages[lcols[user_req_cols[7]]] = ",".join(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
155 [re.sub(r"[\,\"]", "", lcols[l]) for l in user_req_cols[1:8]]
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
156 + [str()]
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
157 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
158
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
159 if lcols[7] == "Rondeletia bicolor Goode & Bean":
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
160 print(lineages[lcols[8]])
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
161 exit(0)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
162
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
163 csv_fh.close()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
164 return lineages, raw_recs
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
165
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
166
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
167 def from_genbank(gbk: os.PathLike, min_len: int) -> dict:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
168 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
169 Method to parse GenBank file and return
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
170 organism to latest accession mapping.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
171 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
172 accs2orgs = dict()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
173 sanitize_pat = re.compile(r"[\,\"]")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
174
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
175 if not (os.path.exists(gbk) or os.path.getsize(gbk) > 0):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
176 logging.info(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
177 f"The GenBank file [{os.path.basename(gbk)}] does not exist"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
178 + "\nor is of size 0."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
179 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
180 exit(1)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
181
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
182 logging.info(f"Indexing {os.path.basename(gbk)}...")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
183
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
184 # a = open("./_accs", "w")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
185 try:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
186 for record in SeqIO.parse(gbk, "genbank"):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
187 if len(record.seq) < min_len:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
188 continue
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
189 else:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
190 # a.write(f"{record.id}\n")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
191 accs2orgs[record.id] = sanitize_pat.sub(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
192 "", record.annotations["organism"]
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
193 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
194 except Exception as e:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
195 logging.error(f"Error occured around GenBank ID: {record.id}")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
196 logging.error(f"Error: {e}")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
197 exit(1)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
198
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
199 return accs2orgs
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
200
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
201
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
202 def from_genbank_alt(gbk: os.PathLike) -> dict:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
203 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
204 Method to parse GenBank file and return
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
205 organism to latest accession mapping without
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
206 using BioPython's GenBank Scanner
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
207 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
208 accs2orgs = dict()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
209 accs = dict()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
210 orgs = dict()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
211 acc = False
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
212 acc_pat = re.compile(r"^VERSION\s+(.+)")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
213 org_pat = re.compile(r"^\s+ORGANISM\s+(.+)")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
214 sanitize_pat = re.compile(r"[\,\"]")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
215
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
216 if not (os.path.exists(gbk) or os.path.getsize(gbk) > 0):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
217 logging.info(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
218 f"The GenBank file [{os.path.basename(gbk)}] does not exist"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
219 + "\nor is of size 0."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
220 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
221 exit(1)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
222
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
223 logging.info(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
224 f"Indexing {os.path.basename(gbk)} without using\nBioPython's GenBank Scanner..."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
225 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
226
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
227 with open(gbk, "r") as gbk_fh:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
228 for line in gbk_fh:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
229 line = line.rstrip()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
230 if line.startswith("VERSION") and acc_pat.match(line):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
231 acc = acc_pat.match(line).group(1)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
232 accs[acc] = 1
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
233 if org_pat.match(line):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
234 if acc and acc not in orgs.keys():
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
235 orgs[acc] = sanitize_pat.sub("", org_pat.match(line).group(1))
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
236 elif acc and acc in orgs.keys():
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
237 logging.error(f"Duplicate VERSION line: {acc}")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
238 exit(1)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
239 if len(accs.keys()) != len(orgs.keys()):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
240 logging.error(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
241 f"Got unequal number of organisms ({len(orgs.keys())})\n"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
242 + f"and accessions ({len(accs.keys())})"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
243 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
244 exit(1)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
245 else:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
246 for acc in accs.keys():
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
247 if acc not in orgs.keys():
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
248 logging.error(f"ORAGANISM not found for accession: {acc}")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
249 exit(1)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
250 accs2orgs[acc] = orgs[acc]
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
251
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
252 gbk_fh.close()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
253 return accs2orgs
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
254
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
255
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
256 def write_fasta(seq: str, id: str, basedir: os.PathLike, suffix: str) -> None:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
257 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
258 Write sequence with no description to specified file.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
259 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
260 SeqIO.write(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
261 SeqRecord(Seq(seq), id=id, description=str()),
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
262 os.path.join(basedir, id + suffix),
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
263 "fasta",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
264 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
265
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
266
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
267 # Main
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
268 def main() -> None:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
269 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
270 This script takes:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
271 1. Downloads the RefSeq Mitochrondrial GenBank and FASTA format files.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
272 2. Takes as input and output .csv.gz or .csv file generated by `ncbitax2lin`.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
273
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
274 and then generates a folder containing individual FASTA sequence files
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
275 per organelle, and a corresponding lineage file in CSV format.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
276 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
277
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
278 # Set logging.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
279 logging.basicConfig(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
280 format="\n"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
281 + "=" * 55
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
282 + "\n%(asctime)s - %(levelname)s\n"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
283 + "=" * 55
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
284 + "\n%(message)s\n\n",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
285 level=logging.DEBUG,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
286 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
287
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
288 # Debug print.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
289 ppp = pprint.PrettyPrinter(width=55)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
290 prog_name = os.path.basename(inspect.stack()[0].filename)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
291
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
292 parser = argparse.ArgumentParser(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
293 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
294 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
295
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
296 required = parser.add_argument_group("required arguments")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
297
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
298 required.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
299 "-csv",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
300 dest="csv",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
301 default=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
302 required=True,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
303 help="Absolute UNIX path to .csv or .csv.gz file which is generated "
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
304 + "\nby the `ncbitax2lin` tool.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
305 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
306 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
307 "-cols",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
308 dest="lineage_cols",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
309 default="tax_id,domain,phylum,class,order,family,genus,species,strain",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
310 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
311 help="Taxonomic lineage will be built using these columns from the output of"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
312 + "\n`ncbitax2lin` tool.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
313 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
314 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
315 "-url",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
316 dest="url",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
317 default="https://ftp.ncbi.nlm.nih.gov/refseq/release/mitochondrion",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
318 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
319 help="Base URL from where NCBI RefSeq mitochondrion files will be downloaded\nfrom.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
320 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
321 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
322 "-out",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
323 dest="out_folder",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
324 default=os.path.join(os.getcwd(), "organelles"),
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
325 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
326 help="By default, the output is written to this folder.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
327 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
328 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
329 "-f",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
330 dest="force_write_out",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
331 default=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
332 action="store_true",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
333 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
334 help="Force overwrite output directory contents.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
335 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
336 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
337 "--fna-suffix",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
338 dest="fna_suffix",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
339 default=".fna",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
340 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
341 help="Suffix of the individual organelle FASTA files that will be saved.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
342 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
343 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
344 "-ml",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
345 dest="fa_min_len",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
346 default=200,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
347 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
348 help="Minimum length of the FASTA sequence for it to be considered for"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
349 + "\nfurther processing",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
350 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
351 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
352 "--gen-per-fa",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
353 dest="gen_per_fa",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
354 default=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
355 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
356 action="store_true",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
357 help="Generate per sequence FASTA file.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
358 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
359 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
360 "--alt-gb-parser",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
361 dest="alt_gb_parser",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
362 default=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
363 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
364 action="store_true",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
365 help="Use alternate GenBank parser instead of BioPython's.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
366 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
367
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
368 # Parse defaults
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
369 args = parser.parse_args()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
370 csv = args.csv
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
371 out = args.out_folder
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
372 overwrite = args.force_write_out
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
373 fna_suffix = args.fna_suffix
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
374 url = args.url
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
375 tax_cols = args.lineage_cols
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
376 gen_per_fa = args.gen_per_fa
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
377 alt_gb_parser = args.alt_gb_parser
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
378 min_len = int(args.fa_min_len)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
379 tcols_pat = re.compile(r"^[\w\,]+?\w$")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
380 mito_fna_suffix = re.compile(r".*?\.genomic\.fna")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
381 mito_gbff_suffix = re.compile(r".*?\.genomic\.gbff")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
382 final_lineages = os.path.join(out, "lineages.csv")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
383 lineages_not_found = os.path.join(out, "lineages_not_found.csv")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
384 base_fasta_dir = os.path.join(out, "fasta")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
385
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
386 # Basic checks
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
387 if not overwrite and os.path.exists(out):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
388 logging.warning(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
389 f"Output destination [{os.path.basename(out)}] already exists!"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
390 + "\nPlease use -f to delete and overwrite."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
391 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
392 elif overwrite and os.path.exists(out):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
393 logging.info(f"Overwrite requested. Deleting {os.path.basename(out)}...")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
394 shutil.rmtree(out)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
395
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
396 if not tcols_pat.match(tax_cols):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
397 logging.error(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
398 f"Supplied columns' names {tax_cols} should only have words (alphanumeric) separated by a comma."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
399 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
400 exit(1)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
401 else:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
402 tax_cols = re.sub("\n", "", tax_cols).split(",")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
403
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
404 # Get .fna and .gbk files
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
405 fna = dl_mito_seqs_and_flat_files(url, mito_fna_suffix, out)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
406 gbk = dl_mito_seqs_and_flat_files(url, mito_gbff_suffix, out)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
407
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
408 # Get taxonomy from ncbitax2lin
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
409 lineages, raw_recs = get_lineages(csv, tax_cols)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
410
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
411 # Get parsed organisms and latest accession from GenBank file.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
412 if alt_gb_parser:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
413 accs2orgs = from_genbank_alt(gbk)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
414 else:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
415 accs2orgs = from_genbank(gbk, min_len)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
416
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
417 # # Finally, read FASTA and create individual FASTA if lineage exists.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
418 logging.info(f"Creating new sequences and lineages...")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
419
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
420 l_fh = open(final_lineages, "w")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
421 ln_fh = open(lineages_not_found, "w")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
422 l_fh.write(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
423 "identifiers,superkingdom,phylum,class,order,family,genus,species,strain\n"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
424 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
425 ln_fh.write("fna_id,gbk_org\n")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
426 passed_lookup = 0
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
427 failed_lookup = 0
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
428 gbk_recs_missing = 0
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
429 skipped_len_short = 0
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
430
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
431 if gen_per_fa and not os.path.exists(base_fasta_dir):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
432 os.makedirs(base_fasta_dir)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
433
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
434 for record in SeqIO.parse(fna, "fasta"):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
435 if len(record.seq) < min_len:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
436 skipped_len_short += 1
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
437 continue
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
438 elif record.id in accs2orgs.keys():
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
439 org_words = accs2orgs[record.id].split(" ")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
440 else:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
441 gbk_recs_missing += 1
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
442 continue
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
443
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
444 genus_species = (
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
445 " ".join(org_words[0:2]) if len(org_words) > 2 else " ".join(org_words[0:])
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
446 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
447
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
448 if gen_per_fa:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
449 write_fasta(record.seq, record.id, base_fasta_dir, fna_suffix)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
450
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
451 if record.id in accs2orgs.keys() and accs2orgs[record.id] in lineages.keys():
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
452 l_fh.write(",".join([record.id, lineages[accs2orgs[record.id]]]) + "\n")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
453 passed_lookup += 1
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
454 elif record.id in accs2orgs.keys() and genus_species in lineages.keys():
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
455 if len(org_words) > 2:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
456 l_fh.write(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
457 ",".join(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
458 [
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
459 record.id,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
460 lineages[genus_species].rstrip(","),
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
461 accs2orgs[record.id],
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
462 ]
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
463 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
464 + "\n"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
465 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
466 else:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
467 l_fh.write(",".join([record.id, lineages[genus_species]]) + "\n")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
468 passed_lookup += 1
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
469 else:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
470 if len(org_words) > 2:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
471 l_fh.write(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
472 ",".join(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
473 [
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
474 record.id,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
475 "",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
476 "",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
477 "",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
478 "",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
479 "",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
480 org_words[0],
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
481 org_words[0] + " " + org_words[1],
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
482 accs2orgs[record.id],
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
483 ]
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
484 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
485 + "\n"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
486 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
487 else:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
488 l_fh.write(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
489 ",".join(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
490 [
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
491 record.id,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
492 "",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
493 "",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
494 "",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
495 "",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
496 "",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
497 org_words[0],
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
498 accs2orgs[record.id],
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
499 "",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
500 ]
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
501 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
502 + "\n"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
503 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
504 ln_fh.write(",".join([record.id, accs2orgs[record.id]]) + "\n")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
505 failed_lookup += 1
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
506
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
507 logging.info(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
508 f"No. of raw records present in `ncbitax2lin` [{os.path.basename(csv)}]: {raw_recs}"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
509 + f"\nNo. of valid records collected from `ncbitax2lin` [{os.path.basename(csv)}]: {len(lineages.keys())}"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
510 + f"\nNo. of sequences skipped (Sequence length < {min_len}): {skipped_len_short}"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
511 + f"\nNo. of records in FASTA [{os.path.basename(fna)}]: {passed_lookup + failed_lookup}"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
512 + f"\nNo. of records in GenBank [{os.path.basename(gbk)}]: {len(accs2orgs.keys())}"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
513 + f"\nNo. of FASTA records for which new lineages were created: {passed_lookup}"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
514 + f"\nNo. of FASTA records for which only genus, species and/or strain information were created: {failed_lookup}"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
515 + f"\nNo. of FASTA records for which no GenBank records exist: {gbk_recs_missing}"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
516 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
517
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
518 if (passed_lookup + failed_lookup) != len(accs2orgs.keys()):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
519 logging.error(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
520 f"The number of FASTA records written [{len(accs2orgs.keys())}]"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
521 + f"\nis not equal to number of lineages created [{passed_lookup + failed_lookup}]!"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
522 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
523 exit(1)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
524 else:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
525 logging.info("Succesfully created lineages and FASTA records! Done!!")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
526
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
527 l_fh.close()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
528 ln_fh.close()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
529
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
530
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
531 if __name__ == "__main__":
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
532 main()