kkonganti@0
|
1 #!/usr/bin/env python3
|
kkonganti@0
|
2
|
kkonganti@0
|
3 import argparse
|
kkonganti@0
|
4 import gzip
|
kkonganti@0
|
5 import inspect
|
kkonganti@0
|
6 import logging
|
kkonganti@0
|
7 import os
|
kkonganti@0
|
8 import pprint
|
kkonganti@0
|
9 import re
|
kkonganti@0
|
10 import shutil
|
kkonganti@0
|
11 from collections import defaultdict
|
kkonganti@0
|
12 from typing import BinaryIO, TextIO, Union
|
kkonganti@0
|
13
|
kkonganti@0
|
14 from Bio import SeqIO
|
kkonganti@0
|
15 from Bio.Seq import Seq
|
kkonganti@0
|
16 from Bio.SeqRecord import SeqRecord
|
kkonganti@0
|
17
|
kkonganti@0
|
18
|
kkonganti@0
|
19 # Multiple inheritence for pretty printing of help text.
|
kkonganti@0
|
20 class MultiArgFormatClasses(
|
kkonganti@0
|
21 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
|
kkonganti@0
|
22 ):
|
kkonganti@0
|
23 pass
|
kkonganti@0
|
24
|
kkonganti@0
|
25
|
kkonganti@0
|
26 def get_lineages(csv: os.PathLike) -> defaultdict:
|
kkonganti@0
|
27 """
|
kkonganti@0
|
28 Parse the lineages.csv file and store a list of
|
kkonganti@0
|
29 accessions.
|
kkonganti@0
|
30 """
|
kkonganti@0
|
31 lineages = dict()
|
kkonganti@0
|
32 if csv == None or not (os.path.exists(csv) or os.path.getsize(csv) > 0):
|
kkonganti@0
|
33 logging.error(
|
kkonganti@0
|
34 f"The CSV file [{os.path.basename(csv)}] is empty or does not exist!"
|
kkonganti@0
|
35 )
|
kkonganti@0
|
36 exit(1)
|
kkonganti@0
|
37
|
kkonganti@0
|
38 logging.info(f"Indexing {os.path.basename(csv)}...")
|
kkonganti@0
|
39
|
kkonganti@0
|
40 with open(csv, "r") as csv_fh:
|
kkonganti@0
|
41 _ = csv_fh.readline().strip().split(",")
|
kkonganti@0
|
42 for line in csv_fh:
|
kkonganti@0
|
43 cols = line.strip().split(",")
|
kkonganti@0
|
44
|
kkonganti@0
|
45 if len(cols) < 9:
|
kkonganti@0
|
46 logging.error(
|
kkonganti@0
|
47 f"The CSV file {os.path.basename(csv)} should have a mandatory 9 columns."
|
kkonganti@0
|
48 + "\n\nEx: identifiers,superkingdom,phylum,class,order,family,genus,species,strain"
|
kkonganti@0
|
49 + "\nAB211151.1,Eukaryota,Arthropoda,Malacostraca,Decapoda,Majidae,Chionoecetes,Chionoecetes opilio,"
|
kkonganti@0
|
50 + f"\n\nGot:\n{line}"
|
kkonganti@0
|
51 )
|
kkonganti@0
|
52 exit(1)
|
kkonganti@0
|
53
|
kkonganti@0
|
54 lineages[cols[0]] = re.sub(r"\W+", "-", "_".join(cols[7].split(" ")))
|
kkonganti@0
|
55
|
kkonganti@0
|
56 csv_fh.close()
|
kkonganti@0
|
57 return lineages
|
kkonganti@0
|
58
|
kkonganti@0
|
59
|
kkonganti@0
|
60 def write_fasta(recs: list, basedir: os.PathLike, name: str, suffix: str) -> None:
|
kkonganti@0
|
61 """
|
kkonganti@0
|
62 Write sequence with no description to a specified file.
|
kkonganti@0
|
63 """
|
kkonganti@0
|
64 SeqIO.write(
|
kkonganti@0
|
65 recs,
|
kkonganti@0
|
66 os.path.join(basedir, name + suffix),
|
kkonganti@0
|
67 "fasta",
|
kkonganti@0
|
68 )
|
kkonganti@0
|
69
|
kkonganti@0
|
70
|
kkonganti@0
|
71 def parse_fasta(fh: Union[TextIO, BinaryIO], sp2accs: dict) -> list:
|
kkonganti@0
|
72 """
|
kkonganti@0
|
73 Parse the sequences and create per species FASTA record.
|
kkonganti@0
|
74 """
|
kkonganti@0
|
75 records = defaultdict()
|
kkonganti@0
|
76 logging.info("")
|
kkonganti@0
|
77
|
kkonganti@0
|
78 for record in SeqIO.parse(fh, "fasta"):
|
kkonganti@0
|
79
|
kkonganti@0
|
80 id = record.id
|
kkonganti@0
|
81 seq = record.seq
|
kkonganti@0
|
82
|
kkonganti@0
|
83 if id in sp2accs.keys():
|
kkonganti@0
|
84 records.setdefault(sp2accs[id], []).append(
|
kkonganti@0
|
85 SeqRecord(Seq(seq), id=id, description=str())
|
kkonganti@0
|
86 )
|
kkonganti@0
|
87 else:
|
kkonganti@0
|
88 print(f"Lineage row does not exist for accession: {id}")
|
kkonganti@0
|
89
|
kkonganti@0
|
90 logging.info(f"Collected FASTA records for {len(records.keys())} species'.")
|
kkonganti@0
|
91 fh.close()
|
kkonganti@0
|
92 return records
|
kkonganti@0
|
93
|
kkonganti@0
|
94
|
kkonganti@0
|
95 # Main
|
kkonganti@0
|
96 def main() -> None:
|
kkonganti@0
|
97 """
|
kkonganti@0
|
98 This script takes:
|
kkonganti@0
|
99 1. The FASTA file and,
|
kkonganti@0
|
100 2. Takes the corresponding lineages.csv file and,
|
kkonganti@0
|
101
|
kkonganti@0
|
102 then generates a folder containing individual FASTA sequence files
|
kkonganti@0
|
103 per species.
|
kkonganti@0
|
104 """
|
kkonganti@0
|
105
|
kkonganti@0
|
106 # Set logging.
|
kkonganti@0
|
107 logging.basicConfig(
|
kkonganti@0
|
108 format="\n"
|
kkonganti@0
|
109 + "=" * 55
|
kkonganti@0
|
110 + "\n%(asctime)s - %(levelname)s\n"
|
kkonganti@0
|
111 + "=" * 55
|
kkonganti@0
|
112 + "\n%(message)s\r\r",
|
kkonganti@0
|
113 level=logging.DEBUG,
|
kkonganti@0
|
114 )
|
kkonganti@0
|
115
|
kkonganti@0
|
116 # Debug print.
|
kkonganti@0
|
117 ppp = pprint.PrettyPrinter(width=55)
|
kkonganti@0
|
118 prog_name = os.path.basename(inspect.stack()[0].filename)
|
kkonganti@0
|
119
|
kkonganti@0
|
120 parser = argparse.ArgumentParser(
|
kkonganti@0
|
121 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
|
kkonganti@0
|
122 )
|
kkonganti@0
|
123
|
kkonganti@0
|
124 required = parser.add_argument_group("required arguments")
|
kkonganti@0
|
125
|
kkonganti@0
|
126 required.add_argument(
|
kkonganti@0
|
127 "-fa",
|
kkonganti@0
|
128 dest="fna",
|
kkonganti@0
|
129 default=False,
|
kkonganti@0
|
130 required=True,
|
kkonganti@0
|
131 help="Absolute UNIX path to the FASTA file that corresponds"
|
kkonganti@0
|
132 + "\nto the lineages.csv file.",
|
kkonganti@0
|
133 )
|
kkonganti@0
|
134 required.add_argument(
|
kkonganti@0
|
135 "-csv",
|
kkonganti@0
|
136 dest="csv",
|
kkonganti@0
|
137 default=False,
|
kkonganti@0
|
138 required=True,
|
kkonganti@0
|
139 help="Absolute UNIX path to lineages.csv which has a guaranteed 9 "
|
kkonganti@0
|
140 + "\ncolumns with the first being an accession.",
|
kkonganti@0
|
141 )
|
kkonganti@0
|
142 parser.add_argument(
|
kkonganti@0
|
143 "-out",
|
kkonganti@0
|
144 dest="out_folder",
|
kkonganti@0
|
145 default=os.path.join(os.getcwd(), "species"),
|
kkonganti@0
|
146 required=False,
|
kkonganti@0
|
147 help="By default, the output is written to this\nfolder.",
|
kkonganti@0
|
148 )
|
kkonganti@0
|
149 parser.add_argument(
|
kkonganti@0
|
150 "-f",
|
kkonganti@0
|
151 dest="force_write_out",
|
kkonganti@0
|
152 default=False,
|
kkonganti@0
|
153 action="store_true",
|
kkonganti@0
|
154 required=False,
|
kkonganti@0
|
155 help="Force overwrite output directory contents.",
|
kkonganti@0
|
156 )
|
kkonganti@0
|
157 parser.add_argument(
|
kkonganti@0
|
158 "-suffix",
|
kkonganti@0
|
159 dest="fna_suffix",
|
kkonganti@0
|
160 default=".fna",
|
kkonganti@0
|
161 required=False,
|
kkonganti@0
|
162 help="Suffix of the individual species FASTA files\nthat will be saved.",
|
kkonganti@0
|
163 )
|
kkonganti@0
|
164
|
kkonganti@0
|
165 # Parse defaults
|
kkonganti@0
|
166 args = parser.parse_args()
|
kkonganti@0
|
167 csv = args.csv
|
kkonganti@0
|
168 fna = args.fna
|
kkonganti@0
|
169 out = args.out_folder
|
kkonganti@0
|
170 overwrite = args.force_write_out
|
kkonganti@0
|
171 fna_suffix = args.fna_suffix
|
kkonganti@0
|
172
|
kkonganti@0
|
173 # Basic checks
|
kkonganti@0
|
174 if not overwrite and os.path.exists(out):
|
kkonganti@0
|
175 logging.warning(
|
kkonganti@0
|
176 f"Output destination [{os.path.basename(out)}] already exists!"
|
kkonganti@0
|
177 + "\nPlease use -f to delete and overwrite."
|
kkonganti@0
|
178 )
|
kkonganti@0
|
179 elif overwrite and os.path.exists(out):
|
kkonganti@0
|
180 logging.info(f"Overwrite requested. Deleting {os.path.basename(out)}...")
|
kkonganti@0
|
181 shutil.rmtree(out)
|
kkonganti@0
|
182
|
kkonganti@0
|
183 # Get taxonomy from ncbitax2lin
|
kkonganti@0
|
184 lineages = get_lineages(csv)
|
kkonganti@0
|
185
|
kkonganti@0
|
186 logging.info(f"Creating new squences per species...")
|
kkonganti@0
|
187
|
kkonganti@0
|
188 if not os.path.exists(out):
|
kkonganti@0
|
189 os.makedirs(out)
|
kkonganti@0
|
190
|
kkonganti@0
|
191 try:
|
kkonganti@0
|
192 gz_fh = gzip.open(fna, "rt")
|
kkonganti@0
|
193 fa_recs = parse_fasta(gz_fh, lineages)
|
kkonganti@0
|
194 except gzip.BadGzipFile:
|
kkonganti@0
|
195 logging.info(
|
kkonganti@0
|
196 f"Input FASTA file {os.path.basename(csv)} is not in\nGZIP format."
|
kkonganti@0
|
197 )
|
kkonganti@0
|
198 txt_fh = open(fna, "r")
|
kkonganti@0
|
199 fa_recs = parse_fasta(txt_fh, lineages)
|
kkonganti@0
|
200 finally:
|
kkonganti@0
|
201 logging.info("Assigned FASTA records per species...")
|
kkonganti@0
|
202
|
kkonganti@0
|
203 logging.info("Writing FASTA records per species...")
|
kkonganti@0
|
204
|
kkonganti@0
|
205 for sp in fa_recs.keys():
|
kkonganti@0
|
206 write_fasta(fa_recs[sp], out, sp, fna_suffix)
|
kkonganti@0
|
207
|
kkonganti@0
|
208
|
kkonganti@0
|
209 if __name__ == "__main__":
|
kkonganti@0
|
210 main()
|
kkonganti@0
|
211
|
kkonganti@0
|
212 # ~/apps/nowayout/bin/gen_per_species_fa_from_bold.py -tsv BOLD_Public.05-Feb-2024.tsv -csv ../tax.csv ─╯
|
kkonganti@0
|
213
|
kkonganti@0
|
214 # =======================================================
|
kkonganti@0
|
215 # 2024-02-08 21:37:28,541 - INFO
|
kkonganti@0
|
216 # =======================================================
|
kkonganti@0
|
217 # Indexing tax.csv...
|
kkonganti@0
|
218
|
kkonganti@0
|
219 # =======================================================
|
kkonganti@0
|
220 # 2024-02-08 21:38:06,567 - INFO
|
kkonganti@0
|
221 # =======================================================
|
kkonganti@0
|
222 # Creating new squences per species...
|
kkonganti@0
|
223
|
kkonganti@0
|
224 # =======================================================
|
kkonganti@0
|
225 # 2024-02-08 21:38:06,572 - INFO
|
kkonganti@0
|
226 # =======================================================
|
kkonganti@0
|
227 # Input TSV file BOLD_Public.05-Feb-2024.tsv is not in
|
kkonganti@0
|
228 # GZIP format.
|
kkonganti@0
|
229
|
kkonganti@0
|
230 # =======================================================
|
kkonganti@0
|
231 # 2024-02-08 22:01:04,554 - INFO
|
kkonganti@0
|
232 # =======================================================
|
kkonganti@0
|
233 # Collected FASTA records for 497421 species'.
|
kkonganti@0
|
234
|
kkonganti@0
|
235 # =======================================================
|
kkonganti@0
|
236 # 2024-02-08 22:24:35,000 - INFO
|
kkonganti@0
|
237 # =======================================================
|
kkonganti@0
|
238 # No. of raw records present in `ncbitax2lin` [tax.csv]: 2550767
|
kkonganti@0
|
239 # No. of valid records collected from `ncbitax2lin` [tax.csv]: 2134980
|
kkonganti@0
|
240 # No. of raw records in TSV [BOLD_Public.05-Feb-2024.tsv]: 9735210
|
kkonganti@0
|
241 # No. of valid records in TSV [BOLD_Public.05-Feb-2024.tsv]: 4988323
|
kkonganti@0
|
242 # No. of FASTA records for which new lineages were created: 4069202
|
kkonganti@0
|
243 # No. of FASTA records for which only genus, species and/or strain information were created: 919121
|
kkonganti@0
|
244
|
kkonganti@0
|
245 # =======================================================
|
kkonganti@0
|
246 # 2024-02-08 22:24:35,001 - INFO
|
kkonganti@0
|
247 # =======================================================
|
kkonganti@0
|
248 # Succesfully created lineages and FASTA records! Done!!
|