comparison 0.5.0/bin/gen_per_species_fa_from_lin.py @ 0:97cd2f532efe

planemo upload
author kkonganti
date Mon, 31 Mar 2025 14:50:40 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:97cd2f532efe
1 #!/usr/bin/env python3
2
3 import argparse
4 import gzip
5 import inspect
6 import logging
7 import os
8 import pprint
9 import re
10 import shutil
11 from collections import defaultdict
12 from typing import BinaryIO, TextIO, Union
13
14 from Bio import SeqIO
15 from Bio.Seq import Seq
16 from Bio.SeqRecord import SeqRecord
17
18
19 # Multiple inheritence for pretty printing of help text.
20 class MultiArgFormatClasses(
21 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
22 ):
23 pass
24
25
26 def get_lineages(csv: os.PathLike) -> defaultdict:
27 """
28 Parse the lineages.csv file and store a list of
29 accessions.
30 """
31 lineages = dict()
32 if csv == None or not (os.path.exists(csv) or os.path.getsize(csv) > 0):
33 logging.error(
34 f"The CSV file [{os.path.basename(csv)}] is empty or does not exist!"
35 )
36 exit(1)
37
38 logging.info(f"Indexing {os.path.basename(csv)}...")
39
40 with open(csv, "r") as csv_fh:
41 _ = csv_fh.readline().strip().split(",")
42 for line in csv_fh:
43 cols = line.strip().split(",")
44
45 if len(cols) < 9:
46 logging.error(
47 f"The CSV file {os.path.basename(csv)} should have a mandatory 9 columns."
48 + "\n\nEx: identifiers,superkingdom,phylum,class,order,family,genus,species,strain"
49 + "\nAB211151.1,Eukaryota,Arthropoda,Malacostraca,Decapoda,Majidae,Chionoecetes,Chionoecetes opilio,"
50 + f"\n\nGot:\n{line}"
51 )
52 exit(1)
53
54 lineages[cols[0]] = re.sub(r"\W+", "-", "_".join(cols[7].split(" ")))
55
56 csv_fh.close()
57 return lineages
58
59
60 def write_fasta(recs: list, basedir: os.PathLike, name: str, suffix: str) -> None:
61 """
62 Write sequence with no description to a specified file.
63 """
64 SeqIO.write(
65 recs,
66 os.path.join(basedir, name + suffix),
67 "fasta",
68 )
69
70
71 def parse_fasta(fh: Union[TextIO, BinaryIO], sp2accs: dict) -> list:
72 """
73 Parse the sequences and create per species FASTA record.
74 """
75 records = defaultdict()
76 logging.info("")
77
78 for record in SeqIO.parse(fh, "fasta"):
79
80 id = record.id
81 seq = record.seq
82
83 if id in sp2accs.keys():
84 records.setdefault(sp2accs[id], []).append(
85 SeqRecord(Seq(seq), id=id, description=str())
86 )
87 else:
88 print(f"Lineage row does not exist for accession: {id}")
89
90 logging.info(f"Collected FASTA records for {len(records.keys())} species'.")
91 fh.close()
92 return records
93
94
95 # Main
96 def main() -> None:
97 """
98 This script takes:
99 1. The FASTA file and,
100 2. Takes the corresponding lineages.csv file and,
101
102 then generates a folder containing individual FASTA sequence files
103 per species.
104 """
105
106 # Set logging.
107 logging.basicConfig(
108 format="\n"
109 + "=" * 55
110 + "\n%(asctime)s - %(levelname)s\n"
111 + "=" * 55
112 + "\n%(message)s\r\r",
113 level=logging.DEBUG,
114 )
115
116 # Debug print.
117 ppp = pprint.PrettyPrinter(width=55)
118 prog_name = os.path.basename(inspect.stack()[0].filename)
119
120 parser = argparse.ArgumentParser(
121 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
122 )
123
124 required = parser.add_argument_group("required arguments")
125
126 required.add_argument(
127 "-fa",
128 dest="fna",
129 default=False,
130 required=True,
131 help="Absolute UNIX path to the FASTA file that corresponds"
132 + "\nto the lineages.csv file.",
133 )
134 required.add_argument(
135 "-csv",
136 dest="csv",
137 default=False,
138 required=True,
139 help="Absolute UNIX path to lineages.csv which has a guaranteed 9 "
140 + "\ncolumns with the first being an accession.",
141 )
142 parser.add_argument(
143 "-out",
144 dest="out_folder",
145 default=os.path.join(os.getcwd(), "species"),
146 required=False,
147 help="By default, the output is written to this\nfolder.",
148 )
149 parser.add_argument(
150 "-f",
151 dest="force_write_out",
152 default=False,
153 action="store_true",
154 required=False,
155 help="Force overwrite output directory contents.",
156 )
157 parser.add_argument(
158 "-suffix",
159 dest="fna_suffix",
160 default=".fna",
161 required=False,
162 help="Suffix of the individual species FASTA files\nthat will be saved.",
163 )
164
165 # Parse defaults
166 args = parser.parse_args()
167 csv = args.csv
168 fna = args.fna
169 out = args.out_folder
170 overwrite = args.force_write_out
171 fna_suffix = args.fna_suffix
172
173 # Basic checks
174 if not overwrite and os.path.exists(out):
175 logging.warning(
176 f"Output destination [{os.path.basename(out)}] already exists!"
177 + "\nPlease use -f to delete and overwrite."
178 )
179 elif overwrite and os.path.exists(out):
180 logging.info(f"Overwrite requested. Deleting {os.path.basename(out)}...")
181 shutil.rmtree(out)
182
183 # Get taxonomy from ncbitax2lin
184 lineages = get_lineages(csv)
185
186 logging.info(f"Creating new squences per species...")
187
188 if not os.path.exists(out):
189 os.makedirs(out)
190
191 try:
192 gz_fh = gzip.open(fna, "rt")
193 fa_recs = parse_fasta(gz_fh, lineages)
194 except gzip.BadGzipFile:
195 logging.info(
196 f"Input FASTA file {os.path.basename(csv)} is not in\nGZIP format."
197 )
198 txt_fh = open(fna, "r")
199 fa_recs = parse_fasta(txt_fh, lineages)
200 finally:
201 logging.info("Assigned FASTA records per species...")
202
203 logging.info("Writing FASTA records per species...")
204
205 for sp in fa_recs.keys():
206 write_fasta(fa_recs[sp], out, sp, fna_suffix)
207
208
209 if __name__ == "__main__":
210 main()
211
212 # ~/apps/nowayout/bin/gen_per_species_fa_from_bold.py -tsv BOLD_Public.05-Feb-2024.tsv -csv ../tax.csv ─╯
213
214 # =======================================================
215 # 2024-02-08 21:37:28,541 - INFO
216 # =======================================================
217 # Indexing tax.csv...
218
219 # =======================================================
220 # 2024-02-08 21:38:06,567 - INFO
221 # =======================================================
222 # Creating new squences per species...
223
224 # =======================================================
225 # 2024-02-08 21:38:06,572 - INFO
226 # =======================================================
227 # Input TSV file BOLD_Public.05-Feb-2024.tsv is not in
228 # GZIP format.
229
230 # =======================================================
231 # 2024-02-08 22:01:04,554 - INFO
232 # =======================================================
233 # Collected FASTA records for 497421 species'.
234
235 # =======================================================
236 # 2024-02-08 22:24:35,000 - INFO
237 # =======================================================
238 # No. of raw records present in `ncbitax2lin` [tax.csv]: 2550767
239 # No. of valid records collected from `ncbitax2lin` [tax.csv]: 2134980
240 # No. of raw records in TSV [BOLD_Public.05-Feb-2024.tsv]: 9735210
241 # No. of valid records in TSV [BOLD_Public.05-Feb-2024.tsv]: 4988323
242 # No. of FASTA records for which new lineages were created: 4069202
243 # No. of FASTA records for which only genus, species and/or strain information were created: 919121
244
245 # =======================================================
246 # 2024-02-08 22:24:35,001 - INFO
247 # =======================================================
248 # Succesfully created lineages and FASTA records! Done!!