annotate 0.5.0/bin/remove_dup_fasta_ids.py @ 0:3c767f9cfd88 draft default tip

planemo upload
author galaxytrakr
date Fri, 29 May 2026 13:37:56 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
1 #!/usr/bin/env python3
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
2
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
3 import argparse
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
4 import gzip
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
5 import inspect
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
6 import logging
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
7 import os
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
8 import pprint
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
9 import shutil
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
10 from typing import BinaryIO, TextIO, Union
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
11
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
12 from Bio import SeqIO
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
13 from Bio.Seq import Seq
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
14 from Bio.SeqRecord import SeqRecord
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
15 from genericpath import isdir
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
16
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
17
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
18 # Multiple inheritence for pretty printing of help text.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
19 class MultiArgFormatClasses(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
20 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
21 ):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
22 pass
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
23
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
24
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
25 def write_fasta(seq: str, id: str, fh: Union[TextIO, BinaryIO]) -> None:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
26 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
27 Write sequence with no description to specified file.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
28 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
29 SeqIO.write(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
30 SeqRecord(Seq(seq), id=id, description=str()),
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
31 fh,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
32 "fasta",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
33 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
34
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
35
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
36 # Main
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
37 def main() -> None:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
38 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
39 This script takes:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
40 1. A FASTA file in gzip or non-gzip (ASCII TXT) format and
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
41
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
42 and then generates a new FASTA file with duplicate FASTA IDs replaced
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
43 with a unique ID.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
44 """
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
45
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
46 # Set logging.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
47 logging.basicConfig(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
48 format="\n"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
49 + "=" * 55
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
50 + "\n%(asctime)s - %(levelname)s\n"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
51 + "=" * 55
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
52 + "\n%(message)s\n\n",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
53 level=logging.DEBUG,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
54 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
55
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
56 # Debug print.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
57 ppp = pprint.PrettyPrinter(width=55)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
58 prog_name = os.path.basename(inspect.stack()[0].filename)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
59
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
60 parser = argparse.ArgumentParser(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
61 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
62 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
63
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
64 required = parser.add_argument_group("required arguments")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
65
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
66 required.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
67 "-fna",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
68 dest="fna",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
69 default=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
70 required=True,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
71 help="Absolute UNIX path to .fna or .fna.gz file.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
72 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
73 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
74 "-lin",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
75 dest="lineages",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
76 default=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
77 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
78 help="Absolute UNIX path to lineages.csv file for which the"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
79 + "\nthe duplicate IDs will be made unique corresponding to"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
80 + "\nthe FASTA IDs",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
81 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
82 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
83 "-outdir",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
84 dest="out_folder",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
85 default=os.getcwd(),
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
86 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
87 help="By default, the output is written to this\nfolder.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
88 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
89 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
90 "-f",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
91 dest="force_write_out",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
92 default=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
93 action="store_true",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
94 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
95 help="Force overwrite the output file.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
96 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
97 parser.add_argument(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
98 "--fna-suffix",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
99 dest="fna_suffix",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
100 default=".fna",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
101 required=False,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
102 help="Suffix of the output FASTA file.",
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
103 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
104
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
105 # Parse defaults
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
106 args = parser.parse_args()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
107 fna = args.fna
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
108 lineages = args.lineages
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
109 outdir = args.out_folder
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
110 overwrite = args.force_write_out
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
111 fna_suffix = args.fna_suffix
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
112 new_fna = os.path.join(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
113 outdir, os.path.basename(fna).split(".")[0] + "_dedup_ids" + fna_suffix
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
114 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
115 lin_header = False
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
116 new_lin = False
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
117 seen_ids = dict()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
118 seen_lineages = dict()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
119
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
120 # Basic checks
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
121 if not overwrite and os.path.exists(new_fna):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
122 logging.warning(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
123 f"Output destination [{os.path.basename(new_fna)}] already exists!"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
124 + "\nPlease use -f to delete and overwrite."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
125 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
126 elif overwrite and os.path.exists(new_fna):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
127 logging.info(f"Overwrite requested. Deleting {os.path.basename(new_fna)}...")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
128 if os.path.isdir(new_fna):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
129 shutil.rmtree(new_fna)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
130 else:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
131 os.remove(new_fna)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
132
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
133 # Prepare for writing
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
134 new_fna_fh = open(new_fna, "+at")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
135
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
136 # If lineages file is mentioned, index it.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
137 if lineages and os.path.exists(lineages) and os.path.getsize(lineages) > 0:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
138 new_lin = os.path.join(os.getcwd(), os.path.basename(lineages) + "_dedup.csv")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
139 new_lin_fh = open(new_lin, "w")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
140 with open(lineages, "r") as l_fh:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
141 lin_header = l_fh.readline()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
142 for line in l_fh:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
143 cols = line.strip().split(",")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
144 if len(cols) < 9:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
145 logging.error(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
146 f"The row in the lineages file {os.path.basename(lineages)}"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
147 + f"\ndoes not have 9 required columns: {len(cols)}"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
148 + f"\n\n{lin_header.strip()}\n{line.strip()}"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
149 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
150 exit(1)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
151 elif len(cols) > 9:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
152 logging.info(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
153 f"The row in the lineages file {os.path.basename(lineages)}"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
154 + f"\nhas more than 9 required columns: {len(cols)}"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
155 + f"\nRetaining only 9 columns of the following 10 columns."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
156 + f"\n\n{lin_header.strip()}\n{line.strip()}"
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
157 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
158
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
159 if cols[0] not in seen_lineages.keys():
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
160 seen_lineages[cols[0]] = ",".join(cols[1:9])
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
161
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
162 new_lin_fh.write(lin_header)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
163 l_fh.close()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
164
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
165 # Read FASTA and create unique FASTA IDs.
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
166 logging.info(f"Creating new FASTA with unique IDs.")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
167 try:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
168 fna_fh = gzip.open(fna, "rt")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
169 _ = fna_fh.readline()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
170 except gzip.BadGzipFile:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
171 logging.info(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
172 f"Input FASTA file {os.path.basename(fna)} is not in\nGZIP format."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
173 + "\nAttempting text parsing."
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
174 )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
175 fna_fh = open(fna, "r")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
176
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
177 for record in SeqIO.parse(fna_fh, format="fasta"):
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
178 seq_id = record.id
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
179
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
180 if record.id not in seen_ids.keys():
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
181 seen_ids[record.id] = 1
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
182 else:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
183 seen_ids[record.id] += 1
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
184
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
185 if seen_ids[seq_id] > 1:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
186 seq_id = str(record.id) + str(seen_ids[record.id])
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
187
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
188 if new_lin:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
189 new_lin_fh.write(",".join([seq_id, seen_lineages[record.id]]) + "\n")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
190
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
191 write_fasta(record.seq, seq_id, new_fna_fh)
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
192
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
193 if new_lin:
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
194 new_lin_fh.close()
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
195
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
196 logging.info("Done!")
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
197
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
198
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
199 if __name__ == "__main__":
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
200
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
201 main()