Mercurial > repos > rliterman > csp2
comparison CSP2/bin/screenSNPDiffs.py @ 0:01431fa12065
"planemo upload"
author | rliterman |
---|---|
date | Mon, 02 Dec 2024 10:40:55 -0500 |
parents | |
children | 792274118b2e |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:01431fa12065 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import sys | |
4 import os | |
5 import pandas as pd | |
6 import datetime | |
7 from pybedtools import BedTool,helpers | |
8 import concurrent.futures | |
9 import time | |
10 import uuid | |
11 import traceback | |
12 import shutil | |
13 import argparse | |
14 | |
15 | |
16 def fetchHeaders(snpdiffs_file): | |
17 | |
18 with open(snpdiffs_file, 'r') as file: | |
19 top_line = file.readline().strip().split('\t')[1:] | |
20 | |
21 header_cols = [item.split(':')[0] for item in top_line] | |
22 header_vals = [item.split(':')[1] for item in top_line] | |
23 | |
24 header_data = pd.DataFrame(columns = header_cols) | |
25 header_data.loc[0] = header_vals | |
26 header_data.loc[:, 'File_Path'] = snpdiffs_file | |
27 | |
28 return header_data | |
29 | |
30 def processBED(bed_rows,snpdiffs_orientation): | |
31 | |
32 bed_columns = ['Ref_Contig','Ref_Start','Ref_End','Ref_Length','Ref_Aligned', | |
33 'Query_Contig','Query_Start','Query_End','Query_Length','Query_Aligned', | |
34 'Perc_Iden'] | |
35 | |
36 reverse_columns = ['Query_Contig','Query_Start','Query_End','Query_Length','Query_Aligned', | |
37 'Ref_Contig','Ref_Start','Ref_End','Ref_Length','Ref_Aligned', | |
38 'Perc_Iden'] | |
39 | |
40 int_columns = ['Ref_Start', 'Ref_End', 'Ref_Length', 'Ref_Aligned', | |
41 'Query_Start', 'Query_End', 'Query_Length', 'Query_Aligned'] | |
42 | |
43 float_columns = ['Perc_Iden'] | |
44 | |
45 if len(bed_rows) > 0: | |
46 | |
47 bed_df = pd.DataFrame(bed_rows, columns=bed_columns) | |
48 | |
49 # Swap columns if reversed | |
50 if snpdiffs_orientation == -1: | |
51 bed_df = bed_df[reverse_columns].copy() | |
52 bed_df.columns = bed_columns | |
53 | |
54 # Remove any rows where Query_Contig or Ref_Contig == "." (Unaligned) | |
55 covered_bed_df = bed_df[(bed_df['Ref_Start'] != ".") & (bed_df['Query_Start'] != ".")].copy() | |
56 | |
57 if covered_bed_df.shape[0] > 0: | |
58 for col in int_columns: | |
59 covered_bed_df.loc[:, col] = covered_bed_df.loc[:, col].astype(float).astype(int) | |
60 for col in float_columns: | |
61 covered_bed_df.loc[:, col] = covered_bed_df.loc[:, col].astype(float) | |
62 return covered_bed_df | |
63 else: | |
64 return pd.DataFrame(columns=bed_columns) | |
65 else: | |
66 return pd.DataFrame(columns=bed_columns) | |
67 | |
68 def processSNPs(snp_rows,snpdiffs_orientation): | |
69 | |
70 snp_columns = ['Ref_Contig','Start_Ref','Ref_Pos', | |
71 'Query_Contig','Start_Query','Query_Pos', | |
72 'Ref_Loc','Query_Loc', | |
73 'Ref_Start','Ref_End', | |
74 'Query_Start','Query_End', | |
75 'Ref_Base','Query_Base', | |
76 'Dist_to_Ref_End','Dist_to_Query_End', | |
77 'Ref_Aligned','Query_Aligned', | |
78 'Query_Direction','Perc_Iden','Cat'] | |
79 | |
80 reverse_columns = ['Query_Contig','Start_Query','Query_Pos', | |
81 'Ref_Contig','Start_Ref','Ref_Pos', | |
82 'Query_Loc','Ref_Loc', | |
83 'Query_Start','Query_End', | |
84 'Ref_Start','Ref_End', | |
85 'Query_Base','Ref_Base', | |
86 'Dist_to_Query_End','Dist_to_Ref_End', | |
87 'Query_Aligned','Ref_Aligned', | |
88 'Query_Direction','Perc_Iden','Cat'] | |
89 | |
90 return_columns = ['Ref_Contig','Start_Ref','Ref_Pos', | |
91 'Query_Contig','Start_Query','Query_Pos', | |
92 'Ref_Loc','Query_Loc', | |
93 'Ref_Start','Ref_End', | |
94 'Query_Start','Query_End', | |
95 'Ref_Base','Query_Base', | |
96 'Dist_to_Ref_End','Dist_to_Query_End', | |
97 'Ref_Aligned','Query_Aligned', | |
98 'Perc_Iden','Cat'] | |
99 | |
100 reverse_complement = {'A':'T','T':'A','G':'C','C':'G', | |
101 'a':'T','t':'A','c':'G','g':'C'} | |
102 | |
103 # Columns to convert to integer | |
104 int_columns = ['Start_Ref', 'Ref_Pos', 'Start_Query', 'Query_Pos', | |
105 'Dist_to_Ref_End', 'Dist_to_Query_End', 'Ref_Aligned', 'Query_Aligned'] | |
106 | |
107 # Columns to convert to float | |
108 float_columns = ['Perc_Iden'] | |
109 | |
110 if len(snp_rows) > 0: | |
111 snp_df = pd.DataFrame(snp_rows, columns= snp_columns).copy() | |
112 | |
113 if snpdiffs_orientation == -1: | |
114 snp_df = snp_df[reverse_columns].copy() | |
115 snp_df.columns = snp_columns | |
116 | |
117 # Replace Query_Base and Reference_Base with reverse complement if Query_Direction is -1 and base is in ['A','T','G','C','a','c','t','g'] | |
118 snp_df.loc[snp_df['Query_Direction'] == '-1','Query_Base'] = snp_df.loc[snp_df['Query_Direction'] == '-1','Query_Base'].apply(lambda x: reverse_complement[x] if x in reverse_complement else x) | |
119 snp_df.loc[snp_df['Query_Direction'] == '-1','Ref_Base'] = snp_df.loc[snp_df['Query_Direction'] == '-1','Ref_Base'].apply(lambda x: reverse_complement[x] if x in reverse_complement else x) | |
120 | |
121 | |
122 for col in int_columns: | |
123 snp_df.loc[:, col] = snp_df.loc[:, col].astype(float).astype(int) | |
124 for col in float_columns: | |
125 snp_df.loc[:, col] = snp_df.loc[:, col].astype(float) | |
126 | |
127 else: | |
128 snp_df = pd.DataFrame(columns = return_columns) | |
129 | |
130 return snp_df[return_columns] | |
131 | |
132 def swapHeader(header_data): | |
133 | |
134 raw_header_cols = [x for x in header_data.columns] | |
135 reverse_header_cols = [item.replace('Reference', 'temp').replace('Query', 'Reference').replace('temp', 'Query') for item in raw_header_cols] | |
136 reversed_header_data = header_data[reverse_header_cols].copy() | |
137 reversed_header_data.columns = raw_header_cols | |
138 | |
139 return reversed_header_data | |
140 | |
141 def parseSNPDiffs(snpdiffs_file,snpdiffs_orientation): | |
142 | |
143 bed_rows = [] | |
144 snp_rows = [] | |
145 | |
146 with open(snpdiffs_file, 'r') as file: | |
147 lines = file.readlines() | |
148 | |
149 for line in lines: | |
150 if line[0:2] == "#\t": | |
151 pass | |
152 elif line[0:3] == "##\t": | |
153 bed_rows.append(line.strip().split("\t")[1:]) | |
154 else: | |
155 snp_rows.append(line.strip().split("\t")) | |
156 | |
157 bed_df = processBED(bed_rows,snpdiffs_orientation) | |
158 snp_df = processSNPs(snp_rows,snpdiffs_orientation) | |
159 return (bed_df,snp_df) | |
160 | |
161 def calculate_total_length(bedtool): | |
162 return sum(len(interval) for interval in bedtool) | |
163 | |
164 def filterSNPs(raw_snp_df,bed_df,log_file, min_len, min_iden, ref_edge, query_edge, density_windows, max_snps): | |
165 | |
166 if temp_dir != "": | |
167 helpers.set_tempdir(temp_dir) | |
168 | |
169 # Grab raw data | |
170 total_snp_count = raw_snp_df.shape[0] | |
171 | |
172 # Get unique SNPs relative to the reference genome | |
173 unique_ref_snps = raw_snp_df['Ref_Loc'].unique() | |
174 unique_snp_count = len(unique_ref_snps) | |
175 | |
176 snp_tally_df = pd.DataFrame() | |
177 | |
178 with open(log_file,"a+") as log: | |
179 log.write(f"\n\t- Raw SNP + indel count: {total_snp_count}\n") | |
180 log.write(f"\n\t- Unique SNP positions in reference genome: {unique_snp_count}\n") | |
181 | |
182 # Set all sites to SNP | |
183 raw_snp_df['Filter_Cat'] = "SNP" | |
184 | |
185 # Filter out SNPs based on --min_len and --min_iden | |
186 reject_length = raw_snp_df.loc[(raw_snp_df['Ref_Aligned'] < min_len) & (raw_snp_df['Perc_Iden'] >= min_iden)].copy() | |
187 if reject_length.shape[0] > 0: | |
188 with open(log_file,"a+") as log: | |
189 log.write(f"\t\t- Purged (Alignment Length): {reject_length.shape[0]}\n") | |
190 reject_length['Filter_Cat'] = "Purged_Length" | |
191 snp_tally_df = pd.concat([snp_tally_df,reject_length]).reset_index(drop=True) | |
192 | |
193 reject_iden = raw_snp_df.loc[(raw_snp_df['Ref_Aligned'] >= min_len) & (raw_snp_df['Perc_Iden'] < min_iden)].copy() | |
194 if reject_iden.shape[0] > 0: | |
195 with open(log_file,"a+") as log: | |
196 log.write(f"\t\t- Purged (Alignment Identity): {reject_iden.shape[0]}\n") | |
197 reject_iden['Filter_Cat'] = "Purged_Identity" | |
198 snp_tally_df = pd.concat([snp_tally_df,reject_iden]).reset_index(drop=True) | |
199 | |
200 reject_lenIden = raw_snp_df.loc[(raw_snp_df['Ref_Aligned'] < min_len) & (raw_snp_df['Perc_Iden'] < min_iden)].copy() | |
201 if reject_lenIden.shape[0] > 0: | |
202 with open(log_file,"a+") as log: | |
203 log.write(f"\t\t- Purged (Alignment Length + Identity): {reject_lenIden.shape[0]}\n") | |
204 reject_lenIden['Filter_Cat'] = "Purged_LengthIdentity" | |
205 snp_tally_df = pd.concat([snp_tally_df,reject_lenIden]).reset_index(drop=True) | |
206 | |
207 pass_filter = raw_snp_df.loc[(raw_snp_df['Ref_Aligned'] >= min_len) & (raw_snp_df['Perc_Iden'] >= min_iden)].copy().reset_index(drop=True) | |
208 | |
209 # Invalid processing | |
210 reject_invalid = pass_filter[pass_filter['Cat'] == "Invalid"].copy() | |
211 if reject_invalid.shape[0] > 0: | |
212 with open(log_file,"a+") as log: | |
213 log.write(f"\t\t- Purged (Invalid Base): {reject_invalid.shape[0]}\n") | |
214 reject_invalid['Filter_Cat'] = "Purged_Invalid" | |
215 snp_tally_df = pd.concat([snp_tally_df,reject_invalid]).reset_index(drop=True) | |
216 pass_filter = pass_filter.loc[pass_filter['Cat'] != "Invalid"].copy() | |
217 | |
218 # Indel processing | |
219 reject_indel = pass_filter[pass_filter['Cat'] == "Indel"].copy() | |
220 if reject_indel.shape[0] > 0: | |
221 with open(log_file,"a+") as log: | |
222 log.write(f"\t\t- Purged (Indel): {reject_indel.shape[0]}\n") | |
223 reject_indel['Filter_Cat'] = "Purged_Indel" | |
224 snp_tally_df = pd.concat([snp_tally_df,reject_indel]).reset_index(drop=True) | |
225 pass_filter = pass_filter.loc[pass_filter['Cat'] != "Indel"].copy() | |
226 | |
227 # Check for heterozygous SNPs | |
228 check_heterozygous = pass_filter.groupby('Ref_Loc').filter(lambda x: x['Query_Base'].nunique() > 1) | |
229 if check_heterozygous.shape[0] > 0: | |
230 reject_heterozygous = pass_filter.loc[pass_filter['Ref_Loc'].isin(check_heterozygous['Ref_Loc'])].copy() | |
231 reject_heterozygous['Filter_Cat'] = "Purged_Heterozygous" | |
232 with open(log_file,"a+") as log: | |
233 log.write(f"\t\t- Purged (Heterozygotes): {reject_heterozygous.shape[0]}\n") | |
234 snp_tally_df = pd.concat([snp_tally_df,reject_heterozygous]).reset_index(drop=True) | |
235 pass_filter = pass_filter.loc[~pass_filter['Ref_Loc'].isin(check_heterozygous['Ref_Loc'])].copy() | |
236 | |
237 # Check for duplicate SNPs and take the longest, best hit | |
238 check_duplicates = pass_filter.groupby('Ref_Loc').filter(lambda x: x.shape[0] > 1) | |
239 if check_duplicates.shape[0] > 0: | |
240 reject_duplicate = pass_filter.loc[pass_filter['Ref_Loc'].isin(check_duplicates['Ref_Loc'])].copy() | |
241 pass_filter = pass_filter.loc[~pass_filter['Ref_Loc'].isin(check_duplicates['Ref_Loc'])].copy() | |
242 | |
243 best_snp = reject_duplicate.groupby('Ref_Loc').apply(lambda x: x.sort_values(by=['Ref_Aligned', 'Perc_Iden'], ascending=[False, False]).head(1)) | |
244 pass_filter = pd.concat([pass_filter,best_snp]).reset_index(drop=True) | |
245 | |
246 dup_snps = reject_duplicate[~reject_duplicate.apply(lambda x: x in best_snp, axis=1)] | |
247 dup_snps['Filter_Cat'] = "Purged_Duplicate" | |
248 | |
249 snp_tally_df = pd.concat([snp_tally_df,dup_snps]).reset_index(drop=True) | |
250 | |
251 with open(log_file,"a+") as log: | |
252 log.write(f"\t\t- Purged (Duplicates): {dup_snps.shape[0]}\n") | |
253 | |
254 # Assert that Ref_Loc and Query_Loc are unique in pass_filter | |
255 helpers.cleanup(verbose=False,remove_all = False) | |
256 assert pass_filter['Ref_Loc'].nunique() == pass_filter.shape[0] | |
257 assert pass_filter['Query_Loc'].nunique() == pass_filter.shape[0] | |
258 | |
259 # Density filtering | |
260 density_locs = [] | |
261 ref_locs = pass_filter['Ref_Loc'].tolist() | |
262 | |
263 if len(density_windows) == 0: | |
264 with open(log_file,"a+") as log: | |
265 log.write("\n\t- Density filtering disabled...\n") | |
266 elif len(ref_locs) > 0: | |
267 density_df = pd.DataFrame([item.split('/') for item in ref_locs], columns=['Ref_Contig','Ref_End']) | |
268 density_df['Ref_Start'] = density_df['Ref_End'].astype(float).astype(int) - 1 | |
269 density_bed = BedTool.from_dataframe(density_df[['Ref_Contig','Ref_Start','Ref_End']]) | |
270 | |
271 # For each density window, remove all SNPs that fall in a window with > max_snps | |
272 for i in range(0,len(density_windows)): | |
273 window_df = density_bed.window(density_bed,c=True, w=density_windows[i]).to_dataframe() | |
274 problematic_windows = window_df[window_df['name'] > max_snps[i]].copy() | |
275 if not problematic_windows.empty: | |
276 temp_locs = [] | |
277 for _, row in problematic_windows.iterrows(): | |
278 purge_window_df = window_df[window_df['chrom'] == row['chrom']].copy() | |
279 purge_window_df['Dist'] = abs(purge_window_df['end'] - row['end']) | |
280 window_snps = purge_window_df.sort_values(by=['Dist'],ascending=True).head(row['name']) | |
281 temp_locs = temp_locs + ["/".join([str(x[0]),str(x[1])]) for x in list(zip(window_snps.chrom, window_snps.end))] | |
282 density_locs.extend(list(set(temp_locs))) | |
283 | |
284 density_locs = list(set(density_locs)) | |
285 reject_density = pass_filter[pass_filter['Ref_Loc'].isin(density_locs)].copy() | |
286 | |
287 if reject_density.shape[0] > 0: | |
288 with open(log_file,"a+") as log: | |
289 log.write(f"\t\t- Purged (Density): {reject_density.shape[0]}\n") | |
290 reject_density['Filter_Cat'] = "Purged_Density" | |
291 snp_tally_df = pd.concat([snp_tally_df,reject_density]).reset_index(drop=True) | |
292 pass_filter = pass_filter[~pass_filter['Ref_Loc'].isin(density_locs)].copy() | |
293 | |
294 reject_query_edge = pass_filter[(pass_filter['Dist_to_Query_End'] < query_edge) & (pass_filter['Dist_to_Ref_End'] >= ref_edge)].copy() | |
295 reject_ref_edge = pass_filter[(pass_filter['Dist_to_Ref_End'] < ref_edge) & (pass_filter['Dist_to_Query_End'] >= query_edge)].copy() | |
296 reject_both_edge = pass_filter[(pass_filter['Dist_to_Query_End'] < query_edge) & (pass_filter['Dist_to_Ref_End'] < ref_edge)].copy() | |
297 | |
298 if reject_query_edge.shape[0] > 0: | |
299 with open(log_file,"a+") as log: | |
300 log.write(f"\t\t- Purged (Query Edge): {reject_query_edge.shape[0]}\n") | |
301 reject_query_edge['Filter_Cat'] = "Filtered_Query_Edge" | |
302 snp_tally_df = pd.concat([snp_tally_df,reject_query_edge]).reset_index(drop=True) | |
303 | |
304 if reject_ref_edge.shape[0] > 0: | |
305 with open(log_file,"a+") as log: | |
306 log.write(f"\t\t- Purged (Ref Edge): {reject_ref_edge.shape[0]}\n") | |
307 reject_ref_edge['Filter_Cat'] = "Filtered_Ref_Edge" | |
308 snp_tally_df = pd.concat([snp_tally_df,reject_ref_edge]).reset_index(drop=True) | |
309 | |
310 if reject_both_edge.shape[0] > 0: | |
311 with open(log_file,"a+") as log: | |
312 log.write(f"\t\t- Purged (Both Edge): {reject_both_edge.shape[0]}\n") | |
313 reject_both_edge['Filter_Cat'] = "Filtered_Both_Edge" | |
314 snp_tally_df = pd.concat([snp_tally_df,reject_both_edge]).reset_index(drop=True) | |
315 | |
316 pass_filter = pass_filter[(pass_filter['Dist_to_Query_End'] >= query_edge) & (pass_filter['Dist_to_Ref_End'] >= ref_edge)].copy() | |
317 | |
318 helpers.cleanup(verbose=False,remove_all = False) | |
319 | |
320 assert snp_tally_df.shape[0] + pass_filter.shape[0] == total_snp_count | |
321 return_df = pd.concat([pass_filter,snp_tally_df]).reset_index(drop=True).sort_values(by=['Ref_Loc']) | |
322 | |
323 return return_df.drop(columns=['Cat']).rename({'Filter_Cat':'Cat'}, axis=1) | |
324 | |
325 | |
326 def screenSNPDiffs(snpdiffs_file,trim_name, min_cov, min_len, min_iden, ref_edge, query_edge, density_windows, max_snps,ref_ids): | |
327 | |
328 if temp_dir != "": | |
329 helpers.set_tempdir(temp_dir) | |
330 | |
331 screen_start_time = time.time() | |
332 | |
333 # Set CSP2 variables to NA | |
334 csp2_screen_snps = purged_length = purged_identity = purged_invalid = purged_indel = purged_lengthIdentity = purged_duplicate = purged_het = purged_density = filtered_ref_edge = filtered_query_edge = filtered_both_edge = "NA" | |
335 | |
336 # Ensure snpdiffs file exists | |
337 if not os.path.exists(snpdiffs_file) or not snpdiffs_file.endswith('.snpdiffs'): | |
338 run_failed = True | |
339 sys.exit(f"Invalid snpdiffs file provided: {snpdiffs_file}") | |
340 | |
341 # Ensure header can be read in | |
342 try: | |
343 header_data = fetchHeaders(snpdiffs_file) | |
344 header_query = header_data['Query_ID'][0].replace(trim_name,'') | |
345 header_ref = header_data['Reference_ID'][0].replace(trim_name,'') | |
346 except: | |
347 run_failed = True | |
348 sys.exit(f"Error reading headers from snpdiffs file: {snpdiffs_file}") | |
349 | |
350 # Check snpdiffs orientation | |
351 if ref_ids == []: | |
352 snpdiffs_orientation = 1 | |
353 query_id = header_query | |
354 reference_id = header_ref | |
355 elif (header_query not in ref_ids) and (header_ref in ref_ids): | |
356 snpdiffs_orientation = 1 | |
357 query_id = header_query | |
358 reference_id = header_ref | |
359 elif (header_query in ref_ids) and (header_ref not in ref_ids): | |
360 snpdiffs_orientation = -1 | |
361 query_id = header_ref | |
362 reference_id = header_query | |
363 header_data = swapHeader(header_data) | |
364 else: | |
365 snpdiffs_orientation = 2 | |
366 query_id = header_query | |
367 reference_id = header_ref | |
368 | |
369 # Establish log file | |
370 log_file = f"{log_dir}/{query_id}__vs__{reference_id}.log" | |
371 with open(log_file,"w+") as log: | |
372 log.write("Screening Analysis\n") | |
373 log.write(f"Query Isolate: {query_id}\n") | |
374 log.write(f"Reference Isolate: {reference_id}\n") | |
375 log.write(str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))+"\n") | |
376 log.write("-------------------------------------------------------\n\n") | |
377 if ref_ids == []: | |
378 log.write("\t- No explicit references set, processing in forward orientation\n") | |
379 log.write("-------------------------------------------------------\n\n") | |
380 elif snpdiffs_orientation == 1: | |
381 log.write("\t- SNPDiffs file is in the forward orientation\n") | |
382 log.write("-------------------------------------------------------\n\n") | |
383 elif snpdiffs_orientation == -1: | |
384 log.write("\t- SNPDiffs file is in the reverse orientation\n") | |
385 log.write("-------------------------------------------------------\n\n") | |
386 else: | |
387 snpdiffs_orientation = 1 | |
388 log.write("\t- SNPDiffs file not contain a reference and non-reference sample, processing in forward orientation\n") | |
389 log.write("-------------------------------------------------------\n\n") | |
390 | |
391 | |
392 # Set variables from header data | |
393 raw_snps = int(header_data['SNPs'][0]) | |
394 raw_indels = int(header_data['Indels'][0]) | |
395 raw_invalid = int(header_data['Invalid'][0]) | |
396 | |
397 kmer_similarity = float(header_data['Kmer_Similarity'][0]) | |
398 shared_kmers = int(header_data['Shared_Kmers'][0]) | |
399 query_unique_kmers = int(header_data['Query_Unique_Kmers'][0]) | |
400 reference_unique_kmers = int(header_data['Reference_Unique_Kmers'][0]) | |
401 mummer_gsnps = int(header_data['gSNPs'][0]) | |
402 mummer_gindels = int(header_data['gIndels'][0]) | |
403 | |
404 query_bases = int(header_data['Query_Assembly_Bases'][0]) | |
405 reference_bases = int(header_data['Reference_Assembly_Bases'][0]) | |
406 | |
407 query_contigs = int(header_data['Query_Contig_Count'][0]) | |
408 reference_contigs = int(header_data['Reference_Contig_Count'][0]) | |
409 | |
410 raw_query_percent_aligned = float(header_data['Query_Percent_Aligned'][0]) | |
411 raw_ref_percent_aligned = float(header_data['Reference_Percent_Aligned'][0]) | |
412 | |
413 # If the reference is not covered by at least min_cov, STOP | |
414 if raw_ref_percent_aligned < min_cov: | |
415 query_percent_aligned = raw_query_percent_aligned | |
416 reference_percent_aligned = raw_ref_percent_aligned | |
417 screen_category = "Low_Coverage" | |
418 with open(log_file,"a+") as log: | |
419 log.write(f"\t- Reference genome coverage: {raw_ref_percent_aligned}% \n") | |
420 log.write(f"\t- Query covers less than --min_cov ({min_cov}%)...Screen halted...\n") | |
421 log.write("-------------------------------------------------------\n\n") | |
422 | |
423 elif raw_snps + raw_indels + raw_invalid > 10000: | |
424 query_percent_aligned = raw_query_percent_aligned | |
425 reference_percent_aligned = raw_ref_percent_aligned | |
426 screen_category = "SNP_Cutoff" | |
427 with open(log_file,"a+") as log: | |
428 log.write(f"\t- {raw_snps} detected...\n") | |
429 log.write("\t- > 10,000 SNPs, indels, or invalid sites detected by MUMmer...Screen halted...\n") | |
430 log.write("-------------------------------------------------------\n\n") | |
431 | |
432 else: | |
433 | |
434 ##### 02: Read in BED/SNP data ##### | |
435 with open(log_file,"a+") as log: | |
436 log.write("Step 1: Reading in snpdiffs BED/SNP data...") | |
437 try: | |
438 bed_df,snp_df = parseSNPDiffs(snpdiffs_file,snpdiffs_orientation) | |
439 with open(log_file,"a+") as log: | |
440 log.write("Done!\n") | |
441 log.write("-------------------------------------------------------\n\n") | |
442 except: | |
443 with open(log_file,"a+") as log: | |
444 log.write(f"Error reading BED/SNP data from file: {snpdiffs_file}") | |
445 run_failed = True | |
446 sys.exit(f"Error reading BED/SNP data from file: {snpdiffs_file}") | |
447 | |
448 ##### 03: Filter genome overlaps ##### | |
449 with open(log_file,"a+") as log: | |
450 log.write("Step 2: Filtering for short overlaps and low percent identity...") | |
451 | |
452 good_bed_df = bed_df[(bed_df['Ref_Aligned'] >= min_len) & (bed_df['Perc_Iden'] >= min_iden)].copy() | |
453 | |
454 if good_bed_df.shape[0] == 0: | |
455 screen_category = "Low_Quality_Coverage" | |
456 with open(log_file,"a+") as log: | |
457 log.write(f"\n\t- After filtering based on --min_len ({min_len}) and --min_iden ({min_iden}) , no valid alignments remain...Screen halted...\n") | |
458 log.write("-------------------------------------------------------\n\n") | |
459 | |
460 | |
461 else: | |
462 # Create a BED file for alignments that pass basic QC | |
463 good_query_bed_df = good_bed_df[['Query_Contig','Query_Start','Query_End']].copy() | |
464 good_reference_bed_df = good_bed_df[['Ref_Contig','Ref_Start','Ref_End']].copy() | |
465 | |
466 good_query_aligned = calculate_total_length(BedTool.from_dataframe(good_query_bed_df).sort().merge()) | |
467 good_reference_aligned = calculate_total_length(BedTool.from_dataframe(good_reference_bed_df).sort().merge()) | |
468 | |
469 query_percent_aligned = (good_query_aligned / query_bases) * 100 | |
470 reference_percent_aligned = (good_reference_aligned / reference_bases) * 100 | |
471 | |
472 if reference_percent_aligned < min_cov: | |
473 screen_category = "Low_Quality_Coverage" | |
474 with open(log_file,"a+") as log: | |
475 log.write(f"\n\t- Raw reference genome coverage was {raw_ref_percent_aligned}% \n") | |
476 log.write(f"\t- After filtering based on --min_len ({min_len}) and --min_iden ({min_iden}), reference genome coverage was {reference_percent_aligned:.2f}% \n") | |
477 log.write(f"\t- Query covers less than --min_cov ({min_cov}%) of reference after filtering...Screen halted...\n") | |
478 log.write("-------------------------------------------------------\n\n") | |
479 | |
480 else: | |
481 screen_category = "Pass" | |
482 with open(log_file,"a+") as log: | |
483 log.write("Done!\n") | |
484 log.write(f"\t- Raw reference genome coverage was {raw_ref_percent_aligned}% \n") | |
485 log.write(f"\t- After filtering based on --min_len ({min_len}) and --min_iden ({min_iden}), reference genome coverage was {reference_percent_aligned:.2f}% \n") | |
486 log.write("-------------------------------------------------------\n\n") | |
487 | |
488 | |
489 # Filter SNPs | |
490 with open(log_file,"a+") as log: | |
491 log.write("Step 3: Filtering SNPs to get final SNP distances...") | |
492 | |
493 if raw_snps == 0: | |
494 csp2_screen_snps = purged_length = purged_identity = purged_lengthIdentity = purged_indel = purged_invalid = purged_duplicate = purged_het = purged_density = filtered_ref_edge = filtered_query_edge = filtered_both_edge = 0 | |
495 with open(log_file,"a+") as log: | |
496 log.write("Done!\n") | |
497 log.write("\t- No SNPs detected in MUMmer output, no filtering required\n") | |
498 log.write("-------------------------------------------------------\n\n") | |
499 | |
500 else: | |
501 filtered_snp_df = filterSNPs(snp_df,bed_df,log_file, min_len, min_iden, ref_edge, query_edge, density_windows, max_snps) | |
502 | |
503 # Write filtered SNP data to file | |
504 snp_file = log_file.replace(".log","_SNPs.tsv") | |
505 filtered_snp_df.to_csv(snp_file, sep="\t", index=False) | |
506 | |
507 csp2_screen_snps = filtered_snp_df[filtered_snp_df.Cat == "SNP"].shape[0] | |
508 | |
509 purged_length = filtered_snp_df[filtered_snp_df.Cat == "Purged_Length"].shape[0] | |
510 purged_identity = filtered_snp_df[filtered_snp_df.Cat == "Purged_Identity"].shape[0] | |
511 purged_lengthIdentity = filtered_snp_df[filtered_snp_df.Cat == "Purged_LengthIdentity"].shape[0] | |
512 purged_invalid = filtered_snp_df[filtered_snp_df.Cat == "Purged_Invalid"].shape[0] | |
513 purged_indel = filtered_snp_df[filtered_snp_df.Cat == "Purged_Indel"].shape[0] | |
514 purged_het = filtered_snp_df[filtered_snp_df.Cat == "Purged_Heterozygous"].shape[0] | |
515 purged_duplicate = filtered_snp_df[filtered_snp_df.Cat == "Purged_Duplicate"].shape[0] | |
516 purged_density = filtered_snp_df[filtered_snp_df.Cat == "Purged_Density"].shape[0] | |
517 filtered_query_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Query_Edge"].shape[0] | |
518 filtered_ref_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Ref_Edge"].shape[0] | |
519 filtered_both_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Both_Edge"].shape[0] | |
520 | |
521 with open(log_file,"a+") as log: | |
522 log.write("Done!\n") | |
523 log.write(f"\t- {csp2_screen_snps} SNPs detected between {query_id} and {reference_id} after filtering\n") | |
524 log.write(f"\t- SNP data saved to {snp_file}\n") | |
525 log.write("-------------------------------------------------------\n\n") | |
526 | |
527 screen_end_time = time.time() | |
528 helpers.cleanup(verbose=False, remove_all=False) | |
529 | |
530 with open(log_file,"a+") as log: | |
531 log.write(f"Screening Time: {screen_end_time - screen_start_time:.2f} seconds\n") | |
532 | |
533 # Clean up pybedtools temp | |
534 helpers.cleanup(verbose=False, remove_all=False) | |
535 | |
536 return [str(item) for item in [query_id,reference_id,screen_category,csp2_screen_snps, | |
537 f"{query_percent_aligned:.2f}",f"{reference_percent_aligned:.2f}", | |
538 query_contigs,query_bases,reference_contigs,reference_bases, | |
539 raw_snps,purged_length,purged_identity,purged_lengthIdentity,purged_invalid,purged_indel,purged_duplicate,purged_het,purged_density, | |
540 filtered_query_edge,filtered_ref_edge,filtered_both_edge, | |
541 kmer_similarity,shared_kmers,query_unique_kmers,reference_unique_kmers, | |
542 mummer_gsnps,mummer_gindels]] | |
543 | |
544 # Read in arguments | |
545 global run_failed | |
546 run_failed = False | |
547 | |
548 parser = argparse.ArgumentParser() | |
549 parser.add_argument("--snpdiffs_file", help="Path to the file containing SNP diffs") | |
550 parser.add_argument("--log_dir", help="Path to the log directory") | |
551 parser.add_argument("--min_cov", type=float, help="Minimum coverage") | |
552 parser.add_argument("--min_len", type=int, help="Minimum length") | |
553 parser.add_argument("--min_iden", type=float, help="Minimum identity") | |
554 parser.add_argument("--ref_edge", type=int, help="Reference edge") | |
555 parser.add_argument("--query_edge", type=int, help="Query edge") | |
556 parser.add_argument("--density_windows", help="Density windows (comma-separated)") | |
557 parser.add_argument("--max_snps", help="Maximum SNPs (comma-separated)") | |
558 parser.add_argument("--trim_name", help="Trim name") | |
559 parser.add_argument("--output_file", help="Output file") | |
560 parser.add_argument("--ref_id", help="Reference IDs file") | |
561 parser.add_argument("--tmp_dir", help="TMP dir") | |
562 | |
563 args = parser.parse_args() | |
564 | |
565 snpdiffs_list = [line.strip() for line in open(args.snpdiffs_file, 'r')] | |
566 snpdiffs_list = [line for line in snpdiffs_list if line] | |
567 for snpdiffs_file in snpdiffs_list: | |
568 if not os.path.exists(snpdiffs_file): | |
569 run_failed = True | |
570 sys.exit("Error: File does not exist: " + snpdiffs_file) | |
571 | |
572 snpdiffs_list = list(set(snpdiffs_list)) | |
573 | |
574 log_dir = os.path.normpath(os.path.abspath(args.log_dir)) | |
575 | |
576 min_cov = args.min_cov | |
577 min_len = args.min_len | |
578 min_iden = args.min_iden | |
579 | |
580 ref_edge = args.ref_edge | |
581 query_edge = args.query_edge | |
582 | |
583 input_density = args.density_windows | |
584 input_maxsnps = args.max_snps | |
585 | |
586 if input_density == "0": | |
587 density_windows = [] | |
588 max_snps = [] | |
589 else: | |
590 density_windows = [int(x) for x in args.density_windows.split(",")] | |
591 max_snps = [int(x) for x in args.max_snps.split(",")] | |
592 assert len(density_windows) == len(max_snps) | |
593 | |
594 trim_name = args.trim_name | |
595 | |
596 output_file = os.path.abspath(args.output_file) | |
597 | |
598 if os.stat(args.ref_id).st_size == 0: | |
599 ref_ids = [] | |
600 else: | |
601 ref_ids = [line.strip() for line in open(args.ref_id, 'r')] | |
602 | |
603 global temp_dir | |
604 if args.tmp_dir != "": | |
605 random_temp_id = str(uuid.uuid4()) | |
606 temp_dir = f"{os.path.normpath(os.path.abspath(args.tmp_dir))}/{random_temp_id}" | |
607 try: | |
608 os.mkdir(temp_dir) | |
609 helpers.set_tempdir(temp_dir) | |
610 except OSError as e: | |
611 run_failed = True | |
612 print(f"Error: Failed to create directory '{temp_dir}': {e}") | |
613 else: | |
614 temp_dir = "" | |
615 | |
616 try: | |
617 with concurrent.futures.ProcessPoolExecutor() as executor: | |
618 results = [executor.submit(screenSNPDiffs,snp_diff_file,trim_name, min_cov, min_len, min_iden, ref_edge, query_edge, density_windows, max_snps,ref_ids) for snp_diff_file in snpdiffs_list] | |
619 | |
620 # Clean up pybedtools temp | |
621 helpers.cleanup(verbose=False,remove_all = False) | |
622 | |
623 # Combine results into a dataframe | |
624 output_columns = ['Query_ID','Reference_ID','Screen_Category','CSP2_Screen_SNPs', | |
625 'Query_Percent_Aligned','Reference_Percent_Aligned', | |
626 'Query_Contigs','Query_Bases','Reference_Contigs','Reference_Bases', | |
627 'Raw_SNPs','Purged_Length','Purged_Identity','Purged_LengthIdentity','Purged_Invalid','Purged_Indel','Purged_Duplicate','Purged_Het','Purged_Density', | |
628 'Filtered_Query_Edge','Filtered_Ref_Edge','Filtered_Both_Edge', | |
629 'Kmer_Similarity','Shared_Kmers','Query_Unique_Kmers','Reference_Unique_Kmers', | |
630 'MUMmer_gSNPs','MUMmer_gIndels'] | |
631 | |
632 results_df = pd.DataFrame([item.result() for item in results], columns = output_columns) | |
633 results_df.to_csv(output_file, sep="\t", index=False) | |
634 except: | |
635 run_failed = True | |
636 print("Exception occurred:\n", traceback.format_exc()) | |
637 finally: | |
638 helpers.cleanup(verbose=False, remove_all=False) | |
639 if temp_dir != "": | |
640 shutil.rmtree(temp_dir) | |
641 if run_failed: | |
642 sys.exit(1) | |
643 | |
644 | |
645 | |
646 | |
647 |