rliterman@0
|
1 #!/usr/bin/env python3
|
rliterman@0
|
2
|
rliterman@0
|
3 import sys
|
rliterman@0
|
4 import os
|
rliterman@0
|
5 import pandas as pd
|
rliterman@0
|
6 import datetime
|
rliterman@0
|
7 from pybedtools import BedTool,helpers
|
rliterman@0
|
8 import concurrent.futures
|
rliterman@0
|
9 import time
|
rliterman@0
|
10 import uuid
|
rliterman@0
|
11 import traceback
|
rliterman@0
|
12 import shutil
|
rliterman@0
|
13 import argparse
|
rliterman@0
|
14
|
rliterman@0
|
15
|
rliterman@0
|
16 def fetchHeaders(snpdiffs_file):
|
rliterman@0
|
17
|
rliterman@0
|
18 with open(snpdiffs_file, 'r') as file:
|
rliterman@0
|
19 top_line = file.readline().strip().split('\t')[1:]
|
rliterman@0
|
20
|
rliterman@0
|
21 header_cols = [item.split(':')[0] for item in top_line]
|
rliterman@0
|
22 header_vals = [item.split(':')[1] for item in top_line]
|
rliterman@0
|
23
|
rliterman@0
|
24 header_data = pd.DataFrame(columns = header_cols)
|
rliterman@0
|
25 header_data.loc[0] = header_vals
|
rliterman@0
|
26 header_data.loc[:, 'File_Path'] = snpdiffs_file
|
rliterman@0
|
27
|
rliterman@0
|
28 return header_data
|
rliterman@0
|
29
|
rliterman@0
|
30 def processBED(bed_rows,snpdiffs_orientation):
|
rliterman@0
|
31
|
rliterman@0
|
32 bed_columns = ['Ref_Contig','Ref_Start','Ref_End','Ref_Length','Ref_Aligned',
|
rliterman@0
|
33 'Query_Contig','Query_Start','Query_End','Query_Length','Query_Aligned',
|
rliterman@0
|
34 'Perc_Iden']
|
rliterman@0
|
35
|
rliterman@0
|
36 reverse_columns = ['Query_Contig','Query_Start','Query_End','Query_Length','Query_Aligned',
|
rliterman@0
|
37 'Ref_Contig','Ref_Start','Ref_End','Ref_Length','Ref_Aligned',
|
rliterman@0
|
38 'Perc_Iden']
|
rliterman@0
|
39
|
rliterman@0
|
40 int_columns = ['Ref_Start', 'Ref_End', 'Ref_Length', 'Ref_Aligned',
|
rliterman@0
|
41 'Query_Start', 'Query_End', 'Query_Length', 'Query_Aligned']
|
rliterman@0
|
42
|
rliterman@0
|
43 float_columns = ['Perc_Iden']
|
rliterman@0
|
44
|
rliterman@0
|
45 if len(bed_rows) > 0:
|
rliterman@0
|
46
|
rliterman@0
|
47 bed_df = pd.DataFrame(bed_rows, columns=bed_columns)
|
rliterman@0
|
48
|
rliterman@0
|
49 # Swap columns if reversed
|
rliterman@0
|
50 if snpdiffs_orientation == -1:
|
rliterman@0
|
51 bed_df = bed_df[reverse_columns].copy()
|
rliterman@0
|
52 bed_df.columns = bed_columns
|
rliterman@0
|
53
|
rliterman@0
|
54 # Remove any rows where Query_Contig or Ref_Contig == "." (Unaligned)
|
rliterman@0
|
55 covered_bed_df = bed_df[(bed_df['Ref_Start'] != ".") & (bed_df['Query_Start'] != ".")].copy()
|
rliterman@0
|
56
|
rliterman@0
|
57 if covered_bed_df.shape[0] > 0:
|
rliterman@0
|
58 for col in int_columns:
|
rliterman@0
|
59 covered_bed_df.loc[:, col] = covered_bed_df.loc[:, col].astype(float).astype(int)
|
rliterman@0
|
60 for col in float_columns:
|
rliterman@0
|
61 covered_bed_df.loc[:, col] = covered_bed_df.loc[:, col].astype(float)
|
rliterman@0
|
62 return covered_bed_df
|
rliterman@0
|
63 else:
|
rliterman@0
|
64 return pd.DataFrame(columns=bed_columns)
|
rliterman@0
|
65 else:
|
rliterman@0
|
66 return pd.DataFrame(columns=bed_columns)
|
rliterman@0
|
67
|
rliterman@0
|
68 def processSNPs(snp_rows,snpdiffs_orientation):
|
rliterman@0
|
69
|
rliterman@0
|
70 snp_columns = ['Ref_Contig','Start_Ref','Ref_Pos',
|
rliterman@0
|
71 'Query_Contig','Start_Query','Query_Pos',
|
rliterman@0
|
72 'Ref_Loc','Query_Loc',
|
rliterman@0
|
73 'Ref_Start','Ref_End',
|
rliterman@0
|
74 'Query_Start','Query_End',
|
rliterman@0
|
75 'Ref_Base','Query_Base',
|
rliterman@0
|
76 'Dist_to_Ref_End','Dist_to_Query_End',
|
rliterman@0
|
77 'Ref_Aligned','Query_Aligned',
|
rliterman@0
|
78 'Query_Direction','Perc_Iden','Cat']
|
rliterman@0
|
79
|
rliterman@0
|
80 reverse_columns = ['Query_Contig','Start_Query','Query_Pos',
|
rliterman@0
|
81 'Ref_Contig','Start_Ref','Ref_Pos',
|
rliterman@0
|
82 'Query_Loc','Ref_Loc',
|
rliterman@0
|
83 'Query_Start','Query_End',
|
rliterman@0
|
84 'Ref_Start','Ref_End',
|
rliterman@0
|
85 'Query_Base','Ref_Base',
|
rliterman@0
|
86 'Dist_to_Query_End','Dist_to_Ref_End',
|
rliterman@0
|
87 'Query_Aligned','Ref_Aligned',
|
rliterman@0
|
88 'Query_Direction','Perc_Iden','Cat']
|
rliterman@0
|
89
|
rliterman@0
|
90 return_columns = ['Ref_Contig','Start_Ref','Ref_Pos',
|
rliterman@0
|
91 'Query_Contig','Start_Query','Query_Pos',
|
rliterman@0
|
92 'Ref_Loc','Query_Loc',
|
rliterman@0
|
93 'Ref_Start','Ref_End',
|
rliterman@0
|
94 'Query_Start','Query_End',
|
rliterman@0
|
95 'Ref_Base','Query_Base',
|
rliterman@0
|
96 'Dist_to_Ref_End','Dist_to_Query_End',
|
rliterman@0
|
97 'Ref_Aligned','Query_Aligned',
|
rliterman@0
|
98 'Perc_Iden','Cat']
|
rliterman@0
|
99
|
rliterman@0
|
100 reverse_complement = {'A':'T','T':'A','G':'C','C':'G',
|
rliterman@0
|
101 'a':'T','t':'A','c':'G','g':'C'}
|
rliterman@0
|
102
|
rliterman@0
|
103 # Columns to convert to integer
|
rliterman@0
|
104 int_columns = ['Start_Ref', 'Ref_Pos', 'Start_Query', 'Query_Pos',
|
rliterman@0
|
105 'Dist_to_Ref_End', 'Dist_to_Query_End', 'Ref_Aligned', 'Query_Aligned']
|
rliterman@0
|
106
|
rliterman@0
|
107 # Columns to convert to float
|
rliterman@0
|
108 float_columns = ['Perc_Iden']
|
rliterman@0
|
109
|
rliterman@0
|
110 if len(snp_rows) > 0:
|
rliterman@0
|
111 snp_df = pd.DataFrame(snp_rows, columns= snp_columns).copy()
|
rliterman@0
|
112
|
rliterman@0
|
113 if snpdiffs_orientation == -1:
|
rliterman@0
|
114 snp_df = snp_df[reverse_columns].copy()
|
rliterman@0
|
115 snp_df.columns = snp_columns
|
rliterman@0
|
116
|
rliterman@0
|
117 # Replace Query_Base and Reference_Base with reverse complement if Query_Direction is -1 and base is in ['A','T','G','C','a','c','t','g']
|
rliterman@0
|
118 snp_df.loc[snp_df['Query_Direction'] == '-1','Query_Base'] = snp_df.loc[snp_df['Query_Direction'] == '-1','Query_Base'].apply(lambda x: reverse_complement[x] if x in reverse_complement else x)
|
rliterman@0
|
119 snp_df.loc[snp_df['Query_Direction'] == '-1','Ref_Base'] = snp_df.loc[snp_df['Query_Direction'] == '-1','Ref_Base'].apply(lambda x: reverse_complement[x] if x in reverse_complement else x)
|
rliterman@0
|
120
|
rliterman@0
|
121
|
rliterman@0
|
122 for col in int_columns:
|
rliterman@0
|
123 snp_df.loc[:, col] = snp_df.loc[:, col].astype(float).astype(int)
|
rliterman@0
|
124 for col in float_columns:
|
rliterman@0
|
125 snp_df.loc[:, col] = snp_df.loc[:, col].astype(float)
|
rliterman@0
|
126
|
rliterman@0
|
127 else:
|
rliterman@0
|
128 snp_df = pd.DataFrame(columns = return_columns)
|
rliterman@0
|
129
|
rliterman@0
|
130 return snp_df[return_columns]
|
rliterman@0
|
131
|
rliterman@0
|
132 def swapHeader(header_data):
|
rliterman@0
|
133
|
rliterman@0
|
134 raw_header_cols = [x for x in header_data.columns]
|
rliterman@0
|
135 reverse_header_cols = [item.replace('Reference', 'temp').replace('Query', 'Reference').replace('temp', 'Query') for item in raw_header_cols]
|
rliterman@0
|
136 reversed_header_data = header_data[reverse_header_cols].copy()
|
rliterman@0
|
137 reversed_header_data.columns = raw_header_cols
|
rliterman@0
|
138
|
rliterman@0
|
139 return reversed_header_data
|
rliterman@0
|
140
|
rliterman@0
|
141 def parseSNPDiffs(snpdiffs_file,snpdiffs_orientation):
|
rliterman@0
|
142
|
rliterman@0
|
143 bed_rows = []
|
rliterman@0
|
144 snp_rows = []
|
rliterman@0
|
145
|
rliterman@0
|
146 with open(snpdiffs_file, 'r') as file:
|
rliterman@0
|
147 lines = file.readlines()
|
rliterman@0
|
148
|
rliterman@0
|
149 for line in lines:
|
rliterman@0
|
150 if line[0:2] == "#\t":
|
rliterman@0
|
151 pass
|
rliterman@0
|
152 elif line[0:3] == "##\t":
|
rliterman@0
|
153 bed_rows.append(line.strip().split("\t")[1:])
|
rliterman@0
|
154 else:
|
rliterman@0
|
155 snp_rows.append(line.strip().split("\t"))
|
rliterman@0
|
156
|
rliterman@0
|
157 bed_df = processBED(bed_rows,snpdiffs_orientation)
|
rliterman@0
|
158 snp_df = processSNPs(snp_rows,snpdiffs_orientation)
|
rliterman@0
|
159 return (bed_df,snp_df)
|
rliterman@0
|
160
|
rliterman@0
|
161 def calculate_total_length(bedtool):
|
rliterman@0
|
162 return sum(len(interval) for interval in bedtool)
|
rliterman@0
|
163
|
rliterman@0
|
164 def filterSNPs(raw_snp_df,bed_df,log_file, min_len, min_iden, ref_edge, query_edge, density_windows, max_snps):
|
rliterman@0
|
165
|
rliterman@0
|
166 if temp_dir != "":
|
rliterman@0
|
167 helpers.set_tempdir(temp_dir)
|
rliterman@0
|
168
|
rliterman@0
|
169 # Grab raw data
|
rliterman@0
|
170 total_snp_count = raw_snp_df.shape[0]
|
rliterman@0
|
171
|
rliterman@0
|
172 # Get unique SNPs relative to the reference genome
|
rliterman@0
|
173 unique_ref_snps = raw_snp_df['Ref_Loc'].unique()
|
rliterman@0
|
174 unique_snp_count = len(unique_ref_snps)
|
rliterman@0
|
175
|
rliterman@0
|
176 snp_tally_df = pd.DataFrame()
|
rliterman@0
|
177
|
rliterman@0
|
178 with open(log_file,"a+") as log:
|
rliterman@0
|
179 log.write(f"\n\t- Raw SNP + indel count: {total_snp_count}\n")
|
rliterman@0
|
180 log.write(f"\n\t- Unique SNP positions in reference genome: {unique_snp_count}\n")
|
rliterman@0
|
181
|
rliterman@0
|
182 # Set all sites to SNP
|
rliterman@0
|
183 raw_snp_df['Filter_Cat'] = "SNP"
|
rliterman@0
|
184
|
rliterman@0
|
185 # Filter out SNPs based on --min_len and --min_iden
|
rliterman@0
|
186 reject_length = raw_snp_df.loc[(raw_snp_df['Ref_Aligned'] < min_len) & (raw_snp_df['Perc_Iden'] >= min_iden)].copy()
|
rliterman@0
|
187 if reject_length.shape[0] > 0:
|
rliterman@0
|
188 with open(log_file,"a+") as log:
|
rliterman@0
|
189 log.write(f"\t\t- Purged (Alignment Length): {reject_length.shape[0]}\n")
|
rliterman@0
|
190 reject_length['Filter_Cat'] = "Purged_Length"
|
rliterman@0
|
191 snp_tally_df = pd.concat([snp_tally_df,reject_length]).reset_index(drop=True)
|
rliterman@0
|
192
|
rliterman@0
|
193 reject_iden = raw_snp_df.loc[(raw_snp_df['Ref_Aligned'] >= min_len) & (raw_snp_df['Perc_Iden'] < min_iden)].copy()
|
rliterman@0
|
194 if reject_iden.shape[0] > 0:
|
rliterman@0
|
195 with open(log_file,"a+") as log:
|
rliterman@0
|
196 log.write(f"\t\t- Purged (Alignment Identity): {reject_iden.shape[0]}\n")
|
rliterman@0
|
197 reject_iden['Filter_Cat'] = "Purged_Identity"
|
rliterman@0
|
198 snp_tally_df = pd.concat([snp_tally_df,reject_iden]).reset_index(drop=True)
|
rliterman@0
|
199
|
rliterman@0
|
200 reject_lenIden = raw_snp_df.loc[(raw_snp_df['Ref_Aligned'] < min_len) & (raw_snp_df['Perc_Iden'] < min_iden)].copy()
|
rliterman@0
|
201 if reject_lenIden.shape[0] > 0:
|
rliterman@0
|
202 with open(log_file,"a+") as log:
|
rliterman@0
|
203 log.write(f"\t\t- Purged (Alignment Length + Identity): {reject_lenIden.shape[0]}\n")
|
rliterman@0
|
204 reject_lenIden['Filter_Cat'] = "Purged_LengthIdentity"
|
rliterman@0
|
205 snp_tally_df = pd.concat([snp_tally_df,reject_lenIden]).reset_index(drop=True)
|
rliterman@0
|
206
|
rliterman@0
|
207 pass_filter = raw_snp_df.loc[(raw_snp_df['Ref_Aligned'] >= min_len) & (raw_snp_df['Perc_Iden'] >= min_iden)].copy().reset_index(drop=True)
|
rliterman@0
|
208
|
rliterman@0
|
209 # Invalid processing
|
rliterman@0
|
210 reject_invalid = pass_filter[pass_filter['Cat'] == "Invalid"].copy()
|
rliterman@0
|
211 if reject_invalid.shape[0] > 0:
|
rliterman@0
|
212 with open(log_file,"a+") as log:
|
rliterman@0
|
213 log.write(f"\t\t- Purged (Invalid Base): {reject_invalid.shape[0]}\n")
|
rliterman@0
|
214 reject_invalid['Filter_Cat'] = "Purged_Invalid"
|
rliterman@0
|
215 snp_tally_df = pd.concat([snp_tally_df,reject_invalid]).reset_index(drop=True)
|
rliterman@0
|
216 pass_filter = pass_filter.loc[pass_filter['Cat'] != "Invalid"].copy()
|
rliterman@0
|
217
|
rliterman@0
|
218 # Indel processing
|
rliterman@0
|
219 reject_indel = pass_filter[pass_filter['Cat'] == "Indel"].copy()
|
rliterman@0
|
220 if reject_indel.shape[0] > 0:
|
rliterman@0
|
221 with open(log_file,"a+") as log:
|
rliterman@0
|
222 log.write(f"\t\t- Purged (Indel): {reject_indel.shape[0]}\n")
|
rliterman@0
|
223 reject_indel['Filter_Cat'] = "Purged_Indel"
|
rliterman@0
|
224 snp_tally_df = pd.concat([snp_tally_df,reject_indel]).reset_index(drop=True)
|
rliterman@0
|
225 pass_filter = pass_filter.loc[pass_filter['Cat'] != "Indel"].copy()
|
rliterman@0
|
226
|
rliterman@0
|
227 # Check for heterozygous SNPs
|
rliterman@0
|
228 check_heterozygous = pass_filter.groupby('Ref_Loc').filter(lambda x: x['Query_Base'].nunique() > 1)
|
rliterman@0
|
229 if check_heterozygous.shape[0] > 0:
|
rliterman@0
|
230 reject_heterozygous = pass_filter.loc[pass_filter['Ref_Loc'].isin(check_heterozygous['Ref_Loc'])].copy()
|
rliterman@0
|
231 reject_heterozygous['Filter_Cat'] = "Purged_Heterozygous"
|
rliterman@0
|
232 with open(log_file,"a+") as log:
|
rliterman@0
|
233 log.write(f"\t\t- Purged (Heterozygotes): {reject_heterozygous.shape[0]}\n")
|
rliterman@0
|
234 snp_tally_df = pd.concat([snp_tally_df,reject_heterozygous]).reset_index(drop=True)
|
rliterman@0
|
235 pass_filter = pass_filter.loc[~pass_filter['Ref_Loc'].isin(check_heterozygous['Ref_Loc'])].copy()
|
rliterman@0
|
236
|
rliterman@0
|
237 # Check for duplicate SNPs and take the longest, best hit
|
rliterman@0
|
238 check_duplicates = pass_filter.groupby('Ref_Loc').filter(lambda x: x.shape[0] > 1)
|
rliterman@0
|
239 if check_duplicates.shape[0] > 0:
|
rliterman@0
|
240 reject_duplicate = pass_filter.loc[pass_filter['Ref_Loc'].isin(check_duplicates['Ref_Loc'])].copy()
|
rliterman@0
|
241 pass_filter = pass_filter.loc[~pass_filter['Ref_Loc'].isin(check_duplicates['Ref_Loc'])].copy()
|
rliterman@0
|
242
|
rliterman@0
|
243 best_snp = reject_duplicate.groupby('Ref_Loc').apply(lambda x: x.sort_values(by=['Ref_Aligned', 'Perc_Iden'], ascending=[False, False]).head(1))
|
rliterman@0
|
244 pass_filter = pd.concat([pass_filter,best_snp]).reset_index(drop=True)
|
rliterman@0
|
245
|
rliterman@0
|
246 dup_snps = reject_duplicate[~reject_duplicate.apply(lambda x: x in best_snp, axis=1)]
|
rliterman@0
|
247 dup_snps['Filter_Cat'] = "Purged_Duplicate"
|
rliterman@0
|
248
|
rliterman@0
|
249 snp_tally_df = pd.concat([snp_tally_df,dup_snps]).reset_index(drop=True)
|
rliterman@0
|
250
|
rliterman@0
|
251 with open(log_file,"a+") as log:
|
rliterman@0
|
252 log.write(f"\t\t- Purged (Duplicates): {dup_snps.shape[0]}\n")
|
rliterman@0
|
253
|
rliterman@0
|
254 # Assert that Ref_Loc and Query_Loc are unique in pass_filter
|
rliterman@0
|
255 helpers.cleanup(verbose=False,remove_all = False)
|
rliterman@0
|
256 assert pass_filter['Ref_Loc'].nunique() == pass_filter.shape[0]
|
rliterman@0
|
257 assert pass_filter['Query_Loc'].nunique() == pass_filter.shape[0]
|
rliterman@0
|
258
|
rliterman@0
|
259 # Density filtering
|
rliterman@0
|
260 density_locs = []
|
rliterman@0
|
261 ref_locs = pass_filter['Ref_Loc'].tolist()
|
rliterman@0
|
262
|
rliterman@0
|
263 if len(density_windows) == 0:
|
rliterman@0
|
264 with open(log_file,"a+") as log:
|
rliterman@0
|
265 log.write("\n\t- Density filtering disabled...\n")
|
rliterman@0
|
266 elif len(ref_locs) > 0:
|
rliterman@0
|
267 density_df = pd.DataFrame([item.split('/') for item in ref_locs], columns=['Ref_Contig','Ref_End'])
|
rliterman@0
|
268 density_df['Ref_Start'] = density_df['Ref_End'].astype(float).astype(int) - 1
|
rliterman@0
|
269 density_bed = BedTool.from_dataframe(density_df[['Ref_Contig','Ref_Start','Ref_End']])
|
rliterman@0
|
270
|
rliterman@0
|
271 # For each density window, remove all SNPs that fall in a window with > max_snps
|
rliterman@0
|
272 for i in range(0,len(density_windows)):
|
rliterman@0
|
273 window_df = density_bed.window(density_bed,c=True, w=density_windows[i]).to_dataframe()
|
rliterman@0
|
274 problematic_windows = window_df[window_df['name'] > max_snps[i]].copy()
|
rliterman@0
|
275 if not problematic_windows.empty:
|
rliterman@0
|
276 temp_locs = []
|
rliterman@0
|
277 for _, row in problematic_windows.iterrows():
|
rliterman@0
|
278 purge_window_df = window_df[window_df['chrom'] == row['chrom']].copy()
|
rliterman@0
|
279 purge_window_df['Dist'] = abs(purge_window_df['end'] - row['end'])
|
rliterman@0
|
280 window_snps = purge_window_df.sort_values(by=['Dist'],ascending=True).head(row['name'])
|
rliterman@0
|
281 temp_locs = temp_locs + ["/".join([str(x[0]),str(x[1])]) for x in list(zip(window_snps.chrom, window_snps.end))]
|
rliterman@0
|
282 density_locs.extend(list(set(temp_locs)))
|
rliterman@0
|
283
|
rliterman@0
|
284 density_locs = list(set(density_locs))
|
rliterman@0
|
285 reject_density = pass_filter[pass_filter['Ref_Loc'].isin(density_locs)].copy()
|
rliterman@0
|
286
|
rliterman@0
|
287 if reject_density.shape[0] > 0:
|
rliterman@0
|
288 with open(log_file,"a+") as log:
|
rliterman@0
|
289 log.write(f"\t\t- Purged (Density): {reject_density.shape[0]}\n")
|
rliterman@0
|
290 reject_density['Filter_Cat'] = "Purged_Density"
|
rliterman@0
|
291 snp_tally_df = pd.concat([snp_tally_df,reject_density]).reset_index(drop=True)
|
rliterman@0
|
292 pass_filter = pass_filter[~pass_filter['Ref_Loc'].isin(density_locs)].copy()
|
rliterman@0
|
293
|
rliterman@0
|
294 reject_query_edge = pass_filter[(pass_filter['Dist_to_Query_End'] < query_edge) & (pass_filter['Dist_to_Ref_End'] >= ref_edge)].copy()
|
rliterman@0
|
295 reject_ref_edge = pass_filter[(pass_filter['Dist_to_Ref_End'] < ref_edge) & (pass_filter['Dist_to_Query_End'] >= query_edge)].copy()
|
rliterman@0
|
296 reject_both_edge = pass_filter[(pass_filter['Dist_to_Query_End'] < query_edge) & (pass_filter['Dist_to_Ref_End'] < ref_edge)].copy()
|
rliterman@0
|
297
|
rliterman@0
|
298 if reject_query_edge.shape[0] > 0:
|
rliterman@0
|
299 with open(log_file,"a+") as log:
|
rliterman@0
|
300 log.write(f"\t\t- Purged (Query Edge): {reject_query_edge.shape[0]}\n")
|
rliterman@0
|
301 reject_query_edge['Filter_Cat'] = "Filtered_Query_Edge"
|
rliterman@0
|
302 snp_tally_df = pd.concat([snp_tally_df,reject_query_edge]).reset_index(drop=True)
|
rliterman@0
|
303
|
rliterman@0
|
304 if reject_ref_edge.shape[0] > 0:
|
rliterman@0
|
305 with open(log_file,"a+") as log:
|
rliterman@0
|
306 log.write(f"\t\t- Purged (Ref Edge): {reject_ref_edge.shape[0]}\n")
|
rliterman@0
|
307 reject_ref_edge['Filter_Cat'] = "Filtered_Ref_Edge"
|
rliterman@0
|
308 snp_tally_df = pd.concat([snp_tally_df,reject_ref_edge]).reset_index(drop=True)
|
rliterman@0
|
309
|
rliterman@0
|
310 if reject_both_edge.shape[0] > 0:
|
rliterman@0
|
311 with open(log_file,"a+") as log:
|
rliterman@0
|
312 log.write(f"\t\t- Purged (Both Edge): {reject_both_edge.shape[0]}\n")
|
rliterman@0
|
313 reject_both_edge['Filter_Cat'] = "Filtered_Both_Edge"
|
rliterman@0
|
314 snp_tally_df = pd.concat([snp_tally_df,reject_both_edge]).reset_index(drop=True)
|
rliterman@0
|
315
|
rliterman@0
|
316 pass_filter = pass_filter[(pass_filter['Dist_to_Query_End'] >= query_edge) & (pass_filter['Dist_to_Ref_End'] >= ref_edge)].copy()
|
rliterman@0
|
317
|
rliterman@0
|
318 helpers.cleanup(verbose=False,remove_all = False)
|
rliterman@0
|
319
|
rliterman@0
|
320 assert snp_tally_df.shape[0] + pass_filter.shape[0] == total_snp_count
|
rliterman@0
|
321 return_df = pd.concat([pass_filter,snp_tally_df]).reset_index(drop=True).sort_values(by=['Ref_Loc'])
|
rliterman@0
|
322
|
rliterman@0
|
323 return return_df.drop(columns=['Cat']).rename({'Filter_Cat':'Cat'}, axis=1)
|
rliterman@0
|
324
|
rliterman@0
|
325
|
rliterman@0
|
326 def screenSNPDiffs(snpdiffs_file,trim_name, min_cov, min_len, min_iden, ref_edge, query_edge, density_windows, max_snps,ref_ids):
|
rliterman@0
|
327
|
rliterman@0
|
328 if temp_dir != "":
|
rliterman@0
|
329 helpers.set_tempdir(temp_dir)
|
rliterman@0
|
330
|
rliterman@0
|
331 screen_start_time = time.time()
|
rliterman@0
|
332
|
rliterman@0
|
333 # Set CSP2 variables to NA
|
rliterman@0
|
334 csp2_screen_snps = purged_length = purged_identity = purged_invalid = purged_indel = purged_lengthIdentity = purged_duplicate = purged_het = purged_density = filtered_ref_edge = filtered_query_edge = filtered_both_edge = "NA"
|
rliterman@0
|
335
|
rliterman@0
|
336 # Ensure snpdiffs file exists
|
rliterman@0
|
337 if not os.path.exists(snpdiffs_file) or not snpdiffs_file.endswith('.snpdiffs'):
|
rliterman@0
|
338 run_failed = True
|
rliterman@0
|
339 sys.exit(f"Invalid snpdiffs file provided: {snpdiffs_file}")
|
rliterman@0
|
340
|
rliterman@0
|
341 # Ensure header can be read in
|
rliterman@0
|
342 try:
|
rliterman@0
|
343 header_data = fetchHeaders(snpdiffs_file)
|
rliterman@0
|
344 header_query = header_data['Query_ID'][0].replace(trim_name,'')
|
rliterman@0
|
345 header_ref = header_data['Reference_ID'][0].replace(trim_name,'')
|
rliterman@0
|
346 except:
|
rliterman@0
|
347 run_failed = True
|
rliterman@0
|
348 sys.exit(f"Error reading headers from snpdiffs file: {snpdiffs_file}")
|
rliterman@0
|
349
|
rliterman@0
|
350 # Check snpdiffs orientation
|
rliterman@0
|
351 if ref_ids == []:
|
rliterman@0
|
352 snpdiffs_orientation = 1
|
rliterman@0
|
353 query_id = header_query
|
rliterman@0
|
354 reference_id = header_ref
|
rliterman@0
|
355 elif (header_query not in ref_ids) and (header_ref in ref_ids):
|
rliterman@0
|
356 snpdiffs_orientation = 1
|
rliterman@0
|
357 query_id = header_query
|
rliterman@0
|
358 reference_id = header_ref
|
rliterman@0
|
359 elif (header_query in ref_ids) and (header_ref not in ref_ids):
|
rliterman@0
|
360 snpdiffs_orientation = -1
|
rliterman@0
|
361 query_id = header_ref
|
rliterman@0
|
362 reference_id = header_query
|
rliterman@0
|
363 header_data = swapHeader(header_data)
|
rliterman@0
|
364 else:
|
rliterman@0
|
365 snpdiffs_orientation = 2
|
rliterman@0
|
366 query_id = header_query
|
rliterman@0
|
367 reference_id = header_ref
|
rliterman@0
|
368
|
rliterman@0
|
369 # Establish log file
|
rliterman@0
|
370 log_file = f"{log_dir}/{query_id}__vs__{reference_id}.log"
|
rliterman@0
|
371 with open(log_file,"w+") as log:
|
rliterman@0
|
372 log.write("Screening Analysis\n")
|
rliterman@0
|
373 log.write(f"Query Isolate: {query_id}\n")
|
rliterman@0
|
374 log.write(f"Reference Isolate: {reference_id}\n")
|
rliterman@0
|
375 log.write(str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))+"\n")
|
rliterman@0
|
376 log.write("-------------------------------------------------------\n\n")
|
rliterman@0
|
377 if ref_ids == []:
|
rliterman@0
|
378 log.write("\t- No explicit references set, processing in forward orientation\n")
|
rliterman@0
|
379 log.write("-------------------------------------------------------\n\n")
|
rliterman@0
|
380 elif snpdiffs_orientation == 1:
|
rliterman@0
|
381 log.write("\t- SNPDiffs file is in the forward orientation\n")
|
rliterman@0
|
382 log.write("-------------------------------------------------------\n\n")
|
rliterman@0
|
383 elif snpdiffs_orientation == -1:
|
rliterman@0
|
384 log.write("\t- SNPDiffs file is in the reverse orientation\n")
|
rliterman@0
|
385 log.write("-------------------------------------------------------\n\n")
|
rliterman@0
|
386 else:
|
rliterman@0
|
387 snpdiffs_orientation = 1
|
rliterman@0
|
388 log.write("\t- SNPDiffs file not contain a reference and non-reference sample, processing in forward orientation\n")
|
rliterman@0
|
389 log.write("-------------------------------------------------------\n\n")
|
rliterman@0
|
390
|
rliterman@0
|
391
|
rliterman@0
|
392 # Set variables from header data
|
rliterman@0
|
393 raw_snps = int(header_data['SNPs'][0])
|
rliterman@0
|
394 raw_indels = int(header_data['Indels'][0])
|
rliterman@0
|
395 raw_invalid = int(header_data['Invalid'][0])
|
rliterman@0
|
396
|
rliterman@0
|
397 kmer_similarity = float(header_data['Kmer_Similarity'][0])
|
rliterman@0
|
398 shared_kmers = int(header_data['Shared_Kmers'][0])
|
rliterman@0
|
399 query_unique_kmers = int(header_data['Query_Unique_Kmers'][0])
|
rliterman@0
|
400 reference_unique_kmers = int(header_data['Reference_Unique_Kmers'][0])
|
rliterman@0
|
401 mummer_gsnps = int(header_data['gSNPs'][0])
|
rliterman@0
|
402 mummer_gindels = int(header_data['gIndels'][0])
|
rliterman@0
|
403
|
rliterman@0
|
404 query_bases = int(header_data['Query_Assembly_Bases'][0])
|
rliterman@0
|
405 reference_bases = int(header_data['Reference_Assembly_Bases'][0])
|
rliterman@0
|
406
|
rliterman@0
|
407 query_contigs = int(header_data['Query_Contig_Count'][0])
|
rliterman@0
|
408 reference_contigs = int(header_data['Reference_Contig_Count'][0])
|
rliterman@0
|
409
|
rliterman@0
|
410 raw_query_percent_aligned = float(header_data['Query_Percent_Aligned'][0])
|
rliterman@0
|
411 raw_ref_percent_aligned = float(header_data['Reference_Percent_Aligned'][0])
|
rliterman@0
|
412
|
rliterman@0
|
413 # If the reference is not covered by at least min_cov, STOP
|
rliterman@0
|
414 if raw_ref_percent_aligned < min_cov:
|
rliterman@0
|
415 query_percent_aligned = raw_query_percent_aligned
|
rliterman@0
|
416 reference_percent_aligned = raw_ref_percent_aligned
|
rliterman@0
|
417 screen_category = "Low_Coverage"
|
rliterman@0
|
418 with open(log_file,"a+") as log:
|
rliterman@0
|
419 log.write(f"\t- Reference genome coverage: {raw_ref_percent_aligned}% \n")
|
rliterman@0
|
420 log.write(f"\t- Query covers less than --min_cov ({min_cov}%)...Screen halted...\n")
|
rliterman@0
|
421 log.write("-------------------------------------------------------\n\n")
|
rliterman@0
|
422
|
rliterman@0
|
423 elif raw_snps + raw_indels + raw_invalid > 10000:
|
rliterman@0
|
424 query_percent_aligned = raw_query_percent_aligned
|
rliterman@0
|
425 reference_percent_aligned = raw_ref_percent_aligned
|
rliterman@0
|
426 screen_category = "SNP_Cutoff"
|
rliterman@0
|
427 with open(log_file,"a+") as log:
|
rliterman@0
|
428 log.write(f"\t- {raw_snps} detected...\n")
|
rliterman@0
|
429 log.write("\t- > 10,000 SNPs, indels, or invalid sites detected by MUMmer...Screen halted...\n")
|
rliterman@0
|
430 log.write("-------------------------------------------------------\n\n")
|
rliterman@0
|
431
|
rliterman@0
|
432 else:
|
rliterman@0
|
433
|
rliterman@0
|
434 ##### 02: Read in BED/SNP data #####
|
rliterman@0
|
435 with open(log_file,"a+") as log:
|
rliterman@0
|
436 log.write("Step 1: Reading in snpdiffs BED/SNP data...")
|
rliterman@0
|
437 try:
|
rliterman@0
|
438 bed_df,snp_df = parseSNPDiffs(snpdiffs_file,snpdiffs_orientation)
|
rliterman@0
|
439 with open(log_file,"a+") as log:
|
rliterman@0
|
440 log.write("Done!\n")
|
rliterman@0
|
441 log.write("-------------------------------------------------------\n\n")
|
rliterman@0
|
442 except:
|
rliterman@0
|
443 with open(log_file,"a+") as log:
|
rliterman@0
|
444 log.write(f"Error reading BED/SNP data from file: {snpdiffs_file}")
|
rliterman@0
|
445 run_failed = True
|
rliterman@0
|
446 sys.exit(f"Error reading BED/SNP data from file: {snpdiffs_file}")
|
rliterman@0
|
447
|
rliterman@0
|
448 ##### 03: Filter genome overlaps #####
|
rliterman@0
|
449 with open(log_file,"a+") as log:
|
rliterman@0
|
450 log.write("Step 2: Filtering for short overlaps and low percent identity...")
|
rliterman@0
|
451
|
rliterman@0
|
452 good_bed_df = bed_df[(bed_df['Ref_Aligned'] >= min_len) & (bed_df['Perc_Iden'] >= min_iden)].copy()
|
rliterman@0
|
453
|
rliterman@0
|
454 if good_bed_df.shape[0] == 0:
|
rliterman@0
|
455 screen_category = "Low_Quality_Coverage"
|
rliterman@0
|
456 with open(log_file,"a+") as log:
|
rliterman@0
|
457 log.write(f"\n\t- After filtering based on --min_len ({min_len}) and --min_iden ({min_iden}) , no valid alignments remain...Screen halted...\n")
|
rliterman@0
|
458 log.write("-------------------------------------------------------\n\n")
|
rliterman@0
|
459
|
rliterman@0
|
460
|
rliterman@0
|
461 else:
|
rliterman@0
|
462 # Create a BED file for alignments that pass basic QC
|
rliterman@0
|
463 good_query_bed_df = good_bed_df[['Query_Contig','Query_Start','Query_End']].copy()
|
rliterman@0
|
464 good_reference_bed_df = good_bed_df[['Ref_Contig','Ref_Start','Ref_End']].copy()
|
rliterman@0
|
465
|
rliterman@0
|
466 good_query_aligned = calculate_total_length(BedTool.from_dataframe(good_query_bed_df).sort().merge())
|
rliterman@0
|
467 good_reference_aligned = calculate_total_length(BedTool.from_dataframe(good_reference_bed_df).sort().merge())
|
rliterman@0
|
468
|
rliterman@0
|
469 query_percent_aligned = (good_query_aligned / query_bases) * 100
|
rliterman@0
|
470 reference_percent_aligned = (good_reference_aligned / reference_bases) * 100
|
rliterman@0
|
471
|
rliterman@0
|
472 if reference_percent_aligned < min_cov:
|
rliterman@0
|
473 screen_category = "Low_Quality_Coverage"
|
rliterman@0
|
474 with open(log_file,"a+") as log:
|
rliterman@0
|
475 log.write(f"\n\t- Raw reference genome coverage was {raw_ref_percent_aligned}% \n")
|
rliterman@0
|
476 log.write(f"\t- After filtering based on --min_len ({min_len}) and --min_iden ({min_iden}), reference genome coverage was {reference_percent_aligned:.2f}% \n")
|
rliterman@0
|
477 log.write(f"\t- Query covers less than --min_cov ({min_cov}%) of reference after filtering...Screen halted...\n")
|
rliterman@0
|
478 log.write("-------------------------------------------------------\n\n")
|
rliterman@0
|
479
|
rliterman@0
|
480 else:
|
rliterman@0
|
481 screen_category = "Pass"
|
rliterman@0
|
482 with open(log_file,"a+") as log:
|
rliterman@0
|
483 log.write("Done!\n")
|
rliterman@0
|
484 log.write(f"\t- Raw reference genome coverage was {raw_ref_percent_aligned}% \n")
|
rliterman@0
|
485 log.write(f"\t- After filtering based on --min_len ({min_len}) and --min_iden ({min_iden}), reference genome coverage was {reference_percent_aligned:.2f}% \n")
|
rliterman@0
|
486 log.write("-------------------------------------------------------\n\n")
|
rliterman@0
|
487
|
rliterman@0
|
488
|
rliterman@0
|
489 # Filter SNPs
|
rliterman@0
|
490 with open(log_file,"a+") as log:
|
rliterman@0
|
491 log.write("Step 3: Filtering SNPs to get final SNP distances...")
|
rliterman@0
|
492
|
rliterman@0
|
493 if raw_snps == 0:
|
rliterman@0
|
494 csp2_screen_snps = purged_length = purged_identity = purged_lengthIdentity = purged_indel = purged_invalid = purged_duplicate = purged_het = purged_density = filtered_ref_edge = filtered_query_edge = filtered_both_edge = 0
|
rliterman@0
|
495 with open(log_file,"a+") as log:
|
rliterman@0
|
496 log.write("Done!\n")
|
rliterman@0
|
497 log.write("\t- No SNPs detected in MUMmer output, no filtering required\n")
|
rliterman@0
|
498 log.write("-------------------------------------------------------\n\n")
|
rliterman@0
|
499
|
rliterman@0
|
500 else:
|
rliterman@0
|
501 filtered_snp_df = filterSNPs(snp_df,bed_df,log_file, min_len, min_iden, ref_edge, query_edge, density_windows, max_snps)
|
rliterman@0
|
502
|
rliterman@0
|
503 # Write filtered SNP data to file
|
rliterman@0
|
504 snp_file = log_file.replace(".log","_SNPs.tsv")
|
rliterman@0
|
505 filtered_snp_df.to_csv(snp_file, sep="\t", index=False)
|
rliterman@0
|
506
|
rliterman@0
|
507 csp2_screen_snps = filtered_snp_df[filtered_snp_df.Cat == "SNP"].shape[0]
|
rliterman@0
|
508
|
rliterman@0
|
509 purged_length = filtered_snp_df[filtered_snp_df.Cat == "Purged_Length"].shape[0]
|
rliterman@0
|
510 purged_identity = filtered_snp_df[filtered_snp_df.Cat == "Purged_Identity"].shape[0]
|
rliterman@0
|
511 purged_lengthIdentity = filtered_snp_df[filtered_snp_df.Cat == "Purged_LengthIdentity"].shape[0]
|
rliterman@0
|
512 purged_invalid = filtered_snp_df[filtered_snp_df.Cat == "Purged_Invalid"].shape[0]
|
rliterman@0
|
513 purged_indel = filtered_snp_df[filtered_snp_df.Cat == "Purged_Indel"].shape[0]
|
rliterman@0
|
514 purged_het = filtered_snp_df[filtered_snp_df.Cat == "Purged_Heterozygous"].shape[0]
|
rliterman@0
|
515 purged_duplicate = filtered_snp_df[filtered_snp_df.Cat == "Purged_Duplicate"].shape[0]
|
rliterman@0
|
516 purged_density = filtered_snp_df[filtered_snp_df.Cat == "Purged_Density"].shape[0]
|
rliterman@0
|
517 filtered_query_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Query_Edge"].shape[0]
|
rliterman@0
|
518 filtered_ref_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Ref_Edge"].shape[0]
|
rliterman@0
|
519 filtered_both_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Both_Edge"].shape[0]
|
rliterman@0
|
520
|
rliterman@0
|
521 with open(log_file,"a+") as log:
|
rliterman@0
|
522 log.write("Done!\n")
|
rliterman@0
|
523 log.write(f"\t- {csp2_screen_snps} SNPs detected between {query_id} and {reference_id} after filtering\n")
|
rliterman@0
|
524 log.write(f"\t- SNP data saved to {snp_file}\n")
|
rliterman@0
|
525 log.write("-------------------------------------------------------\n\n")
|
rliterman@0
|
526
|
rliterman@0
|
527 screen_end_time = time.time()
|
rliterman@0
|
528 helpers.cleanup(verbose=False, remove_all=False)
|
rliterman@0
|
529
|
rliterman@0
|
530 with open(log_file,"a+") as log:
|
rliterman@0
|
531 log.write(f"Screening Time: {screen_end_time - screen_start_time:.2f} seconds\n")
|
rliterman@0
|
532
|
rliterman@0
|
533 # Clean up pybedtools temp
|
rliterman@0
|
534 helpers.cleanup(verbose=False, remove_all=False)
|
rliterman@0
|
535
|
rliterman@0
|
536 return [str(item) for item in [query_id,reference_id,screen_category,csp2_screen_snps,
|
rliterman@0
|
537 f"{query_percent_aligned:.2f}",f"{reference_percent_aligned:.2f}",
|
rliterman@0
|
538 query_contigs,query_bases,reference_contigs,reference_bases,
|
rliterman@0
|
539 raw_snps,purged_length,purged_identity,purged_lengthIdentity,purged_invalid,purged_indel,purged_duplicate,purged_het,purged_density,
|
rliterman@0
|
540 filtered_query_edge,filtered_ref_edge,filtered_both_edge,
|
rliterman@0
|
541 kmer_similarity,shared_kmers,query_unique_kmers,reference_unique_kmers,
|
rliterman@0
|
542 mummer_gsnps,mummer_gindels]]
|
rliterman@0
|
543
|
rliterman@0
|
544 # Read in arguments
|
rliterman@0
|
545 global run_failed
|
rliterman@0
|
546 run_failed = False
|
rliterman@0
|
547
|
rliterman@0
|
548 parser = argparse.ArgumentParser()
|
rliterman@0
|
549 parser.add_argument("--snpdiffs_file", help="Path to the file containing SNP diffs")
|
rliterman@0
|
550 parser.add_argument("--log_dir", help="Path to the log directory")
|
rliterman@27
|
551 parser.add_argument("--min_cov", default=85, type=float, help="Minimum coverage")
|
rliterman@27
|
552 parser.add_argument("--min_len", default=500,type=int, help="Minimum length")
|
rliterman@27
|
553 parser.add_argument("--min_iden", default=99,type=float, help="Minimum identity")
|
rliterman@27
|
554 parser.add_argument("--ref_edge", default=150,type=int, help="Reference edge")
|
rliterman@27
|
555 parser.add_argument("--query_edge", default=150,type=int, help="Query edge")
|
rliterman@27
|
556 parser.add_argument("--density_windows",default="1000,125,15", help="Density windows (comma-separated)")
|
rliterman@27
|
557 parser.add_argument("--max_snps", default="3,2,1",help="Maximum SNPs (comma-separated)")
|
rliterman@27
|
558 parser.add_argument('--trim_name', type=str, default="", help='trim name')
|
rliterman@0
|
559 parser.add_argument("--output_file", help="Output file")
|
rliterman@0
|
560 parser.add_argument("--ref_id", help="Reference IDs file")
|
rliterman@27
|
561 parser.add_argument("--tmp_dir",default="", help="TMP dir")
|
rliterman@0
|
562
|
rliterman@0
|
563 args = parser.parse_args()
|
rliterman@0
|
564
|
rliterman@0
|
565 snpdiffs_list = [line.strip() for line in open(args.snpdiffs_file, 'r')]
|
rliterman@0
|
566 snpdiffs_list = [line for line in snpdiffs_list if line]
|
rliterman@0
|
567 for snpdiffs_file in snpdiffs_list:
|
rliterman@0
|
568 if not os.path.exists(snpdiffs_file):
|
rliterman@0
|
569 run_failed = True
|
rliterman@0
|
570 sys.exit("Error: File does not exist: " + snpdiffs_file)
|
rliterman@0
|
571
|
rliterman@0
|
572 snpdiffs_list = list(set(snpdiffs_list))
|
rliterman@0
|
573
|
rliterman@0
|
574 log_dir = os.path.normpath(os.path.abspath(args.log_dir))
|
rliterman@0
|
575
|
rliterman@0
|
576 min_cov = args.min_cov
|
rliterman@0
|
577 min_len = args.min_len
|
rliterman@0
|
578 min_iden = args.min_iden
|
rliterman@0
|
579
|
rliterman@0
|
580 ref_edge = args.ref_edge
|
rliterman@0
|
581 query_edge = args.query_edge
|
rliterman@0
|
582
|
rliterman@0
|
583 input_density = args.density_windows
|
rliterman@0
|
584 input_maxsnps = args.max_snps
|
rliterman@0
|
585
|
rliterman@0
|
586 if input_density == "0":
|
rliterman@0
|
587 density_windows = []
|
rliterman@0
|
588 max_snps = []
|
rliterman@0
|
589 else:
|
rliterman@0
|
590 density_windows = [int(x) for x in args.density_windows.split(",")]
|
rliterman@0
|
591 max_snps = [int(x) for x in args.max_snps.split(",")]
|
rliterman@0
|
592 assert len(density_windows) == len(max_snps)
|
rliterman@0
|
593
|
rliterman@0
|
594 trim_name = args.trim_name
|
rliterman@0
|
595
|
rliterman@0
|
596 output_file = os.path.abspath(args.output_file)
|
rliterman@0
|
597
|
rliterman@0
|
598 if os.stat(args.ref_id).st_size == 0:
|
rliterman@0
|
599 ref_ids = []
|
rliterman@0
|
600 else:
|
rliterman@0
|
601 ref_ids = [line.strip() for line in open(args.ref_id, 'r')]
|
rliterman@0
|
602
|
rliterman@0
|
603 global temp_dir
|
rliterman@0
|
604 if args.tmp_dir != "":
|
rliterman@0
|
605 random_temp_id = str(uuid.uuid4())
|
rliterman@0
|
606 temp_dir = f"{os.path.normpath(os.path.abspath(args.tmp_dir))}/{random_temp_id}"
|
rliterman@0
|
607 try:
|
rliterman@0
|
608 os.mkdir(temp_dir)
|
rliterman@0
|
609 helpers.set_tempdir(temp_dir)
|
rliterman@0
|
610 except OSError as e:
|
rliterman@0
|
611 run_failed = True
|
rliterman@0
|
612 print(f"Error: Failed to create directory '{temp_dir}': {e}")
|
rliterman@0
|
613 else:
|
rliterman@0
|
614 temp_dir = ""
|
rliterman@0
|
615
|
rliterman@0
|
616 try:
|
rliterman@0
|
617 with concurrent.futures.ProcessPoolExecutor() as executor:
|
rliterman@0
|
618 results = [executor.submit(screenSNPDiffs,snp_diff_file,trim_name, min_cov, min_len, min_iden, ref_edge, query_edge, density_windows, max_snps,ref_ids) for snp_diff_file in snpdiffs_list]
|
rliterman@0
|
619
|
rliterman@0
|
620 # Clean up pybedtools temp
|
rliterman@0
|
621 helpers.cleanup(verbose=False,remove_all = False)
|
rliterman@0
|
622
|
rliterman@0
|
623 # Combine results into a dataframe
|
rliterman@0
|
624 output_columns = ['Query_ID','Reference_ID','Screen_Category','CSP2_Screen_SNPs',
|
rliterman@0
|
625 'Query_Percent_Aligned','Reference_Percent_Aligned',
|
rliterman@0
|
626 'Query_Contigs','Query_Bases','Reference_Contigs','Reference_Bases',
|
rliterman@0
|
627 'Raw_SNPs','Purged_Length','Purged_Identity','Purged_LengthIdentity','Purged_Invalid','Purged_Indel','Purged_Duplicate','Purged_Het','Purged_Density',
|
rliterman@0
|
628 'Filtered_Query_Edge','Filtered_Ref_Edge','Filtered_Both_Edge',
|
rliterman@0
|
629 'Kmer_Similarity','Shared_Kmers','Query_Unique_Kmers','Reference_Unique_Kmers',
|
rliterman@0
|
630 'MUMmer_gSNPs','MUMmer_gIndels']
|
rliterman@0
|
631
|
rliterman@0
|
632 results_df = pd.DataFrame([item.result() for item in results], columns = output_columns)
|
rliterman@0
|
633 results_df.to_csv(output_file, sep="\t", index=False)
|
rliterman@0
|
634 except:
|
rliterman@0
|
635 run_failed = True
|
rliterman@0
|
636 print("Exception occurred:\n", traceback.format_exc())
|
rliterman@0
|
637 finally:
|
rliterman@0
|
638 helpers.cleanup(verbose=False, remove_all=False)
|
rliterman@0
|
639 if temp_dir != "":
|
rliterman@0
|
640 shutil.rmtree(temp_dir)
|
rliterman@0
|
641 if run_failed:
|
rliterman@0
|
642 sys.exit(1)
|
rliterman@0
|
643
|
rliterman@0
|
644
|
rliterman@0
|
645
|
rliterman@0
|
646
|
rliterman@0
|
647
|