Mercurial > repos > rliterman > csp2
comparison CSP2/bin/compileSNPResults.py @ 0:01431fa12065
"planemo upload"
author | rliterman |
---|---|
date | Mon, 02 Dec 2024 10:40:55 -0500 |
parents | |
children | 93393808f415 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:01431fa12065 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import sys | |
4 import os | |
5 import glob | |
6 import pandas as pd | |
7 from itertools import chain | |
8 import scipy.stats | |
9 import numpy as np | |
10 import datetime | |
11 import time | |
12 import argparse | |
13 | |
14 def getWarnings(df): | |
15 | |
16 df_measures = list(set(df['Measure'])) | |
17 warn_list = [] | |
18 if 'Preserved_Diff' in df_measures: | |
19 for index,row in df.iterrows(): | |
20 if pd.isna(row['Zscore']): | |
21 warn_list.append(np.nan) | |
22 elif 2.5 <= row['Zscore'] < 3: | |
23 warn_list.append("Warning") | |
24 elif row['Zscore'] >=3: | |
25 warn_list.append("Failure") | |
26 else: | |
27 warn_list.append("Pass") | |
28 elif 'Contig_Count' in df_measures: | |
29 for index,row in df.iterrows(): | |
30 if pd.isna(row['Zscore']): | |
31 warn_list.append(np.nan) | |
32 elif row['Measure'] in ["Contig_Count","L50","L90"]: | |
33 if 2.5 <= row['Zscore'] < 3: | |
34 warn_list.append("Warning") | |
35 elif row['Zscore'] >=3: | |
36 warn_list.append("Failure") | |
37 else: | |
38 warn_list.append("Pass") | |
39 elif row['Measure'] == "Assembly_Bases": | |
40 if 2.5 <= abs(row['Zscore']) < 3: | |
41 warn_list.append("Warning") | |
42 elif abs(row['Zscore']) >=3: | |
43 warn_list.append("Failure") | |
44 else: | |
45 warn_list.append("Pass") | |
46 elif row['Measure'] in ["N50","N90"]: | |
47 if -3 < row['Zscore'] <= -2.5: | |
48 warn_list.append("Warning") | |
49 elif row['Zscore'] <= -3: | |
50 warn_list.append("Failure") | |
51 else: | |
52 warn_list.append("Pass") | |
53 else: | |
54 sys.exit(f"{row['Measure']}") | |
55 elif ('Raw_Distance_StdDev' in df_measures) | ('Preserved_Distance_StdDev' in df_measures): | |
56 for index,row in df.iterrows(): | |
57 if pd.isna(row['Zscore']): | |
58 warn_list.append(np.nan) | |
59 elif 2.5 <= row['Zscore'] < 3: | |
60 warn_list.append("Warning") | |
61 elif row['Zscore'] >=3: | |
62 warn_list.append("Failure") | |
63 else: | |
64 warn_list.append("Pass") | |
65 elif 'Unique_Kmers' in df_measures: | |
66 for index,row in df.iterrows(): | |
67 if pd.isna(row['Zscore']): | |
68 warn_list.append(np.nan) | |
69 elif row['Measure'] in ["Align_Percent_Diff","Unique_Kmers","gIndels","Missing_Kmers"]: | |
70 if 2.5 <= row['Zscore'] < 3: | |
71 warn_list.append("Warning") | |
72 elif row['Zscore'] >=3: | |
73 warn_list.append("Failure") | |
74 else: | |
75 warn_list.append("Pass") | |
76 elif row['Measure'] in ["Compare_Aligned","Kmer_Similarity","Self_Aligned","Median_Alignment_Length"]: | |
77 if -3 < row['Zscore'] <= -2.5: | |
78 warn_list.append("Warning") | |
79 elif row['Zscore'] <= -3: | |
80 warn_list.append("Failure") | |
81 else: | |
82 warn_list.append("Pass") | |
83 else: | |
84 sys.exit(f"{row['Measure']}") | |
85 | |
86 elif ('SNPs_Cocalled' in df_measures) | ('Raw_SNPs_Cocalled' in df_measures) | ('Preserved_SNPs_Cocalled' in df_measures): | |
87 for index,row in df.iterrows(): | |
88 if pd.isna(row['Zscore']): | |
89 warn_list.append(np.nan) | |
90 elif -3 < row['Zscore'] <= -2.5: | |
91 warn_list.append("Warning") | |
92 elif row['Zscore'] <= -3: | |
93 warn_list.append("Failure") | |
94 else: | |
95 warn_list.append("Pass") | |
96 else: | |
97 sys.exit(f"{df_measures}") | |
98 | |
99 return warn_list | |
100 | |
101 start_time = time.time() | |
102 | |
103 # Get args | |
104 parser = argparse.ArgumentParser(description='CSP2 SNP Pipeline Compiler') | |
105 parser.add_argument('--snp_dirs_file', type=str, help='Path to the file containing SNP directories') | |
106 parser.add_argument('--output_directory', type=str, help='Path to the output directory') | |
107 parser.add_argument('--isolate_data_file', type=str, help='Path to the isolate data file') | |
108 parser.add_argument('--mummer_data_file', type=str, help='Path to the MUMmer data file') | |
109 args = parser.parse_args() | |
110 | |
111 snp_dirs = [line.strip() for line in open(args.snp_dirs_file, 'r')] | |
112 raw_snp_distance_files = list(chain.from_iterable([glob.glob(snp_dir + '/snp_distance_pairwise.tsv') for snp_dir in snp_dirs])) | |
113 screening_files = list(chain.from_iterable([glob.glob(snp_dir + '/Reference_Screening.tsv') for snp_dir in snp_dirs])) | |
114 | |
115 # Set paths | |
116 output_directory = args.output_directory | |
117 log_file = f"{output_directory}/Compilation.log" | |
118 mean_isolate_file = f"{output_directory}/Mean_Assembly_Stats.tsv" | |
119 isolate_assembly_stats_file = f"{output_directory}/Isolate_Assembly_Stats.tsv" | |
120 align_stats_file = f"{output_directory}/Isolate_Alignment_Stats.tsv" | |
121 ref_mean_summary_file = f"{output_directory}/Align_Summary_by_Reference.tsv" | |
122 snp_comparison_file = f"{output_directory}/SNP_Distance_Summary.tsv" | |
123 qc_file = f"{output_directory}/QC_Warnings_Failures.tsv" | |
124 | |
125 with open(log_file,"w+") as log: | |
126 log.write("CSP2 SNP Pipeline Compiler\n") | |
127 log.write(str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))+"\n") | |
128 log.write("-------------------------------------------------------\n\n") | |
129 if len(raw_snp_distance_files) == 0: | |
130 log.write("\t- CSP2 SNP Pipeline Compiler cannot detected any SNP pipeline output files\n") | |
131 log.write("\t- Compiler stopping...\n") | |
132 sys.exit(0) | |
133 | |
134 isolate_data = pd.read_csv(args.isolate_data_file, sep="\t") | |
135 raw_isolate_count = isolate_data.shape[0] | |
136 | |
137 mummer_data = pd.read_csv(args.mummer_data_file, sep="\t") | |
138 | |
139 # Get reference IDs | |
140 reference_ids = list(set(isolate_data[isolate_data['Isolate_Type'] == "Reference"]['Isolate_ID'])) | |
141 raw_ref_count = len(reference_ids) | |
142 | |
143 raw_snp_distance_df = pd.concat([pd.read_csv(file, sep='\t').assign(Reference_ID=os.path.basename(os.path.dirname(file))) for file in raw_snp_distance_files]) | |
144 raw_snp_distance_df['Comparison'] = raw_snp_distance_df.apply(lambda row: ';'.join(sorted([str(row['Query_1']), str(row['Query_2'])])), axis=1) | |
145 | |
146 # Check for preserved data | |
147 preserved_snp_distance_files = list(chain.from_iterable([glob.glob(snp_dir + '/snp_distance_pairwise_preserved.tsv') for snp_dir in snp_dirs])) | |
148 if len(preserved_snp_distance_files) == 0: | |
149 has_preserved = False | |
150 else: | |
151 has_preserved = True | |
152 preserved_snp_distance_df = pd.concat([pd.read_csv(file, sep='\t').assign(Reference_ID=os.path.basename(os.path.dirname(file))) for file in preserved_snp_distance_files]) | |
153 preserved_snp_distance_df['Comparison'] = preserved_snp_distance_df.apply(lambda row: ';'.join(sorted([str(row['Query_1']), str(row['Query_2'])])), axis=1) | |
154 | |
155 screening_df = pd.concat([pd.read_csv(file, sep='\t').assign(Reference_ID=os.path.basename(os.path.dirname(file))) for file in screening_files]) | |
156 | |
157 snp_isolates = list(set(raw_snp_distance_df['Query_1'].tolist() + raw_snp_distance_df['Query_2'].tolist())) | |
158 | |
159 with open(log_file,"a+") as log: | |
160 log.write(f"- Detected SNP distance data for {len(snp_isolates)} isolates out of {raw_isolate_count} total isolates analyzed\n") | |
161 if len(snp_isolates) <= 2: | |
162 log.write("\t- CSP2 SNP Pipeline Compiler cannot do much with 2 or fewer isolates\n") | |
163 log.write("\t- Compiler stopping...\n") | |
164 sys.exit(0) | |
165 else: | |
166 failed_combinations = screening_df.loc[screening_df['Screen_Category'] != "Pass"] | |
167 failed_comparisons = [] | |
168 | |
169 if failed_combinations.shape[0] > 0: | |
170 reference_query_dict = failed_combinations.groupby('Reference_ID')['Query_ID'].apply(list).to_dict() | |
171 for index,row in failed_combinations.iterrows(): | |
172 failed_comparisons.append(";".join(sorted([row['Query_ID'], row['Reference_ID']]))) | |
173 log.write("\n- The following query-reference alignments did not pass QC\n") | |
174 for key, value in reference_query_dict.items(): | |
175 log.write(f"\nReference {key}\n{', '.join(map(str, value))}\n") | |
176 | |
177 # Prune isolate data | |
178 isolate_data = isolate_data.loc[isolate_data['Isolate_ID'].isin(snp_isolates)] | |
179 reference_ids = [x for x in reference_ids if x in snp_isolates] | |
180 ref_count = len(reference_ids) | |
181 with open(log_file,"a+") as log: | |
182 log.write(f"- Detected SNP distance data for {ref_count} reference isolates out of {raw_ref_count} total reference isolates analyzed\n") | |
183 for ref in reference_ids: | |
184 log.write(f"\t- {ref}\n") | |
185 | |
186 # Prune MUMmer data | |
187 mummer_data['Comparison'] = mummer_data.apply(lambda row: ';'.join(sorted([str(row['Query_ID']), str(row['Reference_ID'])])), axis=1) | |
188 mummer_data = mummer_data.loc[~mummer_data['Comparison'].isin(failed_comparisons), ['SNPDiffs_File','Query_ID','Reference_ID','Comparison','Reference_Percent_Aligned','Query_Percent_Aligned','Median_Alignment_Length','Kmer_Similarity','Reference_Unique_Kmers','Query_Unique_Kmers','gIndels']] | |
189 max_align_values = np.maximum(mummer_data['Reference_Percent_Aligned'], mummer_data['Query_Percent_Aligned']) | |
190 min_align_values = np.minimum(mummer_data['Reference_Percent_Aligned'], mummer_data['Query_Percent_Aligned']) | |
191 mummer_data['Align_Percent_Diff'] = 100*((max_align_values - min_align_values)/min_align_values) | |
192 | |
193 # Run basic assembly stats | |
194 isolate_stats = isolate_data.melt(id_vars=['Isolate_ID', 'Isolate_Type'], value_vars = ['Contig_Count','Assembly_Bases','N50','N90','L50','L90'],value_name='Value',var_name = "Measure") | |
195 isolate_stats['Zscore'] = isolate_stats.groupby('Measure')['Value'].transform(scipy.stats.zscore).astype('float').round(3) | |
196 isolate_stats['QC'] = getWarnings(isolate_stats) | |
197 isolate_stats['Value'] = isolate_stats['Value'].astype('int') | |
198 | |
199 # Reformat for final TSV | |
200 isolate_stats['Min'] = np.nan | |
201 isolate_stats['Max'] = np.nan | |
202 isolate_stats['StdDev'] = np.nan | |
203 isolate_stats = isolate_stats[['Isolate_ID','Isolate_Type','Measure','Min','Value','Max','StdDev','Zscore','QC']].rename(columns = {'Value':'Mean'}) | |
204 isolate_stats['Count'] = 1 | |
205 | |
206 # Get mean values | |
207 isolate_mean_df = isolate_stats.groupby(by=['Measure'])['Mean'].agg(Min = 'min',Mean = "mean",Max = 'max',StdDev = 'std',Count = 'count') | |
208 isolate_mean_df['Mean'] = isolate_mean_df['Mean'].astype("int") | |
209 isolate_mean_df['StdDev'] = isolate_mean_df['StdDev'].astype("float").round(3) | |
210 with open(log_file,"a+") as log: | |
211 log.write("- Read in and processed isolate data\n\n") | |
212 for index, row in isolate_mean_df.iterrows(): | |
213 log.write(f"{index}:\tMin: {row['Min']}\tMean: {row['Mean']}\tMax: {row['Max']}\tStdDev: {row['StdDev']}\n") | |
214 log.write("\n") | |
215 | |
216 # Run basic alignment stats | |
217 isolate_mummer = pd.DataFrame(columns=['Isolate_ID', 'Compare_ID', 'Self_Aligned', 'Compare_Aligned','Align_Percent_Diff','Median_Alignment_Length', 'Kmer_Similarity', 'Unique_Kmers', 'Missing_Kmers', 'gIndels', 'SNPDiffs_File']) | |
218 | |
219 for isolate in snp_isolates: | |
220 | |
221 temp_mummer = mummer_data[(mummer_data['Query_ID'] == isolate) | (mummer_data['Reference_ID'] == isolate)].drop_duplicates(subset=['Comparison']).assign(Isolate_ID = isolate) | |
222 | |
223 for index, row in temp_mummer.iterrows(): | |
224 if row['Query_ID'] == isolate: | |
225 temp_isolate_mummer = row[['Isolate_ID', 'Reference_ID', 'Query_Percent_Aligned', 'Reference_Percent_Aligned','Align_Percent_Diff', 'Median_Alignment_Length', 'Kmer_Similarity', 'Query_Unique_Kmers', 'Reference_Unique_Kmers', 'gIndels', 'SNPDiffs_File']].to_frame().T | |
226 temp_isolate_mummer.columns = ['Isolate_ID', 'Compare_ID', 'Self_Aligned', 'Compare_Aligned', 'Align_Percent_Diff','Median_Alignment_Length', 'Kmer_Similarity', 'Unique_Kmers', 'Missing_Kmers', 'gIndels', 'SNPDiffs_File'] | |
227 isolate_mummer = pd.concat([isolate_mummer,temp_isolate_mummer]) | |
228 | |
229 elif row['Reference_ID'] == isolate: | |
230 temp_isolate_mummer = row[['Isolate_ID', 'Query_ID', 'Reference_Percent_Aligned', 'Query_Percent_Aligned', 'Align_Percent_Diff', 'Median_Alignment_Length', 'Kmer_Similarity', 'Reference_Unique_Kmers', 'Query_Unique_Kmers', 'gIndels', 'SNPDiffs_File']].to_frame().T | |
231 temp_isolate_mummer.columns = ['Isolate_ID', 'Compare_ID', 'Self_Aligned', 'Compare_Aligned', 'Align_Percent_Diff', 'Median_Alignment_Length', 'Kmer_Similarity', 'Unique_Kmers', 'Missing_Kmers', 'gIndels', 'SNPDiffs_File'] | |
232 isolate_mummer = pd.concat([isolate_mummer,temp_isolate_mummer]) | |
233 | |
234 isolate_mummer['Isolate_Type'] = isolate_mummer['Isolate_ID'].apply(lambda x: 'Reference' if x in reference_ids else 'Query') | |
235 isolate_mummer_df = isolate_mummer.melt(id_vars=['Isolate_ID','Isolate_Type'], value_vars = ['Self_Aligned','Compare_Aligned', 'Align_Percent_Diff','Median_Alignment_Length','Kmer_Similarity','Unique_Kmers','Missing_Kmers','gIndels'],value_name='Value',var_name = "Measure") | |
236 isolate_mummer_df['Value'] = isolate_mummer_df['Value'].astype("float") | |
237 isolate_mummer_df = isolate_mummer_df.groupby(by=['Isolate_ID','Isolate_Type','Measure'])['Value'].agg(Count = "count",Min = "min",Value = "mean",Max = "max",StdDev = 'std').reset_index() | |
238 isolate_mummer_df['Value'] = isolate_mummer_df['Value'].astype("float").round(2) | |
239 | |
240 # Get Zscores | |
241 isolate_mummer_df['Zscore'] = isolate_mummer_df.groupby('Measure')['Value'].transform(scipy.stats.zscore).astype('float').round(3) | |
242 isolate_mummer_df['QC'] = getWarnings(isolate_mummer_df) | |
243 | |
244 # Reformat for final TSV | |
245 align_stats = isolate_mummer_df[['Isolate_ID','Isolate_Type','Measure','Min','Value','Max','StdDev','Zscore','QC','Count']].copy().rename(columns = {"Value":"Mean"}) | |
246 align_stats['StdDev'] = align_stats['StdDev'].astype('float').round(3) | |
247 | |
248 with open(log_file,"a+") as log: | |
249 log.write("- Read in and processed alignment data\n") | |
250 | |
251 # Process cocalled data | |
252 raw_cocalled_df = raw_snp_distance_df[['Comparison','Query_1','Query_2','Reference_ID','SNPs_Cocalled']] | |
253 isolate_cocalled_df = pd.DataFrame(columns = ['Isolate_ID','Count','Min','Mean','Max','StdDev']) | |
254 | |
255 for isolate in snp_isolates: | |
256 temp_cocalled = raw_cocalled_df[(raw_cocalled_df['Query_1'] == isolate) | (raw_cocalled_df['Query_2'] == isolate)].drop_duplicates(subset=['Comparison','Reference_ID']).assign(Isolate_ID = isolate) | |
257 temp_cocalled = temp_cocalled.groupby(['Isolate_ID'])['SNPs_Cocalled'].agg(Count = "count", Min = "min", Value = "mean", Max = "max",StdDev = 'std').reset_index() | |
258 isolate_cocalled_df = pd.concat([isolate_cocalled_df,temp_cocalled]) | |
259 | |
260 isolate_cocalled_df['Measure'] = 'Raw_SNPs_Cocalled' | |
261 isolate_cocalled_df['Value'] = isolate_cocalled_df['Value'].astype('int') | |
262 isolate_cocalled_df['Zscore'] = isolate_cocalled_df['Value'].transform(scipy.stats.zscore).astype('float').round(3) | |
263 isolate_cocalled_df['QC'] = getWarnings(isolate_cocalled_df) | |
264 | |
265 # Format for final TSV | |
266 isolate_cocalled_df['Isolate_Type'] = isolate_cocalled_df['Isolate_ID'].apply(lambda x: 'Reference' if x in reference_ids else 'Query') | |
267 isolate_cocalled_stats = isolate_cocalled_df[['Isolate_ID','Isolate_Type','Measure','Min','Value','Max','StdDev','Zscore','QC','Count']].copy().rename(columns={'Value':'Mean'}) | |
268 | |
269 if has_preserved: | |
270 preserved_cocalled_df = preserved_snp_distance_df[['Comparison','Query_1','Query_2','Reference_ID','SNPs_Cocalled']] | |
271 isolate_preserved_cocalled_df = pd.DataFrame(columns = ['Isolate_ID','Count','Min','Mean','Max','StdDev']) | |
272 | |
273 for isolate in snp_isolates: | |
274 temp_cocalled = preserved_cocalled_df[(preserved_cocalled_df['Query_1'] == isolate) | (preserved_cocalled_df['Query_2'] == isolate)].drop_duplicates(subset=['Comparison','Reference_ID']).assign(Isolate_ID = isolate) | |
275 temp_cocalled = temp_cocalled.groupby(['Isolate_ID'])['SNPs_Cocalled'].agg(Count = "count", Min = "min", Value = "mean", Max = "max",StdDev = 'std').reset_index() | |
276 isolate_preserved_cocalled_df = pd.concat([isolate_preserved_cocalled_df,temp_cocalled]) | |
277 | |
278 isolate_preserved_cocalled_df['Measure'] = 'Preserved_SNPs_Cocalled' | |
279 isolate_preserved_cocalled_df['Value'] = isolate_preserved_cocalled_df['Value'].astype('int') | |
280 isolate_preserved_cocalled_df['Zscore'] = isolate_preserved_cocalled_df['Value'].transform(scipy.stats.zscore).astype('float').round(3) | |
281 isolate_preserved_cocalled_df['QC'] = getWarnings(isolate_preserved_cocalled_df) | |
282 | |
283 # Format for final TSV | |
284 isolate_preserved_cocalled_df['Isolate_Type'] = isolate_preserved_cocalled_df['Isolate_ID'].apply(lambda x: 'Reference' if x in reference_ids else 'Query') | |
285 isolate_preserved_cocalled_df['StdDev'] = isolate_preserved_cocalled_df['StdDev'].astype('float').round(3) | |
286 isolate_cocalled_stats = pd.concat([isolate_cocalled_stats,isolate_preserved_cocalled_df[['Isolate_ID','Isolate_Type','Measure','Min','Value','Max','StdDev','Zscore','QC','Count']].copy().rename(columns={'Value':'Mean'})]) | |
287 | |
288 with open(log_file,"a+") as log: | |
289 log.write("- Processed cocalled SNP data\n") | |
290 | |
291 if has_preserved: | |
292 raw_snp_df = raw_snp_distance_df[['Comparison','Query_1','Query_2','Reference_ID','SNP_Distance']].rename(columns = {'SNP_Distance':'Raw_SNP_Distance'}) | |
293 preserved_snp_df = preserved_snp_distance_df[['Comparison','Query_1','Query_2','Reference_ID','SNP_Distance']].rename(columns = {'SNP_Distance':'Preserved_SNP_Distance'}) | |
294 snp_df = pd.merge(raw_snp_df,preserved_snp_df,how="left",on=['Comparison','Query_1','Query_2','Reference_ID']) | |
295 snp_df['Preserved_Diff'] = abs(snp_df['Preserved_SNP_Distance'] - snp_df['Raw_SNP_Distance']) | |
296 | |
297 isolate_snp_df = pd.DataFrame(columns = ['Isolate_ID','Count','Min','Value','Max','StdDev']) | |
298 | |
299 for isolate in snp_isolates: | |
300 temp_snp = snp_df[(snp_df['Query_1'] == isolate) | (snp_df['Query_2'] == isolate)].drop_duplicates(subset=['Comparison','Reference_ID']).assign(Isolate_ID = isolate) | |
301 temp_snp = temp_snp.groupby(['Isolate_ID'])['Preserved_Diff'].agg(Count = "count", Min = "min", Value = "mean", Max = "max",StdDev = 'std').reset_index() | |
302 isolate_snp_df = pd.concat([isolate_snp_df,temp_snp]) | |
303 | |
304 isolate_snp_df['Measure'] = 'Preserved_Diff' | |
305 isolate_snp_df['Value'] = isolate_snp_df['Value'].astype("float") | |
306 isolate_snp_df['Zscore'] = isolate_snp_df['Value'].transform(scipy.stats.zscore).astype('float').round(3) | |
307 isolate_snp_df['Value'] = isolate_snp_df['Value'].astype('float').round(3) | |
308 isolate_snp_df['QC'] = getWarnings(isolate_snp_df) | |
309 | |
310 # Format for final TSV | |
311 isolate_snp_df['Isolate_Type'] = isolate_snp_df['Isolate_ID'].apply(lambda x: 'Reference' if x in reference_ids else 'Query') | |
312 isolate_snp_df['StdDev'] = isolate_snp_df['StdDev'].astype('float').round(3) | |
313 isolate_snp_stats = isolate_snp_df[['Isolate_ID','Isolate_Type','Measure','Min','Value','Max','StdDev','Zscore','QC','Count']].copy().rename(columns={'Value':'Mean'}) | |
314 with open(log_file,"a+") as log: | |
315 log.write("- Processed preserved SNP data\n") | |
316 else: | |
317 with open(log_file,"a+") as log: | |
318 log.write("- No preserved SNP data to process\n") | |
319 isolate_snp_stats = pd.DataFrame(columns = ['Isolate_ID','Isolate_Type','Measure','Min','Value','Max','StdDev','Zscore','QC','Count']) | |
320 | |
321 # Compare SNPs across refs | |
322 if len(reference_ids) == 1: | |
323 isolate_stdev_stats = pd.DataFrame(columns =['Isolate_ID','Isolate_Type','Measure','Min','Mean','Max','StdDev','Zscore','QC','Count']) | |
324 with open(log_file,"a+") as log: | |
325 log.write("- 1 reference provided, SNP distances have no comparisons\n") | |
326 else: | |
327 # Get comparison stats | |
328 raw_comparison_df = raw_snp_distance_df.groupby(by=['Comparison'])['SNP_Distance'].agg(Count = 'count', Min = 'min', Mean = 'mean', Max = 'max', StdDev = 'std').reset_index() | |
329 raw_comparison_df['StdDev'] = raw_comparison_df['StdDev'].astype('float').round(3) | |
330 raw_comparison_df['Mean'] = raw_comparison_df['Mean'].astype('int') | |
331 raw_comparison_df[['Query_1', 'Query_2']] = raw_comparison_df['Comparison'].str.split(';', expand=True) | |
332 raw_comparison_df['SNP_Spread'] = abs(raw_comparison_df['Max'] - raw_comparison_df['Min']) | |
333 | |
334 comparison_df = raw_comparison_df[['Comparison','Query_1','Query_2','Mean','StdDev','Min','Max','SNP_Spread','Count']].copy() | |
335 | |
336 # Get isolate stats | |
337 isolate_stdev_df = pd.DataFrame(columns = ['Isolate_ID','Count','Min','Value','Max','StdDev']) | |
338 | |
339 for isolate in snp_isolates: | |
340 temp_compare = raw_comparison_df[(raw_comparison_df['Query_1'] == isolate) | (raw_comparison_df['Query_2'] == isolate)].drop_duplicates(subset=['Comparison']).assign(Isolate_ID = isolate) | |
341 temp_compare = temp_compare.groupby(by=['Isolate_ID'])['StdDev'].agg(Count = 'count', Min = 'min', Value = 'mean', Max = 'max', StdDev = 'std').reset_index() | |
342 isolate_stdev_df = pd.concat([isolate_stdev_df,temp_compare]) | |
343 | |
344 isolate_stdev_df['Measure'] = "Raw_Distance_StdDev" | |
345 isolate_stdev_df['Value'] = isolate_stdev_df['Value'].astype("float") | |
346 isolate_stdev_df['Zscore'] = isolate_stdev_df['Value'].transform(scipy.stats.zscore).astype('float').round(3) | |
347 isolate_stdev_df['Value'] = isolate_stdev_df['Value'].astype('float').round(3) | |
348 isolate_stdev_df['Isolate_Type'] = isolate_stdev_df['Isolate_ID'].apply(lambda x: 'Reference' if x in reference_ids else 'Query') | |
349 isolate_stdev_df['QC'] = getWarnings(isolate_stdev_df) | |
350 | |
351 isolate_stdev_stats = isolate_stdev_df[['Isolate_ID','Isolate_Type','Measure','Min','Value','Max','StdDev','Zscore','QC','Count']].copy().rename(columns={'Value':'Mean'}) | |
352 | |
353 if has_preserved: | |
354 comparison_df.columns = ['Comparison','Query_1','Query_2','Raw_Mean','Raw_StdDev','Raw_Min','Raw_Max','Raw_SNP_Spread','Raw_Count'] | |
355 | |
356 preserved_comparison_df = preserved_snp_distance_df.groupby(by=['Comparison'])['SNP_Distance'].agg(Preserved_Count = 'count', Preserved_Min = 'min', Preserved_Mean = 'mean', Preserved_Max = 'max', Preserved_StdDev = 'std').reset_index() | |
357 preserved_comparison_df['Preserved_StdDev'] = preserved_comparison_df['Preserved_StdDev'].astype('float').round(3) | |
358 preserved_comparison_df['Preserved_Mean'] = preserved_comparison_df['Preserved_Mean'].astype('int') | |
359 preserved_comparison_df[['Query_1', 'Query_2']] = preserved_comparison_df['Comparison'].str.split(';', expand=True) | |
360 preserved_comparison_df['Preserved_SNP_Spread'] = abs(preserved_comparison_df['Preserved_Max'] - preserved_comparison_df['Preserved_Min']) | |
361 | |
362 comparison_df = comparison_df.merge(preserved_comparison_df,how = "left", on=['Comparison','Query_1','Query_2']) | |
363 comparison_df['Mean_Preserved_Diff'] = abs(comparison_df['Preserved_Mean'] - comparison_df['Raw_Mean']) | |
364 comparison_df = comparison_df[['Query_1','Query_2','Raw_Mean','Preserved_Mean','Mean_Preserved_Diff','Raw_StdDev','Preserved_StdDev','Raw_SNP_Spread','Preserved_SNP_Spread','Raw_Min','Raw_Max','Preserved_Min','Preserved_Max','Raw_Count','Preserved_Count']] | |
365 | |
366 isolate_stdev_df = pd.DataFrame(columns = ['Isolate_ID','Count','Min','Value','Max','StdDev']) | |
367 | |
368 for isolate in snp_isolates: | |
369 temp_compare = preserved_comparison_df[(preserved_comparison_df['Query_1'] == isolate) | (preserved_comparison_df['Query_2'] == isolate)].drop_duplicates(subset=['Comparison']).assign(Isolate_ID = isolate) | |
370 temp_compare = temp_compare.groupby(by=['Isolate_ID'])['Preserved_StdDev'].agg(Count = 'count', Min = 'min', Value = 'mean', Max = 'max', StdDev = 'std').reset_index() | |
371 isolate_stdev_df = pd.concat([isolate_stdev_df,temp_compare]) | |
372 | |
373 isolate_stdev_df['Measure'] = "Preserved_Distance_StdDev" | |
374 isolate_stdev_df['Value'] = isolate_stdev_df['Value'].astype('float') | |
375 isolate_stdev_df['Zscore'] = isolate_stdev_df['Value'].transform(scipy.stats.zscore).astype('float').round(3) | |
376 isolate_stdev_df['Value'] = isolate_stdev_df['Value'].astype('float').round(3) | |
377 isolate_stdev_df['Isolate_Type'] = isolate_stdev_df['Isolate_ID'].apply(lambda x: 'Reference' if x in reference_ids else 'Query') | |
378 isolate_stdev_df['QC'] = getWarnings(isolate_stdev_df) | |
379 isolate_stdev_stats = pd.concat([isolate_stdev_stats,isolate_stdev_df[['Isolate_ID','Isolate_Type','Measure','Min','Value','Max','StdDev','Zscore','QC','Count']].copy().rename(columns={'Value':'Mean'})]) | |
380 with open(log_file,"a+") as log: | |
381 log.write("- Compared results across references\n") | |
382 else: | |
383 with open(log_file,"a+") as log: | |
384 log.write("- Compared results across references\n") | |
385 # Group by ref | |
386 | |
387 #### Isolate #### | |
388 ref_isolate_df = isolate_stats.loc[isolate_stats['Isolate_Type'] == "Reference"][['Isolate_ID','Measure','Mean','StdDev','Min','Max','Zscore','QC','Count']].rename(columns = {'Isolate_ID':'Reference_ID'}) | |
389 | |
390 #### StdDev #### | |
391 ref_stdev_df = isolate_stdev_stats.loc[isolate_stdev_stats['Isolate_Type'] == "Reference"][['Isolate_ID','Measure','Mean','StdDev','Min','Max','Zscore','QC','Count']].rename(columns = {'Isolate_ID':'Reference_ID'}) | |
392 | |
393 #### MUMmer #### | |
394 ref_mummer_df = pd.DataFrame(columns = ['Reference_ID','Measure','Mean','StdDev','Min','Max','Count']) | |
395 for ref in reference_ids: | |
396 ref_mummer = isolate_mummer[(isolate_mummer['Isolate_ID'] == ref) | (isolate_mummer['Compare_ID'] == ref)].assign(Focal_Reference = ref) | |
397 ref_mummer['Comparison'] = ref_mummer.apply(lambda row: ';'.join(sorted([str(row['Isolate_ID']), str(row['Compare_ID'])])), axis=1) | |
398 ref_mummer = ref_mummer.drop_duplicates(subset=['Comparison']) | |
399 | |
400 ref_mummer = ref_mummer.melt(id_vars=['Focal_Reference','Isolate_ID','Compare_ID'], value_vars = ['Align_Percent_Diff','Median_Alignment_Length','Kmer_Similarity','gIndels'],value_name='Value',var_name = "Measure") | |
401 ref_mummer['Value'] = ref_mummer['Value'].astype("float") | |
402 ref_mummer = ref_mummer.groupby(by=['Measure'])['Value'].agg(Count = "count",Min = "min",Mean = "mean",Max = "max",StdDev = 'std').reset_index().assign(Reference_ID = ref) | |
403 ref_mummer = ref_mummer[['Reference_ID','Measure','Mean','StdDev','Min','Max','Count']] | |
404 ref_mummer_df = pd.concat([ref_mummer_df,ref_mummer]) | |
405 ref_mummer_df['QC'] = np.nan | |
406 ref_mummer_df['Zscore'] = np.nan | |
407 | |
408 ref_mummer_summary_df = pd.concat([ref_mummer_df[['Reference_ID','Measure','Mean','StdDev','Min','Max','Zscore','QC','Count']],align_stats.loc[(align_stats['Isolate_Type'] == "Reference") & (align_stats['Measure'].isin(['Self_Aligned','Compare_Aligned','Unique_Kmers','Missing_Kmers']))][['Isolate_ID','Measure','Mean','StdDev','Min','Max','Zscore','QC','Count']].rename(columns = {'Isolate_ID':'Reference_ID'})]) | |
409 | |
410 #### Cocalled #### | |
411 ref_cocalled_summary_df = raw_cocalled_df.groupby(by=['Reference_ID'])['SNPs_Cocalled'].agg(Mean = "mean",StdDev = 'std',Min = "min",Max = "max",Count = 'count').reset_index() | |
412 ref_cocalled_summary_df['Measure'] = "Raw_SNPs_Cocalled" | |
413 ref_cocalled_summary_df['QC'] = np.nan | |
414 ref_cocalled_summary_df['Zscore'] = np.nan | |
415 ref_cocalled_summary_df = ref_cocalled_summary_df[['Reference_ID','Measure','Mean','StdDev','Min','Max','Zscore','QC','Count']] | |
416 | |
417 if has_preserved: | |
418 preserved_cocalled_summary = preserved_cocalled_df.groupby(by=['Reference_ID'])['SNPs_Cocalled'].agg(Mean = "mean",StdDev = 'std',Min = "min",Max = "max",Count = 'count').reset_index() | |
419 preserved_cocalled_summary['Measure'] = "Preserved_SNPs_Cocalled" | |
420 preserved_cocalled_summary['QC'] = np.nan | |
421 preserved_cocalled_summary['Zscore'] = np.nan | |
422 ref_cocalled_summary_df = pd.concat([ref_cocalled_summary_df,preserved_cocalled_summary[['Reference_ID','Measure','Mean','StdDev','Min','Max','Zscore','QC','Count']]]) | |
423 | |
424 #### Preserved Diff #### | |
425 if has_preserved: | |
426 ref_summary_preserved_df = snp_df.groupby(by=['Reference_ID'])['Preserved_Diff'].agg(Mean = "mean",StdDev = 'std',Min = "min",Max = "max",Count = 'count').reset_index() | |
427 ref_summary_preserved_df['Measure'] = "Preserved_Diff" | |
428 ref_summary_preserved_df['QC'] = np.nan | |
429 ref_summary_preserved_df['Zscore'] = np.nan | |
430 ref_summary_preserved_df = ref_summary_preserved_df[['Reference_ID','Measure','Mean','StdDev','Min','Max','Count','Zscore','QC']].copy() | |
431 | |
432 #### Compile #### | |
433 ref_summary_df = pd.concat([ref_mummer_summary_df, | |
434 ref_cocalled_summary_df, | |
435 ref_isolate_df,ref_stdev_df]).sort_values(by=['Measure']) | |
436 | |
437 ref_summary_df['Mean'] = ref_summary_df['Mean'].astype("float").round(3) | |
438 ref_summary_df['Min'] = ref_summary_df['Min'].astype("float").round(3) | |
439 ref_summary_df['Max'] = ref_summary_df['Max'].astype("float").round(3) | |
440 ref_summary_df['StdDev'] = ref_summary_df['StdDev'].astype("float").round(3) | |
441 | |
442 # Catch warnings and failures | |
443 all_isolate_stats = pd.concat([isolate_stats,align_stats,isolate_stdev_stats,isolate_cocalled_stats,isolate_snp_stats]).sort_values(by=['Zscore']) | |
444 | |
445 warn_fail_df = all_isolate_stats.loc[all_isolate_stats['QC'].isin(['Failure','Warning'])].copy() | |
446 warn_fail_df['abs_Zscore'] = warn_fail_df['Zscore'].abs() | |
447 warn_fail_df = warn_fail_df.sort_values(by='abs_Zscore',ascending=False).drop('abs_Zscore',axis=1) | |
448 | |
449 warn_fail_isolates = list(set(warn_fail_df['Isolate_ID'])) | |
450 if len(warn_fail_isolates) > 0: | |
451 with open(log_file,"a+") as log: | |
452 log.write("\n- The following samples had QC warnings or failures:\n") | |
453 for isolate in warn_fail_isolates: | |
454 isolate_warn_fail = warn_fail_df.loc[warn_fail_df['Isolate_ID'] == isolate] | |
455 | |
456 if isolate in reference_ids: | |
457 log.write(f"\n{isolate} (Reference):\n") | |
458 else: | |
459 log.write(f"\n{isolate} (Query):\n") | |
460 | |
461 for index,row in isolate_warn_fail.iterrows(): | |
462 log.write(f"\t- {row['Measure']} - Mean: {row['Mean']}; Zscore: {row['Zscore']}; QC: {row['QC']}\n") | |
463 else: | |
464 with open(log_file,"a+") as log: | |
465 log.write("-There were no QC warnings or failures\n") | |
466 | |
467 # Output data | |
468 | |
469 # Mean assembly stats | |
470 isolate_mean_df.reset_index().to_csv(mean_isolate_file,sep='\t',index=False) | |
471 | |
472 # Isolate assembly stats | |
473 isolate_assembly_stats = isolate_stats.loc[isolate_stats['Measure'].isin(['Contig_Count','Assembly_Bases','L50','L90','N50','N90'])].drop(['Min','Max','StdDev','Count'],axis=1).rename(columns = {'Mean':'Value'}) | |
474 isolate_assembly_stats.to_csv(isolate_assembly_stats_file,sep='\t',index=False) | |
475 | |
476 # Isolate alignment stats | |
477 isolate_align_stats = pd.concat([align_stats,isolate_cocalled_stats,isolate_snp_stats,isolate_stdev_stats]).reset_index(drop=True) | |
478 for col in ['Min', 'Mean', 'Max', 'StdDev', 'Zscore']: | |
479 isolate_align_stats[col] = isolate_align_stats[col].astype("float").round(3) | |
480 isolate_align_stats.to_csv(align_stats_file,sep='\t',index=False) | |
481 | |
482 # Reference Assembly Stats | |
483 ref_align_summary_df = ref_summary_df.loc[(~ref_summary_df['Measure'].isin(['Contig_Count','Assembly_Bases','L50','L90','N50','N90'])) & (~pd.isna(ref_summary_df['Zscore']))] | |
484 ref_mean_summary_df = ref_summary_df.loc[(~ref_summary_df['Measure'].isin(['Contig_Count','Assembly_Bases','L50','L90','N50','N90'])) & (pd.isna(ref_summary_df['Zscore']))].drop(['Zscore','QC'],axis =1) | |
485 ref_mean_summary_df['Zscore'] = np.nan | |
486 ref_mean_summary_df['QC'] = np.nan | |
487 | |
488 # Add alignment stats | |
489 if has_preserved: | |
490 ref_mean_summary_df = pd.concat([ref_mean_summary_df,ref_summary_preserved_df]) | |
491 | |
492 ref_isolate_align_stats = align_stats.loc[(align_stats['Isolate_Type'] == "Reference") & (align_stats['Measure'].isin(['Self_Aligned','Compare_Aligned']))].drop(['Isolate_Type'],axis=1).rename(columns = {'Isolate_ID':'Reference_ID'})[['Reference_ID','Measure','Mean','StdDev','Min','Max','Count','Zscore','QC']] | |
493 | |
494 ref_mean_summary_stats = pd.concat([ref_mean_summary_df,ref_isolate_align_stats]) | |
495 ref_mean_summary_stats.to_csv(ref_mean_summary_file,sep='\t',index=False) | |
496 | |
497 end_time = time.time() | |
498 | |
499 with open(log_file,"a+") as log: | |
500 log.write(f"\n- Completed compilation in {end_time - start_time:.2f} seconds\n") | |
501 log.write(f"\t- Saved mean isolate assembly data to {mean_isolate_file}\n") | |
502 log.write(f"\t- Saved raw isolate assembly data to {isolate_assembly_stats_file}\n") | |
503 log.write(f"\t- Saved isolate alignment data to {align_stats_file}\n") | |
504 log.write(f"\t- Saved reference summary data to {ref_mean_summary_file}\n") | |
505 | |
506 # Comparisons if multiple refs | |
507 if len(reference_ids) > 1: | |
508 comparison_df.to_csv(snp_comparison_file,sep="\t",index = False) | |
509 log.write(f"\t- Saved SNP distance comparisons across references to {snp_comparison_file}\n") | |
510 | |
511 # Failures/warnings | |
512 if warn_fail_df.shape[0] > 0: | |
513 warn_fail_df.to_csv(qc_file,sep="\t",index=False) | |
514 log.write(f"\t- Saved QC warnings/failures to {qc_file}\n") |