rliterman@0: #!/usr/bin/env python3 rliterman@0: rliterman@0: import sys rliterman@0: import os rliterman@0: import pandas as pd rliterman@0: import datetime rliterman@0: from pybedtools import BedTool,helpers rliterman@0: import concurrent.futures rliterman@0: import time rliterman@0: from Bio.Seq import Seq rliterman@0: from Bio.SeqRecord import SeqRecord rliterman@0: from Bio.Align import MultipleSeqAlignment rliterman@0: from Bio import AlignIO rliterman@0: from itertools import combinations rliterman@0: import numpy as np rliterman@0: import uuid rliterman@0: import traceback rliterman@0: import shutil rliterman@0: import argparse rliterman@0: rliterman@0: def fetchHeaders(snpdiffs_file): rliterman@0: rliterman@0: with open(snpdiffs_file, 'r') as file: rliterman@0: top_line = file.readline().strip().split('\t')[1:] rliterman@0: rliterman@0: header_cols = [item.split(':')[0] for item in top_line] rliterman@0: header_vals = [item.split(':')[1] for item in top_line] rliterman@0: rliterman@0: header_data = pd.DataFrame(columns = header_cols) rliterman@0: header_data.loc[0] = header_vals rliterman@0: header_data.loc[:, 'File_Path'] = snpdiffs_file rliterman@0: rliterman@0: return header_data rliterman@0: rliterman@0: def processBED(bed_rows,snpdiffs_orientation): rliterman@0: rliterman@0: bed_columns = ['Ref_Contig','Ref_Start','Ref_End','Ref_Length','Ref_Aligned', rliterman@0: 'Query_Contig','Query_Start','Query_End','Query_Length','Query_Aligned', rliterman@0: 'Perc_Iden'] rliterman@0: rliterman@0: reverse_columns = ['Query_Contig','Query_Start','Query_End','Query_Length','Query_Aligned', rliterman@0: 'Ref_Contig','Ref_Start','Ref_End','Ref_Length','Ref_Aligned', rliterman@0: 'Perc_Iden'] rliterman@0: rliterman@0: int_columns = ['Ref_Start', 'Ref_End', 'Ref_Length', 'Ref_Aligned', rliterman@0: 'Query_Start', 'Query_End', 'Query_Length', 'Query_Aligned'] rliterman@0: rliterman@0: float_columns = ['Perc_Iden'] rliterman@0: rliterman@0: if len(bed_rows) > 0: rliterman@0: rliterman@0: bed_df = pd.DataFrame(bed_rows, columns=bed_columns) rliterman@0: rliterman@0: # Swap columns if reversed rliterman@0: if snpdiffs_orientation == -1: rliterman@0: bed_df = bed_df[reverse_columns].copy() rliterman@0: bed_df.columns = bed_columns rliterman@0: rliterman@0: # Gather covered loci rliterman@0: covered_bed_df = bed_df[(bed_df['Ref_Start'] != ".") & (bed_df['Query_Start'] != ".")].copy() rliterman@0: if covered_bed_df.shape[0] > 0: rliterman@0: for col in int_columns: rliterman@0: covered_bed_df.loc[:, col] = covered_bed_df.loc[:, col].astype(float).astype(int) rliterman@0: for col in float_columns: rliterman@0: covered_bed_df.loc[:, col] = covered_bed_df.loc[:, col].astype(float) rliterman@0: rliterman@0: return covered_bed_df rliterman@0: else: rliterman@0: return pd.DataFrame(columns=bed_columns) rliterman@0: rliterman@0: def processSNPs(snp_rows,snpdiffs_orientation): rliterman@0: rliterman@0: snp_columns = ['Ref_Contig','Start_Ref','Ref_Pos', rliterman@0: 'Query_Contig','Start_Query','Query_Pos', rliterman@0: 'Ref_Loc','Query_Loc', rliterman@0: 'Ref_Start','Ref_End', rliterman@0: 'Query_Start','Query_End', rliterman@0: 'Ref_Base','Query_Base', rliterman@0: 'Dist_to_Ref_End','Dist_to_Query_End', rliterman@0: 'Ref_Aligned','Query_Aligned', rliterman@0: 'Query_Direction','Perc_Iden','Cat'] rliterman@0: rliterman@0: return_columns = ['Ref_Contig','Start_Ref','Ref_Pos', rliterman@0: 'Query_Contig','Start_Query','Query_Pos', rliterman@0: 'Ref_Loc','Query_Loc', rliterman@0: 'Ref_Start','Ref_End', rliterman@0: 'Query_Start','Query_End', rliterman@0: 'Ref_Base','Query_Base', rliterman@0: 'Dist_to_Ref_End','Dist_to_Query_End', rliterman@0: 'Ref_Aligned','Query_Aligned', rliterman@0: 'Perc_Iden','Cat'] rliterman@0: rliterman@0: reverse_columns = ['Query_Contig','Start_Query','Query_Pos', rliterman@0: 'Ref_Contig','Start_Ref','Ref_Pos', rliterman@0: 'Query_Loc','Ref_Loc', rliterman@0: 'Query_Start','Query_End', rliterman@0: 'Ref_Start','Ref_End', rliterman@0: 'Query_Base','Ref_Base', rliterman@0: 'Dist_to_Query_End','Dist_to_Ref_End', rliterman@0: 'Query_Aligned','Ref_Aligned', rliterman@0: 'Query_Direction','Perc_Iden','Cat'] rliterman@0: rliterman@0: reverse_complement = {'A':'T','T':'A','G':'C','C':'G', rliterman@0: 'a':'T','t':'A','c':'G','g':'C'} rliterman@0: rliterman@0: # Columns to convert to integer rliterman@0: int_columns = ['Start_Ref', 'Ref_Pos', 'Start_Query', 'Query_Pos', rliterman@0: 'Dist_to_Ref_End', 'Dist_to_Query_End', 'Ref_Aligned', 'Query_Aligned'] rliterman@0: rliterman@0: # Columns to convert to float rliterman@0: float_columns = ['Perc_Iden'] rliterman@0: rliterman@0: if len(snp_rows) > 0: rliterman@0: snp_df = pd.DataFrame(snp_rows, columns= snp_columns).copy() rliterman@0: rliterman@0: if snpdiffs_orientation == -1: rliterman@0: snp_df = snp_df[reverse_columns].copy() rliterman@0: snp_df.columns = snp_columns rliterman@0: rliterman@0: # Replace Query_Base and Reference_Base with reverse complement if Query_Direction is -1 and base is in ['A','T','G','C','a','c','t','g'] rliterman@0: snp_df.loc[snp_df['Query_Direction'] == '-1','Query_Base'] = snp_df.loc[snp_df['Query_Direction'] == '-1','Query_Base'].apply(lambda x: reverse_complement[x] if x in reverse_complement else x) rliterman@0: snp_df.loc[snp_df['Query_Direction'] == '-1','Ref_Base'] = snp_df.loc[snp_df['Query_Direction'] == '-1','Ref_Base'].apply(lambda x: reverse_complement[x] if x in reverse_complement else x) rliterman@0: rliterman@0: for col in int_columns: rliterman@0: snp_df.loc[:, col] = snp_df.loc[:, col].astype(float).astype(int) rliterman@0: for col in float_columns: rliterman@0: snp_df.loc[:, col] = snp_df.loc[:, col].astype(float) rliterman@0: rliterman@0: else: rliterman@0: snp_df = pd.DataFrame(columns = return_columns) rliterman@0: rliterman@0: return snp_df[return_columns] rliterman@0: rliterman@0: def swapHeader(header_data): rliterman@0: rliterman@0: raw_header_cols = [x for x in header_data.columns] rliterman@0: reverse_header_cols = [item.replace('Reference', 'temp').replace('Query', 'Reference').replace('temp', 'Query') for item in raw_header_cols] rliterman@0: reversed_header_data = header_data[reverse_header_cols].copy() rliterman@0: reversed_header_data.columns = raw_header_cols rliterman@0: rliterman@0: return reversed_header_data rliterman@0: rliterman@0: def parseSNPDiffs(snpdiffs_file,snpdiffs_orientation): rliterman@0: rliterman@0: bed_rows = [] rliterman@0: snp_rows = [] rliterman@0: rliterman@0: with open(snpdiffs_file, 'r') as file: rliterman@0: lines = file.readlines() rliterman@0: rliterman@0: for line in lines: rliterman@0: if line[0:2] == "#\t": rliterman@0: pass rliterman@0: elif line[0:3] == "##\t": rliterman@0: bed_rows.append(line.strip().split("\t")[1:]) rliterman@0: else: rliterman@0: snp_rows.append(line.strip().split("\t")) rliterman@0: rliterman@0: bed_df = processBED(bed_rows,snpdiffs_orientation) rliterman@0: snp_df = processSNPs(snp_rows,snpdiffs_orientation) rliterman@0: return (bed_df,snp_df) rliterman@0: rliterman@0: def calculate_total_length(bedtool): rliterman@0: return sum(len(interval) for interval in bedtool) rliterman@0: rliterman@0: def filterSNPs(raw_snp_df,bed_df,log_file, min_len, min_iden, ref_edge, query_edge, density_windows, max_snps): rliterman@0: rliterman@0: if temp_dir != "": rliterman@0: helpers.set_tempdir(temp_dir) rliterman@0: rliterman@0: # Grab raw data rliterman@0: total_snp_count = raw_snp_df.shape[0] rliterman@0: rliterman@0: # Get unique SNPs relative to the reference genome rliterman@0: unique_ref_snps = raw_snp_df['Ref_Loc'].unique() rliterman@0: unique_snp_count = len(unique_ref_snps) rliterman@0: rliterman@0: snp_tally_df = pd.DataFrame() rliterman@0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\n\t- Raw SNP + indel count: {total_snp_count}\n") rliterman@0: log.write(f"\n\t- Unique SNP positions in reference genome: {unique_snp_count}\n") rliterman@0: rliterman@0: # Set all sites to SNP rliterman@0: raw_snp_df['Filter_Cat'] = "SNP" rliterman@0: rliterman@0: # Filter out SNPs based on --min_len and --min_iden rliterman@0: reject_length = raw_snp_df.loc[(raw_snp_df['Ref_Aligned'] < min_len) & (raw_snp_df['Perc_Iden'] >= min_iden)].copy() rliterman@0: if reject_length.shape[0] > 0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t\t- Purged (Alignment Length): {reject_length.shape[0]}\n") rliterman@0: reject_length['Filter_Cat'] = "Purged_Length" rliterman@0: snp_tally_df = pd.concat([snp_tally_df,reject_length]).reset_index(drop=True) rliterman@0: rliterman@0: reject_iden = raw_snp_df.loc[(raw_snp_df['Ref_Aligned'] >= min_len) & (raw_snp_df['Perc_Iden'] < min_iden)].copy() rliterman@0: if reject_iden.shape[0] > 0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t\t- Purged (Alignment Identity): {reject_iden.shape[0]}\n") rliterman@0: reject_iden['Filter_Cat'] = "Purged_Identity" rliterman@0: snp_tally_df = pd.concat([snp_tally_df,reject_iden]).reset_index(drop=True) rliterman@0: rliterman@0: reject_lenIden = raw_snp_df.loc[(raw_snp_df['Ref_Aligned'] < min_len) & (raw_snp_df['Perc_Iden'] < min_iden)].copy() rliterman@0: if reject_lenIden.shape[0] > 0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t\t- Purged (Alignment Length + Identity): {reject_lenIden.shape[0]}\n") rliterman@0: reject_lenIden['Filter_Cat'] = "Purged_LengthIdentity" rliterman@0: snp_tally_df = pd.concat([snp_tally_df,reject_lenIden]).reset_index(drop=True) rliterman@0: rliterman@0: pass_filter = raw_snp_df.loc[(raw_snp_df['Ref_Aligned'] >= min_len) & (raw_snp_df['Perc_Iden'] >= min_iden)].copy().reset_index(drop=True) rliterman@0: rliterman@0: # Invalid processing rliterman@0: reject_invalid = pass_filter[pass_filter['Cat'] == "Invalid"].copy() rliterman@0: if reject_invalid.shape[0] > 0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t\t- Purged (Invalid Base): {reject_invalid.shape[0]}\n") rliterman@0: reject_invalid['Filter_Cat'] = "Purged_Invalid" rliterman@0: snp_tally_df = pd.concat([snp_tally_df,reject_invalid]).reset_index(drop=True) rliterman@0: pass_filter = pass_filter.loc[pass_filter['Cat'] != "Invalid"].copy() rliterman@0: rliterman@0: # Indel processing rliterman@0: reject_indel = pass_filter[pass_filter['Cat'] == "Indel"].copy() rliterman@0: if reject_indel.shape[0] > 0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t\t- Purged (Indel): {reject_indel.shape[0]}\n") rliterman@0: reject_indel['Filter_Cat'] = "Purged_Indel" rliterman@0: snp_tally_df = pd.concat([snp_tally_df,reject_indel]).reset_index(drop=True) rliterman@0: pass_filter = pass_filter.loc[pass_filter['Cat'] != "Indel"].copy() rliterman@0: rliterman@0: # Check for heterozygous SNPs rliterman@0: check_heterozygous = pass_filter.groupby('Ref_Loc').filter(lambda x: x['Query_Base'].nunique() > 1) rliterman@0: if check_heterozygous.shape[0] > 0: rliterman@0: reject_heterozygous = pass_filter.loc[pass_filter['Ref_Loc'].isin(check_heterozygous['Ref_Loc'])].copy() rliterman@0: reject_heterozygous['Filter_Cat'] = "Purged_Heterozygous" rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t\t- Purged (Heterozygotes): {reject_heterozygous.shape[0]}\n") rliterman@0: snp_tally_df = pd.concat([snp_tally_df,reject_heterozygous]).reset_index(drop=True) rliterman@0: pass_filter = pass_filter.loc[~pass_filter['Ref_Loc'].isin(check_heterozygous['Ref_Loc'])].copy() rliterman@0: rliterman@0: # Check for duplicate SNPs and take the longest, best hit rliterman@0: check_duplicates = pass_filter.groupby('Ref_Loc').filter(lambda x: x.shape[0] > 1) rliterman@0: if check_duplicates.shape[0] > 0: rliterman@0: reject_duplicate = pass_filter.loc[pass_filter['Ref_Loc'].isin(check_duplicates['Ref_Loc'])].copy() rliterman@0: pass_filter = pass_filter.loc[~pass_filter['Ref_Loc'].isin(check_duplicates['Ref_Loc'])].copy() rliterman@0: rliterman@0: best_snp = reject_duplicate.groupby('Ref_Loc').apply(lambda x: x.sort_values(by=['Ref_Aligned', 'Perc_Iden'], ascending=[False, False]).head(1)) rliterman@0: pass_filter = pd.concat([pass_filter,best_snp]).reset_index(drop=True) rliterman@0: rliterman@0: dup_snps = reject_duplicate[~reject_duplicate.apply(lambda x: x in best_snp, axis=1)] rliterman@0: dup_snps['Filter_Cat'] = "Purged_Duplicate" rliterman@0: rliterman@0: snp_tally_df = pd.concat([snp_tally_df,dup_snps]).reset_index(drop=True) rliterman@0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t\t- Purged (Duplicates): {dup_snps.shape[0]}\n") rliterman@0: rliterman@0: # Assert that Ref_Loc and Query_Loc are unique in pass_filter rliterman@0: helpers.cleanup(verbose=False,remove_all = False) rliterman@0: assert pass_filter['Ref_Loc'].nunique() == pass_filter.shape[0] rliterman@0: assert pass_filter['Query_Loc'].nunique() == pass_filter.shape[0] rliterman@0: rliterman@0: # Density filtering rliterman@0: density_locs = [] rliterman@0: ref_locs = pass_filter['Ref_Loc'].tolist() rliterman@0: rliterman@0: if len(density_windows) == 0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("\n\t- Density filtering disabled...\n") rliterman@0: elif len(ref_locs) > 0: rliterman@0: density_df = pd.DataFrame([item.split('/') for item in ref_locs], columns=['Ref_Contig','Ref_End']) rliterman@0: density_df['Ref_Start'] = density_df['Ref_End'].astype(float).astype(int) - 1 rliterman@0: density_bed = BedTool.from_dataframe(density_df[['Ref_Contig','Ref_Start','Ref_End']]) rliterman@0: rliterman@0: # For each density window, remove all SNPs that fall in a window with > max_snps rliterman@0: for i in range(0,len(density_windows)): rliterman@0: window_df = density_bed.window(density_bed,c=True, w=density_windows[i]).to_dataframe() rliterman@0: problematic_windows = window_df[window_df['name'] > max_snps[i]].copy() rliterman@0: if not problematic_windows.empty: rliterman@0: temp_locs = [] rliterman@0: for _, row in problematic_windows.iterrows(): rliterman@0: purge_window_df = window_df[window_df['chrom'] == row['chrom']].copy() rliterman@0: purge_window_df['Dist'] = abs(purge_window_df['end'] - row['end']) rliterman@0: window_snps = purge_window_df.sort_values(by=['Dist'],ascending=True).head(row['name']) rliterman@0: temp_locs = temp_locs + ["/".join([str(x[0]),str(x[1])]) for x in list(zip(window_snps.chrom, window_snps.end))] rliterman@0: density_locs.extend(list(set(temp_locs))) rliterman@0: rliterman@0: density_locs = list(set(density_locs)) rliterman@0: reject_density = pass_filter[pass_filter['Ref_Loc'].isin(density_locs)].copy() rliterman@0: rliterman@0: if reject_density.shape[0] > 0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t\t- Purged (Density): {reject_density.shape[0]}\n") rliterman@0: reject_density['Filter_Cat'] = "Purged_Density" rliterman@0: snp_tally_df = pd.concat([snp_tally_df,reject_density]).reset_index(drop=True) rliterman@0: pass_filter = pass_filter[~pass_filter['Ref_Loc'].isin(density_locs)].copy() rliterman@0: rliterman@0: reject_query_edge = pass_filter[(pass_filter['Dist_to_Query_End'] < query_edge) & (pass_filter['Dist_to_Ref_End'] >= ref_edge)].copy() rliterman@0: reject_ref_edge = pass_filter[(pass_filter['Dist_to_Ref_End'] < ref_edge) & (pass_filter['Dist_to_Query_End'] >= query_edge)].copy() rliterman@0: reject_both_edge = pass_filter[(pass_filter['Dist_to_Query_End'] < query_edge) & (pass_filter['Dist_to_Ref_End'] < ref_edge)].copy() rliterman@0: rliterman@0: if reject_query_edge.shape[0] > 0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t\t- Purged (Query Edge): {reject_query_edge.shape[0]}\n") rliterman@0: reject_query_edge['Filter_Cat'] = "Filtered_Query_Edge" rliterman@0: snp_tally_df = pd.concat([snp_tally_df,reject_query_edge]).reset_index(drop=True) rliterman@0: rliterman@0: if reject_ref_edge.shape[0] > 0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t\t- Purged (Ref Edge): {reject_ref_edge.shape[0]}\n") rliterman@0: reject_ref_edge['Filter_Cat'] = "Filtered_Ref_Edge" rliterman@0: snp_tally_df = pd.concat([snp_tally_df,reject_ref_edge]).reset_index(drop=True) rliterman@0: rliterman@0: if reject_both_edge.shape[0] > 0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t\t- Purged (Both Edge): {reject_both_edge.shape[0]}\n") rliterman@0: reject_both_edge['Filter_Cat'] = "Filtered_Both_Edge" rliterman@0: snp_tally_df = pd.concat([snp_tally_df,reject_both_edge]).reset_index(drop=True) rliterman@0: rliterman@0: pass_filter = pass_filter[(pass_filter['Dist_to_Query_End'] >= query_edge) & (pass_filter['Dist_to_Ref_End'] >= ref_edge)].copy() rliterman@0: rliterman@0: helpers.cleanup(verbose=False,remove_all = False) rliterman@0: rliterman@0: assert snp_tally_df.shape[0] + pass_filter.shape[0] == total_snp_count rliterman@0: return_df = pd.concat([pass_filter,snp_tally_df]).reset_index(drop=True).sort_values(by=['Ref_Loc']) rliterman@0: rliterman@0: return return_df.drop(columns=['Cat']).rename({'Filter_Cat':'Cat'}, axis=1) rliterman@0: rliterman@0: def screenSNPDiffs(snpdiffs_file,trim_name, min_cov, min_len, min_iden, ref_edge, query_edge, density_windows, max_snps,reference_id,log_directory): rliterman@0: rliterman@0: screen_start_time = time.time() rliterman@0: rliterman@0: if temp_dir != "": rliterman@0: helpers.set_tempdir(temp_dir) rliterman@0: rliterman@0: # Set CSP2 variables to NA rliterman@0: csp2_screen_snps = purged_length = purged_identity = purged_invalid = purged_indel = purged_lengthIdentity = purged_duplicate = purged_het = purged_density = filtered_ref_edge = filtered_query_edge = filtered_both_edge = "NA" rliterman@0: filtered_snp_df = pd.DataFrame() rliterman@0: good_reference_bed_df = pd.DataFrame() rliterman@0: rliterman@0: # Ensure snpdiffs file exists rliterman@0: if not os.path.exists(snpdiffs_file) or not snpdiffs_file.endswith('.snpdiffs'): rliterman@0: run_failed = True rliterman@0: sys.exit(f"Invalid snpdiffs file provided: {snpdiffs_file}") rliterman@0: rliterman@0: # Ensure header can be read in rliterman@0: try: rliterman@0: header_data = fetchHeaders(snpdiffs_file) rliterman@0: header_query = header_data['Query_ID'][0].replace(trim_name,'') rliterman@0: header_ref = header_data['Reference_ID'][0].replace(trim_name,'') rliterman@0: except: rliterman@0: run_failed = True rliterman@0: sys.exit(f"Error reading headers from snpdiffs file: {snpdiffs_file}") rliterman@0: rliterman@0: # Check snpdiffs orientation rliterman@0: if (header_ref == reference_id): rliterman@0: snpdiffs_orientation = 1 rliterman@0: query_id = header_query rliterman@0: elif (header_query == reference_id): rliterman@0: snpdiffs_orientation = -1 rliterman@0: query_id = header_ref rliterman@0: header_data = swapHeader(header_data) rliterman@0: else: rliterman@0: run_failed = True rliterman@0: sys.exit(f"Error: Reference ID not found in header of {snpdiffs_file}...") rliterman@0: rliterman@0: # Establish log file rliterman@0: log_file = f"{log_directory}/{query_id}__vs__{reference_id}.log" rliterman@0: with open(log_file,"w+") as log: rliterman@0: log.write("Reference Screening for SNP Pipeline Analysis\n") rliterman@0: log.write(f"Query Isolate: {query_id}\n") rliterman@0: log.write(f"Reference Isolate: {reference_id}\n") rliterman@0: log.write(str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))+"\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: if snpdiffs_orientation == 1: rliterman@0: log.write("\t- SNPDiffs file is in the forward orientation\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: else: rliterman@0: log.write("\t- SNPDiffs file is in the reverse orientation\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: rliterman@0: rliterman@0: # Set variables from header data rliterman@0: raw_snps = int(header_data['SNPs'][0]) rliterman@0: raw_indels = int(header_data['Indels'][0]) rliterman@0: raw_invalid = int(header_data['Invalid'][0]) rliterman@0: rliterman@0: kmer_similarity = float(header_data['Kmer_Similarity'][0]) rliterman@0: shared_kmers = int(header_data['Shared_Kmers'][0]) rliterman@0: query_unique_kmers = int(header_data['Query_Unique_Kmers'][0]) rliterman@0: reference_unique_kmers = int(header_data['Reference_Unique_Kmers'][0]) rliterman@0: mummer_gsnps = int(header_data['gSNPs'][0]) rliterman@0: mummer_gindels = int(header_data['gIndels'][0]) rliterman@0: rliterman@0: query_bases = int(header_data['Query_Assembly_Bases'][0]) rliterman@0: reference_bases = int(header_data['Reference_Assembly_Bases'][0]) rliterman@0: rliterman@0: query_contigs = int(header_data['Query_Contig_Count'][0]) rliterman@0: reference_contigs = int(header_data['Reference_Contig_Count'][0]) rliterman@0: rliterman@0: raw_query_percent_aligned = float(header_data['Query_Percent_Aligned'][0]) rliterman@0: raw_ref_percent_aligned = float(header_data['Reference_Percent_Aligned'][0]) rliterman@0: rliterman@0: # If the reference is not covered by at least min_cov, STOP rliterman@0: if raw_ref_percent_aligned < min_cov: rliterman@0: query_percent_aligned = raw_query_percent_aligned rliterman@0: reference_percent_aligned = raw_ref_percent_aligned rliterman@0: screen_category = "Low_Coverage" rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t- Reference genome coverage: {raw_ref_percent_aligned}% \n") rliterman@0: log.write(f"\t- Query covers less than --min_cov ({min_cov}%)...Screen halted...\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: rliterman@0: elif raw_snps + raw_indels + raw_invalid > 10000: rliterman@0: query_percent_aligned = raw_query_percent_aligned rliterman@0: reference_percent_aligned = raw_ref_percent_aligned rliterman@0: screen_category = "SNP_Cutoff" rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t- {raw_snps} detected...\n") rliterman@0: log.write("\t- > 10,000 SNPs, indels, or invalid sites detected by MUMmer...Screen halted...\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: rliterman@0: else: rliterman@0: rliterman@0: ##### 02: Read in BED/SNP data ##### rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("Step 1: Reading in snpdiffs BED/SNP data...") rliterman@0: try: rliterman@0: bed_df,snp_df = parseSNPDiffs(snpdiffs_file,snpdiffs_orientation) rliterman@0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("Done!\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: rliterman@0: except Exception as e: rliterman@0: run_failed = True rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\nError reading BED/SNP data from file: {snpdiffs_file}\n{str(e)}") rliterman@0: sys.exit(f"Error reading BED/SNP data from file: {snpdiffs_file}\n{str(e)}") rliterman@0: rliterman@0: ##### 03: Filter genome overlaps ##### rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("Step 2: Filtering for short overlaps and low percent identity...") rliterman@0: rliterman@0: good_bed_df = bed_df[(bed_df['Ref_Aligned'] >= min_len) & (bed_df['Perc_Iden'] >= min_iden)].copy() rliterman@0: rliterman@0: if good_bed_df.shape[0] == 0: rliterman@0: screen_category = "Low_Quality_Coverage" rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\n\t- After filtering based on --min_len ({min_len}) and --min_iden ({min_iden}) , no valid alignments remain...Screen halted...\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: rliterman@0: rliterman@0: else: rliterman@0: # Create a BED file for alignments that pass basic QC rliterman@0: good_query_bed_df = good_bed_df[['Query_Contig','Query_Start','Query_End']].copy() rliterman@0: good_reference_bed_df = good_bed_df[['Ref_Contig','Ref_Start','Ref_End']].copy() rliterman@0: good_reference_bed_df.loc[:, 'Query_ID'] = query_id rliterman@0: rliterman@0: good_query_aligned = calculate_total_length(BedTool.from_dataframe(good_query_bed_df).sort().merge()) rliterman@0: good_reference_aligned = calculate_total_length(BedTool.from_dataframe(good_reference_bed_df[['Ref_Contig','Ref_Start','Ref_End']]).sort().merge()) rliterman@0: rliterman@0: query_percent_aligned = (good_query_aligned / query_bases) * 100 rliterman@0: reference_percent_aligned = (good_reference_aligned / reference_bases) * 100 rliterman@0: rliterman@0: if reference_percent_aligned < min_cov: rliterman@0: screen_category = "Low_Quality_Coverage" rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\n\t- Raw reference genome coverage was {raw_ref_percent_aligned}% \n") rliterman@0: log.write(f"\t- After filtering based on --min_len ({min_len}) and --min_iden ({min_iden}), reference genome coverage was {reference_percent_aligned:.2f}% \n") rliterman@0: log.write(f"\t- Query covers less than --min_cov ({min_cov}%) of reference after filtering...Screen halted...\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: rliterman@0: else: rliterman@0: screen_category = "Pass" rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("Done!\n") rliterman@0: log.write(f"\t- Raw reference genome coverage was {raw_ref_percent_aligned}% \n") rliterman@0: log.write(f"\t- After filtering based on --min_len ({min_len}) and --min_iden ({min_iden}), reference genome coverage was {reference_percent_aligned:.2f}% \n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: rliterman@0: rliterman@0: # Filter SNPs rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("Step 3: Filtering SNPs to get final SNP distances...") rliterman@0: rliterman@0: if raw_snps == 0: rliterman@0: csp2_screen_snps = purged_length = purged_identity = purged_lengthIdentity = purged_indel = purged_invalid = purged_duplicate = purged_het = purged_density = filtered_ref_edge = filtered_query_edge = filtered_both_edge = 0 rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("Done!\n") rliterman@0: log.write("\t- No SNPs detected in MUMmer output, no filtering required\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: rliterman@0: else: rliterman@0: filtered_snp_df = filterSNPs(snp_df,bed_df,log_file, min_len, min_iden, ref_edge, query_edge, density_windows, max_snps) rliterman@0: rliterman@0: csp2_screen_snps = filtered_snp_df[filtered_snp_df.Cat == "SNP"].shape[0] rliterman@0: purged_length = filtered_snp_df[filtered_snp_df.Cat == "Purged_Length"].shape[0] rliterman@0: purged_identity = filtered_snp_df[filtered_snp_df.Cat == "Purged_Identity"].shape[0] rliterman@0: purged_lengthIdentity = filtered_snp_df[filtered_snp_df.Cat == "Purged_LengthIdentity"].shape[0] rliterman@0: purged_invalid = filtered_snp_df[filtered_snp_df.Cat == "Purged_Invalid"].shape[0] rliterman@0: purged_indel = filtered_snp_df[filtered_snp_df.Cat == "Purged_Indel"].shape[0] rliterman@0: purged_het = filtered_snp_df[filtered_snp_df.Cat == "Purged_Heterozygous"].shape[0] rliterman@0: purged_duplicate = filtered_snp_df[filtered_snp_df.Cat == "Purged_Duplicate"].shape[0] rliterman@0: purged_density = filtered_snp_df[filtered_snp_df.Cat == "Purged_Density"].shape[0] rliterman@0: filtered_query_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Query_Edge"].shape[0] rliterman@0: filtered_ref_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Ref_Edge"].shape[0] rliterman@0: filtered_both_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Both_Edge"].shape[0] rliterman@0: rliterman@0: # Write filtered SNP data to file rliterman@0: snp_file = log_file.replace(".log","_SNPs.tsv") rliterman@0: filtered_snp_df.to_csv(snp_file, sep="\t", index=False) rliterman@0: rliterman@0: filtered_snp_df.loc[:, 'Query_ID'] = query_id rliterman@0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("Done!\n") rliterman@0: log.write(f"\t- {csp2_screen_snps} SNPs detected between {query_id} and {reference_id} after filtering\n") rliterman@0: log.write(f"\t- Full SNP data saved to {snp_file}\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: rliterman@0: screen_end_time = time.time() rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"Screening Time: {screen_end_time - screen_start_time:.2f} seconds\n") rliterman@0: rliterman@0: # Clean up pybedtools temp rliterman@0: helpers.cleanup(verbose=False, remove_all=False) rliterman@0: rliterman@0: return ([str(item) for item in [query_id,reference_id,screen_category,csp2_screen_snps, rliterman@0: f"{query_percent_aligned:.2f}",f"{reference_percent_aligned:.2f}", rliterman@0: query_contigs,query_bases,reference_contigs,reference_bases, rliterman@0: raw_snps,purged_length,purged_identity,purged_lengthIdentity,purged_invalid,purged_indel,purged_duplicate,purged_het,purged_density, rliterman@0: filtered_query_edge,filtered_ref_edge,filtered_both_edge, rliterman@0: kmer_similarity,shared_kmers,query_unique_kmers,reference_unique_kmers, rliterman@0: mummer_gsnps,mummer_gindels]],good_reference_bed_df,filtered_snp_df) rliterman@0: rliterman@0: def assessCoverage(query_id,site_list): rliterman@0: rliterman@0: if temp_dir != "": rliterman@0: helpers.set_tempdir(temp_dir) rliterman@0: rliterman@0: if len(site_list) == 0: rliterman@0: return pd.DataFrame(columns=['Ref_Loc','Query_ID','Cat']) rliterman@0: else: rliterman@0: coverage_df = pass_filter_coverage_df[pass_filter_coverage_df['Query_ID'] == query_id].copy() rliterman@0: rliterman@0: if coverage_df.shape[0] == 0: rliterman@0: uncovered_loc_df = pd.DataFrame({ rliterman@0: 'Ref_Loc': site_list, rliterman@0: 'Query_ID': [query_id] * len(site_list), rliterman@0: 'Cat': ["Uncovered"] * len(site_list) rliterman@0: }) rliterman@0: return uncovered_loc_df rliterman@0: else: rliterman@0: coverage_bed = BedTool.from_dataframe(coverage_df[['Ref_Contig','Ref_Start','Ref_End']]).sort() rliterman@0: snp_bed_df = pd.DataFrame([item.split('/') for item in site_list], columns=['Ref_Contig','Ref_End']) rliterman@0: snp_bed_df['Ref_Start'] = snp_bed_df['Ref_End'].astype(float).astype(int) - 1 rliterman@0: snp_bed_df['Ref_Loc'] = site_list rliterman@0: snp_bed = BedTool.from_dataframe(snp_bed_df[['Ref_Contig','Ref_Start','Ref_End','Ref_Loc']]).sort() rliterman@0: rliterman@0: # Ref_Locs from snp_bed that intersect with coverage_bed go into covered_locs, the rest go into uncovered_locs rliterman@0: covered_locs = snp_bed.intersect(coverage_bed, wa=True) rliterman@0: uncovered_locs = snp_bed.intersect(coverage_bed, v=True, wa=True) rliterman@0: rliterman@0: covered_loc_df = pd.DataFrame({ rliterman@0: 'Ref_Loc': [snp.fields[3] for snp in covered_locs], rliterman@0: 'Query_ID': [query_id] * covered_locs.count(), rliterman@0: 'Cat': ["Ref_Base"] * covered_locs.count() rliterman@0: }) if covered_locs.count() > 0 else pd.DataFrame(columns=['Ref_Loc','Query_ID','Cat']) rliterman@0: rliterman@0: uncovered_loc_df = pd.DataFrame({ rliterman@0: 'Ref_Loc': [snp.fields[3] for snp in uncovered_locs], rliterman@0: 'Query_ID': [query_id] * uncovered_locs.count(), rliterman@0: 'Cat': ["Uncovered"] * uncovered_locs.count() rliterman@0: }) if uncovered_locs.count() > 0 else pd.DataFrame(columns=['Ref_Loc','Query_ID','Cat']) rliterman@0: rliterman@0: # Clean up pybedtools temp rliterman@0: helpers.cleanup(verbose=False, remove_all=False) rliterman@0: rliterman@0: return pd.concat([covered_loc_df.drop_duplicates(['Ref_Loc']),uncovered_loc_df]) rliterman@0: rliterman@0: def getPairwise(chunk, sequences, ids): rliterman@0: results = [] rliterman@0: rliterman@0: for i, j in chunk: rliterman@0: seq1, seq2 = sequences[i], sequences[j] rliterman@0: actg_mask1 = np.isin(seq1, list('ACTGactg')) rliterman@0: actg_mask2 = np.isin(seq2, list('ACTGactg')) rliterman@0: cocalled_mask = actg_mask1 & actg_mask2 rliterman@0: rliterman@0: snps_cocalled = np.sum(cocalled_mask) rliterman@0: snp_distance = np.sum((seq1 != seq2) & cocalled_mask) rliterman@0: rliterman@0: results.append([ids[i], ids[j], snp_distance, snps_cocalled]) rliterman@0: rliterman@0: return results rliterman@0: rliterman@0: def parallelAlignment(alignment, chunk_size=5000): rliterman@0: sequences = [np.array(list(record.seq)) for record in alignment] rliterman@0: ids = [record.id for record in alignment] rliterman@0: pairwise_combinations = list(combinations(range(len(sequences)), 2)) rliterman@0: rliterman@0: # Create chunks of pairwise combinations rliterman@0: chunks = [pairwise_combinations[i:i + chunk_size] for i in range(0, len(pairwise_combinations), chunk_size)] rliterman@0: rliterman@0: results = [] rliterman@0: with concurrent.futures.ProcessPoolExecutor() as executor: rliterman@0: future_to_chunk = {executor.submit(getPairwise, chunk, sequences, ids): chunk for chunk in chunks} rliterman@0: for future in concurrent.futures.as_completed(future_to_chunk): rliterman@0: chunk_results = future.result() rliterman@0: results.extend(chunk_results) rliterman@0: return results rliterman@0: rliterman@0: def getFinalPurge(df): rliterman@0: # Returns the 'farthest along' category for a given Ref_Loc rliterman@0: if "Purged_Density" in df['Cat'].values: rliterman@0: return "Purged_Density" rliterman@0: elif "Purged_Heterozygous" in df['Cat'].values: rliterman@0: return "Purged_Heterozygous" rliterman@0: elif "Purged_Indel" in df['Cat'].values: rliterman@0: return "Purged_Indel" rliterman@0: elif "Purged_Invalid" in df['Cat'].values: rliterman@0: return "Purged_Invalid" rliterman@0: elif "Purged_Length" in df['Cat'].values: rliterman@0: return "Purged_Length" rliterman@0: else: rliterman@0: return "Purged_Identity" rliterman@0: rliterman@0: rliterman@0: # Read in arguments rliterman@0: global run_failed rliterman@0: run_failed = False rliterman@0: rliterman@0: start_time = time.time() rliterman@0: rliterman@0: parser = argparse.ArgumentParser(description='CSP2 SNP Pipeline Analysis') rliterman@0: parser.add_argument('--reference_id', type=str, help='Reference Isolate') rliterman@0: parser.add_argument('--output_directory', type=str, help='Output Directory') rliterman@0: parser.add_argument('--log_directory', type=str, help='Log Directory') rliterman@0: parser.add_argument('--snpdiffs_file', type=str, help='Path to SNPdiffs file') rliterman@0: parser.add_argument('--min_cov', type=float, help='Minimum coverage') rliterman@0: parser.add_argument('--min_len', type=int, help='Minimum length') rliterman@0: parser.add_argument('--min_iden', type=float, help='Minimum identity') rliterman@0: parser.add_argument('--ref_edge', type=int, help='Reference edge') rliterman@0: parser.add_argument('--query_edge', type=int, help='Query edge') rliterman@0: parser.add_argument('--density_windows', type=str, help='Density windows') rliterman@0: parser.add_argument('--max_snps', type=str, help='Maximum SNPs') rliterman@0: parser.add_argument('--trim_name', type=str, help='Trim name') rliterman@0: parser.add_argument('--max_missing', type=float, help='Maximum missing') rliterman@0: parser.add_argument('--tmp_dir', type=str, help='Temporary directory') rliterman@0: parser.add_argument('--rescue', type=str, help='Rescue edge SNPs (rescue/norescue)') rliterman@0: args = parser.parse_args() rliterman@0: rliterman@0: reference_id = args.reference_id rliterman@0: output_directory = os.path.abspath(args.output_directory) rliterman@0: log_directory = os.path.abspath(args.log_directory) rliterman@0: log_file = f"{output_directory}/CSP2_SNP_Pipeline.log" rliterman@0: snpdiffs_file = args.snpdiffs_file rliterman@0: min_cov = args.min_cov rliterman@0: min_len = args.min_len rliterman@0: min_iden = args.min_iden rliterman@0: ref_edge = args.ref_edge rliterman@0: query_edge = args.query_edge rliterman@0: density_windows = [int(x) for x in args.density_windows.split(",")] rliterman@0: max_snps = [int(x) for x in args.max_snps.split(",")] rliterman@0: trim_name = args.trim_name rliterman@0: max_missing = args.max_missing rliterman@0: rliterman@0: # Establish log file rliterman@0: with open(log_file,"w+") as log: rliterman@0: log.write("CSP2 SNP Pipeline Analysis\n") rliterman@0: log.write(f"Reference Isolate: {reference_id}\n") rliterman@0: log.write(str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))+"\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: log.write("Reading in SNPDiffs files...") rliterman@0: rliterman@0: rliterman@0: # Read in all lines and ensure each file exists rliterman@0: snpdiffs_list = [line.strip() for line in open(snpdiffs_file, 'r')] rliterman@0: snpdiffs_list = [line for line in snpdiffs_list if line] rliterman@0: for snpdiffs_file in snpdiffs_list: rliterman@0: if not os.path.exists(snpdiffs_file): rliterman@0: run_failed = True rliterman@0: sys.exit("Error: File does not exist: " + snpdiffs_file) rliterman@0: rliterman@0: snpdiffs_list = list(set(snpdiffs_list)) rliterman@0: rliterman@0: if len(snpdiffs_list) == 0: rliterman@0: run_failed = True rliterman@0: sys.exit("No SNPdiffs files provided...") rliterman@0: rliterman@0: with open(log_file, "a+") as log: rliterman@0: log.write("Done!\n") rliterman@0: log.write(f"\t- Read in {len(snpdiffs_list)} SNPdiffs files\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: rliterman@0: global temp_dir rliterman@0: if args.tmp_dir != "": rliterman@0: random_temp_id = str(uuid.uuid4()) rliterman@0: temp_dir = f"{os.path.normpath(os.path.abspath(args.tmp_dir))}/{random_temp_id}" rliterman@0: try: rliterman@0: os.mkdir(temp_dir) rliterman@0: helpers.set_tempdir(temp_dir) rliterman@0: except OSError as e: rliterman@0: run_failed = True rliterman@0: print(f"Error: Failed to create directory '{temp_dir}': {e}") rliterman@0: else: rliterman@0: temp_dir = "" rliterman@0: rliterman@0: rescue_edge = str(args.rescue) rliterman@0: if rescue_edge not in ["rescue","norescue"]: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t- Unexpected rescue_edge variable ('{rescue_edge}'). Not performing SNP edge rescuing (any SNPs found within {query_edge}bp of a query contig edge will be purged)...\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: rescue_edge = "norescue" rliterman@0: elif rescue_edge == "rescue": rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t- Rescuing edge SNPs within {query_edge}bp of query contig edges if found more centrally in another query...\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: else: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t- Not performing SNP edge rescuing (any SNPs found within {query_edge}bp of a query contig edge will be purged)...\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: rliterman@0: try: rliterman@0: # Establish output files rliterman@0: reference_screening_file = f"{output_directory}/Reference_Screening.tsv" rliterman@0: locus_category_file = f"{output_directory}/Locus_Categories.tsv" rliterman@0: query_coverage_file = f"{output_directory}/Query_Coverage.tsv" rliterman@0: raw_loclist = f"{output_directory}/snplist.txt" rliterman@0: raw_alignment = f"{output_directory}/snpma.fasta" rliterman@0: preserved_loclist = f"{output_directory}/snplist_preserved.txt" rliterman@0: preserved_alignment_file = f"{output_directory}/snpma_preserved.fasta" rliterman@0: raw_pairwise = f"{output_directory}/snp_distance_pairwise.tsv" rliterman@0: raw_matrix = f"{output_directory}/snp_distance_matrix.tsv" rliterman@0: preserved_pairwise = f"{output_directory}/snp_distance_pairwise_preserved.tsv" rliterman@0: preserved_matrix = f"{output_directory}/snp_distance_matrix_preserved.tsv" rliterman@0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("Screening all queries against reference...") rliterman@0: with concurrent.futures.ProcessPoolExecutor() as executor: rliterman@0: results = [executor.submit(screenSNPDiffs,snp_diff_file,trim_name, min_cov, min_len, min_iden, ref_edge, query_edge, density_windows, max_snps,reference_id,log_directory) for snp_diff_file in snpdiffs_list] rliterman@0: rliterman@0: # Combine results into a dataframe rliterman@0: output_columns = ['Query_ID','Reference_ID','Screen_Category','CSP2_Screen_SNPs', rliterman@0: 'Query_Percent_Aligned','Reference_Percent_Aligned', rliterman@0: 'Query_Contigs','Query_Bases','Reference_Contigs','Reference_Bases', rliterman@0: 'Raw_SNPs','Purged_Length','Purged_Identity','Purged_LengthIdentity','Purged_Invalid','Purged_Indel','Purged_Duplicate','Purged_Het','Purged_Density', rliterman@0: 'Filtered_Query_Edge','Filtered_Ref_Edge','Filtered_Both_Edge', rliterman@0: 'Kmer_Similarity','Shared_Kmers','Query_Unique_Kmers','Reference_Unique_Kmers', rliterman@0: 'MUMmer_gSNPs','MUMmer_gIndels'] rliterman@0: rliterman@0: # Save reference screening rliterman@0: results_df = pd.DataFrame([item.result()[0] for item in results], columns = output_columns) rliterman@0: results_df.to_csv(reference_screening_file, sep="\t", index=False) rliterman@0: rliterman@0: # Get reference bed dfs rliterman@0: covered_df = pd.concat([item.result()[1] for item in results]) rliterman@0: rliterman@0: # Get snp dfs rliterman@0: filtered_snp_df = pd.concat([item.result()[2] for item in results]) rliterman@0: rliterman@0: # Separate isolates that pass QC rliterman@0: pass_qc_isolates = list(set(results_df[results_df['Screen_Category'] == "Pass"]['Query_ID'])) rliterman@0: fail_qc_isolates = list(set(results_df[results_df['Screen_Category'] != "Pass"]['Query_ID'])) rliterman@0: rliterman@0: if len(pass_qc_isolates) == 0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("Done!\n") rliterman@0: log.write(f"\t- Reference screening data saved to {reference_screening_file}\n") rliterman@0: log.write(f"\t- Of {len(snpdiffs_list)} comparisons, no isolates passed QC. Pipeline cannot continue.\n") rliterman@0: log.write(f"\t- {len(fail_qc_isolates)} comparisons failed QC\n") rliterman@0: for isolate in fail_qc_isolates: rliterman@0: isolate_category = results_df[results_df['Query_ID'] == isolate]['Screen_Category'].values[0] rliterman@0: log.write(f"\t\t- {isolate}: {isolate_category}\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: sys.exit(0) rliterman@0: else: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("Done!\n") rliterman@0: log.write(f"\t- Reference screening data saved to {reference_screening_file}\n") rliterman@0: log.write(f"\t- Of {len(snpdiffs_list)} comparisons, {len(pass_qc_isolates)} covered at least {min_cov}% of the reference genome after removing poor alignments\n") rliterman@0: if len(fail_qc_isolates) > 0: rliterman@0: log.write(f"\t- {len(fail_qc_isolates)} comparisons failed QC\n") rliterman@0: for isolate in fail_qc_isolates: rliterman@0: isolate_category = results_df[results_df['Query_ID'] == isolate]['Screen_Category'].values[0] rliterman@0: log.write(f"\t\t- {isolate}: {isolate_category}\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"Compiling SNPs across {len(pass_qc_isolates)} samples...\n") rliterman@0: rliterman@0: if filtered_snp_df.shape[0] == 0: rliterman@0: snp_count = 0 rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("\t- No SNPs detected across samples...Skipping to output...\n") rliterman@0: else: rliterman@0: rliterman@0: # Remove samples that failed QC rliterman@0: pass_filter_snps = filtered_snp_df[filtered_snp_df['Query_ID'].isin(pass_qc_isolates)].copy() rliterman@0: pass_filter_snp_list = list(set(pass_filter_snps['Ref_Loc'])) rliterman@0: pass_filter_snp_count = len(pass_filter_snp_list) rliterman@0: rliterman@0: global pass_filter_coverage_df rliterman@0: pass_filter_coverage_df = covered_df[covered_df['Query_ID'].isin(pass_qc_isolates)].copy() rliterman@0: rliterman@0: # Get SNP counts rliterman@0: snp_df = pass_filter_snps[pass_filter_snps['Cat'] == "SNP"].copy() rliterman@0: rliterman@0: if snp_df.shape[0] == 0: rliterman@0: snp_count = 0 rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t- {pass_filter_snp_count} total SNPs detected across all samples...\n") rliterman@0: log.write("\t- No SNPs passed QC filtering in any sample...Skipping to output...\n") rliterman@0: rliterman@0: else: rliterman@0: snp_list = list(set(snp_df['Ref_Loc'])) rliterman@0: snp_count = len(snp_list) rliterman@0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t- {pass_filter_snp_count} total SNPs detected across all samples...\n") rliterman@0: log.write(f"\t- {snp_count} unique SNPs passed QC filtering in at least one sample...\n") rliterman@0: rliterman@0: # Note SNPs lost irrevocably to reference edge trimming rliterman@0: ref_edge_df = pass_filter_snps[pass_filter_snps['Cat'].isin(["Filtered_Ref_Edge",'Filtered_Both_Edge'])].copy() rliterman@0: if ref_edge_df.shape[0] > 0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t- {ref_edge_df['Ref_Loc'].nunique()} unique SNPs were within {ref_edge}bp of a reference contig end and were not considered in any query...\n") rliterman@0: rliterman@0: # Create Ref_Base df rliterman@0: ref_base_df = snp_df[['Ref_Loc', 'Ref_Base']].copy().drop_duplicates().rename(columns = {'Ref_Base':'Query_Base'}) rliterman@0: ref_base_df.loc[:,'Query_ID'] = reference_id rliterman@0: ref_base_df.loc[:,'Cat'] = "Reference_Isolate" rliterman@0: ref_base_df = ref_base_df.loc[:,['Ref_Loc','Query_ID','Query_Base','Cat']] rliterman@0: rliterman@0: # Rescue SNPs that are near the edge if they are valid SNPs in other samples rliterman@0: rescued_edge_df = pass_filter_snps[(pass_filter_snps['Cat'] == "Filtered_Query_Edge") & (pass_filter_snps['Ref_Loc'].isin(snp_list))].copy() rliterman@0: rliterman@0: if rescue_edge == "norescue": rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("\t- Skipping edge resucing...\n") rliterman@0: rliterman@0: elif rescued_edge_df.shape[0] > 0: rliterman@0: rliterman@0: # Remove rescued sites from pass_filter_snps rliterman@0: rescue_merge = pass_filter_snps.merge(rescued_edge_df, indicator=True, how='outer') rliterman@0: pass_filter_snps = rescue_merge[rescue_merge['_merge'] == 'left_only'].drop(columns=['_merge']).copy() rliterman@0: rliterman@0: # Add rescued SNPs to snp_df rliterman@0: rescued_edge_df.loc[:,'Cat'] = "Rescued_SNP" rliterman@0: snp_df = pd.concat([snp_df,rescued_edge_df]).reset_index(drop=True) rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t- Rescued {rescued_edge_df.shape[0]} query SNPs that fell within {query_edge}bp of the query contig edge...\n") rliterman@0: rliterman@0: else: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t- No query SNPs that fell within {query_edge}bp of the query contig edge were rescued...\n") rliterman@0: rliterman@0: # Gather base data for all valid SNPs rliterman@0: snp_base_df = snp_df[['Ref_Loc','Query_ID','Query_Base','Cat']].copy() rliterman@0: rliterman@0: # Process purged sites rliterman@0: purged_snp_df = pd.DataFrame(columns=['Ref_Loc','Query_ID','Query_Base','Cat']) rliterman@0: purged_df = pass_filter_snps[pass_filter_snps['Cat'] != "SNP"].copy() rliterman@0: rliterman@0: if purged_df.shape[0] > 0: rliterman@0: rliterman@0: # Remove rows from purged_df if the Ref_Loc/Query_ID pair is already in snp_base_df rliterman@0: purged_df = purged_df[~purged_df[['Ref_Loc','Query_ID']].apply(tuple,1).isin(snp_base_df[['Ref_Loc','Query_ID']].apply(tuple,1))].copy() rliterman@0: rliterman@0: if purged_df.shape[0] > 0: rliterman@0: rliterman@0: # Get purged SNPs where no query has a valid SNP rliterman@0: non_snp_df = purged_df[~purged_df['Ref_Loc'].isin(snp_list)].copy() rliterman@0: if non_snp_df.shape[0] > 0: rliterman@0: non_snp_merge = purged_df.merge(non_snp_df, indicator=True, how='outer') rliterman@0: purged_df = non_snp_merge[non_snp_merge['_merge'] == 'left_only'].drop(columns=['_merge']).copy() rliterman@0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t- {len(list(set(non_snp_df['Ref_Loc'])))} unique SNPs were purged in all queries they were found in, and were not considered in the final dataset...\n") rliterman@0: rliterman@0: if purged_df.shape[0] > 0: rliterman@0: rliterman@0: purged_snp_df = purged_df[['Ref_Loc','Query_ID']].copy().drop_duplicates() rliterman@0: purged_snp_df.loc[:, 'Query_Base'] = "N" rliterman@0: final_purge_df = purged_df.groupby(['Ref_Loc','Query_ID']).apply(getFinalPurge).reset_index().rename(columns={0:'Cat'}) rliterman@0: purged_snp_df = purged_snp_df.merge(final_purge_df, on=['Ref_Loc','Query_ID'], how='inner') rliterman@0: rliterman@0: # Genomic positions that do not occur- in the SNP data are either uncovered or match the reference base rliterman@0: missing_df = pd.DataFrame(columns=['Ref_Loc','Query_ID','Query_Base','Cat']) rliterman@0: rliterman@0: covered_snps = pd.concat([snp_base_df,purged_snp_df]).copy() rliterman@0: ref_loc_sets = covered_snps.groupby('Query_ID')['Ref_Loc'].agg(set).to_dict() rliterman@0: isolates_with_missing = [isolate for isolate in pass_qc_isolates if len(set(snp_list) - ref_loc_sets.get(isolate, set())) > 0] rliterman@0: rliterman@0: uncovered_df = pd.DataFrame() rliterman@0: rliterman@0: if isolates_with_missing: rliterman@0: isolate_data = [(isolate, list(set(snp_list) - ref_loc_sets.get(isolate, set()))) for isolate in isolates_with_missing] rliterman@0: rliterman@0: with concurrent.futures.ProcessPoolExecutor() as executor: rliterman@0: results = [executor.submit(assessCoverage, query, sites) for query, sites in isolate_data] rliterman@0: coverage_dfs = [result.result() for result in concurrent.futures.as_completed(results)] rliterman@0: rliterman@0: coverage_df = pd.concat(coverage_dfs) rliterman@0: covered_df = coverage_df[coverage_df['Cat'] == 'Ref_Base'] rliterman@0: uncovered_df = coverage_df[coverage_df['Cat'] == 'Uncovered'] rliterman@0: rliterman@0: if not uncovered_df.empty: rliterman@0: uncovered_df.loc[:, 'Query_Base'] = "?" rliterman@0: missing_df = pd.concat([missing_df, uncovered_df[['Ref_Loc', 'Query_ID', 'Query_Base', 'Cat']]]) rliterman@0: rliterman@0: if not covered_df.empty: rliterman@0: ref_base_snp_df = covered_df.merge(ref_base_df[['Ref_Loc', 'Query_Base']], on='Ref_Loc', how='left') rliterman@0: missing_df = pd.concat([missing_df, ref_base_snp_df[['Ref_Loc', 'Query_ID', 'Query_Base', 'Cat']]]) rliterman@0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("\t- Processed coverage information...\n") rliterman@0: rliterman@0: final_snp_df = pd.concat([snp_base_df,purged_snp_df,missing_df,ref_base_df]).sort_values(by=['Ref_Loc','Query_ID']).reset_index(drop=True) rliterman@0: snp_counts = final_snp_df.groupby('Query_ID')['Ref_Loc'].count().reset_index().rename(columns={'Ref_Loc':'SNP_Count'}) rliterman@0: rliterman@0: # Assert that all snp_counts == snp_count rliterman@0: assert snp_counts['SNP_Count'].nunique() == 1 rliterman@0: assert snp_counts['SNP_Count'].values[0] == snp_count rliterman@0: rliterman@0: # Get locus coverage stats rliterman@0: snp_coverage_df = final_snp_df[final_snp_df['Cat'].isin(['SNP','Rescued_SNP'])].groupby('Ref_Loc')['Query_ID'].count().reset_index().rename(columns={'Query_ID':'SNP_Count'}) rliterman@0: ref_base_coverage_df = final_snp_df[final_snp_df['Cat'].isin(["Ref_Base","Reference_Isolate"])].groupby('Ref_Loc')['Query_ID'].count().reset_index().rename(columns={'Query_ID':'Ref_Base_Count'}) rliterman@0: rliterman@0: if uncovered_df.shape[0] > 0: rliterman@0: uncovered_count_df = final_snp_df[final_snp_df['Cat'] == "Uncovered"].groupby('Ref_Loc')['Query_ID'].count().reset_index().rename(columns={'Query_ID':'Uncovered_Count'}).copy() rliterman@0: else: rliterman@0: uncovered_count_df = pd.DataFrame(columns=['Ref_Loc','Uncovered_Count']) rliterman@0: rliterman@0: possible_purged_cols = ['Purged_Length','Purged_Identity','Purged_Invalid','Purged_Indel','Purged_Heterozygous','Purged_Density'] rliterman@0: if purged_snp_df.shape[0] > 0: rliterman@0: purged_count_df = final_snp_df[final_snp_df['Cat'].isin(possible_purged_cols)].groupby('Ref_Loc')['Query_ID'].count().reset_index().rename(columns={'Query_ID':'Purged_Count'}).copy() rliterman@0: else: rliterman@0: purged_count_df = pd.DataFrame(columns=['Ref_Loc','Purged_Count']) rliterman@0: rliterman@0: locus_coverage_df = snp_coverage_df.merge(ref_base_coverage_df, how='outer', on='Ref_Loc').merge(uncovered_count_df, how='outer', on='Ref_Loc').merge(purged_count_df, how='outer', on='Ref_Loc').fillna(0) rliterman@0: locus_coverage_df.loc[:, ['SNP_Count','Ref_Base_Count','Uncovered_Count','Purged_Count']] = locus_coverage_df.loc[:, ['SNP_Count','Ref_Base_Count','Uncovered_Count','Purged_Count']].astype(int) rliterman@0: locus_coverage_df['Missing_Ratio'] = ((locus_coverage_df['Uncovered_Count'] + locus_coverage_df['Purged_Count']) / (1+len(pass_qc_isolates))) * 100 rliterman@0: locus_coverage_df.to_csv(locus_category_file, sep="\t", index=False) rliterman@0: rliterman@0: # Get isolate coverage stats rliterman@0: min_isolate_cols = ['Query_ID','SNP','Ref_Base','Percent_Missing','Purged','Uncovered','Rescued_SNP','Purged_Ref_Edge'] rliterman@0: isolate_coverage_df = final_snp_df.groupby('Query_ID')['Cat'].value_counts().unstack().fillna(0).astype(float).astype(int).reset_index().drop(columns=['Reference_Isolate']) rliterman@0: isolate_coverage_df.loc[isolate_coverage_df['Query_ID'] == reference_id, 'Ref_Base'] = snp_count rliterman@0: rliterman@0: if "Rescued_SNP" not in isolate_coverage_df.columns.tolist(): rliterman@0: isolate_coverage_df.loc[:,'Rescued_SNP'] = 0 rliterman@0: isolate_coverage_df['SNP'] = isolate_coverage_df['SNP'] + isolate_coverage_df['Rescued_SNP'] rliterman@0: rliterman@0: for col in ['Uncovered'] + possible_purged_cols: rliterman@0: if col not in isolate_coverage_df.columns.tolist(): rliterman@0: isolate_coverage_df.loc[:,col] = 0 rliterman@0: rliterman@0: isolate_coverage_df['Purged'] = isolate_coverage_df[possible_purged_cols].sum(axis=1) rliterman@0: rliterman@0: isolate_coverage_df['Percent_Missing'] = (isolate_coverage_df['Uncovered'] + isolate_coverage_df['Purged'])/(isolate_coverage_df['Uncovered'] + isolate_coverage_df['Purged'] + isolate_coverage_df['Ref_Base'] + isolate_coverage_df['SNP']) * 100 rliterman@0: rliterman@0: isolate_coverage_df.loc[:,'Purged_Ref_Edge'] = 0 rliterman@0: if ref_edge_df.shape[0] > 0: rliterman@0: isolate_coverage_df.loc[isolate_coverage_df['Query_ID'] == reference_id, 'Purged_Ref_Edge'] = ref_edge_df['Ref_Loc'].nunique() rliterman@0: rliterman@0: isolate_coverage_df = isolate_coverage_df[min_isolate_cols + possible_purged_cols].sort_values(by = 'Percent_Missing',ascending = False).reset_index(drop=True) rliterman@0: isolate_coverage_df.to_csv(query_coverage_file, sep="\t", index=False) rliterman@0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t- SNP coverage information: {locus_category_file}\n") rliterman@0: log.write(f"\t- Query coverage information: {query_coverage_file}\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("Processing alignment data...") rliterman@0: rliterman@0: alignment_df = final_snp_df[['Query_ID','Ref_Loc','Query_Base']].copy().rename(columns={'Query_Base':'Base'}).pivot(index='Query_ID', columns='Ref_Loc', values='Base') rliterman@0: csp2_ordered = alignment_df.columns rliterman@0: rliterman@0: with open(raw_loclist,"w+") as loclist: rliterman@0: loclist.write("\n".join(csp2_ordered)+"\n") rliterman@0: rliterman@0: seq_records = [SeqRecord(Seq(''.join(row)), id=query,description='') for query,row in alignment_df.iterrows()] rliterman@0: rliterman@0: alignment = MultipleSeqAlignment(seq_records) rliterman@0: AlignIO.write(alignment,raw_alignment,"fasta") rliterman@0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("Done!\n") rliterman@0: log.write(f"\t- Saved alignment of {snp_count} SNPs to {raw_alignment}\n") rliterman@0: log.write(f"\t- Saved ordered loc list to {raw_loclist}\n") rliterman@0: rliterman@0: if max_missing == float(100): rliterman@0: locs_pass_missing = csp2_ordered rliterman@0: preserved_alignment = alignment rliterman@0: rliterman@0: AlignIO.write(preserved_alignment,preserved_alignment_file,"fasta") rliterman@0: with open(preserved_loclist,"w+") as loclist: rliterman@0: loclist.write("\n".join(csp2_ordered)+"\n") rliterman@0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("Skipping SNP preservation step...\n") rliterman@0: log.write(f"\t- Saved duplicate alignment to {preserved_alignment_file}\n") rliterman@0: log.write(f"\t- Saved duplicate ordered loc list to {preserved_loclist}\n") rliterman@0: else: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t- Preserving SNPs with at most {max_missing}% missing data...\n") rliterman@0: rliterman@0: # Parse missing data rliterman@0: locs_pass_missing = list(set(locus_coverage_df[locus_coverage_df['Missing_Ratio'] <= max_missing]['Ref_Loc'])) rliterman@0: rliterman@0: if len(locs_pass_missing) == 0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t- Of {snp_count} SNPs, no SNPs pass the {max_missing}% missing data threshold...\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: else: rliterman@0: preserved_alignment_df = alignment_df[locs_pass_missing].copy() rliterman@0: rliterman@0: preserved_ordered = preserved_alignment_df.columns rliterman@0: with open(preserved_loclist,"w+") as loclist: rliterman@0: loclist.write("\n".join(preserved_ordered)+"\n") rliterman@0: rliterman@0: seq_records = [SeqRecord(Seq(''.join(row)), id=query,description='') for query,row in preserved_alignment_df.iterrows()] rliterman@0: preserved_alignment = MultipleSeqAlignment(seq_records) rliterman@0: AlignIO.write(preserved_alignment,preserved_alignment_file,"fasta") rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write(f"\t- Of {snp_count} SNPs, {len(locs_pass_missing)} SNPs pass the {max_missing}% missing data threshold...\n") rliterman@0: log.write(f"\t- Saved preserved alignment to {preserved_alignment_file}\n") rliterman@0: log.write(f"\t- Saved preserved ordered loc list to {preserved_loclist}\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("Processing pairwise comparisons files...") rliterman@0: rliterman@0: # Get pairwise comparisons between all pass_qc_isolates and reference_id rliterman@0: pairwise_combinations = [sorted(x) for x in list(combinations([reference_id] + pass_qc_isolates, 2))] rliterman@0: rliterman@0: if snp_count == 0: rliterman@0: pairwise_df = pd.DataFrame([(pairwise[0], pairwise[1], 0,np.nan) for pairwise in pairwise_combinations],columns = ['Query_1','Query_2','SNP_Distance','SNPs_Cocalled']) rliterman@0: preserved_pairwise_df = pairwise_df.copy() rliterman@0: rliterman@0: pairwise_df.to_csv(raw_pairwise, sep="\t", index=False) rliterman@0: preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False) rliterman@0: rliterman@0: # Create matrix rliterman@0: idx = sorted(set(pairwise_df['Query_1']).union(pairwise_df['Query_2'])) rliterman@0: mirrored_distance_df = pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f')) rliterman@0: mirrored_distance_df.index.name = '' rliterman@0: mirrored_distance_df.to_csv(raw_matrix,sep="\t") rliterman@0: mirrored_distance_df.to_csv(preserved_matrix,sep="\t") rliterman@0: rliterman@0: else: rliterman@0: raw_distance_results = parallelAlignment(alignment) rliterman@0: raw_pairwise_df = pd.DataFrame(raw_distance_results, columns=['Query_1', 'Query_2', 'SNP_Distance', 'SNPs_Cocalled']) rliterman@0: raw_pairwise_df.to_csv(raw_pairwise, sep="\t", index=False) rliterman@0: rliterman@0: if len(locs_pass_missing) == snp_count: rliterman@0: preserved_pairwise_df = raw_pairwise_df.copy() rliterman@0: preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False) rliterman@0: elif len(locs_pass_missing) == 0: rliterman@0: preserved_pairwise_df = pd.DataFrame([(pairwise[0], pairwise[1], 0,np.nan) for pairwise in pairwise_combinations],columns = ['Query_1','Query_2','SNP_Distance','SNPs_Cocalled']) rliterman@0: preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False) rliterman@0: else: rliterman@0: preserved_distance_results = parallelAlignment(preserved_alignment) rliterman@0: preserved_pairwise_df = pd.DataFrame(preserved_distance_results, columns=['Query_1', 'Query_2', 'SNP_Distance', 'SNPs_Cocalled']) rliterman@0: preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False) rliterman@0: rliterman@0: # Create matrix rliterman@0: idx = sorted(set(raw_pairwise_df['Query_1']).union(raw_pairwise_df['Query_2'])) rliterman@0: mirrored_distance_df = raw_pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f')) rliterman@0: mirrored_distance_df.index.name = '' rliterman@0: mirrored_distance_df.to_csv(raw_matrix,sep="\t") rliterman@0: rliterman@0: idx = sorted(set(preserved_pairwise_df['Query_1']).union(preserved_pairwise_df['Query_2'])) rliterman@0: mirrored_distance_df = preserved_pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f')) rliterman@0: mirrored_distance_df.index.name = '' rliterman@0: mirrored_distance_df.to_csv(preserved_matrix,sep="\t") rliterman@0: rliterman@0: # Clean up pybedtools temp rliterman@0: helpers.cleanup(verbose=False,remove_all = False) rliterman@0: rliterman@0: end_time = time.time() rliterman@0: with open(log_file,"a+") as log: rliterman@0: log.write("Done!\n") rliterman@0: if snp_count == 0: rliterman@0: log.write(f"\t- No SNPs detected, zeroed pairwise distance files saved to {raw_pairwise}/{preserved_pairwise}/{raw_matrix}/{preserved_matrix}\n") rliterman@0: else: rliterman@0: log.write(f"\t- Saved raw pairwise distances to {raw_pairwise}\n") rliterman@0: log.write(f"\t- Saved raw pairwise matrix to {raw_matrix}\n") rliterman@0: rliterman@0: if max_missing == float(100): rliterman@0: log.write("Skipped SNP preservation step...\n") rliterman@0: log.write(f"\t- Saved duplicated preserved pairwise distances to {preserved_pairwise}\n") rliterman@0: log.write(f"\t- Saved duplicated preserved pairwise matrix to {preserved_matrix}\n") rliterman@0: elif len(locs_pass_missing) == 0: rliterman@0: log.write(f"\t- No SNPs passed the {max_missing}% missing data threshold, zeroed pairwise distance files saved to {preserved_pairwise}/{preserved_matrix}\n") rliterman@0: else: rliterman@0: log.write(f"\t- Saved preserved pairwise distances to {preserved_pairwise}\n") rliterman@0: log.write(f"\t- Saved preserved pairwise matrix to {preserved_matrix}\n") rliterman@0: log.write(f"Total Time: {end_time - start_time:.2f} seconds\n") rliterman@0: log.write("-------------------------------------------------------\n\n") rliterman@0: rliterman@0: except: rliterman@0: run_failed = True rliterman@0: print("Exception occurred:\n", traceback.format_exc()) rliterman@0: finally: rliterman@0: helpers.cleanup(verbose=False, remove_all=False) rliterman@0: if temp_dir != "": rliterman@0: shutil.rmtree(temp_dir) rliterman@0: if run_failed: rliterman@0: sys.exit(1)