comparison CSP2/bin/runSNPPipeline.py @ 39:93393808f415

"planemo upload"
author rliterman
date Thu, 12 Dec 2024 13:53:15 -0500
parents 893a6993efe3
children
comparison
equal deleted inserted replaced
38:ee512a230a1e 39:93393808f415
505 filtered_ref_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Ref_Edge"].shape[0] 505 filtered_ref_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Ref_Edge"].shape[0]
506 filtered_both_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Both_Edge"].shape[0] 506 filtered_both_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Both_Edge"].shape[0]
507 507
508 # Write filtered SNP data to file 508 # Write filtered SNP data to file
509 snp_file = log_file.replace(".log","_SNPs.tsv") 509 snp_file = log_file.replace(".log","_SNPs.tsv")
510 filtered_snp_df.to_csv(snp_file, sep="\t", index=False) 510 with open(snp_file,"w") as f:
511 filtered_snp_df.to_csv(f, sep="\t", index=False)
511 512
512 filtered_snp_df.loc[:, 'Query_ID'] = query_id 513 filtered_snp_df.loc[:, 'Query_ID'] = query_id
513 514
514 with open(log_file,"a+") as log: 515 with open(log_file,"a+") as log:
515 log.write("Done!\n") 516 log.write("Done!\n")
748 'Kmer_Similarity','Shared_Kmers','Query_Unique_Kmers','Reference_Unique_Kmers', 749 'Kmer_Similarity','Shared_Kmers','Query_Unique_Kmers','Reference_Unique_Kmers',
749 'MUMmer_gSNPs','MUMmer_gIndels'] 750 'MUMmer_gSNPs','MUMmer_gIndels']
750 751
751 # Save reference screening 752 # Save reference screening
752 results_df = pd.DataFrame([item.result()[0] for item in results], columns = output_columns) 753 results_df = pd.DataFrame([item.result()[0] for item in results], columns = output_columns)
753 results_df.to_csv(reference_screening_file, sep="\t", index=False) 754 with open(reference_screening_file,"w") as f:
755 results_df.to_csv(f, sep="\t", index=False)
754 756
755 # Get reference bed dfs 757 # Get reference bed dfs
756 covered_df = pd.concat([item.result()[1] for item in results]) 758 covered_df = pd.concat([item.result()[1] for item in results])
757 759
758 # Get snp dfs 760 # Get snp dfs
938 purged_count_df = pd.DataFrame(columns=['Ref_Loc','Purged_Count']) 940 purged_count_df = pd.DataFrame(columns=['Ref_Loc','Purged_Count'])
939 941
940 locus_coverage_df = snp_coverage_df.merge(ref_base_coverage_df, how='outer', on='Ref_Loc').merge(uncovered_count_df, how='outer', on='Ref_Loc').merge(purged_count_df, how='outer', on='Ref_Loc').fillna(0) 942 locus_coverage_df = snp_coverage_df.merge(ref_base_coverage_df, how='outer', on='Ref_Loc').merge(uncovered_count_df, how='outer', on='Ref_Loc').merge(purged_count_df, how='outer', on='Ref_Loc').fillna(0)
941 locus_coverage_df.loc[:, ['SNP_Count','Ref_Base_Count','Uncovered_Count','Purged_Count']] = locus_coverage_df.loc[:, ['SNP_Count','Ref_Base_Count','Uncovered_Count','Purged_Count']].astype(int) 943 locus_coverage_df.loc[:, ['SNP_Count','Ref_Base_Count','Uncovered_Count','Purged_Count']] = locus_coverage_df.loc[:, ['SNP_Count','Ref_Base_Count','Uncovered_Count','Purged_Count']].astype(int)
942 locus_coverage_df['Missing_Ratio'] = ((locus_coverage_df['Uncovered_Count'] + locus_coverage_df['Purged_Count']) / (1+len(pass_qc_isolates))) * 100 944 locus_coverage_df['Missing_Ratio'] = ((locus_coverage_df['Uncovered_Count'] + locus_coverage_df['Purged_Count']) / (1+len(pass_qc_isolates))) * 100
943 locus_coverage_df.to_csv(locus_category_file, sep="\t", index=False) 945 with open(locus_category_file,"w") as f:
946 locus_coverage_df.to_csv(f, sep="\t", index=False)
944 947
945 # Get isolate coverage stats 948 # Get isolate coverage stats
946 min_isolate_cols = ['Query_ID','SNP','Ref_Base','Percent_Missing','Purged','Uncovered','Rescued_SNP','Purged_Ref_Edge'] 949 min_isolate_cols = ['Query_ID','SNP','Ref_Base','Percent_Missing','Purged','Uncovered','Rescued_SNP','Purged_Ref_Edge']
947 isolate_coverage_df = final_snp_df.groupby('Query_ID')['Cat'].value_counts().unstack().fillna(0).astype(float).astype(int).reset_index().drop(columns=['Reference_Isolate']) 950 isolate_coverage_df = final_snp_df.groupby('Query_ID')['Cat'].value_counts().unstack().fillna(0).astype(float).astype(int).reset_index().drop(columns=['Reference_Isolate'])
948 isolate_coverage_df.loc[isolate_coverage_df['Query_ID'] == reference_id, 'Ref_Base'] = snp_count 951 isolate_coverage_df.loc[isolate_coverage_df['Query_ID'] == reference_id, 'Ref_Base'] = snp_count
962 isolate_coverage_df.loc[:,'Purged_Ref_Edge'] = 0 965 isolate_coverage_df.loc[:,'Purged_Ref_Edge'] = 0
963 if ref_edge_df.shape[0] > 0: 966 if ref_edge_df.shape[0] > 0:
964 isolate_coverage_df.loc[isolate_coverage_df['Query_ID'] == reference_id, 'Purged_Ref_Edge'] = ref_edge_df['Ref_Loc'].nunique() 967 isolate_coverage_df.loc[isolate_coverage_df['Query_ID'] == reference_id, 'Purged_Ref_Edge'] = ref_edge_df['Ref_Loc'].nunique()
965 968
966 isolate_coverage_df = isolate_coverage_df[min_isolate_cols + possible_purged_cols].sort_values(by = 'Percent_Missing',ascending = False).reset_index(drop=True) 969 isolate_coverage_df = isolate_coverage_df[min_isolate_cols + possible_purged_cols].sort_values(by = 'Percent_Missing',ascending = False).reset_index(drop=True)
967 isolate_coverage_df.to_csv(query_coverage_file, sep="\t", index=False) 970 with open(query_coverage_file,'w') as f:
971 isolate_coverage_df.to_csv(f, sep="\t", index=False)
968 972
969 with open(log_file,"a+") as log: 973 with open(log_file,"a+") as log:
970 log.write(f"\t- SNP coverage information: {locus_category_file}\n") 974 log.write(f"\t- SNP coverage information: {locus_category_file}\n")
971 log.write(f"\t- Query coverage information: {query_coverage_file}\n") 975 log.write(f"\t- Query coverage information: {query_coverage_file}\n")
972 log.write("-------------------------------------------------------\n\n") 976 log.write("-------------------------------------------------------\n\n")
1037 1041
1038 if snp_count == 0: 1042 if snp_count == 0:
1039 pairwise_df = pd.DataFrame([(pairwise[0], pairwise[1], 0,np.nan) for pairwise in pairwise_combinations],columns = ['Query_1','Query_2','SNP_Distance','SNPs_Cocalled']) 1043 pairwise_df = pd.DataFrame([(pairwise[0], pairwise[1], 0,np.nan) for pairwise in pairwise_combinations],columns = ['Query_1','Query_2','SNP_Distance','SNPs_Cocalled'])
1040 preserved_pairwise_df = pairwise_df.copy() 1044 preserved_pairwise_df = pairwise_df.copy()
1041 1045
1042 pairwise_df.to_csv(raw_pairwise, sep="\t", index=False) 1046 with open(raw_pairwise,"w") as f:
1043 preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False) 1047 pairwise_df.to_csv(f, sep="\t", index=False)
1048 with open(preserved_pairwise,"w") as f:
1049 preserved_pairwise_df.to_csv(f, sep="\t", index=False)
1044 1050
1045 # Create matrix 1051 # Create matrix
1046 idx = sorted(set(pairwise_df['Query_1']).union(pairwise_df['Query_2'])) 1052 idx = sorted(set(pairwise_df['Query_1']).union(pairwise_df['Query_2']))
1047 mirrored_distance_df = pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f')) 1053 mirrored_distance_df = pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f'))
1048 mirrored_distance_df.index.name = '' 1054 mirrored_distance_df.index.name = ''
1049 mirrored_distance_df.to_csv(raw_matrix,sep="\t") 1055 with open(raw_matrix,"w") as f:
1050 mirrored_distance_df.to_csv(preserved_matrix,sep="\t") 1056 mirrored_distance_df.to_csv(f,sep="\t")
1057 with open(preserved_matrix,"w") as f:
1058 mirrored_distance_df.to_csv(f,sep="\t")
1051 1059
1052 else: 1060 else:
1053 raw_distance_results = parallelAlignment(alignment) 1061 raw_distance_results = parallelAlignment(alignment)
1054 raw_pairwise_df = pd.DataFrame(raw_distance_results, columns=['Query_1', 'Query_2', 'SNP_Distance', 'SNPs_Cocalled']) 1062 raw_pairwise_df = pd.DataFrame(raw_distance_results, columns=['Query_1', 'Query_2', 'SNP_Distance', 'SNPs_Cocalled'])
1055 raw_pairwise_df.to_csv(raw_pairwise, sep="\t", index=False) 1063 with open(raw_pairwise,"w") as f:
1064 raw_pairwise_df.to_csv(f, sep="\t", index=False)
1056 1065
1057 if len(locs_pass_missing) == snp_count: 1066 if len(locs_pass_missing) == snp_count:
1058 preserved_pairwise_df = raw_pairwise_df.copy() 1067 preserved_pairwise_df = raw_pairwise_df.copy()
1059 preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False) 1068 with open(preserved_pairwise,"w") as f:
1069 preserved_pairwise_df.to_csv(f, sep="\t", index=False)
1060 elif len(locs_pass_missing) == 0: 1070 elif len(locs_pass_missing) == 0:
1061 preserved_pairwise_df = pd.DataFrame([(pairwise[0], pairwise[1], 0,np.nan) for pairwise in pairwise_combinations],columns = ['Query_1','Query_2','SNP_Distance','SNPs_Cocalled']) 1071 preserved_pairwise_df = pd.DataFrame([(pairwise[0], pairwise[1], 0,np.nan) for pairwise in pairwise_combinations],columns = ['Query_1','Query_2','SNP_Distance','SNPs_Cocalled'])
1062 preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False) 1072 with open(preserved_pairwise,"w") as f:
1073 preserved_pairwise_df.to_csv(f, sep="\t", index=False)
1063 else: 1074 else:
1064 preserved_distance_results = parallelAlignment(preserved_alignment) 1075 preserved_distance_results = parallelAlignment(preserved_alignment)
1065 preserved_pairwise_df = pd.DataFrame(preserved_distance_results, columns=['Query_1', 'Query_2', 'SNP_Distance', 'SNPs_Cocalled']) 1076 preserved_pairwise_df = pd.DataFrame(preserved_distance_results, columns=['Query_1', 'Query_2', 'SNP_Distance', 'SNPs_Cocalled'])
1066 preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False) 1077 with open(preserved_pairwise,"w") as f:
1067 1078 preserved_pairwise_df.to_csv(f, sep="\t", index=False)
1079
1068 # Create matrix 1080 # Create matrix
1069 idx = sorted(set(raw_pairwise_df['Query_1']).union(raw_pairwise_df['Query_2'])) 1081 idx = sorted(set(raw_pairwise_df['Query_1']).union(raw_pairwise_df['Query_2']))
1070 mirrored_distance_df = raw_pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f')) 1082 mirrored_distance_df = raw_pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f'))
1071 mirrored_distance_df.index.name = '' 1083 mirrored_distance_df.index.name = ''
1072 mirrored_distance_df.to_csv(raw_matrix,sep="\t") 1084 with open(raw_matrix,"w") as f:
1085 mirrored_distance_df.to_csv(f,sep="\t")
1073 1086
1074 idx = sorted(set(preserved_pairwise_df['Query_1']).union(preserved_pairwise_df['Query_2'])) 1087 idx = sorted(set(preserved_pairwise_df['Query_1']).union(preserved_pairwise_df['Query_2']))
1075 mirrored_distance_df = preserved_pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f')) 1088 mirrored_distance_df = preserved_pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f'))
1076 mirrored_distance_df.index.name = '' 1089 mirrored_distance_df.index.name = ''
1077 mirrored_distance_df.to_csv(preserved_matrix,sep="\t") 1090 with open(preserved_matrix,"w") as f:
1091 mirrored_distance_df.to_csv(f,sep="\t")
1078 1092
1079 # Clean up pybedtools temp 1093 # Clean up pybedtools temp
1080 helpers.cleanup(verbose=False,remove_all = False) 1094 helpers.cleanup(verbose=False,remove_all = False)
1081 1095
1082 end_time = time.time() 1096 end_time = time.time()