Mercurial > repos > rliterman > csp2
comparison CSP2/bin/runSNPPipeline.py @ 39:93393808f415
"planemo upload"
author | rliterman |
---|---|
date | Thu, 12 Dec 2024 13:53:15 -0500 |
parents | 893a6993efe3 |
children |
comparison
equal
deleted
inserted
replaced
38:ee512a230a1e | 39:93393808f415 |
---|---|
505 filtered_ref_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Ref_Edge"].shape[0] | 505 filtered_ref_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Ref_Edge"].shape[0] |
506 filtered_both_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Both_Edge"].shape[0] | 506 filtered_both_edge = filtered_snp_df[filtered_snp_df.Cat == "Filtered_Both_Edge"].shape[0] |
507 | 507 |
508 # Write filtered SNP data to file | 508 # Write filtered SNP data to file |
509 snp_file = log_file.replace(".log","_SNPs.tsv") | 509 snp_file = log_file.replace(".log","_SNPs.tsv") |
510 filtered_snp_df.to_csv(snp_file, sep="\t", index=False) | 510 with open(snp_file,"w") as f: |
511 filtered_snp_df.to_csv(f, sep="\t", index=False) | |
511 | 512 |
512 filtered_snp_df.loc[:, 'Query_ID'] = query_id | 513 filtered_snp_df.loc[:, 'Query_ID'] = query_id |
513 | 514 |
514 with open(log_file,"a+") as log: | 515 with open(log_file,"a+") as log: |
515 log.write("Done!\n") | 516 log.write("Done!\n") |
748 'Kmer_Similarity','Shared_Kmers','Query_Unique_Kmers','Reference_Unique_Kmers', | 749 'Kmer_Similarity','Shared_Kmers','Query_Unique_Kmers','Reference_Unique_Kmers', |
749 'MUMmer_gSNPs','MUMmer_gIndels'] | 750 'MUMmer_gSNPs','MUMmer_gIndels'] |
750 | 751 |
751 # Save reference screening | 752 # Save reference screening |
752 results_df = pd.DataFrame([item.result()[0] for item in results], columns = output_columns) | 753 results_df = pd.DataFrame([item.result()[0] for item in results], columns = output_columns) |
753 results_df.to_csv(reference_screening_file, sep="\t", index=False) | 754 with open(reference_screening_file,"w") as f: |
755 results_df.to_csv(f, sep="\t", index=False) | |
754 | 756 |
755 # Get reference bed dfs | 757 # Get reference bed dfs |
756 covered_df = pd.concat([item.result()[1] for item in results]) | 758 covered_df = pd.concat([item.result()[1] for item in results]) |
757 | 759 |
758 # Get snp dfs | 760 # Get snp dfs |
938 purged_count_df = pd.DataFrame(columns=['Ref_Loc','Purged_Count']) | 940 purged_count_df = pd.DataFrame(columns=['Ref_Loc','Purged_Count']) |
939 | 941 |
940 locus_coverage_df = snp_coverage_df.merge(ref_base_coverage_df, how='outer', on='Ref_Loc').merge(uncovered_count_df, how='outer', on='Ref_Loc').merge(purged_count_df, how='outer', on='Ref_Loc').fillna(0) | 942 locus_coverage_df = snp_coverage_df.merge(ref_base_coverage_df, how='outer', on='Ref_Loc').merge(uncovered_count_df, how='outer', on='Ref_Loc').merge(purged_count_df, how='outer', on='Ref_Loc').fillna(0) |
941 locus_coverage_df.loc[:, ['SNP_Count','Ref_Base_Count','Uncovered_Count','Purged_Count']] = locus_coverage_df.loc[:, ['SNP_Count','Ref_Base_Count','Uncovered_Count','Purged_Count']].astype(int) | 943 locus_coverage_df.loc[:, ['SNP_Count','Ref_Base_Count','Uncovered_Count','Purged_Count']] = locus_coverage_df.loc[:, ['SNP_Count','Ref_Base_Count','Uncovered_Count','Purged_Count']].astype(int) |
942 locus_coverage_df['Missing_Ratio'] = ((locus_coverage_df['Uncovered_Count'] + locus_coverage_df['Purged_Count']) / (1+len(pass_qc_isolates))) * 100 | 944 locus_coverage_df['Missing_Ratio'] = ((locus_coverage_df['Uncovered_Count'] + locus_coverage_df['Purged_Count']) / (1+len(pass_qc_isolates))) * 100 |
943 locus_coverage_df.to_csv(locus_category_file, sep="\t", index=False) | 945 with open(locus_category_file,"w") as f: |
946 locus_coverage_df.to_csv(f, sep="\t", index=False) | |
944 | 947 |
945 # Get isolate coverage stats | 948 # Get isolate coverage stats |
946 min_isolate_cols = ['Query_ID','SNP','Ref_Base','Percent_Missing','Purged','Uncovered','Rescued_SNP','Purged_Ref_Edge'] | 949 min_isolate_cols = ['Query_ID','SNP','Ref_Base','Percent_Missing','Purged','Uncovered','Rescued_SNP','Purged_Ref_Edge'] |
947 isolate_coverage_df = final_snp_df.groupby('Query_ID')['Cat'].value_counts().unstack().fillna(0).astype(float).astype(int).reset_index().drop(columns=['Reference_Isolate']) | 950 isolate_coverage_df = final_snp_df.groupby('Query_ID')['Cat'].value_counts().unstack().fillna(0).astype(float).astype(int).reset_index().drop(columns=['Reference_Isolate']) |
948 isolate_coverage_df.loc[isolate_coverage_df['Query_ID'] == reference_id, 'Ref_Base'] = snp_count | 951 isolate_coverage_df.loc[isolate_coverage_df['Query_ID'] == reference_id, 'Ref_Base'] = snp_count |
962 isolate_coverage_df.loc[:,'Purged_Ref_Edge'] = 0 | 965 isolate_coverage_df.loc[:,'Purged_Ref_Edge'] = 0 |
963 if ref_edge_df.shape[0] > 0: | 966 if ref_edge_df.shape[0] > 0: |
964 isolate_coverage_df.loc[isolate_coverage_df['Query_ID'] == reference_id, 'Purged_Ref_Edge'] = ref_edge_df['Ref_Loc'].nunique() | 967 isolate_coverage_df.loc[isolate_coverage_df['Query_ID'] == reference_id, 'Purged_Ref_Edge'] = ref_edge_df['Ref_Loc'].nunique() |
965 | 968 |
966 isolate_coverage_df = isolate_coverage_df[min_isolate_cols + possible_purged_cols].sort_values(by = 'Percent_Missing',ascending = False).reset_index(drop=True) | 969 isolate_coverage_df = isolate_coverage_df[min_isolate_cols + possible_purged_cols].sort_values(by = 'Percent_Missing',ascending = False).reset_index(drop=True) |
967 isolate_coverage_df.to_csv(query_coverage_file, sep="\t", index=False) | 970 with open(query_coverage_file,'w') as f: |
971 isolate_coverage_df.to_csv(f, sep="\t", index=False) | |
968 | 972 |
969 with open(log_file,"a+") as log: | 973 with open(log_file,"a+") as log: |
970 log.write(f"\t- SNP coverage information: {locus_category_file}\n") | 974 log.write(f"\t- SNP coverage information: {locus_category_file}\n") |
971 log.write(f"\t- Query coverage information: {query_coverage_file}\n") | 975 log.write(f"\t- Query coverage information: {query_coverage_file}\n") |
972 log.write("-------------------------------------------------------\n\n") | 976 log.write("-------------------------------------------------------\n\n") |
1037 | 1041 |
1038 if snp_count == 0: | 1042 if snp_count == 0: |
1039 pairwise_df = pd.DataFrame([(pairwise[0], pairwise[1], 0,np.nan) for pairwise in pairwise_combinations],columns = ['Query_1','Query_2','SNP_Distance','SNPs_Cocalled']) | 1043 pairwise_df = pd.DataFrame([(pairwise[0], pairwise[1], 0,np.nan) for pairwise in pairwise_combinations],columns = ['Query_1','Query_2','SNP_Distance','SNPs_Cocalled']) |
1040 preserved_pairwise_df = pairwise_df.copy() | 1044 preserved_pairwise_df = pairwise_df.copy() |
1041 | 1045 |
1042 pairwise_df.to_csv(raw_pairwise, sep="\t", index=False) | 1046 with open(raw_pairwise,"w") as f: |
1043 preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False) | 1047 pairwise_df.to_csv(f, sep="\t", index=False) |
1048 with open(preserved_pairwise,"w") as f: | |
1049 preserved_pairwise_df.to_csv(f, sep="\t", index=False) | |
1044 | 1050 |
1045 # Create matrix | 1051 # Create matrix |
1046 idx = sorted(set(pairwise_df['Query_1']).union(pairwise_df['Query_2'])) | 1052 idx = sorted(set(pairwise_df['Query_1']).union(pairwise_df['Query_2'])) |
1047 mirrored_distance_df = pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f')) | 1053 mirrored_distance_df = pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f')) |
1048 mirrored_distance_df.index.name = '' | 1054 mirrored_distance_df.index.name = '' |
1049 mirrored_distance_df.to_csv(raw_matrix,sep="\t") | 1055 with open(raw_matrix,"w") as f: |
1050 mirrored_distance_df.to_csv(preserved_matrix,sep="\t") | 1056 mirrored_distance_df.to_csv(f,sep="\t") |
1057 with open(preserved_matrix,"w") as f: | |
1058 mirrored_distance_df.to_csv(f,sep="\t") | |
1051 | 1059 |
1052 else: | 1060 else: |
1053 raw_distance_results = parallelAlignment(alignment) | 1061 raw_distance_results = parallelAlignment(alignment) |
1054 raw_pairwise_df = pd.DataFrame(raw_distance_results, columns=['Query_1', 'Query_2', 'SNP_Distance', 'SNPs_Cocalled']) | 1062 raw_pairwise_df = pd.DataFrame(raw_distance_results, columns=['Query_1', 'Query_2', 'SNP_Distance', 'SNPs_Cocalled']) |
1055 raw_pairwise_df.to_csv(raw_pairwise, sep="\t", index=False) | 1063 with open(raw_pairwise,"w") as f: |
1064 raw_pairwise_df.to_csv(f, sep="\t", index=False) | |
1056 | 1065 |
1057 if len(locs_pass_missing) == snp_count: | 1066 if len(locs_pass_missing) == snp_count: |
1058 preserved_pairwise_df = raw_pairwise_df.copy() | 1067 preserved_pairwise_df = raw_pairwise_df.copy() |
1059 preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False) | 1068 with open(preserved_pairwise,"w") as f: |
1069 preserved_pairwise_df.to_csv(f, sep="\t", index=False) | |
1060 elif len(locs_pass_missing) == 0: | 1070 elif len(locs_pass_missing) == 0: |
1061 preserved_pairwise_df = pd.DataFrame([(pairwise[0], pairwise[1], 0,np.nan) for pairwise in pairwise_combinations],columns = ['Query_1','Query_2','SNP_Distance','SNPs_Cocalled']) | 1071 preserved_pairwise_df = pd.DataFrame([(pairwise[0], pairwise[1], 0,np.nan) for pairwise in pairwise_combinations],columns = ['Query_1','Query_2','SNP_Distance','SNPs_Cocalled']) |
1062 preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False) | 1072 with open(preserved_pairwise,"w") as f: |
1073 preserved_pairwise_df.to_csv(f, sep="\t", index=False) | |
1063 else: | 1074 else: |
1064 preserved_distance_results = parallelAlignment(preserved_alignment) | 1075 preserved_distance_results = parallelAlignment(preserved_alignment) |
1065 preserved_pairwise_df = pd.DataFrame(preserved_distance_results, columns=['Query_1', 'Query_2', 'SNP_Distance', 'SNPs_Cocalled']) | 1076 preserved_pairwise_df = pd.DataFrame(preserved_distance_results, columns=['Query_1', 'Query_2', 'SNP_Distance', 'SNPs_Cocalled']) |
1066 preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False) | 1077 with open(preserved_pairwise,"w") as f: |
1067 | 1078 preserved_pairwise_df.to_csv(f, sep="\t", index=False) |
1079 | |
1068 # Create matrix | 1080 # Create matrix |
1069 idx = sorted(set(raw_pairwise_df['Query_1']).union(raw_pairwise_df['Query_2'])) | 1081 idx = sorted(set(raw_pairwise_df['Query_1']).union(raw_pairwise_df['Query_2'])) |
1070 mirrored_distance_df = raw_pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f')) | 1082 mirrored_distance_df = raw_pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f')) |
1071 mirrored_distance_df.index.name = '' | 1083 mirrored_distance_df.index.name = '' |
1072 mirrored_distance_df.to_csv(raw_matrix,sep="\t") | 1084 with open(raw_matrix,"w") as f: |
1085 mirrored_distance_df.to_csv(f,sep="\t") | |
1073 | 1086 |
1074 idx = sorted(set(preserved_pairwise_df['Query_1']).union(preserved_pairwise_df['Query_2'])) | 1087 idx = sorted(set(preserved_pairwise_df['Query_1']).union(preserved_pairwise_df['Query_2'])) |
1075 mirrored_distance_df = preserved_pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f')) | 1088 mirrored_distance_df = preserved_pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f')) |
1076 mirrored_distance_df.index.name = '' | 1089 mirrored_distance_df.index.name = '' |
1077 mirrored_distance_df.to_csv(preserved_matrix,sep="\t") | 1090 with open(preserved_matrix,"w") as f: |
1091 mirrored_distance_df.to_csv(f,sep="\t") | |
1078 | 1092 |
1079 # Clean up pybedtools temp | 1093 # Clean up pybedtools temp |
1080 helpers.cleanup(verbose=False,remove_all = False) | 1094 helpers.cleanup(verbose=False,remove_all = False) |
1081 | 1095 |
1082 end_time = time.time() | 1096 end_time = time.time() |