Mercurial > repos > rliterman > csp2
diff CSP2/bin/runSNPPipeline.py @ 39:93393808f415
"planemo upload"
author | rliterman |
---|---|
date | Thu, 12 Dec 2024 13:53:15 -0500 |
parents | 893a6993efe3 |
children |
line wrap: on
line diff
--- a/CSP2/bin/runSNPPipeline.py Wed Dec 11 12:04:20 2024 -0500 +++ b/CSP2/bin/runSNPPipeline.py Thu Dec 12 13:53:15 2024 -0500 @@ -507,7 +507,8 @@ # Write filtered SNP data to file snp_file = log_file.replace(".log","_SNPs.tsv") - filtered_snp_df.to_csv(snp_file, sep="\t", index=False) + with open(snp_file,"w") as f: + filtered_snp_df.to_csv(f, sep="\t", index=False) filtered_snp_df.loc[:, 'Query_ID'] = query_id @@ -750,7 +751,8 @@ # Save reference screening results_df = pd.DataFrame([item.result()[0] for item in results], columns = output_columns) - results_df.to_csv(reference_screening_file, sep="\t", index=False) + with open(reference_screening_file,"w") as f: + results_df.to_csv(f, sep="\t", index=False) # Get reference bed dfs covered_df = pd.concat([item.result()[1] for item in results]) @@ -940,7 +942,8 @@ locus_coverage_df = snp_coverage_df.merge(ref_base_coverage_df, how='outer', on='Ref_Loc').merge(uncovered_count_df, how='outer', on='Ref_Loc').merge(purged_count_df, how='outer', on='Ref_Loc').fillna(0) locus_coverage_df.loc[:, ['SNP_Count','Ref_Base_Count','Uncovered_Count','Purged_Count']] = locus_coverage_df.loc[:, ['SNP_Count','Ref_Base_Count','Uncovered_Count','Purged_Count']].astype(int) locus_coverage_df['Missing_Ratio'] = ((locus_coverage_df['Uncovered_Count'] + locus_coverage_df['Purged_Count']) / (1+len(pass_qc_isolates))) * 100 - locus_coverage_df.to_csv(locus_category_file, sep="\t", index=False) + with open(locus_category_file,"w") as f: + locus_coverage_df.to_csv(f, sep="\t", index=False) # Get isolate coverage stats min_isolate_cols = ['Query_ID','SNP','Ref_Base','Percent_Missing','Purged','Uncovered','Rescued_SNP','Purged_Ref_Edge'] @@ -964,7 +967,8 @@ isolate_coverage_df.loc[isolate_coverage_df['Query_ID'] == reference_id, 'Purged_Ref_Edge'] = ref_edge_df['Ref_Loc'].nunique() isolate_coverage_df = isolate_coverage_df[min_isolate_cols + possible_purged_cols].sort_values(by = 'Percent_Missing',ascending = False).reset_index(drop=True) - isolate_coverage_df.to_csv(query_coverage_file, sep="\t", index=False) + with open(query_coverage_file,'w') as f: + isolate_coverage_df.to_csv(f, sep="\t", index=False) with open(log_file,"a+") as log: log.write(f"\t- SNP coverage information: {locus_category_file}\n") @@ -1039,42 +1043,52 @@ pairwise_df = pd.DataFrame([(pairwise[0], pairwise[1], 0,np.nan) for pairwise in pairwise_combinations],columns = ['Query_1','Query_2','SNP_Distance','SNPs_Cocalled']) preserved_pairwise_df = pairwise_df.copy() - pairwise_df.to_csv(raw_pairwise, sep="\t", index=False) - preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False) + with open(raw_pairwise,"w") as f: + pairwise_df.to_csv(f, sep="\t", index=False) + with open(preserved_pairwise,"w") as f: + preserved_pairwise_df.to_csv(f, sep="\t", index=False) # Create matrix idx = sorted(set(pairwise_df['Query_1']).union(pairwise_df['Query_2'])) mirrored_distance_df = pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f')) mirrored_distance_df.index.name = '' - mirrored_distance_df.to_csv(raw_matrix,sep="\t") - mirrored_distance_df.to_csv(preserved_matrix,sep="\t") + with open(raw_matrix,"w") as f: + mirrored_distance_df.to_csv(f,sep="\t") + with open(preserved_matrix,"w") as f: + mirrored_distance_df.to_csv(f,sep="\t") else: raw_distance_results = parallelAlignment(alignment) raw_pairwise_df = pd.DataFrame(raw_distance_results, columns=['Query_1', 'Query_2', 'SNP_Distance', 'SNPs_Cocalled']) - raw_pairwise_df.to_csv(raw_pairwise, sep="\t", index=False) + with open(raw_pairwise,"w") as f: + raw_pairwise_df.to_csv(f, sep="\t", index=False) if len(locs_pass_missing) == snp_count: preserved_pairwise_df = raw_pairwise_df.copy() - preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False) + with open(preserved_pairwise,"w") as f: + preserved_pairwise_df.to_csv(f, sep="\t", index=False) elif len(locs_pass_missing) == 0: preserved_pairwise_df = pd.DataFrame([(pairwise[0], pairwise[1], 0,np.nan) for pairwise in pairwise_combinations],columns = ['Query_1','Query_2','SNP_Distance','SNPs_Cocalled']) - preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False) + with open(preserved_pairwise,"w") as f: + preserved_pairwise_df.to_csv(f, sep="\t", index=False) else: preserved_distance_results = parallelAlignment(preserved_alignment) preserved_pairwise_df = pd.DataFrame(preserved_distance_results, columns=['Query_1', 'Query_2', 'SNP_Distance', 'SNPs_Cocalled']) - preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False) - + with open(preserved_pairwise,"w") as f: + preserved_pairwise_df.to_csv(f, sep="\t", index=False) + # Create matrix idx = sorted(set(raw_pairwise_df['Query_1']).union(raw_pairwise_df['Query_2'])) mirrored_distance_df = raw_pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f')) mirrored_distance_df.index.name = '' - mirrored_distance_df.to_csv(raw_matrix,sep="\t") + with open(raw_matrix,"w") as f: + mirrored_distance_df.to_csv(f,sep="\t") idx = sorted(set(preserved_pairwise_df['Query_1']).union(preserved_pairwise_df['Query_2'])) mirrored_distance_df = preserved_pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f')) mirrored_distance_df.index.name = '' - mirrored_distance_df.to_csv(preserved_matrix,sep="\t") + with open(preserved_matrix,"w") as f: + mirrored_distance_df.to_csv(f,sep="\t") # Clean up pybedtools temp helpers.cleanup(verbose=False,remove_all = False)