comparison CSP2/bin/compileSNPResults.py @ 39:93393808f415

"planemo upload"
author rliterman
date Thu, 12 Dec 2024 13:53:15 -0500
parents 01431fa12065
children
comparison
equal deleted inserted replaced
38:ee512a230a1e 39:93393808f415
465 log.write("-There were no QC warnings or failures\n") 465 log.write("-There were no QC warnings or failures\n")
466 466
467 # Output data 467 # Output data
468 468
469 # Mean assembly stats 469 # Mean assembly stats
470 isolate_mean_df.reset_index().to_csv(mean_isolate_file,sep='\t',index=False) 470 with open(mean_isolate_file, 'w') as f:
471 isolate_mean_df.reset_index().to_csv(f,sep='\t',index=False)
471 472
472 # Isolate assembly stats 473 # Isolate assembly stats
473 isolate_assembly_stats = isolate_stats.loc[isolate_stats['Measure'].isin(['Contig_Count','Assembly_Bases','L50','L90','N50','N90'])].drop(['Min','Max','StdDev','Count'],axis=1).rename(columns = {'Mean':'Value'}) 474 isolate_assembly_stats = isolate_stats.loc[isolate_stats['Measure'].isin(['Contig_Count','Assembly_Bases','L50','L90','N50','N90'])].drop(['Min','Max','StdDev','Count'],axis=1).rename(columns = {'Mean':'Value'})
474 isolate_assembly_stats.to_csv(isolate_assembly_stats_file,sep='\t',index=False) 475 with open(isolate_assembly_stats_file,'w') as f:
476 isolate_assembly_stats.to_csv(f,sep='\t',index=False)
475 477
476 # Isolate alignment stats 478 # Isolate alignment stats
477 isolate_align_stats = pd.concat([align_stats,isolate_cocalled_stats,isolate_snp_stats,isolate_stdev_stats]).reset_index(drop=True) 479 isolate_align_stats = pd.concat([align_stats,isolate_cocalled_stats,isolate_snp_stats,isolate_stdev_stats]).reset_index(drop=True)
478 for col in ['Min', 'Mean', 'Max', 'StdDev', 'Zscore']: 480 for col in ['Min', 'Mean', 'Max', 'StdDev', 'Zscore']:
479 isolate_align_stats[col] = isolate_align_stats[col].astype("float").round(3) 481 isolate_align_stats[col] = isolate_align_stats[col].astype("float").round(3)
480 isolate_align_stats.to_csv(align_stats_file,sep='\t',index=False) 482 with open(align_stats_file,'w') as f:
483 isolate_align_stats.to_csv(f,sep='\t',index=False)
481 484
482 # Reference Assembly Stats 485 # Reference Assembly Stats
483 ref_align_summary_df = ref_summary_df.loc[(~ref_summary_df['Measure'].isin(['Contig_Count','Assembly_Bases','L50','L90','N50','N90'])) & (~pd.isna(ref_summary_df['Zscore']))] 486 ref_align_summary_df = ref_summary_df.loc[(~ref_summary_df['Measure'].isin(['Contig_Count','Assembly_Bases','L50','L90','N50','N90'])) & (~pd.isna(ref_summary_df['Zscore']))]
484 ref_mean_summary_df = ref_summary_df.loc[(~ref_summary_df['Measure'].isin(['Contig_Count','Assembly_Bases','L50','L90','N50','N90'])) & (pd.isna(ref_summary_df['Zscore']))].drop(['Zscore','QC'],axis =1) 487 ref_mean_summary_df = ref_summary_df.loc[(~ref_summary_df['Measure'].isin(['Contig_Count','Assembly_Bases','L50','L90','N50','N90'])) & (pd.isna(ref_summary_df['Zscore']))].drop(['Zscore','QC'],axis =1)
485 ref_mean_summary_df['Zscore'] = np.nan 488 ref_mean_summary_df['Zscore'] = np.nan
490 ref_mean_summary_df = pd.concat([ref_mean_summary_df,ref_summary_preserved_df]) 493 ref_mean_summary_df = pd.concat([ref_mean_summary_df,ref_summary_preserved_df])
491 494
492 ref_isolate_align_stats = align_stats.loc[(align_stats['Isolate_Type'] == "Reference") & (align_stats['Measure'].isin(['Self_Aligned','Compare_Aligned']))].drop(['Isolate_Type'],axis=1).rename(columns = {'Isolate_ID':'Reference_ID'})[['Reference_ID','Measure','Mean','StdDev','Min','Max','Count','Zscore','QC']] 495 ref_isolate_align_stats = align_stats.loc[(align_stats['Isolate_Type'] == "Reference") & (align_stats['Measure'].isin(['Self_Aligned','Compare_Aligned']))].drop(['Isolate_Type'],axis=1).rename(columns = {'Isolate_ID':'Reference_ID'})[['Reference_ID','Measure','Mean','StdDev','Min','Max','Count','Zscore','QC']]
493 496
494 ref_mean_summary_stats = pd.concat([ref_mean_summary_df,ref_isolate_align_stats]) 497 ref_mean_summary_stats = pd.concat([ref_mean_summary_df,ref_isolate_align_stats])
495 ref_mean_summary_stats.to_csv(ref_mean_summary_file,sep='\t',index=False) 498
499 with open(ref_mean_summary_file,'w') as f:
500 ref_mean_summary_stats.to_csv(f,sep='\t',index=False)
496 501
497 end_time = time.time() 502 end_time = time.time()
498 503
499 with open(log_file,"a+") as log: 504 with open(log_file,"a+") as log:
500 log.write(f"\n- Completed compilation in {end_time - start_time:.2f} seconds\n") 505 log.write(f"\n- Completed compilation in {end_time - start_time:.2f} seconds\n")
503 log.write(f"\t- Saved isolate alignment data to {align_stats_file}\n") 508 log.write(f"\t- Saved isolate alignment data to {align_stats_file}\n")
504 log.write(f"\t- Saved reference summary data to {ref_mean_summary_file}\n") 509 log.write(f"\t- Saved reference summary data to {ref_mean_summary_file}\n")
505 510
506 # Comparisons if multiple refs 511 # Comparisons if multiple refs
507 if len(reference_ids) > 1: 512 if len(reference_ids) > 1:
508 comparison_df.to_csv(snp_comparison_file,sep="\t",index = False) 513 with open(snp_comparison_file,"w") as f:
514 comparison_df.to_csv(f,sep="\t",index = False)
509 log.write(f"\t- Saved SNP distance comparisons across references to {snp_comparison_file}\n") 515 log.write(f"\t- Saved SNP distance comparisons across references to {snp_comparison_file}\n")
510 516
511 # Failures/warnings 517 # Failures/warnings
512 if warn_fail_df.shape[0] > 0: 518 if warn_fail_df.shape[0] > 0:
513 warn_fail_df.to_csv(qc_file,sep="\t",index=False) 519 with open(qc_file,"w") as f:
520 warn_fail_df.to_csv(f,sep="\t",index=False)
514 log.write(f"\t- Saved QC warnings/failures to {qc_file}\n") 521 log.write(f"\t- Saved QC warnings/failures to {qc_file}\n")