rliterman@0: #!/usr/bin/env python3 rliterman@0: rliterman@0: import os rliterman@0: import sys rliterman@0: import pandas as pd rliterman@0: import numpy as np rliterman@0: import argparse rliterman@0: rliterman@0: def processHeader(header_row,file_path,trim_name): rliterman@0: header_cols = [item.split(':')[0] for item in header_row] rliterman@0: header_vals = [item.split(':')[1] for item in header_row] rliterman@0: rliterman@0: header_data = pd.DataFrame(columns = header_cols) rliterman@0: header_data.loc[0] = header_vals rliterman@0: rliterman@0: # Replace trim_name from Reference_ID and Query_ID rliterman@0: header_data['Reference_ID'] = header_data['Reference_ID'].str.replace(trim_name,'',regex=False) rliterman@0: header_data['Query_ID'] = header_data['Query_ID'].str.replace(trim_name,'',regex=False) rliterman@0: rliterman@0: # Add file path rliterman@0: header_data['SNPDiffs_File'] = os.path.abspath(file_path) rliterman@0: cols = header_data.columns.tolist() rliterman@0: cols = cols[-1:] + cols[:-1] rliterman@0: header_data = header_data[cols] rliterman@0: rliterman@0: return header_data rliterman@0: rliterman@0: rliterman@0: parser = argparse.ArgumentParser() rliterman@0: parser.add_argument("--snpdiffs_file", help="Path to the SNP diffs list file") rliterman@0: parser.add_argument("--summary_file", help="Path to the summary file") rliterman@0: parser.add_argument("--isolate_file", help="Path to the isolate data file") rliterman@27: parser.add_argument('--trim_name', type=str, default="", help='trim name') rliterman@0: parser.add_argument("--ref_id_file", help="Path to the reference IDs file") rliterman@0: args = parser.parse_args() rliterman@0: rliterman@0: snpdiffs_list_file = args.snpdiffs_file rliterman@0: summary_file = args.summary_file rliterman@0: isolate_data_file = args.isolate_file rliterman@0: trim_name = args.trim_name rliterman@0: rliterman@0: if os.stat(args.ref_id_file).st_size == 0: rliterman@0: ref_ids = [] rliterman@0: else: rliterman@0: ref_ids = [line.strip() for line in open(args.ref_id_file, 'r')] rliterman@0: rliterman@0: # Read in all lines and ensure each file exists rliterman@0: snpdiffs_list = [line.strip() for line in open(snpdiffs_list_file, 'r')] rliterman@0: snpdiffs_list = [line for line in snpdiffs_list if line] rliterman@0: for snpdiffs_file in snpdiffs_list: rliterman@0: if not os.path.exists(snpdiffs_file): rliterman@0: sys.exit("Error: File does not exist: " + snpdiffs_file) rliterman@0: rliterman@0: header_rows = [] rliterman@0: for snpdiffs_file in snpdiffs_list: rliterman@0: with open(snpdiffs_file, 'r') as file: rliterman@0: top_line = file.readline().lstrip('#').strip().split('\t') rliterman@0: header_rows.append(processHeader(top_line,snpdiffs_file,trim_name)) rliterman@0: rliterman@0: output_data = pd.concat(header_rows, ignore_index=True) rliterman@0: output_data.to_csv(summary_file, sep='\t', index=False) rliterman@0: rliterman@0: # If ref_ids is empty, save isolate data rliterman@0: ref_header = ['Reference_ID','Reference_Assembly','Reference_Contig_Count','Reference_Assembly_Bases','Reference_N50','Reference_N90','Reference_L50','Reference_L90','Reference_SHA256'] rliterman@0: query_header = ['Query_ID','Query_Assembly','Query_Contig_Count','Query_Assembly_Bases','Query_N50','Query_N90','Query_L50','Query_L90','Query_SHA256'] rliterman@0: isolate_header = ["Isolate_ID","Assembly_Path","Contig_Count","Assembly_Bases","N50","N90","L50","L90","SHA256"] rliterman@0: rliterman@0: ref_df = output_data[ref_header] rliterman@0: query_df = output_data[query_header] rliterman@0: rliterman@0: ref_df.columns = isolate_header rliterman@0: query_df.columns = isolate_header rliterman@0: rliterman@0: combined_df = pd.concat([ref_df,query_df]) rliterman@0: rliterman@0: # Set combined_df[Isolate_Type] to Reference if Isolate_ID is in ref_ids rliterman@0: combined_df['Isolate_Type'] = np.where(combined_df['Isolate_ID'].isin(ref_ids), 'Reference', 'Query') rliterman@0: combined_df = combined_df.drop_duplicates() rliterman@0: cols = combined_df.columns.tolist() rliterman@0: cols = cols[:1] + cols[-1:] + cols[1:-1] rliterman@0: combined_df = combined_df[cols] rliterman@0: combined_df.to_csv(isolate_data_file, sep='\t', index=False) rliterman@0: