rliterman@0
|
1 #!/usr/bin/env python3
|
rliterman@0
|
2
|
rliterman@0
|
3 import os
|
rliterman@0
|
4 import sys
|
rliterman@0
|
5 import pandas as pd
|
rliterman@0
|
6 import numpy as np
|
rliterman@0
|
7 import argparse
|
rliterman@0
|
8
|
rliterman@0
|
9 def processHeader(header_row,file_path,trim_name):
|
rliterman@0
|
10 header_cols = [item.split(':')[0] for item in header_row]
|
rliterman@0
|
11 header_vals = [item.split(':')[1] for item in header_row]
|
rliterman@0
|
12
|
rliterman@0
|
13 header_data = pd.DataFrame(columns = header_cols)
|
rliterman@0
|
14 header_data.loc[0] = header_vals
|
rliterman@0
|
15
|
rliterman@0
|
16 # Replace trim_name from Reference_ID and Query_ID
|
rliterman@0
|
17 header_data['Reference_ID'] = header_data['Reference_ID'].str.replace(trim_name,'',regex=False)
|
rliterman@0
|
18 header_data['Query_ID'] = header_data['Query_ID'].str.replace(trim_name,'',regex=False)
|
rliterman@0
|
19
|
rliterman@0
|
20 # Add file path
|
rliterman@0
|
21 header_data['SNPDiffs_File'] = os.path.abspath(file_path)
|
rliterman@0
|
22 cols = header_data.columns.tolist()
|
rliterman@0
|
23 cols = cols[-1:] + cols[:-1]
|
rliterman@0
|
24 header_data = header_data[cols]
|
rliterman@0
|
25
|
rliterman@0
|
26 return header_data
|
rliterman@0
|
27
|
rliterman@0
|
28
|
rliterman@0
|
29 parser = argparse.ArgumentParser()
|
rliterman@0
|
30 parser.add_argument("--snpdiffs_file", help="Path to the SNP diffs list file")
|
rliterman@0
|
31 parser.add_argument("--summary_file", help="Path to the summary file")
|
rliterman@0
|
32 parser.add_argument("--isolate_file", help="Path to the isolate data file")
|
rliterman@28
|
33 parser.add_argument('--trim_name', nargs='?', const="", default="", type=str, help='Trim name')
|
rliterman@0
|
34 parser.add_argument("--ref_id_file", help="Path to the reference IDs file")
|
rliterman@0
|
35 args = parser.parse_args()
|
rliterman@0
|
36
|
rliterman@0
|
37 snpdiffs_list_file = args.snpdiffs_file
|
rliterman@0
|
38 summary_file = args.summary_file
|
rliterman@0
|
39 isolate_data_file = args.isolate_file
|
rliterman@0
|
40 trim_name = args.trim_name
|
rliterman@0
|
41
|
rliterman@0
|
42 if os.stat(args.ref_id_file).st_size == 0:
|
rliterman@0
|
43 ref_ids = []
|
rliterman@0
|
44 else:
|
rliterman@0
|
45 ref_ids = [line.strip() for line in open(args.ref_id_file, 'r')]
|
rliterman@0
|
46
|
rliterman@0
|
47 # Read in all lines and ensure each file exists
|
rliterman@0
|
48 snpdiffs_list = [line.strip() for line in open(snpdiffs_list_file, 'r')]
|
rliterman@0
|
49 snpdiffs_list = [line for line in snpdiffs_list if line]
|
rliterman@0
|
50 for snpdiffs_file in snpdiffs_list:
|
rliterman@0
|
51 if not os.path.exists(snpdiffs_file):
|
rliterman@0
|
52 sys.exit("Error: File does not exist: " + snpdiffs_file)
|
rliterman@0
|
53
|
rliterman@0
|
54 header_rows = []
|
rliterman@0
|
55 for snpdiffs_file in snpdiffs_list:
|
rliterman@0
|
56 with open(snpdiffs_file, 'r') as file:
|
rliterman@0
|
57 top_line = file.readline().lstrip('#').strip().split('\t')
|
rliterman@0
|
58 header_rows.append(processHeader(top_line,snpdiffs_file,trim_name))
|
rliterman@0
|
59
|
rliterman@0
|
60 output_data = pd.concat(header_rows, ignore_index=True)
|
rliterman@0
|
61 output_data.to_csv(summary_file, sep='\t', index=False)
|
rliterman@0
|
62
|
rliterman@0
|
63 # If ref_ids is empty, save isolate data
|
rliterman@0
|
64 ref_header = ['Reference_ID','Reference_Assembly','Reference_Contig_Count','Reference_Assembly_Bases','Reference_N50','Reference_N90','Reference_L50','Reference_L90','Reference_SHA256']
|
rliterman@0
|
65 query_header = ['Query_ID','Query_Assembly','Query_Contig_Count','Query_Assembly_Bases','Query_N50','Query_N90','Query_L50','Query_L90','Query_SHA256']
|
rliterman@0
|
66 isolate_header = ["Isolate_ID","Assembly_Path","Contig_Count","Assembly_Bases","N50","N90","L50","L90","SHA256"]
|
rliterman@0
|
67
|
rliterman@0
|
68 ref_df = output_data[ref_header]
|
rliterman@0
|
69 query_df = output_data[query_header]
|
rliterman@0
|
70
|
rliterman@0
|
71 ref_df.columns = isolate_header
|
rliterman@0
|
72 query_df.columns = isolate_header
|
rliterman@0
|
73
|
rliterman@0
|
74 combined_df = pd.concat([ref_df,query_df])
|
rliterman@0
|
75
|
rliterman@0
|
76 # Set combined_df[Isolate_Type] to Reference if Isolate_ID is in ref_ids
|
rliterman@0
|
77 combined_df['Isolate_Type'] = np.where(combined_df['Isolate_ID'].isin(ref_ids), 'Reference', 'Query')
|
rliterman@0
|
78 combined_df = combined_df.drop_duplicates()
|
rliterman@0
|
79 cols = combined_df.columns.tolist()
|
rliterman@0
|
80 cols = cols[:1] + cols[-1:] + cols[1:-1]
|
rliterman@0
|
81 combined_df = combined_df[cols]
|
rliterman@0
|
82 combined_df.to_csv(isolate_data_file, sep='\t', index=False)
|
rliterman@0
|
83
|