Mercurial > repos > rliterman > csp2
comparison CSP2/bin/saveSNPDiffs.py @ 0:01431fa12065
"planemo upload"
author | rliterman |
---|---|
date | Mon, 02 Dec 2024 10:40:55 -0500 |
parents | |
children | 792274118b2e |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:01431fa12065 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import os | |
4 import sys | |
5 import pandas as pd | |
6 import numpy as np | |
7 import argparse | |
8 | |
9 def processHeader(header_row,file_path,trim_name): | |
10 header_cols = [item.split(':')[0] for item in header_row] | |
11 header_vals = [item.split(':')[1] for item in header_row] | |
12 | |
13 header_data = pd.DataFrame(columns = header_cols) | |
14 header_data.loc[0] = header_vals | |
15 | |
16 # Replace trim_name from Reference_ID and Query_ID | |
17 header_data['Reference_ID'] = header_data['Reference_ID'].str.replace(trim_name,'',regex=False) | |
18 header_data['Query_ID'] = header_data['Query_ID'].str.replace(trim_name,'',regex=False) | |
19 | |
20 # Add file path | |
21 header_data['SNPDiffs_File'] = os.path.abspath(file_path) | |
22 cols = header_data.columns.tolist() | |
23 cols = cols[-1:] + cols[:-1] | |
24 header_data = header_data[cols] | |
25 | |
26 return header_data | |
27 | |
28 | |
29 parser = argparse.ArgumentParser() | |
30 parser.add_argument("--snpdiffs_file", help="Path to the SNP diffs list file") | |
31 parser.add_argument("--summary_file", help="Path to the summary file") | |
32 parser.add_argument("--isolate_file", help="Path to the isolate data file") | |
33 parser.add_argument("--trim_name", help="Trim name") | |
34 parser.add_argument("--ref_id_file", help="Path to the reference IDs file") | |
35 args = parser.parse_args() | |
36 | |
37 snpdiffs_list_file = args.snpdiffs_file | |
38 summary_file = args.summary_file | |
39 isolate_data_file = args.isolate_file | |
40 trim_name = args.trim_name | |
41 | |
42 if os.stat(args.ref_id_file).st_size == 0: | |
43 ref_ids = [] | |
44 else: | |
45 ref_ids = [line.strip() for line in open(args.ref_id_file, 'r')] | |
46 | |
47 # Read in all lines and ensure each file exists | |
48 snpdiffs_list = [line.strip() for line in open(snpdiffs_list_file, 'r')] | |
49 snpdiffs_list = [line for line in snpdiffs_list if line] | |
50 for snpdiffs_file in snpdiffs_list: | |
51 if not os.path.exists(snpdiffs_file): | |
52 sys.exit("Error: File does not exist: " + snpdiffs_file) | |
53 | |
54 header_rows = [] | |
55 for snpdiffs_file in snpdiffs_list: | |
56 with open(snpdiffs_file, 'r') as file: | |
57 top_line = file.readline().lstrip('#').strip().split('\t') | |
58 header_rows.append(processHeader(top_line,snpdiffs_file,trim_name)) | |
59 | |
60 output_data = pd.concat(header_rows, ignore_index=True) | |
61 output_data.to_csv(summary_file, sep='\t', index=False) | |
62 | |
63 # If ref_ids is empty, save isolate data | |
64 ref_header = ['Reference_ID','Reference_Assembly','Reference_Contig_Count','Reference_Assembly_Bases','Reference_N50','Reference_N90','Reference_L50','Reference_L90','Reference_SHA256'] | |
65 query_header = ['Query_ID','Query_Assembly','Query_Contig_Count','Query_Assembly_Bases','Query_N50','Query_N90','Query_L50','Query_L90','Query_SHA256'] | |
66 isolate_header = ["Isolate_ID","Assembly_Path","Contig_Count","Assembly_Bases","N50","N90","L50","L90","SHA256"] | |
67 | |
68 ref_df = output_data[ref_header] | |
69 query_df = output_data[query_header] | |
70 | |
71 ref_df.columns = isolate_header | |
72 query_df.columns = isolate_header | |
73 | |
74 combined_df = pd.concat([ref_df,query_df]) | |
75 | |
76 # Set combined_df[Isolate_Type] to Reference if Isolate_ID is in ref_ids | |
77 combined_df['Isolate_Type'] = np.where(combined_df['Isolate_ID'].isin(ref_ids), 'Reference', 'Query') | |
78 combined_df = combined_df.drop_duplicates() | |
79 cols = combined_df.columns.tolist() | |
80 cols = cols[:1] + cols[-1:] + cols[1:-1] | |
81 combined_df = combined_df[cols] | |
82 combined_df.to_csv(isolate_data_file, sep='\t', index=False) | |
83 |