rliterman@0
|
1 #!/usr/bin/env python3
|
rliterman@0
|
2
|
rliterman@0
|
3 import os
|
rliterman@0
|
4 import sys
|
rliterman@0
|
5 import pandas as pd
|
rliterman@0
|
6 import hashlib
|
rliterman@0
|
7 import argparse
|
rliterman@0
|
8
|
rliterman@0
|
9 def checkLineExists(file_path, sha256):
|
rliterman@0
|
10 if not os.path.exists(file_path):
|
rliterman@0
|
11 return False
|
rliterman@0
|
12 try:
|
rliterman@0
|
13 with open(file_path, 'rb') as file:
|
rliterman@0
|
14 file_hash = hashlib.sha256()
|
rliterman@0
|
15 chunk_size = 8192 # Read in 8KB chunks
|
rliterman@0
|
16 while chunk := file.read(chunk_size):
|
rliterman@0
|
17 file_hash.update(chunk)
|
rliterman@0
|
18 return file_hash.hexdigest() == sha256
|
rliterman@0
|
19 except Exception as e:
|
rliterman@0
|
20 print(f"Error reading file: {file_path}: {str(e)}")
|
rliterman@0
|
21 return False
|
rliterman@0
|
22
|
rliterman@0
|
23 def processHeader(header_row,snpdiffs_path):
|
rliterman@0
|
24 header_cols = [item.split(':')[0] for item in header_row]
|
rliterman@0
|
25 header_vals = [item.split(':')[1] for item in header_row]
|
rliterman@0
|
26
|
rliterman@0
|
27 header_data = pd.DataFrame(columns = header_cols)
|
rliterman@0
|
28 header_data.loc[0] = header_vals
|
rliterman@0
|
29 header_data['SNPDiffs_File'] = snpdiffs_path
|
rliterman@0
|
30 return header_data
|
rliterman@0
|
31
|
rliterman@0
|
32 parser = argparse.ArgumentParser()
|
rliterman@0
|
33 parser.add_argument("--snpdiffs_file", help="Path to the SNP diffs list file")
|
rliterman@0
|
34 parser.add_argument("--trim_name", help="Trim name")
|
rliterman@0
|
35 args = parser.parse_args()
|
rliterman@0
|
36
|
rliterman@0
|
37 snpdiffs_list_file = args.snpdiffs_file
|
rliterman@0
|
38 trim_name = args.trim_name
|
rliterman@0
|
39 header_rows = []
|
rliterman@0
|
40
|
rliterman@0
|
41 # Read in all files, and if they exist, read in the header
|
rliterman@0
|
42 try:
|
rliterman@0
|
43 snpdiffs_list = [line.strip() for line in open(snpdiffs_list_file, 'r')]
|
rliterman@0
|
44 except:
|
rliterman@0
|
45 sys.exit("Error: Unable to read file: " + snpdiffs_list_file)
|
rliterman@0
|
46
|
rliterman@0
|
47 for snpdiffs_file in snpdiffs_list:
|
rliterman@0
|
48 try:
|
rliterman@0
|
49 snpdiffs_path = os.path.abspath(snpdiffs_file)
|
rliterman@0
|
50 with open(snpdiffs_path, 'r') as file:
|
rliterman@0
|
51 top_line = file.readline().lstrip('#').strip().split('\t')
|
rliterman@0
|
52 header_rows.append(processHeader(top_line,snpdiffs_path))
|
rliterman@0
|
53 except:
|
rliterman@0
|
54 sys.exit("Error: Unable to read file: " + snpdiffs_file)
|
rliterman@0
|
55
|
rliterman@0
|
56 # Create header df
|
rliterman@0
|
57 try:
|
rliterman@0
|
58 all_snpdiffs_data = pd.concat(header_rows,ignore_index=True)
|
rliterman@0
|
59 all_snpdiffs_data['Reference_ID'] = all_snpdiffs_data['Reference_ID'].str.replace(trim_name,'',regex=False)
|
rliterman@0
|
60 all_snpdiffs_data['Query_ID'] = all_snpdiffs_data['Query_ID'].str.replace(trim_name,'',regex=False)
|
rliterman@0
|
61 except:
|
rliterman@0
|
62 sys.exit("Error: Unable to create header dataframe")
|
rliterman@0
|
63
|
rliterman@0
|
64 query_sha_counts = all_snpdiffs_data.groupby('Query_ID')['Query_SHA256'].nunique()
|
rliterman@0
|
65 reference_sha_counts = all_snpdiffs_data.groupby('Reference_ID')['Reference_SHA256'].nunique()
|
rliterman@0
|
66 file_counts = all_snpdiffs_data['SNPDiffs_File'].value_counts()
|
rliterman@0
|
67
|
rliterman@0
|
68 if ((query_sha_counts > 1).any() or (reference_sha_counts > 1).any()):
|
rliterman@0
|
69 print(all_snpdiffs_data[all_snpdiffs_data['Query_ID'].isin(query_sha_counts[query_sha_counts > 1].index)])
|
rliterman@0
|
70 print(all_snpdiffs_data[all_snpdiffs_data['Reference_ID'].isin(reference_sha_counts[reference_sha_counts > 1].index)])
|
rliterman@0
|
71 sys.exit("Multiple SHA256 values found for the same Query_ID/Reference_ID")
|
rliterman@0
|
72 elif (file_counts > 1).any():
|
rliterman@0
|
73 print(all_snpdiffs_data[all_snpdiffs_data['SNPDiffs_File'].isin(file_counts[file_counts > 1].index)])
|
rliterman@0
|
74 sys.exit("The same SNPDiffs file is listed multiple times")
|
rliterman@0
|
75 else:
|
rliterman@0
|
76 results = []
|
rliterman@0
|
77 for index, row in all_snpdiffs_data.iterrows():
|
rliterman@0
|
78
|
rliterman@0
|
79 query_assembly = os.path.abspath(row['Query_Assembly']) if checkLineExists(row['Query_Assembly'], row['Query_SHA256']) else "null"
|
rliterman@0
|
80 reference_assembly = os.path.abspath(row['Reference_Assembly']) if checkLineExists(row['Reference_Assembly'], row['Reference_SHA256']) else "null"
|
rliterman@0
|
81
|
rliterman@0
|
82 result = ",".join([row['SNPDiffs_File'],
|
rliterman@0
|
83 row['Query_ID'], query_assembly,str(row['Query_Contig_Count']),str(row['Query_Assembly_Bases']),
|
rliterman@0
|
84 str(row['Query_N50']),str(row['Query_N90']),str(row['Query_L50']),str(row['Query_L90']),row['Query_SHA256'],
|
rliterman@0
|
85 row['Reference_ID'],reference_assembly,str(row['Reference_Contig_Count']),str(row['Reference_Assembly_Bases']),
|
rliterman@0
|
86 str(row['Reference_N50']),str(row['Reference_N90']),str(row['Reference_L50']),str(row['Reference_L90']),row['Reference_SHA256']])
|
rliterman@0
|
87 results.append(result)
|
rliterman@0
|
88 for result in results:
|
rliterman@0
|
89 print(result)
|