rliterman@0: #!/usr/bin/env python3 rliterman@0: rliterman@0: import os rliterman@0: import sys rliterman@0: import pandas as pd rliterman@0: import hashlib rliterman@0: import argparse rliterman@0: rliterman@0: def checkLineExists(file_path, sha256): rliterman@0: if not os.path.exists(file_path): rliterman@0: return False rliterman@0: try: rliterman@0: with open(file_path, 'rb') as file: rliterman@0: file_hash = hashlib.sha256() rliterman@0: chunk_size = 8192 # Read in 8KB chunks rliterman@0: while chunk := file.read(chunk_size): rliterman@0: file_hash.update(chunk) rliterman@0: return file_hash.hexdigest() == sha256 rliterman@0: except Exception as e: rliterman@0: print(f"Error reading file: {file_path}: {str(e)}") rliterman@0: return False rliterman@0: rliterman@0: def processHeader(header_row,snpdiffs_path): rliterman@0: header_cols = [item.split(':')[0] for item in header_row] rliterman@0: header_vals = [item.split(':')[1] for item in header_row] rliterman@0: rliterman@0: header_data = pd.DataFrame(columns = header_cols) rliterman@0: header_data.loc[0] = header_vals rliterman@0: header_data['SNPDiffs_File'] = snpdiffs_path rliterman@0: return header_data rliterman@0: rliterman@0: parser = argparse.ArgumentParser() rliterman@0: parser.add_argument("--snpdiffs_file", help="Path to the SNP diffs list file") rliterman@28: parser.add_argument('--trim_name', nargs='?', const="", default="", type=str, help='Trim name') rliterman@0: args = parser.parse_args() rliterman@0: rliterman@0: snpdiffs_list_file = args.snpdiffs_file rliterman@0: trim_name = args.trim_name rliterman@0: header_rows = [] rliterman@0: rliterman@0: # Read in all files, and if they exist, read in the header rliterman@0: try: rliterman@0: snpdiffs_list = [line.strip() for line in open(snpdiffs_list_file, 'r')] rliterman@0: except: rliterman@0: sys.exit("Error: Unable to read file: " + snpdiffs_list_file) rliterman@0: rliterman@0: for snpdiffs_file in snpdiffs_list: rliterman@0: try: rliterman@0: snpdiffs_path = os.path.abspath(snpdiffs_file) rliterman@0: with open(snpdiffs_path, 'r') as file: rliterman@0: top_line = file.readline().lstrip('#').strip().split('\t') rliterman@0: header_rows.append(processHeader(top_line,snpdiffs_path)) rliterman@0: except: rliterman@0: sys.exit("Error: Unable to read file: " + snpdiffs_file) rliterman@0: rliterman@0: # Create header df rliterman@0: try: rliterman@0: all_snpdiffs_data = pd.concat(header_rows,ignore_index=True) rliterman@0: all_snpdiffs_data['Reference_ID'] = all_snpdiffs_data['Reference_ID'].str.replace(trim_name,'',regex=False) rliterman@0: all_snpdiffs_data['Query_ID'] = all_snpdiffs_data['Query_ID'].str.replace(trim_name,'',regex=False) rliterman@0: except: rliterman@0: sys.exit("Error: Unable to create header dataframe") rliterman@0: rliterman@0: query_sha_counts = all_snpdiffs_data.groupby('Query_ID')['Query_SHA256'].nunique() rliterman@0: reference_sha_counts = all_snpdiffs_data.groupby('Reference_ID')['Reference_SHA256'].nunique() rliterman@0: file_counts = all_snpdiffs_data['SNPDiffs_File'].value_counts() rliterman@0: rliterman@0: if ((query_sha_counts > 1).any() or (reference_sha_counts > 1).any()): rliterman@0: print(all_snpdiffs_data[all_snpdiffs_data['Query_ID'].isin(query_sha_counts[query_sha_counts > 1].index)]) rliterman@0: print(all_snpdiffs_data[all_snpdiffs_data['Reference_ID'].isin(reference_sha_counts[reference_sha_counts > 1].index)]) rliterman@0: sys.exit("Multiple SHA256 values found for the same Query_ID/Reference_ID") rliterman@0: elif (file_counts > 1).any(): rliterman@0: print(all_snpdiffs_data[all_snpdiffs_data['SNPDiffs_File'].isin(file_counts[file_counts > 1].index)]) rliterman@0: sys.exit("The same SNPDiffs file is listed multiple times") rliterman@0: else: rliterman@0: results = [] rliterman@0: for index, row in all_snpdiffs_data.iterrows(): rliterman@0: rliterman@0: query_assembly = os.path.abspath(row['Query_Assembly']) if checkLineExists(row['Query_Assembly'], row['Query_SHA256']) else "null" rliterman@0: reference_assembly = os.path.abspath(row['Reference_Assembly']) if checkLineExists(row['Reference_Assembly'], row['Reference_SHA256']) else "null" rliterman@0: rliterman@0: result = ",".join([row['SNPDiffs_File'], rliterman@0: row['Query_ID'], query_assembly,str(row['Query_Contig_Count']),str(row['Query_Assembly_Bases']), rliterman@0: str(row['Query_N50']),str(row['Query_N90']),str(row['Query_L50']),str(row['Query_L90']),row['Query_SHA256'], rliterman@0: row['Reference_ID'],reference_assembly,str(row['Reference_Contig_Count']),str(row['Reference_Assembly_Bases']), rliterman@0: str(row['Reference_N50']),str(row['Reference_N90']),str(row['Reference_L50']),str(row['Reference_L90']),row['Reference_SHA256']]) rliterman@0: results.append(result) rliterman@0: for result in results: rliterman@0: print(result)