diff CSP2/bin/userSNPDiffs.py @ 0:01431fa12065

"planemo upload"
author rliterman
date Mon, 02 Dec 2024 10:40:55 -0500
parents
children 792274118b2e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/bin/userSNPDiffs.py	Mon Dec 02 10:40:55 2024 -0500
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import pandas as pd
+import hashlib
+import argparse
+
+def checkLineExists(file_path, sha256):
+    if not os.path.exists(file_path):
+        return False
+    try:
+        with open(file_path, 'rb') as file:
+            file_hash = hashlib.sha256()
+            chunk_size = 8192  # Read in 8KB chunks
+            while chunk := file.read(chunk_size):
+                file_hash.update(chunk)
+        return file_hash.hexdigest() == sha256
+    except Exception as e:
+        print(f"Error reading file: {file_path}: {str(e)}")
+        return False
+
+def processHeader(header_row,snpdiffs_path):
+    header_cols = [item.split(':')[0] for item in header_row]
+    header_vals = [item.split(':')[1] for item in header_row]
+
+    header_data = pd.DataFrame(columns = header_cols)
+    header_data.loc[0] = header_vals
+    header_data['SNPDiffs_File'] = snpdiffs_path
+    return header_data
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--snpdiffs_file", help="Path to the SNP diffs list file")
+parser.add_argument("--trim_name", help="Trim name")
+args = parser.parse_args()
+
+snpdiffs_list_file = args.snpdiffs_file
+trim_name = args.trim_name
+header_rows = []
+
+# Read in all files, and if they exist, read in the header
+try:
+    snpdiffs_list = [line.strip() for line in open(snpdiffs_list_file, 'r')]
+except:
+    sys.exit("Error: Unable to read file: " + snpdiffs_list_file)
+
+for snpdiffs_file in snpdiffs_list:
+    try:
+        snpdiffs_path = os.path.abspath(snpdiffs_file)    
+        with open(snpdiffs_path, 'r') as file:
+            top_line = file.readline().lstrip('#').strip().split('\t')
+            header_rows.append(processHeader(top_line,snpdiffs_path))
+    except:
+        sys.exit("Error: Unable to read file: " + snpdiffs_file)
+        
+# Create header df
+try:
+    all_snpdiffs_data = pd.concat(header_rows,ignore_index=True)
+    all_snpdiffs_data['Reference_ID'] = all_snpdiffs_data['Reference_ID'].str.replace(trim_name,'',regex=False)
+    all_snpdiffs_data['Query_ID'] = all_snpdiffs_data['Query_ID'].str.replace(trim_name,'',regex=False)
+except:
+    sys.exit("Error: Unable to create header dataframe")
+
+query_sha_counts = all_snpdiffs_data.groupby('Query_ID')['Query_SHA256'].nunique()
+reference_sha_counts = all_snpdiffs_data.groupby('Reference_ID')['Reference_SHA256'].nunique()
+file_counts = all_snpdiffs_data['SNPDiffs_File'].value_counts()
+
+if ((query_sha_counts > 1).any() or (reference_sha_counts > 1).any()):
+    print(all_snpdiffs_data[all_snpdiffs_data['Query_ID'].isin(query_sha_counts[query_sha_counts > 1].index)])
+    print(all_snpdiffs_data[all_snpdiffs_data['Reference_ID'].isin(reference_sha_counts[reference_sha_counts > 1].index)])
+    sys.exit("Multiple SHA256 values found for the same Query_ID/Reference_ID")
+elif (file_counts > 1).any():
+    print(all_snpdiffs_data[all_snpdiffs_data['SNPDiffs_File'].isin(file_counts[file_counts > 1].index)])
+    sys.exit("The same SNPDiffs file is listed multiple times")
+else:
+    results = []
+    for index, row in all_snpdiffs_data.iterrows():
+
+        query_assembly = os.path.abspath(row['Query_Assembly']) if checkLineExists(row['Query_Assembly'], row['Query_SHA256']) else "null"
+        reference_assembly = os.path.abspath(row['Reference_Assembly']) if checkLineExists(row['Reference_Assembly'], row['Reference_SHA256']) else "null"
+        
+        result = ",".join([row['SNPDiffs_File'],
+                       row['Query_ID'], query_assembly,str(row['Query_Contig_Count']),str(row['Query_Assembly_Bases']),
+                       str(row['Query_N50']),str(row['Query_N90']),str(row['Query_L50']),str(row['Query_L90']),row['Query_SHA256'],
+                       row['Reference_ID'],reference_assembly,str(row['Reference_Contig_Count']),str(row['Reference_Assembly_Bases']),
+                       str(row['Reference_N50']),str(row['Reference_N90']),str(row['Reference_L50']),str(row['Reference_L90']),row['Reference_SHA256']])
+        results.append(result)
+    for result in results:
+        print(result)