Mercurial > repos > rliterman > csp2
comparison CSP2/bin/userSNPDiffs.py @ 0:01431fa12065
"planemo upload"
author | rliterman |
---|---|
date | Mon, 02 Dec 2024 10:40:55 -0500 |
parents | |
children | 792274118b2e |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:01431fa12065 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import os | |
4 import sys | |
5 import pandas as pd | |
6 import hashlib | |
7 import argparse | |
8 | |
9 def checkLineExists(file_path, sha256): | |
10 if not os.path.exists(file_path): | |
11 return False | |
12 try: | |
13 with open(file_path, 'rb') as file: | |
14 file_hash = hashlib.sha256() | |
15 chunk_size = 8192 # Read in 8KB chunks | |
16 while chunk := file.read(chunk_size): | |
17 file_hash.update(chunk) | |
18 return file_hash.hexdigest() == sha256 | |
19 except Exception as e: | |
20 print(f"Error reading file: {file_path}: {str(e)}") | |
21 return False | |
22 | |
23 def processHeader(header_row,snpdiffs_path): | |
24 header_cols = [item.split(':')[0] for item in header_row] | |
25 header_vals = [item.split(':')[1] for item in header_row] | |
26 | |
27 header_data = pd.DataFrame(columns = header_cols) | |
28 header_data.loc[0] = header_vals | |
29 header_data['SNPDiffs_File'] = snpdiffs_path | |
30 return header_data | |
31 | |
32 parser = argparse.ArgumentParser() | |
33 parser.add_argument("--snpdiffs_file", help="Path to the SNP diffs list file") | |
34 parser.add_argument("--trim_name", help="Trim name") | |
35 args = parser.parse_args() | |
36 | |
37 snpdiffs_list_file = args.snpdiffs_file | |
38 trim_name = args.trim_name | |
39 header_rows = [] | |
40 | |
41 # Read in all files, and if they exist, read in the header | |
42 try: | |
43 snpdiffs_list = [line.strip() for line in open(snpdiffs_list_file, 'r')] | |
44 except: | |
45 sys.exit("Error: Unable to read file: " + snpdiffs_list_file) | |
46 | |
47 for snpdiffs_file in snpdiffs_list: | |
48 try: | |
49 snpdiffs_path = os.path.abspath(snpdiffs_file) | |
50 with open(snpdiffs_path, 'r') as file: | |
51 top_line = file.readline().lstrip('#').strip().split('\t') | |
52 header_rows.append(processHeader(top_line,snpdiffs_path)) | |
53 except: | |
54 sys.exit("Error: Unable to read file: " + snpdiffs_file) | |
55 | |
56 # Create header df | |
57 try: | |
58 all_snpdiffs_data = pd.concat(header_rows,ignore_index=True) | |
59 all_snpdiffs_data['Reference_ID'] = all_snpdiffs_data['Reference_ID'].str.replace(trim_name,'',regex=False) | |
60 all_snpdiffs_data['Query_ID'] = all_snpdiffs_data['Query_ID'].str.replace(trim_name,'',regex=False) | |
61 except: | |
62 sys.exit("Error: Unable to create header dataframe") | |
63 | |
64 query_sha_counts = all_snpdiffs_data.groupby('Query_ID')['Query_SHA256'].nunique() | |
65 reference_sha_counts = all_snpdiffs_data.groupby('Reference_ID')['Reference_SHA256'].nunique() | |
66 file_counts = all_snpdiffs_data['SNPDiffs_File'].value_counts() | |
67 | |
68 if ((query_sha_counts > 1).any() or (reference_sha_counts > 1).any()): | |
69 print(all_snpdiffs_data[all_snpdiffs_data['Query_ID'].isin(query_sha_counts[query_sha_counts > 1].index)]) | |
70 print(all_snpdiffs_data[all_snpdiffs_data['Reference_ID'].isin(reference_sha_counts[reference_sha_counts > 1].index)]) | |
71 sys.exit("Multiple SHA256 values found for the same Query_ID/Reference_ID") | |
72 elif (file_counts > 1).any(): | |
73 print(all_snpdiffs_data[all_snpdiffs_data['SNPDiffs_File'].isin(file_counts[file_counts > 1].index)]) | |
74 sys.exit("The same SNPDiffs file is listed multiple times") | |
75 else: | |
76 results = [] | |
77 for index, row in all_snpdiffs_data.iterrows(): | |
78 | |
79 query_assembly = os.path.abspath(row['Query_Assembly']) if checkLineExists(row['Query_Assembly'], row['Query_SHA256']) else "null" | |
80 reference_assembly = os.path.abspath(row['Reference_Assembly']) if checkLineExists(row['Reference_Assembly'], row['Reference_SHA256']) else "null" | |
81 | |
82 result = ",".join([row['SNPDiffs_File'], | |
83 row['Query_ID'], query_assembly,str(row['Query_Contig_Count']),str(row['Query_Assembly_Bases']), | |
84 str(row['Query_N50']),str(row['Query_N90']),str(row['Query_L50']),str(row['Query_L90']),row['Query_SHA256'], | |
85 row['Reference_ID'],reference_assembly,str(row['Reference_Contig_Count']),str(row['Reference_Assembly_Bases']), | |
86 str(row['Reference_N50']),str(row['Reference_N90']),str(row['Reference_L50']),str(row['Reference_L90']),row['Reference_SHA256']]) | |
87 results.append(result) | |
88 for result in results: | |
89 print(result) |