diff CSP2/bin/processFasta.py @ 0:01431fa12065

"planemo upload"
author rliterman
date Mon, 02 Dec 2024 10:40:55 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/bin/processFasta.py	Mon Dec 02 10:40:55 2024 -0500
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+
+from Bio import SeqIO
+import hashlib
+import sys
+import os
+import pandas as pd
+
+def fasta_info(sample_name,file_path):
+    
+    if not os.path.exists(file_path):
+        sys.exit(f"File {file_path} does not exist.")
+    elif not file_path.lower().endswith(('.fa', '.fasta', '.fna')):
+        sys.exit(f"File {file_path} is not a .fa, .fasta, or .fna file.")
+    
+    records = list(SeqIO.parse(file_path, 'fasta'))
+    contig_count = int(len(records))
+    lengths = sorted([len(record) for record in records], reverse=True)
+    assembly_bases = sum(lengths)
+
+    with open(file_path, 'rb') as file:
+        sha256 = hashlib.sha256(file.read()).hexdigest()
+
+    cumulative_length = 0
+    n50 = None
+    n90 = None
+    l50 = None
+    l90 = None
+    
+    for i, length in enumerate(lengths, start=1):
+        cumulative_length += length
+        if cumulative_length >= assembly_bases * 0.5 and n50 is None:
+            n50 = length
+            l50 = i
+        if cumulative_length >= assembly_bases * 0.9 and n90 is None:
+            n90 = length
+            l90 = i
+        if n50 is not None and n90 is not None:
+            break
+
+    print(f"{sample_name},{file_path},{contig_count},{assembly_bases},{n50},{n90},{l50},{l90},{sha256}")
+
+fasta_tsv = pd.read_csv(sys.argv[1], sep='\t', header=None, names=['Sample_ID','Fasta_Path'])
+[fasta_info(sample_name, file_path) for sample_name, file_path in fasta_tsv.values]
\ No newline at end of file