comparison CSP2/bin/processFasta.py @ 0:01431fa12065

"planemo upload"
author rliterman
date Mon, 02 Dec 2024 10:40:55 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:01431fa12065
1 #!/usr/bin/env python3
2
3 from Bio import SeqIO
4 import hashlib
5 import sys
6 import os
7 import pandas as pd
8
9 def fasta_info(sample_name,file_path):
10
11 if not os.path.exists(file_path):
12 sys.exit(f"File {file_path} does not exist.")
13 elif not file_path.lower().endswith(('.fa', '.fasta', '.fna')):
14 sys.exit(f"File {file_path} is not a .fa, .fasta, or .fna file.")
15
16 records = list(SeqIO.parse(file_path, 'fasta'))
17 contig_count = int(len(records))
18 lengths = sorted([len(record) for record in records], reverse=True)
19 assembly_bases = sum(lengths)
20
21 with open(file_path, 'rb') as file:
22 sha256 = hashlib.sha256(file.read()).hexdigest()
23
24 cumulative_length = 0
25 n50 = None
26 n90 = None
27 l50 = None
28 l90 = None
29
30 for i, length in enumerate(lengths, start=1):
31 cumulative_length += length
32 if cumulative_length >= assembly_bases * 0.5 and n50 is None:
33 n50 = length
34 l50 = i
35 if cumulative_length >= assembly_bases * 0.9 and n90 is None:
36 n90 = length
37 l90 = i
38 if n50 is not None and n90 is not None:
39 break
40
41 print(f"{sample_name},{file_path},{contig_count},{assembly_bases},{n50},{n90},{l50},{l90},{sha256}")
42
43 fasta_tsv = pd.read_csv(sys.argv[1], sep='\t', header=None, names=['Sample_ID','Fasta_Path'])
44 [fasta_info(sample_name, file_path) for sample_name, file_path in fasta_tsv.values]