rliterman@0
|
1 #!/usr/bin/env python3
|
rliterman@0
|
2
|
rliterman@0
|
3 from Bio import SeqIO
|
rliterman@0
|
4 import hashlib
|
rliterman@0
|
5 import sys
|
rliterman@0
|
6 import os
|
rliterman@0
|
7 import pandas as pd
|
rliterman@0
|
8
|
rliterman@0
|
9 def fasta_info(sample_name,file_path):
|
rliterman@0
|
10
|
rliterman@0
|
11 if not os.path.exists(file_path):
|
rliterman@0
|
12 sys.exit(f"File {file_path} does not exist.")
|
rliterman@0
|
13 elif not file_path.lower().endswith(('.fa', '.fasta', '.fna')):
|
rliterman@0
|
14 sys.exit(f"File {file_path} is not a .fa, .fasta, or .fna file.")
|
rliterman@0
|
15
|
rliterman@0
|
16 records = list(SeqIO.parse(file_path, 'fasta'))
|
rliterman@0
|
17 contig_count = int(len(records))
|
rliterman@0
|
18 lengths = sorted([len(record) for record in records], reverse=True)
|
rliterman@0
|
19 assembly_bases = sum(lengths)
|
rliterman@0
|
20
|
rliterman@0
|
21 with open(file_path, 'rb') as file:
|
rliterman@0
|
22 sha256 = hashlib.sha256(file.read()).hexdigest()
|
rliterman@0
|
23
|
rliterman@0
|
24 cumulative_length = 0
|
rliterman@0
|
25 n50 = None
|
rliterman@0
|
26 n90 = None
|
rliterman@0
|
27 l50 = None
|
rliterman@0
|
28 l90 = None
|
rliterman@0
|
29
|
rliterman@0
|
30 for i, length in enumerate(lengths, start=1):
|
rliterman@0
|
31 cumulative_length += length
|
rliterman@0
|
32 if cumulative_length >= assembly_bases * 0.5 and n50 is None:
|
rliterman@0
|
33 n50 = length
|
rliterman@0
|
34 l50 = i
|
rliterman@0
|
35 if cumulative_length >= assembly_bases * 0.9 and n90 is None:
|
rliterman@0
|
36 n90 = length
|
rliterman@0
|
37 l90 = i
|
rliterman@0
|
38 if n50 is not None and n90 is not None:
|
rliterman@0
|
39 break
|
rliterman@0
|
40
|
rliterman@0
|
41 print(f"{sample_name},{file_path},{contig_count},{assembly_bases},{n50},{n90},{l50},{l90},{sha256}")
|
rliterman@0
|
42
|
rliterman@0
|
43 fasta_tsv = pd.read_csv(sys.argv[1], sep='\t', header=None, names=['Sample_ID','Fasta_Path'])
|
rliterman@0
|
44 [fasta_info(sample_name, file_path) for sample_name, file_path in fasta_tsv.values] |