Mercurial > repos > rliterman > csp2
comparison CSP2/bin/processFasta.py @ 0:01431fa12065
"planemo upload"
author | rliterman |
---|---|
date | Mon, 02 Dec 2024 10:40:55 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:01431fa12065 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 from Bio import SeqIO | |
4 import hashlib | |
5 import sys | |
6 import os | |
7 import pandas as pd | |
8 | |
9 def fasta_info(sample_name,file_path): | |
10 | |
11 if not os.path.exists(file_path): | |
12 sys.exit(f"File {file_path} does not exist.") | |
13 elif not file_path.lower().endswith(('.fa', '.fasta', '.fna')): | |
14 sys.exit(f"File {file_path} is not a .fa, .fasta, or .fna file.") | |
15 | |
16 records = list(SeqIO.parse(file_path, 'fasta')) | |
17 contig_count = int(len(records)) | |
18 lengths = sorted([len(record) for record in records], reverse=True) | |
19 assembly_bases = sum(lengths) | |
20 | |
21 with open(file_path, 'rb') as file: | |
22 sha256 = hashlib.sha256(file.read()).hexdigest() | |
23 | |
24 cumulative_length = 0 | |
25 n50 = None | |
26 n90 = None | |
27 l50 = None | |
28 l90 = None | |
29 | |
30 for i, length in enumerate(lengths, start=1): | |
31 cumulative_length += length | |
32 if cumulative_length >= assembly_bases * 0.5 and n50 is None: | |
33 n50 = length | |
34 l50 = i | |
35 if cumulative_length >= assembly_bases * 0.9 and n90 is None: | |
36 n90 = length | |
37 l90 = i | |
38 if n50 is not None and n90 is not None: | |
39 break | |
40 | |
41 print(f"{sample_name},{file_path},{contig_count},{assembly_bases},{n50},{n90},{l50},{l90},{sha256}") | |
42 | |
43 fasta_tsv = pd.read_csv(sys.argv[1], sep='\t', header=None, names=['Sample_ID','Fasta_Path']) | |
44 [fasta_info(sample_name, file_path) for sample_name, file_path in fasta_tsv.values] |