Mercurial > repos > rliterman > csp2
annotate CSP2/bin/processFasta.py @ 0:01431fa12065
"planemo upload"
author | rliterman |
---|---|
date | Mon, 02 Dec 2024 10:40:55 -0500 |
parents | |
children |
rev | line source |
---|---|
rliterman@0 | 1 #!/usr/bin/env python3 |
rliterman@0 | 2 |
rliterman@0 | 3 from Bio import SeqIO |
rliterman@0 | 4 import hashlib |
rliterman@0 | 5 import sys |
rliterman@0 | 6 import os |
rliterman@0 | 7 import pandas as pd |
rliterman@0 | 8 |
rliterman@0 | 9 def fasta_info(sample_name,file_path): |
rliterman@0 | 10 |
rliterman@0 | 11 if not os.path.exists(file_path): |
rliterman@0 | 12 sys.exit(f"File {file_path} does not exist.") |
rliterman@0 | 13 elif not file_path.lower().endswith(('.fa', '.fasta', '.fna')): |
rliterman@0 | 14 sys.exit(f"File {file_path} is not a .fa, .fasta, or .fna file.") |
rliterman@0 | 15 |
rliterman@0 | 16 records = list(SeqIO.parse(file_path, 'fasta')) |
rliterman@0 | 17 contig_count = int(len(records)) |
rliterman@0 | 18 lengths = sorted([len(record) for record in records], reverse=True) |
rliterman@0 | 19 assembly_bases = sum(lengths) |
rliterman@0 | 20 |
rliterman@0 | 21 with open(file_path, 'rb') as file: |
rliterman@0 | 22 sha256 = hashlib.sha256(file.read()).hexdigest() |
rliterman@0 | 23 |
rliterman@0 | 24 cumulative_length = 0 |
rliterman@0 | 25 n50 = None |
rliterman@0 | 26 n90 = None |
rliterman@0 | 27 l50 = None |
rliterman@0 | 28 l90 = None |
rliterman@0 | 29 |
rliterman@0 | 30 for i, length in enumerate(lengths, start=1): |
rliterman@0 | 31 cumulative_length += length |
rliterman@0 | 32 if cumulative_length >= assembly_bases * 0.5 and n50 is None: |
rliterman@0 | 33 n50 = length |
rliterman@0 | 34 l50 = i |
rliterman@0 | 35 if cumulative_length >= assembly_bases * 0.9 and n90 is None: |
rliterman@0 | 36 n90 = length |
rliterman@0 | 37 l90 = i |
rliterman@0 | 38 if n50 is not None and n90 is not None: |
rliterman@0 | 39 break |
rliterman@0 | 40 |
rliterman@0 | 41 print(f"{sample_name},{file_path},{contig_count},{assembly_bases},{n50},{n90},{l50},{l90},{sha256}") |
rliterman@0 | 42 |
rliterman@0 | 43 fasta_tsv = pd.read_csv(sys.argv[1], sep='\t', header=None, names=['Sample_ID','Fasta_Path']) |
rliterman@0 | 44 [fasta_info(sample_name, file_path) for sample_name, file_path in fasta_tsv.values] |