annotate CSP2/bin/processFasta.py @ 38:ee512a230a1e

"planemo upload"
author rliterman
date Wed, 11 Dec 2024 12:04:20 -0500
parents 01431fa12065
children
rev   line source
rliterman@0 1 #!/usr/bin/env python3
rliterman@0 2
rliterman@0 3 from Bio import SeqIO
rliterman@0 4 import hashlib
rliterman@0 5 import sys
rliterman@0 6 import os
rliterman@0 7 import pandas as pd
rliterman@0 8
rliterman@0 9 def fasta_info(sample_name,file_path):
rliterman@0 10
rliterman@0 11 if not os.path.exists(file_path):
rliterman@0 12 sys.exit(f"File {file_path} does not exist.")
rliterman@0 13 elif not file_path.lower().endswith(('.fa', '.fasta', '.fna')):
rliterman@0 14 sys.exit(f"File {file_path} is not a .fa, .fasta, or .fna file.")
rliterman@0 15
rliterman@0 16 records = list(SeqIO.parse(file_path, 'fasta'))
rliterman@0 17 contig_count = int(len(records))
rliterman@0 18 lengths = sorted([len(record) for record in records], reverse=True)
rliterman@0 19 assembly_bases = sum(lengths)
rliterman@0 20
rliterman@0 21 with open(file_path, 'rb') as file:
rliterman@0 22 sha256 = hashlib.sha256(file.read()).hexdigest()
rliterman@0 23
rliterman@0 24 cumulative_length = 0
rliterman@0 25 n50 = None
rliterman@0 26 n90 = None
rliterman@0 27 l50 = None
rliterman@0 28 l90 = None
rliterman@0 29
rliterman@0 30 for i, length in enumerate(lengths, start=1):
rliterman@0 31 cumulative_length += length
rliterman@0 32 if cumulative_length >= assembly_bases * 0.5 and n50 is None:
rliterman@0 33 n50 = length
rliterman@0 34 l50 = i
rliterman@0 35 if cumulative_length >= assembly_bases * 0.9 and n90 is None:
rliterman@0 36 n90 = length
rliterman@0 37 l90 = i
rliterman@0 38 if n50 is not None and n90 is not None:
rliterman@0 39 break
rliterman@0 40
rliterman@0 41 print(f"{sample_name},{file_path},{contig_count},{assembly_bases},{n50},{n90},{l50},{l90},{sha256}")
rliterman@0 42
rliterman@0 43 fasta_tsv = pd.read_csv(sys.argv[1], sep='\t', header=None, names=['Sample_ID','Fasta_Path'])
rliterman@0 44 [fasta_info(sample_name, file_path) for sample_name, file_path in fasta_tsv.values]