rliterman@0: #!/usr/bin/env python3 rliterman@0: rliterman@0: from Bio import SeqIO rliterman@0: import hashlib rliterman@0: import sys rliterman@0: import os rliterman@0: import pandas as pd rliterman@0: rliterman@0: def fasta_info(sample_name,file_path): rliterman@0: rliterman@0: if not os.path.exists(file_path): rliterman@0: sys.exit(f"File {file_path} does not exist.") rliterman@0: elif not file_path.lower().endswith(('.fa', '.fasta', '.fna')): rliterman@0: sys.exit(f"File {file_path} is not a .fa, .fasta, or .fna file.") rliterman@0: rliterman@0: records = list(SeqIO.parse(file_path, 'fasta')) rliterman@0: contig_count = int(len(records)) rliterman@0: lengths = sorted([len(record) for record in records], reverse=True) rliterman@0: assembly_bases = sum(lengths) rliterman@0: rliterman@0: with open(file_path, 'rb') as file: rliterman@0: sha256 = hashlib.sha256(file.read()).hexdigest() rliterman@0: rliterman@0: cumulative_length = 0 rliterman@0: n50 = None rliterman@0: n90 = None rliterman@0: l50 = None rliterman@0: l90 = None rliterman@0: rliterman@0: for i, length in enumerate(lengths, start=1): rliterman@0: cumulative_length += length rliterman@0: if cumulative_length >= assembly_bases * 0.5 and n50 is None: rliterman@0: n50 = length rliterman@0: l50 = i rliterman@0: if cumulative_length >= assembly_bases * 0.9 and n90 is None: rliterman@0: n90 = length rliterman@0: l90 = i rliterman@0: if n50 is not None and n90 is not None: rliterman@0: break rliterman@0: rliterman@0: print(f"{sample_name},{file_path},{contig_count},{assembly_bases},{n50},{n90},{l50},{l90},{sha256}") rliterman@0: rliterman@0: fasta_tsv = pd.read_csv(sys.argv[1], sep='\t', header=None, names=['Sample_ID','Fasta_Path']) rliterman@0: [fasta_info(sample_name, file_path) for sample_name, file_path in fasta_tsv.values]