kkonganti@17: #!/usr/bin/env python3 kkonganti@17: kkonganti@17: # Kranti Konganti kkonganti@17: kkonganti@17: import argparse kkonganti@17: import glob kkonganti@17: import gzip kkonganti@17: import inspect kkonganti@17: import logging kkonganti@17: import os kkonganti@17: import pprint kkonganti@17: import re kkonganti@17: kkonganti@17: # Set logging. kkonganti@17: logging.basicConfig( kkonganti@17: format="\n" kkonganti@17: + "=" * 55 kkonganti@17: + "\n%(asctime)s - %(levelname)s\n" kkonganti@17: + "=" * 55 kkonganti@17: + "\n%(message)s\n\n", kkonganti@17: level=logging.DEBUG, kkonganti@17: ) kkonganti@17: kkonganti@17: # Debug print. kkonganti@17: ppp = pprint.PrettyPrinter(width=50, indent=4) kkonganti@17: kkonganti@17: kkonganti@17: # Multiple inheritence for pretty printing of help text. kkonganti@17: class MultiArgFormatClasses( kkonganti@17: argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter kkonganti@17: ): kkonganti@17: pass kkonganti@17: kkonganti@17: kkonganti@17: def main() -> None: kkonganti@17: """ kkonganti@17: This script works only in the context of `bettercallsal` Nextflow workflow. kkonganti@17: It takes: kkonganti@17: 1. A text file containing accessions or FASTA IDs, one per line and kkonganti@17: then, kkonganti@17: 2. Searches for a genome FASTA file in gzipped format in specified kkonganti@17: search path, where the prefix of the filename is the accession or kkonganti@17: FASTA ID from 1. and then, kkonganti@17: creates a new concatenated gzipped genome FASTA file with all the genomes kkonganti@17: in the text file from 1. kkonganti@17: """ kkonganti@17: kkonganti@17: prog_name = os.path.basename(inspect.stack()[0].filename) kkonganti@17: kkonganti@17: parser = argparse.ArgumentParser( kkonganti@17: prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses kkonganti@17: ) kkonganti@17: kkonganti@17: required = parser.add_argument_group("required arguments") kkonganti@17: kkonganti@17: required.add_argument( kkonganti@17: "-txt", kkonganti@17: dest="accs_txt", kkonganti@17: default=False, kkonganti@17: required=True, kkonganti@17: help="Absolute UNIX path to .txt file containing accessions\n" kkonganti@17: + "FASTA IDs, one per line.", kkonganti@17: ) kkonganti@17: required.add_argument( kkonganti@17: "-gd", kkonganti@17: dest="genomes_dir", kkonganti@17: default=False, kkonganti@17: required=True, kkonganti@17: help="Absolute UNIX path to a directory containing\n" kkonganti@17: + "gzipped genome FASTA files.\n" kkonganti@17: + "Required if -m is on.", kkonganti@17: ) kkonganti@17: parser.add_argument( kkonganti@17: "-gds", kkonganti@17: dest="genomes_dir_suffix", kkonganti@17: default="_scaffolded_genomic.fna.gz", kkonganti@17: required=False, kkonganti@17: help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.", kkonganti@17: ) kkonganti@17: parser.add_argument( kkonganti@17: "-op", kkonganti@17: dest="out_prefix", kkonganti@17: default="CATTED_GENOMES", kkonganti@17: help="Set the output file prefix for .fna.gz and .txt\n" + "files.", kkonganti@17: ) kkonganti@17: parser.add_argument( kkonganti@17: "-txts", kkonganti@17: dest="accs_suffix", kkonganti@17: default="_template_hits.txt", kkonganti@17: required=False, kkonganti@17: help="The suffix of the file supplied with -txt option. It is assumed that the\n" kkonganti@17: + "sample name is present in the file supplied with -txt option and the suffix\n" kkonganti@17: + "will be stripped and stored in a file that logs samples which have no hits.", kkonganti@17: ) kkonganti@17: parser.add_argument( kkonganti@17: "-frag_delim", kkonganti@17: dest="frag_delim", kkonganti@17: default="\t", kkonganti@17: required=False, kkonganti@17: help="The delimitor by which the fields are separated in *_frag.gz file.", kkonganti@17: ) kkonganti@17: kkonganti@17: args = parser.parse_args() kkonganti@17: accs_txt = args.accs_txt kkonganti@17: genomes_dir = args.genomes_dir kkonganti@17: genomes_dir_suffix = args.genomes_dir_suffix kkonganti@17: out_prefix = args.out_prefix kkonganti@17: accs_suffix = args.accs_suffix kkonganti@17: frag_delim = args.frag_delim kkonganti@17: accs_seen = dict() kkonganti@17: cat_genomes_gz = os.path.join(os.getcwd(), out_prefix + "_" + genomes_dir_suffix) kkonganti@17: cat_genomes_gz = re.sub("__", "_", str(cat_genomes_gz)) kkonganti@17: frags_gz = os.path.join(os.getcwd(), out_prefix + ".frag.gz") kkonganti@17: cat_reads_gz = os.path.join(os.getcwd(), out_prefix + "_aln_reads.fna.gz") kkonganti@17: cat_reads_gz = re.sub("__", "_", cat_reads_gz) kkonganti@17: kkonganti@17: if ( kkonganti@17: accs_txt kkonganti@17: and os.path.exists(cat_genomes_gz) kkonganti@17: and os.path.getsize(cat_genomes_gz) > 0 kkonganti@17: ): kkonganti@17: logging.error( kkonganti@17: "A concatenated genome FASTA file,\n" kkonganti@17: + f"{os.path.basename(cat_genomes_gz)} already exists in:\n" kkonganti@17: + f"{os.getcwd()}\n" kkonganti@17: + "Please remove or move it as we will not " kkonganti@17: + "overwrite it." kkonganti@17: ) kkonganti@17: exit(1) kkonganti@17: kkonganti@17: if accs_txt and (not os.path.exists(accs_txt) or not os.path.getsize(accs_txt) > 0): kkonganti@17: logging.error("File,\n" + f"{accs_txt}\ndoes not exist " + "or is empty!") kkonganti@17: failed_sample_name = re.sub(accs_suffix, "", os.path.basename(accs_txt)) kkonganti@17: with open( kkonganti@17: os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w" kkonganti@17: ) as failed_sample_fh: kkonganti@17: failed_sample_fh.write(f"{failed_sample_name}\n") kkonganti@17: failed_sample_fh.close() kkonganti@17: exit(0) kkonganti@17: kkonganti@17: if genomes_dir: kkonganti@17: if not os.path.isdir(genomes_dir): kkonganti@17: logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!") kkonganti@17: exit(1) kkonganti@17: if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0: kkonganti@17: logging.error( kkonganti@17: "Genomes directory" kkonganti@17: + f"{genomes_dir}" kkonganti@17: + "\ndoes not seem to have any\n" kkonganti@17: + f"files ending with suffix: {genomes_dir_suffix}" kkonganti@17: ) kkonganti@17: exit(1) kkonganti@17: kkonganti@17: # ppp.pprint(mash_hits) kkonganti@17: empty_lines = 0 kkonganti@17: empty_lines_msg = "" kkonganti@17: with open(cat_genomes_gz, "wb") as genomes_out_gz: kkonganti@17: with open(accs_txt, "r") as accs_txt_fh: kkonganti@17: for line in accs_txt_fh: kkonganti@17: if line in ["\n", "\n\r"]: kkonganti@17: empty_lines += 1 kkonganti@17: continue kkonganti@17: else: kkonganti@17: line = line.strip() kkonganti@17: kkonganti@17: if line in accs_seen.keys(): kkonganti@17: continue kkonganti@17: else: kkonganti@17: accs_seen[line] = 1 kkonganti@17: kkonganti@17: genome_file = os.path.join(genomes_dir, line + genomes_dir_suffix) kkonganti@17: kkonganti@17: if ( kkonganti@17: not os.path.exists(genome_file) kkonganti@17: or os.path.getsize(genome_file) <= 0 kkonganti@17: ): kkonganti@17: logging.error( kkonganti@17: f"Genome file {os.path.basename(genome_file)} does not\n" kkonganti@17: + "exits or is empty!" kkonganti@17: ) kkonganti@17: exit(1) kkonganti@17: else: kkonganti@17: with open(genome_file, "rb") as genome_file_h: kkonganti@17: genomes_out_gz.writelines(genome_file_h.readlines()) kkonganti@17: genome_file_h.close() kkonganti@17: accs_txt_fh.close() kkonganti@17: genomes_out_gz.close() kkonganti@17: kkonganti@17: if ( kkonganti@17: len(accs_seen.keys()) > 0 kkonganti@17: and os.path.exists(frags_gz) kkonganti@17: and os.path.getsize(frags_gz) > 0 kkonganti@17: ): kkonganti@17: with gzip.open( kkonganti@17: cat_reads_gz, "wt", encoding="utf-8", compresslevel=6 kkonganti@17: ) as cat_reads_gz_fh: kkonganti@17: with gzip.open(frags_gz, "rb", compresslevel=6) as fragz_gz_fh: kkonganti@17: for frag_line in fragz_gz_fh: kkonganti@17: frag_lines = frag_line.decode("utf-8").strip().split(frag_delim) kkonganti@17: # Per KMA specification, 6=template, 7=query, 1=read kkonganti@17: cat_reads_gz_fh.write(f">{frag_lines[6]}\n{frag_lines[0]}\n") kkonganti@17: fragz_gz_fh.close() kkonganti@17: cat_reads_gz_fh.close() kkonganti@17: kkonganti@17: if empty_lines > 0: kkonganti@17: empty_lines_msg = f"Skipped {empty_lines} empty line(s).\n" kkonganti@17: kkonganti@17: logging.info( kkonganti@17: empty_lines_msg kkonganti@17: + f"File {os.path.basename(cat_genomes_gz)}\n" kkonganti@17: + f"written in:\n{os.getcwd()}\nDone! Bye!" kkonganti@17: ) kkonganti@17: exit(0) kkonganti@17: kkonganti@17: kkonganti@17: if __name__ == "__main__": kkonganti@17: main()