kkonganti@0: #!/usr/bin/env python3 kkonganti@0: kkonganti@0: # Kranti Konganti kkonganti@0: kkonganti@0: import argparse kkonganti@0: import glob kkonganti@0: import gzip kkonganti@0: import inspect kkonganti@0: import logging kkonganti@0: import os kkonganti@0: import pprint kkonganti@0: import re kkonganti@0: kkonganti@0: # Set logging. kkonganti@0: logging.basicConfig( kkonganti@0: format="\n" kkonganti@0: + "=" * 55 kkonganti@0: + "\n%(asctime)s - %(levelname)s\n" kkonganti@0: + "=" * 55 kkonganti@0: + "\n%(message)s\n\n", kkonganti@0: level=logging.DEBUG, kkonganti@0: ) kkonganti@0: kkonganti@0: # Debug print. kkonganti@0: ppp = pprint.PrettyPrinter(width=50, indent=4) kkonganti@0: kkonganti@0: kkonganti@0: # Multiple inheritence for pretty printing of help text. kkonganti@0: class MultiArgFormatClasses( kkonganti@0: argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter kkonganti@0: ): kkonganti@0: pass kkonganti@0: kkonganti@0: kkonganti@0: def main() -> None: kkonganti@0: """ kkonganti@0: This script works only in the context of a Nextflow workflow. kkonganti@0: It takes: kkonganti@0: 1. A text file containing accessions or FASTA IDs, one per line and kkonganti@0: then, kkonganti@0: 2. Optionally, searches for a genome FASTA file in gzipped format in specified kkonganti@0: search path, where the prefix of the filename is the accession or kkonganti@0: FASTA ID from 1. and then, creates a new concatenated gzipped genome FASTA kkonganti@0: file with all the genomes in the text file from 1. kkonganti@0: 3. Creates a new FASTQ file with reads aligned to the accessions in the text kkonganti@0: file from 1. kkonganti@0: """ kkonganti@0: kkonganti@0: prog_name = os.path.basename(inspect.stack()[0].filename) kkonganti@0: kkonganti@0: parser = argparse.ArgumentParser( kkonganti@0: prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses kkonganti@0: ) kkonganti@0: kkonganti@0: required = parser.add_argument_group("required arguments") kkonganti@0: kkonganti@0: required.add_argument( kkonganti@0: "-txt", kkonganti@0: dest="accs_txt", kkonganti@0: default=False, kkonganti@0: required=True, kkonganti@0: help="Absolute UNIX path to .txt file containing accessions\n" kkonganti@0: + "FASTA IDs, one per line.", kkonganti@0: ) kkonganti@0: required.add_argument( kkonganti@0: "-op", kkonganti@0: dest="out_prefix", kkonganti@0: default="CATTED_GENOMES", kkonganti@0: required=True, kkonganti@0: help="Set the output file prefix for .fna.gz and .txt\n" + "files.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-gd", kkonganti@0: dest="genomes_dir", kkonganti@0: default=False, kkonganti@0: required=False, kkonganti@0: help="Absolute UNIX path to a directory containing\n" kkonganti@0: + "gzipped genome FASTA files or a file.\n", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-gds", kkonganti@0: dest="genomes_dir_suffix", kkonganti@0: default="_scaffolded_genomic.fna.gz", kkonganti@0: required=False, kkonganti@0: help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-query", kkonganti@0: dest="id_is_query", kkonganti@0: default=False, kkonganti@0: action="store_true", kkonganti@0: required=False, kkonganti@0: help="In the produced FASTQ file, should the FASTA ID should be of KMA query ID\n" kkonganti@0: + "or template ID.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-txts", kkonganti@0: dest="accs_suffix", kkonganti@0: default="_template_hits.txt", kkonganti@0: required=False, kkonganti@0: help="The suffix of the file supplied with -txt option. It is assumed that the\n" kkonganti@0: + "sample name is present in the file supplied with -txt option and the suffix\n" kkonganti@0: + "will be stripped and stored in a file that logs samples which have no hits.", kkonganti@0: ) kkonganti@0: parser.add_argument( kkonganti@0: "-frag_delim", kkonganti@0: dest="frag_delim", kkonganti@0: default="\t", kkonganti@0: required=False, kkonganti@0: help="The delimitor by which the fields are separated in *_frag.gz file.", kkonganti@0: ) kkonganti@0: kkonganti@0: args = parser.parse_args() kkonganti@0: accs_txt = args.accs_txt kkonganti@0: genomes_dir = args.genomes_dir kkonganti@0: genomes_dir_suffix = args.genomes_dir_suffix kkonganti@0: id_is_query = args.id_is_query kkonganti@0: out_prefix = args.out_prefix kkonganti@0: accs_suffix = args.accs_suffix kkonganti@0: frag_delim = args.frag_delim kkonganti@0: accs_seen = dict() kkonganti@0: cat_genomes_gz = os.path.join(os.getcwd(), out_prefix + "_" + genomes_dir_suffix) kkonganti@0: cat_genomes_gz = re.sub("__", "_", str(cat_genomes_gz)) kkonganti@0: frags_gz = os.path.join(os.getcwd(), out_prefix + ".frag.gz") kkonganti@0: cat_reads_gz = os.path.join(os.getcwd(), out_prefix + "_aln_reads.fna.gz") kkonganti@0: cat_reads_gz = re.sub("__", "_", cat_reads_gz) kkonganti@0: kkonganti@0: if ( kkonganti@0: accs_txt kkonganti@0: and os.path.exists(cat_genomes_gz) kkonganti@0: and os.path.getsize(cat_genomes_gz) > 0 kkonganti@0: ): kkonganti@0: logging.error( kkonganti@0: "A concatenated genome FASTA file,\n" kkonganti@0: + f"{os.path.basename(cat_genomes_gz)} already exists in:\n" kkonganti@0: + f"{os.getcwd()}\n" kkonganti@0: + "Please remove or move it as we will not " kkonganti@0: + "overwrite it." kkonganti@0: ) kkonganti@0: exit(1) kkonganti@0: kkonganti@0: if accs_txt and (not os.path.exists(accs_txt) or not os.path.getsize(accs_txt) > 0): kkonganti@0: logging.error("File,\n" + f"{accs_txt}\ndoes not exist " + "or is empty!") kkonganti@0: failed_sample_name = re.sub(accs_suffix, "", os.path.basename(accs_txt)) kkonganti@0: with open( kkonganti@0: os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w" kkonganti@0: ) as failed_sample_fh: kkonganti@0: failed_sample_fh.write(f"{failed_sample_name}\n") kkonganti@0: failed_sample_fh.close() kkonganti@0: exit(0) kkonganti@0: kkonganti@0: # ppp.pprint(mash_hits) kkonganti@0: empty_lines = 0 kkonganti@0: empty_lines_msg = "" kkonganti@0: kkonganti@0: with open(accs_txt, "r") as accs_txt_fh: kkonganti@0: for line in accs_txt_fh: kkonganti@0: if line in ["\n", "\n\r"]: kkonganti@0: empty_lines += 1 kkonganti@0: continue kkonganti@0: else: kkonganti@0: line = line.strip() kkonganti@0: kkonganti@0: if line in accs_seen.keys(): kkonganti@0: continue kkonganti@0: else: kkonganti@0: accs_seen[line] = 1 kkonganti@0: accs_txt_fh.close() kkonganti@0: kkonganti@0: if genomes_dir: kkonganti@0: if not os.path.isdir(genomes_dir): kkonganti@0: logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!") kkonganti@0: exit(1) kkonganti@0: if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0: kkonganti@0: logging.error( kkonganti@0: "Genomes directory" kkonganti@0: + f"{genomes_dir}" kkonganti@0: + "\ndoes not seem to have any\n" kkonganti@0: + f"files ending with suffix: {genomes_dir_suffix}" kkonganti@0: ) kkonganti@0: exit(1) kkonganti@0: kkonganti@0: with open(cat_genomes_gz, "wb") as genomes_out_gz: kkonganti@0: for line in accs_seen.keys(): kkonganti@0: genome_file = os.path.join(genomes_dir, line + genomes_dir_suffix) kkonganti@0: kkonganti@0: if not os.path.exists(genome_file) or os.path.getsize(genome_file) <= 0: kkonganti@0: logging.error( kkonganti@0: f"Genome file {os.path.basename(genome_file)} does not\n" kkonganti@0: + "exits or is empty!" kkonganti@0: ) kkonganti@0: exit(1) kkonganti@0: else: kkonganti@0: with open(genome_file, "rb") as genome_file_h: kkonganti@0: genomes_out_gz.writelines(genome_file_h.readlines()) kkonganti@0: genome_file_h.close() kkonganti@0: genomes_out_gz.close() kkonganti@0: kkonganti@0: if ( kkonganti@0: len(accs_seen.keys()) > 0 kkonganti@0: and os.path.exists(frags_gz) kkonganti@0: and os.path.getsize(frags_gz) > 0 kkonganti@0: ): kkonganti@0: with gzip.open( kkonganti@0: cat_reads_gz, "wt", encoding="utf-8", compresslevel=6 kkonganti@0: ) as cat_reads_gz_fh: kkonganti@0: with gzip.open(frags_gz, "rb", compresslevel=6) as fragz_gz_fh: kkonganti@0: fasta_id = 7 if id_is_query else 6 kkonganti@0: for frag_line in fragz_gz_fh: kkonganti@0: frag_lines = frag_line.decode("utf-8").strip().split(frag_delim) kkonganti@0: # Per KMA specification, 6=template, 7=query, 1=read kkonganti@0: cat_reads_gz_fh.write(f">{frag_lines[fasta_id]}\n{frag_lines[0]}\n") kkonganti@0: fragz_gz_fh.close() kkonganti@0: cat_reads_gz_fh.close() kkonganti@0: kkonganti@0: if empty_lines > 0: kkonganti@0: empty_lines_msg = f"Skipped {empty_lines} empty line(s).\n" kkonganti@0: kkonganti@0: logging.info( kkonganti@0: empty_lines_msg kkonganti@0: + f"File {os.path.basename(cat_genomes_gz)}\n" kkonganti@0: + f"written in:\n{os.getcwd()}\nDone! Bye!" kkonganti@0: ) kkonganti@0: kkonganti@0: kkonganti@0: if __name__ == "__main__": kkonganti@0: main()