annotate 0.6.1/bin/gen_otf_genome.py @ 15:1972677994a6

"planemo upload"
author kkonganti
date Thu, 07 Sep 2023 10:27:12 -0400
parents 749faef1caa9
children
rev   line source
kkonganti@11 1 #!/usr/bin/env python3
kkonganti@11 2
kkonganti@11 3 # Kranti Konganti
kkonganti@11 4
kkonganti@11 5 import argparse
kkonganti@11 6 import glob
kkonganti@11 7 import gzip
kkonganti@11 8 import inspect
kkonganti@11 9 import logging
kkonganti@11 10 import os
kkonganti@11 11 import pprint
kkonganti@11 12 import re
kkonganti@11 13
kkonganti@11 14 # Set logging.
kkonganti@11 15 logging.basicConfig(
kkonganti@11 16 format="\n" + "=" * 55 + "\n%(asctime)s - %(levelname)s\n" + "=" * 55 + "\n%(message)s\n\n",
kkonganti@11 17 level=logging.DEBUG,
kkonganti@11 18 )
kkonganti@11 19
kkonganti@11 20 # Debug print.
kkonganti@11 21 ppp = pprint.PrettyPrinter(width=50, indent=4)
kkonganti@11 22
kkonganti@11 23 # Multiple inheritence for pretty printing of help text.
kkonganti@11 24 class MultiArgFormatClasses(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
kkonganti@11 25 pass
kkonganti@11 26
kkonganti@11 27
kkonganti@11 28 def main() -> None:
kkonganti@11 29 """
kkonganti@11 30 This script works only in the context of `bettercallsal` Nextflow workflow.
kkonganti@11 31 It takes:
kkonganti@11 32 1. A text file containing accessions or FASTA IDs, one per line and
kkonganti@11 33 then,
kkonganti@11 34 2. Searches for a genome FASTA file in gzipped format in specified
kkonganti@11 35 search path, where the prefix of the filename is the accession or
kkonganti@11 36 FASTA ID from 1. and then,
kkonganti@11 37 creates a new concatenated gzipped genome FASTA file with all the genomes
kkonganti@11 38 in the text file from 1.
kkonganti@11 39 """
kkonganti@11 40
kkonganti@11 41 prog_name = os.path.basename(inspect.stack()[0].filename)
kkonganti@11 42
kkonganti@11 43 parser = argparse.ArgumentParser(
kkonganti@11 44 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses
kkonganti@11 45 )
kkonganti@11 46
kkonganti@11 47 required = parser.add_argument_group("required arguments")
kkonganti@11 48
kkonganti@11 49 required.add_argument(
kkonganti@11 50 "-txt",
kkonganti@11 51 dest="accs_txt",
kkonganti@11 52 default=False,
kkonganti@11 53 required=True,
kkonganti@11 54 help="Absolute UNIX path to .txt file containing accessions\n" + "FASTA IDs, one per line.",
kkonganti@11 55 )
kkonganti@11 56 required.add_argument(
kkonganti@11 57 "-gd",
kkonganti@11 58 dest="genomes_dir",
kkonganti@11 59 default=False,
kkonganti@11 60 required=True,
kkonganti@11 61 help="Absolute UNIX path to a directory containing\n"
kkonganti@11 62 + "gzipped genome FASTA files.\n"
kkonganti@11 63 + "Required if -m is on.",
kkonganti@11 64 )
kkonganti@11 65 parser.add_argument(
kkonganti@11 66 "-gds",
kkonganti@11 67 dest="genomes_dir_suffix",
kkonganti@11 68 default="_scaffolded_genomic.fna.gz",
kkonganti@11 69 required=False,
kkonganti@11 70 help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.",
kkonganti@11 71 )
kkonganti@11 72 parser.add_argument(
kkonganti@11 73 "-op",
kkonganti@11 74 dest="out_prefix",
kkonganti@11 75 default="CATTED_GENOMES",
kkonganti@11 76 help="Set the output file prefix for .fna.gz and .txt\n" + "files.",
kkonganti@11 77 )
kkonganti@11 78 parser.add_argument(
kkonganti@11 79 "-txts",
kkonganti@11 80 dest="accs_suffix",
kkonganti@11 81 default="_template_hits.txt",
kkonganti@11 82 required=False,
kkonganti@11 83 help="The suffix of the file supplied with -txt option. It is assumed that the\n"
kkonganti@11 84 + "sample name is present in the file supplied with -txt option and the suffix\n"
kkonganti@11 85 + "will be stripped and stored in a file that logs samples which have no hits.",
kkonganti@11 86 )
kkonganti@11 87 parser.add_argument(
kkonganti@11 88 "-frag_delim",
kkonganti@11 89 dest="frag_delim",
kkonganti@11 90 default="\t",
kkonganti@11 91 required=False,
kkonganti@11 92 help="The delimitor by which the fields are separated in *_frag.gz file.",
kkonganti@11 93 )
kkonganti@11 94
kkonganti@11 95 args = parser.parse_args()
kkonganti@11 96 accs_txt = args.accs_txt
kkonganti@11 97 genomes_dir = args.genomes_dir
kkonganti@11 98 genomes_dir_suffix = args.genomes_dir_suffix
kkonganti@11 99 out_prefix = args.out_prefix
kkonganti@11 100 accs_suffix = args.accs_suffix
kkonganti@11 101 frag_delim = args.frag_delim
kkonganti@11 102 accs_seen = dict()
kkonganti@11 103 cat_genomes_gz = os.path.join(os.getcwd(), out_prefix + "_" + genomes_dir_suffix)
kkonganti@11 104 cat_genomes_gz = re.sub("__", "_", str(cat_genomes_gz))
kkonganti@11 105 frags_gz = os.path.join(os.getcwd(), out_prefix + ".frag.gz")
kkonganti@11 106 cat_reads_gz = os.path.join(os.getcwd(), out_prefix + "_aln_reads.fna.gz")
kkonganti@11 107 cat_reads_gz = re.sub("__", "_", cat_reads_gz)
kkonganti@11 108
kkonganti@11 109 if accs_txt and os.path.exists(cat_genomes_gz) and os.path.getsize(cat_genomes_gz) > 0:
kkonganti@11 110 logging.error(
kkonganti@11 111 "A concatenated genome FASTA file,\n"
kkonganti@11 112 + f"{os.path.basename(cat_genomes_gz)} already exists in:\n"
kkonganti@11 113 + f"{os.getcwd()}\n"
kkonganti@11 114 + "Please remove or move it as we will not "
kkonganti@11 115 + "overwrite it."
kkonganti@11 116 )
kkonganti@11 117 exit(1)
kkonganti@11 118
kkonganti@11 119 if accs_txt and (not os.path.exists(accs_txt) or not os.path.getsize(accs_txt) > 0):
kkonganti@11 120 logging.error("File,\n" + f"{accs_txt}\ndoes not exist " + "or is empty!")
kkonganti@11 121 failed_sample_name = re.sub(accs_suffix, "", os.path.basename(accs_txt))
kkonganti@11 122 with open(
kkonganti@11 123 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w"
kkonganti@11 124 ) as failed_sample_fh:
kkonganti@11 125 failed_sample_fh.write(f"{failed_sample_name}\n")
kkonganti@11 126 failed_sample_fh.close()
kkonganti@11 127 exit(0)
kkonganti@11 128
kkonganti@11 129 if genomes_dir:
kkonganti@11 130 if not os.path.isdir(genomes_dir):
kkonganti@11 131 logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!")
kkonganti@11 132 exit(1)
kkonganti@11 133 if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0:
kkonganti@11 134 logging.error(
kkonganti@11 135 "Genomes directory"
kkonganti@11 136 + f"{genomes_dir}"
kkonganti@11 137 + "\ndoes not seem to have any\n"
kkonganti@11 138 + f"files ending with suffix: {genomes_dir_suffix}"
kkonganti@11 139 )
kkonganti@11 140 exit(1)
kkonganti@11 141
kkonganti@11 142 # ppp.pprint(mash_hits)
kkonganti@11 143 empty_lines = 0
kkonganti@11 144 empty_lines_msg = ""
kkonganti@11 145 with open(cat_genomes_gz, "wb") as genomes_out_gz:
kkonganti@11 146 with open(accs_txt, "r") as accs_txt_fh:
kkonganti@11 147 for line in accs_txt_fh:
kkonganti@11 148 if line in ["\n", "\n\r"]:
kkonganti@11 149 empty_lines += 1
kkonganti@11 150 continue
kkonganti@11 151 else:
kkonganti@11 152 line = line.strip()
kkonganti@11 153
kkonganti@11 154 if line in accs_seen.keys():
kkonganti@11 155 continue
kkonganti@11 156 else:
kkonganti@11 157 accs_seen[line] = 1
kkonganti@11 158
kkonganti@11 159 genome_file = os.path.join(genomes_dir, line + genomes_dir_suffix)
kkonganti@11 160
kkonganti@11 161 if not os.path.exists(genome_file) or os.path.getsize(genome_file) <= 0:
kkonganti@11 162 logging.error(
kkonganti@11 163 f"Genome file {os.path.basename(genome_file)} does not\n"
kkonganti@11 164 + "exits or is empty!"
kkonganti@11 165 )
kkonganti@11 166 exit(1)
kkonganti@11 167 else:
kkonganti@11 168 with open(genome_file, "rb") as genome_file_h:
kkonganti@11 169 genomes_out_gz.writelines(genome_file_h.readlines())
kkonganti@11 170 genome_file_h.close()
kkonganti@11 171 accs_txt_fh.close()
kkonganti@11 172 genomes_out_gz.close()
kkonganti@11 173
kkonganti@11 174 if len(accs_seen.keys()) > 0 and os.path.exists(frags_gz) and os.path.getsize(frags_gz) > 0:
kkonganti@11 175 with gzip.open(cat_reads_gz, "wt", encoding="utf-8") as cat_reads_gz_fh:
kkonganti@11 176 with gzip.open(frags_gz, "rb") as fragz_gz_fh:
kkonganti@11 177 for frag_line in fragz_gz_fh.readlines():
kkonganti@11 178 frag_lines = frag_line.decode("utf-8").strip().split(frag_delim)
kkonganti@11 179 # Per KMA specification, 6=template, 7=query, 1=read
kkonganti@11 180 cat_reads_gz_fh.write(f">{frag_lines[6]}\n{frag_lines[0]}\n")
kkonganti@11 181 fragz_gz_fh.close()
kkonganti@11 182 cat_reads_gz_fh.close()
kkonganti@11 183
kkonganti@11 184 if empty_lines > 0:
kkonganti@11 185 empty_lines_msg = f"Skipped {empty_lines} empty line(s).\n"
kkonganti@11 186
kkonganti@11 187 logging.info(
kkonganti@11 188 empty_lines_msg
kkonganti@11 189 + f"File {os.path.basename(cat_genomes_gz)}\n"
kkonganti@11 190 + f"written in:\n{os.getcwd()}\nDone! Bye!"
kkonganti@11 191 )
kkonganti@11 192 exit(0)
kkonganti@11 193
kkonganti@11 194
kkonganti@11 195 if __name__ == "__main__":
kkonganti@11 196 main()