Mercurial > repos > galaxytrakr > hfp_nowayout_awsbatch
comparison 0.5.0/bin/gen_otf_genome.py @ 0:3c767f9cfd88 draft default tip
planemo upload
| author | galaxytrakr |
|---|---|
| date | Fri, 29 May 2026 13:37:56 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:3c767f9cfd88 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 # Kranti Konganti | |
| 4 | |
| 5 import argparse | |
| 6 import glob | |
| 7 import gzip | |
| 8 import inspect | |
| 9 import logging | |
| 10 import os | |
| 11 import pprint | |
| 12 import re | |
| 13 | |
| 14 # Set logging. | |
| 15 logging.basicConfig( | |
| 16 format="\n" | |
| 17 + "=" * 55 | |
| 18 + "\n%(asctime)s - %(levelname)s\n" | |
| 19 + "=" * 55 | |
| 20 + "\n%(message)s\n\n", | |
| 21 level=logging.DEBUG, | |
| 22 ) | |
| 23 | |
| 24 # Debug print. | |
| 25 ppp = pprint.PrettyPrinter(width=50, indent=4) | |
| 26 | |
| 27 | |
| 28 # Multiple inheritence for pretty printing of help text. | |
| 29 class MultiArgFormatClasses( | |
| 30 argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter | |
| 31 ): | |
| 32 pass | |
| 33 | |
| 34 | |
| 35 def main() -> None: | |
| 36 """ | |
| 37 This script works only in the context of a Nextflow workflow. | |
| 38 It takes: | |
| 39 1. A text file containing accessions or FASTA IDs, one per line and | |
| 40 then, | |
| 41 2. Optionally, searches for a genome FASTA file in gzipped format in specified | |
| 42 search path, where the prefix of the filename is the accession or | |
| 43 FASTA ID from 1. and then, creates a new concatenated gzipped genome FASTA | |
| 44 file with all the genomes in the text file from 1. | |
| 45 3. Creates a new FASTQ file with reads aligned to the accessions in the text | |
| 46 file from 1. | |
| 47 """ | |
| 48 | |
| 49 prog_name = os.path.basename(inspect.stack()[0].filename) | |
| 50 | |
| 51 parser = argparse.ArgumentParser( | |
| 52 prog=prog_name, description=main.__doc__, formatter_class=MultiArgFormatClasses | |
| 53 ) | |
| 54 | |
| 55 required = parser.add_argument_group("required arguments") | |
| 56 | |
| 57 required.add_argument( | |
| 58 "-txt", | |
| 59 dest="accs_txt", | |
| 60 default=False, | |
| 61 required=True, | |
| 62 help="Absolute UNIX path to .txt file containing accessions\n" | |
| 63 + "FASTA IDs, one per line.", | |
| 64 ) | |
| 65 required.add_argument( | |
| 66 "-op", | |
| 67 dest="out_prefix", | |
| 68 default="CATTED_GENOMES", | |
| 69 required=True, | |
| 70 help="Set the output file prefix for .fna.gz and .txt\n" + "files.", | |
| 71 ) | |
| 72 parser.add_argument( | |
| 73 "-gd", | |
| 74 dest="genomes_dir", | |
| 75 default=False, | |
| 76 required=False, | |
| 77 help="Absolute UNIX path to a directory containing\n" | |
| 78 + "gzipped genome FASTA files or a file.\n", | |
| 79 ) | |
| 80 parser.add_argument( | |
| 81 "-gds", | |
| 82 dest="genomes_dir_suffix", | |
| 83 default="_scaffolded_genomic.fna.gz", | |
| 84 required=False, | |
| 85 help="Genome FASTA file suffix to search for\nin the directory mentioned using\n-gd.", | |
| 86 ) | |
| 87 parser.add_argument( | |
| 88 "-query", | |
| 89 dest="id_is_query", | |
| 90 default=False, | |
| 91 action="store_true", | |
| 92 required=False, | |
| 93 help="In the produced FASTQ file, should the FASTA ID should be of KMA query ID\n" | |
| 94 + "or template ID.", | |
| 95 ) | |
| 96 parser.add_argument( | |
| 97 "-txts", | |
| 98 dest="accs_suffix", | |
| 99 default="_template_hits.txt", | |
| 100 required=False, | |
| 101 help="The suffix of the file supplied with -txt option. It is assumed that the\n" | |
| 102 + "sample name is present in the file supplied with -txt option and the suffix\n" | |
| 103 + "will be stripped and stored in a file that logs samples which have no hits.", | |
| 104 ) | |
| 105 parser.add_argument( | |
| 106 "-frag_delim", | |
| 107 dest="frag_delim", | |
| 108 default="\t", | |
| 109 required=False, | |
| 110 help="The delimitor by which the fields are separated in *_frag.gz file.", | |
| 111 ) | |
| 112 | |
| 113 args = parser.parse_args() | |
| 114 accs_txt = args.accs_txt | |
| 115 genomes_dir = args.genomes_dir | |
| 116 genomes_dir_suffix = args.genomes_dir_suffix | |
| 117 id_is_query = args.id_is_query | |
| 118 out_prefix = args.out_prefix | |
| 119 accs_suffix = args.accs_suffix | |
| 120 frag_delim = args.frag_delim | |
| 121 accs_seen = dict() | |
| 122 cat_genomes_gz = os.path.join(os.getcwd(), out_prefix + "_" + genomes_dir_suffix) | |
| 123 cat_genomes_gz = re.sub("__", "_", str(cat_genomes_gz)) | |
| 124 frags_gz = os.path.join(os.getcwd(), out_prefix + ".frag.gz") | |
| 125 cat_reads_gz = os.path.join(os.getcwd(), out_prefix + "_aln_reads.fna.gz") | |
| 126 cat_reads_gz = re.sub("__", "_", cat_reads_gz) | |
| 127 | |
| 128 if ( | |
| 129 accs_txt | |
| 130 and os.path.exists(cat_genomes_gz) | |
| 131 and os.path.getsize(cat_genomes_gz) > 0 | |
| 132 ): | |
| 133 logging.error( | |
| 134 "A concatenated genome FASTA file,\n" | |
| 135 + f"{os.path.basename(cat_genomes_gz)} already exists in:\n" | |
| 136 + f"{os.getcwd()}\n" | |
| 137 + "Please remove or move it as we will not " | |
| 138 + "overwrite it." | |
| 139 ) | |
| 140 exit(1) | |
| 141 | |
| 142 if accs_txt and (not os.path.exists(accs_txt) or not os.path.getsize(accs_txt) > 0): | |
| 143 logging.error("File,\n" + f"{accs_txt}\ndoes not exist " + "or is empty!") | |
| 144 failed_sample_name = re.sub(accs_suffix, "", os.path.basename(accs_txt)) | |
| 145 with open( | |
| 146 os.path.join(os.getcwd(), "_".join([out_prefix, "FAILED.txt"])), "w" | |
| 147 ) as failed_sample_fh: | |
| 148 failed_sample_fh.write(f"{failed_sample_name}\n") | |
| 149 failed_sample_fh.close() | |
| 150 exit(0) | |
| 151 | |
| 152 # ppp.pprint(mash_hits) | |
| 153 empty_lines = 0 | |
| 154 empty_lines_msg = "" | |
| 155 | |
| 156 with open(accs_txt, "r") as accs_txt_fh: | |
| 157 for line in accs_txt_fh: | |
| 158 if line in ["\n", "\n\r"]: | |
| 159 empty_lines += 1 | |
| 160 continue | |
| 161 else: | |
| 162 line = line.strip() | |
| 163 | |
| 164 if line in accs_seen.keys(): | |
| 165 continue | |
| 166 else: | |
| 167 accs_seen[line] = 1 | |
| 168 accs_txt_fh.close() | |
| 169 | |
| 170 if genomes_dir: | |
| 171 if not os.path.isdir(genomes_dir): | |
| 172 logging.error("UNIX path\n" + f"{genomes_dir}\n" + "does not exist!") | |
| 173 exit(1) | |
| 174 if len(glob.glob(os.path.join(genomes_dir, "*" + genomes_dir_suffix))) <= 0: | |
| 175 logging.error( | |
| 176 "Genomes directory" | |
| 177 + f"{genomes_dir}" | |
| 178 + "\ndoes not seem to have any\n" | |
| 179 + f"files ending with suffix: {genomes_dir_suffix}" | |
| 180 ) | |
| 181 exit(1) | |
| 182 | |
| 183 with open(cat_genomes_gz, "wb") as genomes_out_gz: | |
| 184 for line in accs_seen.keys(): | |
| 185 genome_file = os.path.join(genomes_dir, line + genomes_dir_suffix) | |
| 186 | |
| 187 if not os.path.exists(genome_file) or os.path.getsize(genome_file) <= 0: | |
| 188 logging.error( | |
| 189 f"Genome file {os.path.basename(genome_file)} does not\n" | |
| 190 + "exits or is empty!" | |
| 191 ) | |
| 192 exit(1) | |
| 193 else: | |
| 194 with open(genome_file, "rb") as genome_file_h: | |
| 195 genomes_out_gz.writelines(genome_file_h.readlines()) | |
| 196 genome_file_h.close() | |
| 197 genomes_out_gz.close() | |
| 198 | |
| 199 if ( | |
| 200 len(accs_seen.keys()) > 0 | |
| 201 and os.path.exists(frags_gz) | |
| 202 and os.path.getsize(frags_gz) > 0 | |
| 203 ): | |
| 204 with gzip.open( | |
| 205 cat_reads_gz, "wt", encoding="utf-8", compresslevel=6 | |
| 206 ) as cat_reads_gz_fh: | |
| 207 with gzip.open(frags_gz, "rb", compresslevel=6) as fragz_gz_fh: | |
| 208 fasta_id = 7 if id_is_query else 6 | |
| 209 for frag_line in fragz_gz_fh: | |
| 210 frag_lines = frag_line.decode("utf-8").strip().split(frag_delim) | |
| 211 # Per KMA specification, 6=template, 7=query, 1=read | |
| 212 cat_reads_gz_fh.write(f">{frag_lines[fasta_id]}\n{frag_lines[0]}\n") | |
| 213 fragz_gz_fh.close() | |
| 214 cat_reads_gz_fh.close() | |
| 215 | |
| 216 if empty_lines > 0: | |
| 217 empty_lines_msg = f"Skipped {empty_lines} empty line(s).\n" | |
| 218 | |
| 219 logging.info( | |
| 220 empty_lines_msg | |
| 221 + f"File {os.path.basename(cat_genomes_gz)}\n" | |
| 222 + f"written in:\n{os.getcwd()}\nDone! Bye!" | |
| 223 ) | |
| 224 | |
| 225 | |
| 226 if __name__ == "__main__": | |
| 227 main() |
