annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/comparesketch.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 #!/bin/bash
jpayne@69 2
jpayne@69 3 usage(){
jpayne@69 4 echo "
jpayne@69 5 Written by Brian Bushnell
jpayne@69 6 Last modified Jan 7, 2020
jpayne@69 7
jpayne@69 8 Description: Compares query sketches to others, and prints their kmer identity.
jpayne@69 9 The input can be sketches made by sketch.sh, or fasta/fastq files.
jpayne@69 10 It's recommended to first sketch references with sketch.sh for large files,
jpayne@69 11 or when taxonomic information is desired.
jpayne@69 12
jpayne@69 13 Please read bbmap/docs/guides/BBSketchGuide.txt for more information.
jpayne@69 14
jpayne@69 15 Usage: comparesketch.sh in=<file,file,file...> ref=<file,file,file...>
jpayne@69 16 Alternative: comparesketch.sh in=<file,file,file...> file file file
jpayne@69 17 Alternative: comparesketch.sh in=<file,file,file...> *.sketch
jpayne@69 18 Alternative: comparesketch.sh alltoall *.sketch.gz
jpayne@69 19
jpayne@69 20 File parameters:
jpayne@69 21 in=<file,file...> Sketches or fasta files to compare.
jpayne@69 22 out=stdout Comparison output. Can be set to a file instead.
jpayne@69 23 outsketch=<file> Optionally write sketch files generated from the input.
jpayne@69 24 ref=<file,file...> List of sketches to compare against. Files given without
jpayne@69 25 a prefix (ref=) will be treated as references,
jpayne@69 26 so you can use *.sketch without ref=.
jpayne@69 27 You can also do ref=nt#.sketch to load all numbered files
jpayne@69 28 fitting that pattern.
jpayne@69 29 On NERSC, you can use these abbreviations (e.g. ref=nt):
jpayne@69 30 nt: nt sketches
jpayne@69 31 refseq: Refseq sketches
jpayne@69 32 silva: Silva sketches
jpayne@69 33 img: IMG sketches
jpayne@69 34 mito: RefSeq mitochondrial sketches
jpayne@69 35 fungi: RefSeq fungi sketches
jpayne@69 36 protein: RefSeq prokaryotic amino acid sketches
jpayne@69 37 Using an abbreviation automatically sets the blacklist,
jpayne@69 38 and k. If the reference is in amino space, the query
jpayne@69 39 also be in amino acid space with the flag amino added.
jpayne@69 40 If the query is in nucleotide space, use the flag
jpayne@69 41 'translate', but this will only work for prokaryotes.
jpayne@69 42
jpayne@69 43 Blacklist and Whitelist parameters:
jpayne@69 44 blacklist=<file> Ignore keys in this sketch file. Additionally, there are
jpayne@69 45 built-in blacklists that can be specified:
jpayne@69 46 nt: Blacklist for nt
jpayne@69 47 refseq: Blacklist for Refseq
jpayne@69 48 silva: Blacklist for Silva
jpayne@69 49 img: Blacklist for IMG
jpayne@69 50 whitelist=f Ignore keys that are not in the index. Requires index=t.
jpayne@69 51
jpayne@69 52 Sketch-making parameters:
jpayne@69 53 mode=perfile Possible modes, for sequence input:
jpayne@69 54 single: Generate one sketch.
jpayne@69 55 sequence: Generate one sketch per sequence.
jpayne@69 56 perfile: Generate one sketch per file.
jpayne@69 57 sketchonly=f Don't run comparisons, just write the output sketch file.
jpayne@69 58 k=31 Kmer length, 1-32. To maximize sensitivity and
jpayne@69 59 specificity, dual kmer lengths may be used: k=31,24
jpayne@69 60 Dual kmers are fastest if the shorter is a multiple
jpayne@69 61 of 4. Query and reference k must match.
jpayne@69 62 samplerate=1 Set to a lower value to sample a fraction of input reads.
jpayne@69 63 For raw reads (rather than an assembly), 1-3x coverage
jpayne@69 64 gives best results, by reducing error kmers. Somewhat
jpayne@69 65 higher is better for high-error-rate data like PacBio.
jpayne@69 66 minkeycount=1 Ignore kmers that occur fewer times than this. Values
jpayne@69 67 over 1 can be used with raw reads to avoid error kmers.
jpayne@69 68 minprob=0.0001 Ignore kmers below this probability of correctness.
jpayne@69 69 minqual=0 Ignore kmers spanning bases below this quality.
jpayne@69 70 entropy=0.66 Ignore sequence with entropy below this value.
jpayne@69 71 merge=f Merge paired reads prior to sketching.
jpayne@69 72 amino=f Use amino acid mode. Input should be amino acids.
jpayne@69 73 translate=f Call genes and translate to proteins. Input should be
jpayne@69 74 nucleotides. Designed for prokaryotes.
jpayne@69 75 sixframes=f Translate all 6 frames instead of predicting genes.
jpayne@69 76 ssu=t Scan for and retain full-length SSU sequence.
jpayne@69 77 printssusequence=f Print the query SSU sequence (JSON mode only).
jpayne@69 78
jpayne@69 79 Size parameters:
jpayne@69 80 size=10000 Desired size of sketches (if not using autosize).
jpayne@69 81 mgf=0.01 (maxfraction) Max fraction of genomic kmers to use.
jpayne@69 82 minsize=100 Do not generate sketches for genomes smaller than this.
jpayne@69 83 autosize=t Use flexible sizing instead of fixed-length. This is
jpayne@69 84 nonlinear; a human sketch is only ~6x a bacterial sketch.
jpayne@69 85 sizemult=1 Multiply the autosized size of sketches by this factor.
jpayne@69 86 Normally a bacterial-size genome will get a sketch size
jpayne@69 87 of around 10000; if autosizefactor=2, it would be ~20000.
jpayne@69 88 density= If this flag is set (to a number between 0 and 1),
jpayne@69 89 autosize and sizemult are ignored, and this fraction of
jpayne@69 90 genomic kmers are used. For example, at density=0.001,
jpayne@69 91 a 4.5Mbp bacteria will get a 4500-kmer sketch.
jpayne@69 92 sketchheapfactor=4 If minkeycount>1, temporarily track this many kmers until
jpayne@69 93 counts are known and low-count kmers are discarded.
jpayne@69 94
jpayne@69 95 Sketch comparing parameters:
jpayne@69 96 threads=auto Use this many threads for comparison.
jpayne@69 97 index=auto Index the sketches for much faster searching.
jpayne@69 98 Requires more memory and adds startup time.
jpayne@69 99 Recommended true for many query sketches, false for few.
jpayne@69 100 prealloc=f Preallocate the index for greater efficiency.
jpayne@69 101 Can be set to a number between 0 and 1 to determine how
jpayne@69 102 much of total memory should be used.
jpayne@69 103 alltoall (ata) Compare all refs to all. Must be sketches.
jpayne@69 104 compareself=f In all-to-all mode, compare a sketch to itself.
jpayne@69 105
jpayne@69 106 Taxonomy-related parameters:
jpayne@69 107 tree=<file> Specify a TaxTree file. On Genepool, use tree=auto.
jpayne@69 108 Only necessary for use with printtaxa and level.
jpayne@69 109 Assumes comparisons are done against reference sketches
jpayne@69 110 with known taxonomy information.
jpayne@69 111 level=2 Only report the best record per taxa at this level.
jpayne@69 112 Either level names or numbers may be used.
jpayne@69 113 0: disabled
jpayne@69 114 1: subspecies
jpayne@69 115 2: species
jpayne@69 116 3: genus
jpayne@69 117 ...etc
jpayne@69 118 include= Restrict output to organisms in these clades.
jpayne@69 119 May be a comma-delimited list of names or NCBI TaxIDs.
jpayne@69 120 includelevel=0 Promote the include list to this taxonomic level.
jpayne@69 121 For example, include=h.sapiens includelevel=phylum
jpayne@69 122 would only include organisms in the same phylum as human.
jpayne@69 123 includestring= Only report records whose name contains this string.
jpayne@69 124 exclude= Ignore organisms in these clades.
jpayne@69 125 May be a comma-delimited list of names or NCBI TaxIDs.
jpayne@69 126 excludelevel=0 Promote the exclude list to this taxonomic level.
jpayne@69 127 For example, exclude=h.sapiens excludelevel=phylum
jpayne@69 128 would exclude all organisms in the same phylum as human.
jpayne@69 129 excludestring= Do not records whose name contains this string.
jpayne@69 130 minlevel= Use this to restrict comparisons to distantly-related
jpayne@69 131 organisms. Intended for finding misclassified organisms
jpayne@69 132 using all-to-all comparisons. minlevel=order would only
jpayne@69 133 report hits between organisms related at the order level
jpayne@69 134 or higher, not between same species or genus.
jpayne@69 135 banunclassified=f Ignore organisms descending from nodes like
jpayne@69 136 'unclassified Bacteria'
jpayne@69 137 banvirus=f Ignore viruses.
jpayne@69 138 requiressu=f Ignore records without SSUs.
jpayne@69 139 minrefsize=0 Ignore ref sketches smaller than this (unique kmers).
jpayne@69 140 minrefsizebases=0 Ignore ref sketches smaller than this (total base pairs).
jpayne@69 141
jpayne@69 142 Output format:
jpayne@69 143 format=2 2: Default format with, per query, one query header line;
jpayne@69 144 one column header line; and one reference line per hit.
jpayne@69 145 3: One line per hit, with columns query, reference, ANI,
jpayne@69 146 and sizeRatio. Useful for all-to-all comparisons.
jpayne@69 147 4: JSON (format=json also works).
jpayne@69 148 5: Constellation (format=constellation also works).
jpayne@69 149 usetaxidname=f For format 3, print the taxID in the name column.
jpayne@69 150 usetaxname for format 3, print the taxonomic name in the name column.
jpayne@69 151 useimgname For format 3, print the img ID in the name column.
jpayne@69 152
jpayne@69 153 Output columns (for format=2):
jpayne@69 154 printall=f Enable all output columns.
jpayne@69 155 printani=t (ani) Print average nucleotide identity estimate.
jpayne@69 156 completeness=t Genome completeness estimate.
jpayne@69 157 score=f Score (used for sorting the output).
jpayne@69 158 printmatches=t Number of kmer matches to reference.
jpayne@69 159 printlength=f Number of kmers compared.
jpayne@69 160 printtaxid=t NCBI taxID.
jpayne@69 161 printimg=f IMG identifier (only for IMG data).
jpayne@69 162 printgbases=f Number of genomic bases.
jpayne@69 163 printgkmers=f Number of genomic kmers.
jpayne@69 164 printgsize=t Estimated number of unique genomic kmers.
jpayne@69 165 printgseqs=t Number of sequences (scaffolds/reads).
jpayne@69 166 printtaxname=t Name associated with this taxID.
jpayne@69 167 printname0=f (pn0) Original seqeuence name.
jpayne@69 168 printfname=t Query filename.
jpayne@69 169 printtaxa=f Full taxonomy of each record.
jpayne@69 170 printcontam=t Print contamination estimate, and factor contaminant kmers
jpayne@69 171 into calculations. Kmers are considered contaminant if
jpayne@69 172 present in some ref sketch but not the current one.
jpayne@69 173 printunique=t Number of matches unique to this reference.
jpayne@69 174 printunique2=f Number of matches unique to this reference's taxa.
jpayne@69 175 printunique3=f Number of query kmers unique to this reference's taxa,
jpayne@69 176 regardless of whether they are in this reference sketch.
jpayne@69 177 printnohit=f Number of kmers that don't hit anything.
jpayne@69 178 printrefhits=f Average number of ref sketches hit by shared kmers.
jpayne@69 179 printgc=f GC content.
jpayne@69 180 printucontam=f Contam hits that hit exactly one reference sketch.
jpayne@69 181 printcontam2=f Print contamination estimate using only kmer hits
jpayne@69 182 to unrelated taxa.
jpayne@69 183 contamlevel=species Taxonomic level to use for contam2/unique2/unique3.
jpayne@69 184 NOTE: unique2/unique3/contam2/refhits require an index.
jpayne@69 185
jpayne@69 186 printdepth=f (depth) Print average depth of sketch kmers; intended
jpayne@69 187 for shotgun read input.
jpayne@69 188 printdepth2=f (depth2) Print depth compensating for genomic repeats.
jpayne@69 189 Requires reference sketches to be generated with depth.
jpayne@69 190 actualdepth=t If this is false, the raw average count is printed.
jpayne@69 191 If true, the raw average (observed depth) is converted
jpayne@69 192 to estimated actual depth (including uncovered areas).
jpayne@69 193 printvolume=f (volume) Product of average depth and matches.
jpayne@69 194 printca=f Print common ancestor, if query taxID is known.
jpayne@69 195 printcal=f Print common ancestor tax level, if query taxID is known.
jpayne@69 196 recordsperlevel=0 If query TaxID is known, and this is positive, print this
jpayne@69 197 many records per common ancestor level.
jpayne@69 198
jpayne@69 199 Sorting:
jpayne@69 200 sortbyscore=t Default sort order is by score, a composite metric.
jpayne@69 201 sortbydepth=f Include depth as a factor in sort order.
jpayne@69 202 sortbydepth2=f Include depth2 as a factor in sort order.
jpayne@69 203 sortbyvolume=f Include volume as a factor in sort order.
jpayne@69 204 sortbykid=f Sort strictly by KID.
jpayne@69 205 sortbyani=f Sort strictly by ANI/AAI/WKID.
jpayne@69 206 sortbyhits=f Sort strictly by the number of kmer hits.
jpayne@69 207
jpayne@69 208 Other output parameters:
jpayne@69 209 minhits=3 (hits) Only report records with at least this many hits.
jpayne@69 210 minani=0 (ani) Only report records with at least this ANI (0-1).
jpayne@69 211 minwkid=0.0001 (wkid) Only report records with at least this WKID (0-1).
jpayne@69 212 anifromwkid=t Calculate ani from wkid. If false, use kid.
jpayne@69 213 minbases=0 Ignore ref sketches of sequences shortert than this.
jpayne@69 214 minsizeratio=0 Don't compare sketches if the smaller genome is less than
jpayne@69 215 this fraction of the size of the larger.
jpayne@69 216 records=20 Report at most this many best-matching records.
jpayne@69 217 color=family Color records at the family level. color=f will disable.
jpayne@69 218 Colors work in most terminals but may cause odd characters
jpayne@69 219 to appear in text editors. So, color defaults to f if
jpayne@69 220 writing to a file. Requires the taxtree to be loaded.
jpayne@69 221 intersect=f Print sketch intersections. delta=f is suggested.
jpayne@69 222
jpayne@69 223 Metadata flags (optional, for the query sketch header):
jpayne@69 224 taxid=-1 Set the NCBI taxid.
jpayne@69 225 imgid=-1 Set the IMG id.
jpayne@69 226 spid=-1 Set the JGI sequencing project id.
jpayne@69 227 name= Set the name (taxname).
jpayne@69 228 name0= Set name0 (normally the first sequence header).
jpayne@69 229 fname= Set fname (normally the file name).
jpayne@69 230 meta_= Set an arbitrary metadata field.
jpayne@69 231 For example, meta_Month=March.
jpayne@69 232
jpayne@69 233 Other parameters:
jpayne@69 234 requiredmeta= (rmeta) Required optional metadata values. For example:
jpayne@69 235 rmeta=subunit:ssu,source:silva
jpayne@69 236 bannedmeta= (bmeta) Forbidden optional metadata values.
jpayne@69 237
jpayne@69 238
jpayne@69 239 Java Parameters:
jpayne@69 240 -Xmx This will set Java's memory usage, overriding autodetection.
jpayne@69 241 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
jpayne@69 242 The max is typically 85% of physical memory.
jpayne@69 243 -eoom This flag will cause the process to exit if an
jpayne@69 244 out-of-memory exception occurs. Requires Java 8u92+.
jpayne@69 245 -da Disable assertions.
jpayne@69 246
jpayne@69 247 For more detailed information, please read /bbmap/docs/guides/BBSketchGuide.txt.
jpayne@69 248 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
jpayne@69 249 "
jpayne@69 250 }
jpayne@69 251
jpayne@69 252 #This block allows symlinked shellscripts to correctly set classpath.
jpayne@69 253 pushd . > /dev/null
jpayne@69 254 DIR="${BASH_SOURCE[0]}"
jpayne@69 255 while [ -h "$DIR" ]; do
jpayne@69 256 cd "$(dirname "$DIR")"
jpayne@69 257 DIR="$(readlink "$(basename "$DIR")")"
jpayne@69 258 done
jpayne@69 259 cd "$(dirname "$DIR")"
jpayne@69 260 DIR="$(pwd)/"
jpayne@69 261 popd > /dev/null
jpayne@69 262
jpayne@69 263 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
jpayne@69 264 CP="$DIR""current/"
jpayne@69 265
jpayne@69 266 z="-Xmx4g"
jpayne@69 267 z2="-Xms4g"
jpayne@69 268 set=0
jpayne@69 269
jpayne@69 270 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
jpayne@69 271 usage
jpayne@69 272 exit
jpayne@69 273 fi
jpayne@69 274
jpayne@69 275 calcXmx () {
jpayne@69 276 source "$DIR""/calcmem.sh"
jpayne@69 277 setEnvironment
jpayne@69 278 parseXmx "$@"
jpayne@69 279 if [[ $set == 1 ]]; then
jpayne@69 280 return
jpayne@69 281 fi
jpayne@69 282 freeRam 3200m 84
jpayne@69 283 z="-Xmx${RAM}m"
jpayne@69 284 z2="-Xms${RAM}m"
jpayne@69 285 }
jpayne@69 286 calcXmx "$@"
jpayne@69 287
jpayne@69 288 comparesketch() {
jpayne@69 289 local CMD="java $EA $EOOM $z $z2 -cp $CP sketch.CompareSketch $@"
jpayne@69 290 # echo $CMD >&2
jpayne@69 291 eval $CMD
jpayne@69 292 }
jpayne@69 293
jpayne@69 294 comparesketch "$@"