annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/sendsketch.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 #!/bin/bash
jpayne@69 2
jpayne@69 3 usage(){
jpayne@69 4 echo "
jpayne@69 5 Written by Brian Bushnell
jpayne@69 6 Last modified December 19, 2019
jpayne@69 7
jpayne@69 8 Description: Compares query sketches to reference sketches hosted on a
jpayne@69 9 remote server via the Internet. The input can be sketches made by sketch.sh,
jpayne@69 10 or fasta/fastq files from which SendSketch will generate sketches.
jpayne@69 11 Only sketches will sent, not sequences.
jpayne@69 12
jpayne@69 13 Please read bbmap/docs/guides/BBSketchGuide.txt for more information.
jpayne@69 14
jpayne@69 15 Usage:
jpayne@69 16 sendsketch.sh in=file
jpayne@69 17
jpayne@69 18 To change nucleotide servers, add the server name, e.g.:
jpayne@69 19 sendsketch.sh in=file nt
jpayne@69 20
jpayne@69 21 For the protein server with nucleotide input:
jpayne@69 22 sendsketch.sh in=file protein
jpayne@69 23
jpayne@69 24 for the protein server with amino input:
jpayne@69 25 sendsketch.sh in=file amino protein
jpayne@69 26
jpayne@69 27
jpayne@69 28 Standard parameters:
jpayne@69 29 in=<file> Sketch or fasta file to compare.
jpayne@69 30 out=stdout Comparison output. Can be set to a file instead.
jpayne@69 31 outsketch= Optional, to write the sketch to a file.
jpayne@69 32 local=f For local files, have the server load the sketches.
jpayne@69 33 Allows use of whitelists; recommended for Silva.
jpayne@69 34 Local can only be used when the client and server access
jpayne@69 35 the same filesystem - e.g., Genepool and Cori.
jpayne@69 36 address= Address of remote server. Default address:
jpayne@69 37 https://refseq-sketch.jgi-psf.org/sketch
jpayne@69 38 You can also specify these abbreviations:
jpayne@69 39 nt: nt server
jpayne@69 40 refseq: Refseq server
jpayne@69 41 silva: Silva server
jpayne@69 42 protein: RefSeq prokaryotic amino acid sketches
jpayne@69 43 img: IMG server (Not Yet Available)
jpayne@69 44 mito: RefSeq mitochondrial server (NYA)
jpayne@69 45 fungi: RefSeq fungi sketches (NYA)
jpayne@69 46 Using an abbreviation automatically sets the address,
jpayne@69 47 the blacklist, and k.
jpayne@69 48 aws=f Set aws=t to use the aws servers instead of NERSC.
jpayne@69 49 When, for example, NERSC (or the whole SF Bay area) is down.
jpayne@69 50
jpayne@69 51 Sketch-making parameters:
jpayne@69 52 mode=single Possible modes, for fasta input:
jpayne@69 53 single: Generate one sketch per file.
jpayne@69 54 sequence: Generate one sketch per sequence.
jpayne@69 55 k=31 Kmer length, 1-32. This is automatic and does not need to
jpayne@69 56 be set for JGI servers, only for locally-hosted servers.
jpayne@69 57 samplerate=1 Set to a lower value to sample a fraction of input reads.
jpayne@69 58 For raw reads (rather than an assembly), 1-3x coverage
jpayne@69 59 gives best results, by reducing error kmers. Somewhat
jpayne@69 60 higher is better for high-error-rate data like PacBio.
jpayne@69 61 minkeycount=1 Ignore kmers that occur fewer times than this. Values
jpayne@69 62 over 1 can be used with raw reads to avoid error kmers.
jpayne@69 63 minprob=0.0001 Ignore kmers below this probability of correctness.
jpayne@69 64 minqual=0 Ignore kmers spanning bases below this quality.
jpayne@69 65 entropy=0.66 Ignore sequence with entropy below this value.
jpayne@69 66 merge=f Merge paired reads prior to sketching.
jpayne@69 67 amino=f Use amino acid mode. Input should be amino acids.
jpayne@69 68 translate=f Call genes and translate to proteins. Input should be
jpayne@69 69 nucleotides. Designed for prokaryotes.
jpayne@69 70 sixframes=f Translate all 6 frames instead of predicting genes.
jpayne@69 71 ssu=t Scan for and retain full-length SSU sequence.
jpayne@69 72 printssusequence=f Print the query SSU sequence (JSON mode only).
jpayne@69 73 refid= Instead of a query file, specify a reference sketch by name
jpayne@69 74 or taxid; e.g. refid=h.sapiens or refid=9606.
jpayne@69 75
jpayne@69 76 Size parameters:
jpayne@69 77 size=10000 Desired size of sketches (if not using autosize).
jpayne@69 78 mgf=0.01 (maxfraction) Max fraction of genomic kmers to use.
jpayne@69 79 minsize=100 Do not generate sketches for genomes smaller than this.
jpayne@69 80 autosize=t Use flexible sizing instead of fixed-length. This is
jpayne@69 81 nonlinear; a human sketch is only ~6x a bacterial sketch.
jpayne@69 82 sizemult=1 Multiply the autosized size of sketches by this factor.
jpayne@69 83 Normally a bacterial-size genome will get a sketch size
jpayne@69 84 of around 10000; if autosizefactor=2, it would be ~20000.
jpayne@69 85 density= If this flag is set (to a number between 0 and 1),
jpayne@69 86 autosize and sizemult are ignored, and this fraction of
jpayne@69 87 genomic kmers are used. For example, at density=0.001,
jpayne@69 88 a 4.5Mbp bacteria will get a 4500-kmer sketch.
jpayne@69 89 sketchheapfactor=4 If minkeycount>1, temporarily track this many kmers until
jpayne@69 90 counts are known and low-count kmers are discarded.
jpayne@69 91
jpayne@69 92 Taxonomy and filtering parameters:
jpayne@69 93 level=2 Only report the best record per taxa at this level.
jpayne@69 94 Either level names or numbers may be used.
jpayne@69 95 0: disabled
jpayne@69 96 1: subspecies
jpayne@69 97 2: species
jpayne@69 98 3: genus
jpayne@69 99 ...etc
jpayne@69 100 include= Restrict output to organisms in these clades.
jpayne@69 101 May be a comma-delimited list of names or NCBI TaxIDs.
jpayne@69 102 includelevel=0 Promote the include list to this taxonomic level.
jpayne@69 103 For example, include=h.sapiens includelevel=phylum
jpayne@69 104 would only include organisms in the same phylum as human.
jpayne@69 105 includestring= Only report records whose name contains this string.
jpayne@69 106 exclude= Ignore organisms in these clades.
jpayne@69 107 May be a comma-delimited list of names or NCBI TaxIDs.
jpayne@69 108 excludelevel=0 Promote the exclude list to this taxonomic level.
jpayne@69 109 For example, exclude=h.sapiens excludelevel=phylum
jpayne@69 110 would exclude all organisms in the same phylum as human.
jpayne@69 111 excludestring= Do not records whose name contains this string.
jpayne@69 112 banunclassified=f Ignore organisms descending from nodes like
jpayne@69 113 'unclassified Bacteria'
jpayne@69 114 banvirus=f Ignore viruses.
jpayne@69 115 requiressu=f Ignore records without SSUs.
jpayne@69 116 minrefsize=0 Ignore ref sketches smaller than this (unique kmers).
jpayne@69 117 minrefsizebases=0 Ignore ref sketches smaller than this (total base pairs).
jpayne@69 118
jpayne@69 119 Output format:
jpayne@69 120 format=2 2: Default format with, per query, one query header line;
jpayne@69 121 one column header line; and one reference line per hit.
jpayne@69 122 3: One line per hit, with columns query, reference, ANI,
jpayne@69 123 and sizeRatio.
jpayne@69 124 4: JSON (format=json also works).
jpayne@69 125 5: Constellation (format=constellation also works).
jpayne@69 126 usetaxidname=f For format 3, print the taxID in the name column.
jpayne@69 127 usetaxname for format 3, print the taxonomic name in the name column.
jpayne@69 128 useimgname For format 3, print the img ID in the name column.
jpayne@69 129 d3=f Output in JSON format, with a tree for visualization.
jpayne@69 130
jpayne@69 131 Output columns (for format=2):
jpayne@69 132 printall=f Enable all output columns.
jpayne@69 133 printani=t (ani) Print average nucleotide identity estimate.
jpayne@69 134 completeness=t Genome completeness estimate.
jpayne@69 135 score=f Score (used for sorting the output).
jpayne@69 136 printmatches=t Number of kmer matches to reference.
jpayne@69 137 printlength=f Number of kmers compared.
jpayne@69 138 printtaxid=t NCBI taxID.
jpayne@69 139 printimg=f IMG identifier (only for IMG data).
jpayne@69 140 printgbases=f Number of genomic bases.
jpayne@69 141 printgkmers=f Number of genomic kmers.
jpayne@69 142 printgsize=t Estimated number of unique genomic kmers.
jpayne@69 143 printgseqs=t Number of sequences (scaffolds/reads).
jpayne@69 144 printtaxname=t Name associated with this taxID.
jpayne@69 145 printname0=f (pn0) Original seqeuence name.
jpayne@69 146 printqfname=t Query filename.
jpayne@69 147 printrfname=f Reference filename.
jpayne@69 148 printtaxa=f Full taxonomy of each record.
jpayne@69 149 printcontam=t Print contamination estimate, and factor contaminant kmers
jpayne@69 150 into calculations. Kmers are considered contaminant if
jpayne@69 151 present in some ref sketch but not the current one.
jpayne@69 152 printunique=t Number of matches unique to this reference.
jpayne@69 153 printunique2=f Number of matches unique to this reference's taxa.
jpayne@69 154 printunique3=f Number of query kmers unique to this reference's taxa,
jpayne@69 155 regardless of whether they are in this reference sketch.
jpayne@69 156 printnohit=f Number of kmers that don't hit anything.
jpayne@69 157 printrefhits=f Average number of ref sketches hit by shared kmers.
jpayne@69 158 printgc=f GC content.
jpayne@69 159 printucontam=f Contam hits that hit exactly one reference sketch.
jpayne@69 160 printcontam2=f Print contamination estimate using only kmer hits
jpayne@69 161 to unrelated taxa.
jpayne@69 162 contamlevel=species Taxonomic level to use for contam2/unique2/unique3.
jpayne@69 163 NOTE: unique2/unique3/contam2/refhits require an index.
jpayne@69 164
jpayne@69 165 printdepth=f (depth) Print average depth of sketch kmers; intended
jpayne@69 166 for shotgun read input.
jpayne@69 167 printdepth2=f (depth2) Print depth compensating for genomic repeats.
jpayne@69 168 Requires reference sketches to be generated with depth.
jpayne@69 169 actualdepth=t If this is false, the raw average count is printed.
jpayne@69 170 If true, the raw average (observed depth) is converted
jpayne@69 171 to estimated actual depth (including uncovered areas).
jpayne@69 172 printvolume=f (volume) Product of average depth and matches.
jpayne@69 173 printca=f Print common ancestor, if query taxID is known.
jpayne@69 174 printcal=f Print common ancestor tax level, if query taxID is known.
jpayne@69 175 recordsperlevel=0 If query TaxID is known, and this is positive, print at
jpayne@69 176 most this many records per common ancestor level.
jpayne@69 177
jpayne@69 178 Sorting:
jpayne@69 179 sortbyscore=t Default sort order is by score.
jpayne@69 180 sortbydepth=f Include depth as a factor in sort order.
jpayne@69 181 sortbydepth2=f Include depth2 as a factor in sort order.
jpayne@69 182 sortbyvolume=f Include volume as a factor in sort order.
jpayne@69 183 sortbykid=f Sort strictly by KID.
jpayne@69 184 sortbyani=f Sort strictly by ANI/AAI/WKID.
jpayne@69 185 sortbyhits=f Sort strictly by the number of kmer hits.
jpayne@69 186
jpayne@69 187 Other output parameters:
jpayne@69 188 minhits=3 (hits) Only report records with at least this many hits.
jpayne@69 189 minani=0 (ani) Only report records with at least this ANI (0-1).
jpayne@69 190 minwkid=0.0001 (wkid) Only report records with at least this WKID (0-1).
jpayne@69 191 anifromwkid=t Calculate ani from wkid. If false, use kid.
jpayne@69 192 minbases=0 Ignore ref sketches of sequences shortert than this.
jpayne@69 193 minsizeratio=0 Don't compare sketches if the smaller genome is less than
jpayne@69 194 this fraction of the size of the larger.
jpayne@69 195 records=20 Report at most this many best-matching records.
jpayne@69 196 color=family Color records at the family level. color=f will disable.
jpayne@69 197 Colors work in most terminals but may cause odd characters
jpayne@69 198 to appear in text editors. So, color defaults to f if
jpayne@69 199 writing to a file.
jpayne@69 200 intersect=f Print sketch intersections. delta=f is suggested.
jpayne@69 201
jpayne@69 202 Metadata parameters (optional, for the query sketch header):
jpayne@69 203 taxid=-1 Set the NCBI taxid.
jpayne@69 204 imgid=-1 Set the IMG id.
jpayne@69 205 spid=-1 Set the sequencing project id (JGI-specific).
jpayne@69 206 name= Set the name (taxname).
jpayne@69 207 name0= Set name0 (normally the first sequence header).
jpayne@69 208 fname= Set fname (normally the file name).
jpayne@69 209 meta_= Set an arbitrary metadata field.
jpayne@69 210 For example, meta_Month=March.
jpayne@69 211
jpayne@69 212 Other parameters:
jpayne@69 213 requiredmeta= (rmeta) Required optional metadata values. For example:
jpayne@69 214 rmeta=subunit:ssu,source:silva
jpayne@69 215 bannedmeta= (bmeta) Forbidden optional metadata values.
jpayne@69 216
jpayne@69 217 Java Parameters:
jpayne@69 218 -Xmx This will set Java's memory usage, overriding autodetection.
jpayne@69 219 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
jpayne@69 220 The max is typically 85% of physical memory.
jpayne@69 221 -eoom This flag will cause the process to exit if an out-of-memory
jpayne@69 222 exception occurs. Requires Java 8u92+.
jpayne@69 223 -da Disable assertions.
jpayne@69 224
jpayne@69 225 For more detailed information, please read /bbmap/docs/guides/BBSketchGuide.txt.
jpayne@69 226 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
jpayne@69 227 "
jpayne@69 228 }
jpayne@69 229
jpayne@69 230 #This block allows symlinked shellscripts to correctly set classpath.
jpayne@69 231 pushd . > /dev/null
jpayne@69 232 DIR="${BASH_SOURCE[0]}"
jpayne@69 233 while [ -h "$DIR" ]; do
jpayne@69 234 cd "$(dirname "$DIR")"
jpayne@69 235 DIR="$(readlink "$(basename "$DIR")")"
jpayne@69 236 done
jpayne@69 237 cd "$(dirname "$DIR")"
jpayne@69 238 DIR="$(pwd)/"
jpayne@69 239 popd > /dev/null
jpayne@69 240
jpayne@69 241 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
jpayne@69 242 CP="$DIR""current/"
jpayne@69 243
jpayne@69 244 z="-Xmx4g"
jpayne@69 245 z2="-Xms4g"
jpayne@69 246 set=0
jpayne@69 247
jpayne@69 248 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
jpayne@69 249 usage
jpayne@69 250 exit
jpayne@69 251 fi
jpayne@69 252
jpayne@69 253 calcXmx () {
jpayne@69 254 source "$DIR""/calcmem.sh"
jpayne@69 255 setEnvironment
jpayne@69 256 parseXmx "$@"
jpayne@69 257 if [[ $set == 1 ]]; then
jpayne@69 258 return
jpayne@69 259 fi
jpayne@69 260 freeRam 3200m 84
jpayne@69 261 z="-Xmx${RAM}m"
jpayne@69 262 z2="-Xms${RAM}m"
jpayne@69 263 }
jpayne@69 264 calcXmx "$@"
jpayne@69 265
jpayne@69 266 sendsketch() {
jpayne@69 267 local CMD="java $EA $EOOM $z -cp $CP sketch.SendSketch $@"
jpayne@69 268 # echo $CMD >&2
jpayne@69 269 eval $CMD
jpayne@69 270 }
jpayne@69 271
jpayne@69 272 sendsketch "$@"