annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/sketch.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 #!/bin/bash
jpayne@69 2
jpayne@69 3 usage(){
jpayne@69 4 echo "
jpayne@69 5 Written by Brian Bushnell
jpayne@69 6 Last modified January 28, 2020
jpayne@69 7
jpayne@69 8 Description: Creates one or more sketches from a fasta file,
jpayne@69 9 optionally annotated with taxonomic information.
jpayne@69 10
jpayne@69 11 Please read bbmap/docs/guides/BBSketchGuide.txt for more information.
jpayne@69 12
jpayne@69 13 Usage: sketch.sh in=<fasta file> out=<sketch file>
jpayne@69 14
jpayne@69 15 Standard parameters:
jpayne@69 16 in=<file> A fasta file containing one or more sequences.
jpayne@69 17 out=<file> Output filename. If multiple files are desired it must
jpayne@69 18 contain the # symbol.
jpayne@69 19 blacklist=<file> Ignore keys in this sketch file. Additionaly, there are
jpayne@69 20 built-in blacklists that can be specified:
jpayne@69 21 nt: Blacklist for nt
jpayne@69 22 refseq: Blacklist for Refseq
jpayne@69 23 silva: Blacklist for Silva
jpayne@69 24 img: Blacklist for IMG
jpayne@69 25 files=1 Number of output sketch files to produce, for parallel
jpayne@69 26 loading. Independent of the number of sketches produced;
jpayne@69 27 sketches will be randomly distributed between files.
jpayne@69 28 k=32,24 Kmer length, 1-32. To maximize sensitivity and
jpayne@69 29 specificity, dual kmer lengths may be used, e.g. k=32,24
jpayne@69 30 Query and reference k must match.
jpayne@69 31 rcomp=t Look at reverse-complement kmers also.
jpayne@69 32 amino=f Use amino acid mode. Input should be amino acids.
jpayne@69 33 translate=f Call genes and translate to proteins. Input should be
jpayne@69 34 nucleotides. Designed for prokaryotes.
jpayne@69 35 mode=single Possible modes:
jpayne@69 36 single: Write one sketch.
jpayne@69 37 sequence: Write one sketch per sequence.
jpayne@69 38 taxa: Write one sketch per taxonomic unit.
jpayne@69 39 Requires more memory, and taxonomic annotation.
jpayne@69 40 img: Write one sketch per IMG id.
jpayne@69 41 delta=t Delta-compress sketches.
jpayne@69 42 a48=t Encode sketches as ASCII-48 rather than hex.
jpayne@69 43 depth=f Track the number of times kmers appear.
jpayne@69 44 Required for the depth2 field in comparisons.
jpayne@69 45 entropy=0.66 Ignore sequence with entropy below this value.
jpayne@69 46 ssu=t Scan for and retain full-length SSU sequence.
jpayne@69 47
jpayne@69 48 Size parameters:
jpayne@69 49 size=10000 Desired size of sketches (if not using autosize).
jpayne@69 50 maxfraction=0.01 (mgf) Max fraction of genomic kmers to use.
jpayne@69 51 minsize=100 Do not generate sketches for genomes smaller than this.
jpayne@69 52 autosize=t Use flexible sizing instead of fixed-length. This is
jpayne@69 53 nonlinear; a human sketch is only ~6x a bacterial sketch.
jpayne@69 54 sizemult=1 Multiply the autosized size of sketches by this factor.
jpayne@69 55 Normally a bacterial-size genome will get a sketch size
jpayne@69 56 of around 10000; if autosizefactor=2, it would be ~20000.
jpayne@69 57 density= If this flag is set (to a number between 0 and 1),
jpayne@69 58 autosize and sizemult are ignored, and this fraction of
jpayne@69 59 genomic kmers are used. For example, at density=0.001,
jpayne@69 60 a 4.5Mbp bacteria will get a 4500-kmer sketch.
jpayne@69 61
jpayne@69 62 Metadata flags (optional; intended for single-sketch mode):
jpayne@69 63 taxid=-1 Set the NCBI taxid.
jpayne@69 64 imgid=-1 Set the IMG id.
jpayne@69 65 spid=-1 Set the JGI sequencing project id.
jpayne@69 66 name= Set the name (taxname).
jpayne@69 67 name0= Set name0 (normally the first sequence header).
jpayne@69 68 fname= Set fname (normally the file name).
jpayne@69 69 meta_= Set an arbitrary metadata field.
jpayne@69 70 For example, meta_Month=March.
jpayne@69 71
jpayne@69 72 Taxonomy-specific flags:
jpayne@69 73 tree= Specify a taxtree file. On Genepool, use 'auto'.
jpayne@69 74 gi= Specify a gitable file. On Genepool, use 'auto'.
jpayne@69 75 accession= Specify one or more comma-delimited NCBI accession to
jpayne@69 76 taxid files. On Genepool, use 'auto'.
jpayne@69 77 imgdump= Specify an IMG dump file containing NCBI taxIDs,
jpayne@69 78 for IMG mode.
jpayne@69 79 taxlevel=subspecies Taxa hits below this rank will be promoted and merged
jpayne@69 80 with others.
jpayne@69 81 prefilter=f For huge datasets full of junk like nt, this flag
jpayne@69 82 will save memory by ignoring taxa smaller than minsize.
jpayne@69 83 Requires taxonomic information (tree and gi).
jpayne@69 84 tossjunk=t For taxa mode, discard taxonomically uninformative
jpayne@69 85 sequences. This includes sequences with no taxid,
jpayne@69 86 with a tax level NO_RANK, of parent taxid of LIFE.
jpayne@69 87 silva=f Parse headers using Silva or semicolon-delimited syntax.
jpayne@69 88
jpayne@69 89 Ribosomal flags, which allow SSU sequences to be attached to sketches:
jpayne@69 90 processSSU=t Run gene-calling to detect ribosomal SSU sequences.
jpayne@69 91 16Sfile=<file> Optional file of 16S sequences, annotated with TaxIDs.
jpayne@69 92 18Sfile=<file> Optional file of 18S sequences, annotated with TaxIDs.
jpayne@69 93 preferSSUMap=f Prefer file SSUs over called SSUs.
jpayne@69 94 preferSSUMapEuks=t Prefer file SSUs over called SSUs for Eukaryotes.
jpayne@69 95 SSUMapOnly=f Only use file SSUs.
jpayne@69 96 SSUMapOnlyEuks=f Only use file SSUs for Eukaryotes. This prevents
jpayne@69 97 associating an organism with its mitochondrial or
jpayne@69 98 chloroplast 16S/18S, which is otherwise a problem.
jpayne@69 99
jpayne@69 100
jpayne@69 101 Java Parameters:
jpayne@69 102 -Xmx This will set Java's memory usage, overriding autodetection.
jpayne@69 103 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
jpayne@69 104 The max is typically 85% of physical memory.
jpayne@69 105 -eoom This flag will cause the process to exit if an
jpayne@69 106 out-of-memory exception occurs. Requires Java 8u92+.
jpayne@69 107 -da Disable assertions.
jpayne@69 108
jpayne@69 109 For more detailed information, please read /bbmap/docs/guides/BBSketchGuide.txt.
jpayne@69 110 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
jpayne@69 111 "
jpayne@69 112 }
jpayne@69 113
jpayne@69 114 #This block allows symlinked shellscripts to correctly set classpath.
jpayne@69 115 pushd . > /dev/null
jpayne@69 116 DIR="${BASH_SOURCE[0]}"
jpayne@69 117 while [ -h "$DIR" ]; do
jpayne@69 118 cd "$(dirname "$DIR")"
jpayne@69 119 DIR="$(readlink "$(basename "$DIR")")"
jpayne@69 120 done
jpayne@69 121 cd "$(dirname "$DIR")"
jpayne@69 122 DIR="$(pwd)/"
jpayne@69 123 popd > /dev/null
jpayne@69 124
jpayne@69 125 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
jpayne@69 126 CP="$DIR""current/"
jpayne@69 127
jpayne@69 128 z="-Xmx4g"
jpayne@69 129 z2="-Xms4g"
jpayne@69 130 set=0
jpayne@69 131
jpayne@69 132 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
jpayne@69 133 usage
jpayne@69 134 exit
jpayne@69 135 fi
jpayne@69 136
jpayne@69 137 calcXmx () {
jpayne@69 138 source "$DIR""/calcmem.sh"
jpayne@69 139 setEnvironment
jpayne@69 140 parseXmx "$@"
jpayne@69 141 if [[ $set == 1 ]]; then
jpayne@69 142 return
jpayne@69 143 fi
jpayne@69 144 freeRam 4000m 84
jpayne@69 145 z="-Xmx${RAM}m"
jpayne@69 146 z2="-Xms${RAM}m"
jpayne@69 147 }
jpayne@69 148 calcXmx "$@"
jpayne@69 149
jpayne@69 150 sketch() {
jpayne@69 151 local CMD="java $EA $EOOM $z $z2 -cp $CP sketch.SketchMaker $@"
jpayne@69 152 echo $CMD >&2
jpayne@69 153 eval $CMD
jpayne@69 154 }
jpayne@69 155
jpayne@69 156 sketch "$@"