jpayne@69: #!/bin/bash
jpayne@69:
jpayne@69: usage(){
jpayne@69: echo "
jpayne@69: Written by Brian Bushnell
jpayne@69: Last modified July 28, 2022
jpayne@69:
jpayne@69: Description: Compares reads to the kmers in a reference dataset, optionally
jpayne@69: allowing an edit distance. Splits the reads into two outputs - those that
jpayne@69: match the reference, and those that don't. Can also trim (remove) the matching
jpayne@69: parts of the reads rather than binning the reads.
jpayne@69: Please read bbmap/docs/guides/BBDukGuide.txt for more information.
jpayne@69:
jpayne@69: Usage: bbduk.sh in= out= ref=
jpayne@69:
jpayne@69: Input may be stdin or a fasta or fastq file, compressed or uncompressed.
jpayne@69: If you pipe via stdin/stdout, please include the file type; e.g. for gzipped
jpayne@69: fasta input, set in=stdin.fa.gz
jpayne@69:
jpayne@69: Input parameters:
jpayne@69: in= Main input. in=stdin.fq will pipe from stdin.
jpayne@69: in2= Input for 2nd read of pairs in a different file.
jpayne@69: ref= Comma-delimited list of reference files.
jpayne@69: In addition to filenames, you may also use the keywords:
jpayne@69: adapters, artifacts, phix, lambda, pjet, mtst, kapa
jpayne@69: literal= Comma-delimited list of literal reference sequences.
jpayne@69: touppercase=f (tuc) Change all bases upper-case.
jpayne@69: interleaved=auto (int) t/f overrides interleaved autodetection.
jpayne@69: qin=auto Input quality offset: 33 (Sanger), 64, or auto.
jpayne@69: reads=-1 If positive, quit after processing X reads or pairs.
jpayne@69: copyundefined=f (cu) Process non-AGCT IUPAC reference bases by making all
jpayne@69: possible unambiguous copies. Intended for short motifs
jpayne@69: or adapter barcodes, as time/memory use is exponential.
jpayne@69: samplerate=1 Set lower to only process a fraction of input reads.
jpayne@69: samref= Optional reference fasta for processing sam files.
jpayne@69:
jpayne@69: Output parameters:
jpayne@69: out= (outnonmatch) Write reads here that do not contain
jpayne@69: kmers matching the database. 'out=stdout.fq' will pipe
jpayne@69: to standard out.
jpayne@69: out2= (outnonmatch2) Use this to write 2nd read of pairs to a
jpayne@69: different file.
jpayne@69: outm= (outmatch) Write reads here that fail filters. In default
jpayne@69: kfilter mode, this means any read with a matching kmer.
jpayne@69: In any mode, it also includes reads that fail filters such
jpayne@69: as minlength, mingc, maxgc, entropy, etc. In other words,
jpayne@69: it includes all reads that do not go to 'out'.
jpayne@69: outm2= (outmatch2) Use this to write 2nd read of pairs to a
jpayne@69: different file.
jpayne@69: outs= (outsingle) Use this to write singleton reads whose mate
jpayne@69: was trimmed shorter than minlen.
jpayne@69: stats= Write statistics about which contamininants were detected.
jpayne@69: refstats= Write statistics on a per-reference-file basis.
jpayne@69: rpkm= Write RPKM for each reference sequence (for RNA-seq).
jpayne@69: dump= Dump kmer tables to a file, in fasta format.
jpayne@69: duk= Write statistics in duk's format. *DEPRECATED*
jpayne@69: nzo=t Only write statistics about ref sequences with nonzero hits.
jpayne@69: overwrite=t (ow) Grant permission to overwrite files.
jpayne@69: showspeed=t (ss) 'f' suppresses display of processing speed.
jpayne@69: ziplevel=2 (zl) Compression level; 1 (min) through 9 (max).
jpayne@69: fastawrap=70 Length of lines in fasta output.
jpayne@69: qout=auto Output quality offset: 33 (Sanger), 64, or auto.
jpayne@69: statscolumns=3 (cols) Number of columns for stats output, 3 or 5.
jpayne@69: 5 includes base counts.
jpayne@69: rename=f Rename reads to indicate which sequences they matched.
jpayne@69: refnames=f Use names of reference files rather than scaffold IDs.
jpayne@69: trd=f Truncate read and ref names at the first whitespace.
jpayne@69: ordered=f Set to true to output reads in same order as input.
jpayne@69: maxbasesout=-1 If positive, quit after writing approximately this many
jpayne@69: bases to out (outu/outnonmatch).
jpayne@69: maxbasesoutm=-1 If positive, quit after writing approximately this many
jpayne@69: bases to outm (outmatch).
jpayne@69: json=f Print to screen in json format.
jpayne@69:
jpayne@69: Histogram output parameters:
jpayne@69: bhist= Base composition histogram by position.
jpayne@69: qhist= Quality histogram by position.
jpayne@69: qchist= Count of bases with each quality value.
jpayne@69: aqhist= Histogram of average read quality.
jpayne@69: bqhist= Quality histogram designed for box plots.
jpayne@69: lhist= Read length histogram.
jpayne@69: phist= Polymer length histogram.
jpayne@69: gchist= Read GC content histogram.
jpayne@69: enthist= Read entropy histogram.
jpayne@69: ihist= Insert size histogram, for paired reads in mapped sam.
jpayne@69: gcbins=100 Number gchist bins. Set to 'auto' to use read length.
jpayne@69: maxhistlen=6000 Set an upper bound for histogram lengths; higher uses
jpayne@69: more memory. The default is 6000 for some histograms
jpayne@69: and 80000 for others.
jpayne@69:
jpayne@69: Histograms for mapped sam/bam files only:
jpayne@69: histbefore=t Calculate histograms from reads before processing.
jpayne@69: ehist= Errors-per-read histogram.
jpayne@69: qahist= Quality accuracy histogram of error rates versus quality
jpayne@69: score.
jpayne@69: indelhist= Indel length histogram.
jpayne@69: mhist= Histogram of match, sub, del, and ins rates by position.
jpayne@69: idhist= Histogram of read count versus percent identity.
jpayne@69: idbins=100 Number idhist bins. Set to 'auto' to use read length.
jpayne@69: varfile= Ignore substitution errors listed in this file when
jpayne@69: calculating error rates. Can be generated with
jpayne@69: CallVariants.
jpayne@69: vcf= Ignore substitution errors listed in this VCF file
jpayne@69: when calculating error rates.
jpayne@69: ignorevcfindels=t Also ignore indels listed in the VCF.
jpayne@69:
jpayne@69: Processing parameters:
jpayne@69: k=27 Kmer length used for finding contaminants. Contaminants
jpayne@69: shorter than k will not be found. k must be at least 1.
jpayne@69: rcomp=t Look for reverse-complements of kmers in addition to
jpayne@69: forward kmers.
jpayne@69: maskmiddle=t (mm) Treat the middle base of a kmer as a wildcard, to
jpayne@69: increase sensitivity in the presence of errors.
jpayne@69: minkmerhits=1 (mkh) Reads need at least this many matching kmers
jpayne@69: to be considered as matching the reference.
jpayne@69: minkmerfraction=0.0 (mkf) A reads needs at least this fraction of its total
jpayne@69: kmers to hit a ref, in order to be considered a match.
jpayne@69: If this and minkmerhits are set, the greater is used.
jpayne@69: mincovfraction=0.0 (mcf) A reads needs at least this fraction of its total
jpayne@69: bases to be covered by ref kmers to be considered a match.
jpayne@69: If specified, mcf overrides mkh and mkf.
jpayne@69: hammingdistance=0 (hdist) Maximum Hamming distance for ref kmers (subs only).
jpayne@69: Memory use is proportional to (3*K)^hdist.
jpayne@69: qhdist=0 Hamming distance for query kmers; impacts speed, not memory.
jpayne@69: editdistance=0 (edist) Maximum edit distance from ref kmers (subs
jpayne@69: and indels). Memory use is proportional to (8*K)^edist.
jpayne@69: hammingdistance2=0 (hdist2) Sets hdist for short kmers, when using mink.
jpayne@69: qhdist2=0 Sets qhdist for short kmers, when using mink.
jpayne@69: editdistance2=0 (edist2) Sets edist for short kmers, when using mink.
jpayne@69: forbidn=f (fn) Forbids matching of read kmers containing N.
jpayne@69: By default, these will match a reference 'A' if
jpayne@69: hdist>0 or edist>0, to increase sensitivity.
jpayne@69: removeifeitherbad=t (rieb) Paired reads get sent to 'outmatch' if either is
jpayne@69: match (or either is trimmed shorter than minlen).
jpayne@69: Set to false to require both.
jpayne@69: trimfailures=f Instead of discarding failed reads, trim them to 1bp.
jpayne@69: This makes the statistics a bit odd.
jpayne@69: findbestmatch=f (fbm) If multiple matches, associate read with sequence
jpayne@69: sharing most kmers. Reduces speed.
jpayne@69: skipr1=f Don't do kmer-based operations on read 1.
jpayne@69: skipr2=f Don't do kmer-based operations on read 2.
jpayne@69: ecco=f For overlapping paired reads only. Performs error-
jpayne@69: correction with BBMerge prior to kmer operations.
jpayne@69: recalibrate=f (recal) Recalibrate quality scores. Requires calibration
jpayne@69: matrices generated by CalcTrueQuality.
jpayne@69: sam= If recalibration is desired, and matrices have not already
jpayne@69: been generated, BBDuk will create them from the sam file.
jpayne@69: amino=f Run in amino acid mode. Some features have not been
jpayne@69: tested, but kmer-matching works fine. Maximum k is 12.
jpayne@69:
jpayne@69: Speed and Memory parameters:
jpayne@69: threads=auto (t) Set number of threads to use; default is number of
jpayne@69: logical processors.
jpayne@69: prealloc=f Preallocate memory in table. Allows faster table loading
jpayne@69: and more efficient memory usage, for a large reference.
jpayne@69: monitor=f Kill this process if it crashes. monitor=600,0.01 would
jpayne@69: kill after 600 seconds under 1% usage.
jpayne@69: minrskip=1 (mns) Force minimal skip interval when indexing reference
jpayne@69: kmers. 1 means use all, 2 means use every other kmer, etc.
jpayne@69: maxrskip=1 (mxs) Restrict maximal skip interval when indexing
jpayne@69: reference kmers. Normally all are used for scaffolds<100kb,
jpayne@69: but with longer scaffolds, up to maxrskip-1 are skipped.
jpayne@69: rskip= Set both minrskip and maxrskip to the same value.
jpayne@69: If not set, rskip will vary based on sequence length.
jpayne@69: qskip=1 Skip query kmers to increase speed. 1 means use all.
jpayne@69: speed=0 Ignore this fraction of kmer space (0-15 out of 16) in both
jpayne@69: reads and reference. Increases speed and reduces memory.
jpayne@69: Note: Do not use more than one of 'speed', 'qskip', and 'rskip'.
jpayne@69:
jpayne@69: Trimming/Filtering/Masking parameters:
jpayne@69: Note - if ktrim, kmask, and ksplit are unset, the default behavior is kfilter.
jpayne@69: All kmer processing modes are mutually exclusive.
jpayne@69: Reads only get sent to 'outm' purely based on kmer matches in kfilter mode.
jpayne@69:
jpayne@69: ktrim=f Trim reads to remove bases matching reference kmers, plus
jpayne@69: all bases to the left or right.
jpayne@69: Values:
jpayne@69: f (don't trim),
jpayne@69: r (trim to the right),
jpayne@69: l (trim to the left)
jpayne@69: ktrimtips=0 Set this to a positive number to perform ktrim on both
jpayne@69: ends, examining only the outermost X bases.
jpayne@69: kmask= Replace bases matching ref kmers with another symbol.
jpayne@69: Allows any non-whitespace character, and processes short
jpayne@69: kmers on both ends if mink is set. 'kmask=lc' will
jpayne@69: convert masked bases to lowercase.
jpayne@69: maskfullycovered=f (mfc) Only mask bases that are fully covered by kmers.
jpayne@69: ksplit=f For single-ended reads only. Reads will be split into
jpayne@69: pairs around the kmer. If the kmer is at the end of the
jpayne@69: read, it will be trimmed instead. Singletons will go to
jpayne@69: out, and pairs will go to outm. Do not use ksplit with
jpayne@69: other operations such as quality-trimming or filtering.
jpayne@69: mink=0 Look for shorter kmers at read tips down to this length,
jpayne@69: when k-trimming or masking. 0 means disabled. Enabling
jpayne@69: this will disable maskmiddle.
jpayne@69: qtrim=f Trim read ends to remove bases with quality below trimq.
jpayne@69: Performed AFTER looking for kmers. Values:
jpayne@69: rl (trim both ends),
jpayne@69: f (neither end),
jpayne@69: r (right end only),
jpayne@69: l (left end only),
jpayne@69: w (sliding window).
jpayne@69: trimq=6 Regions with average quality BELOW this will be trimmed,
jpayne@69: if qtrim is set to something other than f. Can be a
jpayne@69: floating-point number like 7.3.
jpayne@69: trimclip=f Trim soft-clipped bases from sam files.
jpayne@69: minlength=10 (ml) Reads shorter than this after trimming will be
jpayne@69: discarded. Pairs will be discarded if both are shorter.
jpayne@69: mlf=0 (minlengthfraction) Reads shorter than this fraction of
jpayne@69: original length after trimming will be discarded.
jpayne@69: maxlength= Reads longer than this after trimming will be discarded.
jpayne@69: minavgquality=0 (maq) Reads with average quality (after trimming) below
jpayne@69: this will be discarded.
jpayne@69: maqb=0 If positive, calculate maq from this many initial bases.
jpayne@69: minbasequality=0 (mbq) Reads with any base below this quality (after
jpayne@69: trimming) will be discarded.
jpayne@69: maxns=-1 If non-negative, reads with more Ns than this
jpayne@69: (after trimming) will be discarded.
jpayne@69: mcb=0 (minconsecutivebases) Discard reads without at least
jpayne@69: this many consecutive called bases.
jpayne@69: ottm=f (outputtrimmedtomatch) Output reads trimmed to shorter
jpayne@69: than minlength to outm rather than discarding.
jpayne@69: tp=0 (trimpad) Trim this much extra around matching kmers.
jpayne@69: tbo=f (trimbyoverlap) Trim adapters based on where paired
jpayne@69: reads overlap.
jpayne@69: strictoverlap=t Adjust sensitivity for trimbyoverlap mode.
jpayne@69: minoverlap=14 Require this many bases of overlap for detection.
jpayne@69: mininsert=40 Require insert size of at least this for overlap.
jpayne@69: Should be reduced to 16 for small RNA sequencing.
jpayne@69: tpe=f (trimpairsevenly) When kmer right-trimming, trim both
jpayne@69: reads to the minimum length of either.
jpayne@69: forcetrimleft=0 (ftl) If positive, trim bases to the left of this position
jpayne@69: (exclusive, 0-based).
jpayne@69: forcetrimright=0 (ftr) If positive, trim bases to the right of this position
jpayne@69: (exclusive, 0-based).
jpayne@69: forcetrimright2=0 (ftr2) If positive, trim this many bases on the right end.
jpayne@69: forcetrimmod=0 (ftm) If positive, right-trim length to be equal to zero,
jpayne@69: modulo this number.
jpayne@69: restrictleft=0 If positive, only look for kmer matches in the
jpayne@69: leftmost X bases.
jpayne@69: restrictright=0 If positive, only look for kmer matches in the
jpayne@69: rightmost X bases.
jpayne@69: NOTE: restrictleft and restrictright are mutually exclusive. If trimming
jpayne@69: both ends is desired, use ktrimtips.
jpayne@69: mingc=0 Discard reads with GC content below this.
jpayne@69: maxgc=1 Discard reads with GC content above this.
jpayne@69: gcpairs=t Use average GC of paired reads.
jpayne@69: Also affects gchist.
jpayne@69: tossjunk=f Discard reads with invalid characters as bases.
jpayne@69: swift=f Trim Swift sequences: Trailing C/T/N R1, leading G/A/N R2.
jpayne@69:
jpayne@69: Header-parsing parameters - these require Illumina headers:
jpayne@69: chastityfilter=f (cf) Discard reads with id containing ' 1:Y:' or ' 2:Y:'.
jpayne@69: barcodefilter=f Remove reads with unexpected barcodes if barcodes is set,
jpayne@69: or barcodes containing 'N' otherwise. A barcode must be
jpayne@69: the last part of the read header. Values:
jpayne@69: t: Remove reads with bad barcodes.
jpayne@69: f: Ignore barcodes.
jpayne@69: crash: Crash upon encountering bad barcodes.
jpayne@69: barcodes= Comma-delimited list of barcodes or files of barcodes.
jpayne@69: xmin=-1 If positive, discard reads with a lesser X coordinate.
jpayne@69: ymin=-1 If positive, discard reads with a lesser Y coordinate.
jpayne@69: xmax=-1 If positive, discard reads with a greater X coordinate.
jpayne@69: ymax=-1 If positive, discard reads with a greater Y coordinate.
jpayne@69:
jpayne@69: Polymer trimming:
jpayne@69: trimpolya=0 If greater than 0, trim poly-A or poly-T tails of
jpayne@69: at least this length on either end of reads.
jpayne@69: trimpolygleft=0 If greater than 0, trim poly-G prefixes of at least this
jpayne@69: length on the left end of reads. Does not trim poly-C.
jpayne@69: trimpolygright=0 If greater than 0, trim poly-G tails of at least this
jpayne@69: length on the right end of reads. Does not trim poly-C.
jpayne@69: trimpolyg=0 This sets both left and right at once.
jpayne@69: filterpolyg=0 If greater than 0, remove reads with a poly-G prefix of
jpayne@69: at least this length (on the left).
jpayne@69: Note: there are also equivalent poly-C flags.
jpayne@69:
jpayne@69: Polymer tracking:
jpayne@69: pratio=base,base 'pratio=G,C' will print the ratio of G to C polymers.
jpayne@69: plen=20 Length of homopolymers to count.
jpayne@69:
jpayne@69: Entropy/Complexity parameters:
jpayne@69: entropy=-1 Set between 0 and 1 to filter reads with entropy below
jpayne@69: that value. Higher is more stringent.
jpayne@69: entropywindow=50 Calculate entropy using a sliding window of this length.
jpayne@69: entropyk=5 Calculate entropy using kmers of this length.
jpayne@69: minbasefrequency=0 Discard reads with a minimum base frequency below this.
jpayne@69: entropytrim=f Values:
jpayne@69: f: (false) Do not entropy-trim.
jpayne@69: r: (right) Trim low entropy on the right end only.
jpayne@69: l: (left) Trim low entropy on the left end only.
jpayne@69: rl: (both) Trim low entropy on both ends.
jpayne@69: entropymask=f Values:
jpayne@69: f: (filter) Discard low-entropy sequences.
jpayne@69: t: (true) Mask low-entropy parts of sequences with N.
jpayne@69: lc: Change low-entropy parts of sequences to lowercase.
jpayne@69: entropymark=f Mark each base with its entropy value. This is on a scale
jpayne@69: of 0-41 and is reported as quality scores, so the output
jpayne@69: should be fastq or fasta+qual.
jpayne@69: NOTE: If set, entropytrim overrides entropymask.
jpayne@69:
jpayne@69: Cardinality estimation:
jpayne@69: cardinality=f (loglog) Count unique kmers using the LogLog algorithm.
jpayne@69: cardinalityout=f (loglogout) Count unique kmers in output reads.
jpayne@69: loglogk=31 Use this kmer length for counting.
jpayne@69: loglogbuckets=2048 Use this many buckets for counting.
jpayne@69: khist= Kmer frequency histogram; plots number of kmers versus
jpayne@69: kmer depth. This is approximate.
jpayne@69: khistout= Kmer frequency histogram for output reads.
jpayne@69:
jpayne@69: Java Parameters:
jpayne@69:
jpayne@69: -Xmx This will set Java's memory usage, overriding autodetection.
jpayne@69: -Xmx20g will
jpayne@69: specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
jpayne@69: The max is typically 85% of physical memory.
jpayne@69: -eoom This flag will cause the process to exit if an
jpayne@69: out-of-memory exception occurs. Requires Java 8u92+.
jpayne@69: -da Disable assertions.
jpayne@69:
jpayne@69: Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
jpayne@69: "
jpayne@69: }
jpayne@69:
jpayne@69: #This block allows symlinked shellscripts to correctly set classpath.
jpayne@69: pushd . > /dev/null
jpayne@69: DIR="${BASH_SOURCE[0]}"
jpayne@69: while [ -h "$DIR" ]; do
jpayne@69: cd "$(dirname "$DIR")"
jpayne@69: DIR="$(readlink "$(basename "$DIR")")"
jpayne@69: done
jpayne@69: cd "$(dirname "$DIR")"
jpayne@69: DIR="$(pwd)/"
jpayne@69: popd > /dev/null
jpayne@69:
jpayne@69: #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
jpayne@69: CP="$DIR""current/"
jpayne@69: JNI="-Djava.library.path=""$DIR""jni/"
jpayne@69: JNI=""
jpayne@69:
jpayne@69: z="-Xmx1g"
jpayne@69: z2="-Xms1g"
jpayne@69: set=0
jpayne@69: silent=0
jpayne@69: json=0
jpayne@69:
jpayne@69: if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
jpayne@69: usage
jpayne@69: exit
jpayne@69: fi
jpayne@69:
jpayne@69: calcXmx () {
jpayne@69: source "$DIR""/calcmem.sh"
jpayne@69: setEnvironment
jpayne@69: parseXmx "$@"
jpayne@69: if [[ $set == 1 ]]; then
jpayne@69: return
jpayne@69: fi
jpayne@69: freeRam 1400m 42
jpayne@69: z="-Xmx${RAM}m"
jpayne@69: z2="-Xms${RAM}m"
jpayne@69: }
jpayne@69: calcXmx "$@"
jpayne@69:
jpayne@69: bbduk() {
jpayne@69: local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP jgi.BBDuk $@"
jpayne@69: if [[ $silent == 0 ]] && [[ $json == 0 ]]; then
jpayne@69: echo $CMD >&2
jpayne@69: fi
jpayne@69: eval $CMD
jpayne@69: }
jpayne@69:
jpayne@69: bbduk "$@"