comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/bbduk.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 #!/bin/bash
2
3 usage(){
4 echo "
5 Written by Brian Bushnell
6 Last modified July 28, 2022
7
8 Description: Compares reads to the kmers in a reference dataset, optionally
9 allowing an edit distance. Splits the reads into two outputs - those that
10 match the reference, and those that don't. Can also trim (remove) the matching
11 parts of the reads rather than binning the reads.
12 Please read bbmap/docs/guides/BBDukGuide.txt for more information.
13
14 Usage: bbduk.sh in=<input file> out=<output file> ref=<contaminant files>
15
16 Input may be stdin or a fasta or fastq file, compressed or uncompressed.
17 If you pipe via stdin/stdout, please include the file type; e.g. for gzipped
18 fasta input, set in=stdin.fa.gz
19
20 Input parameters:
21 in=<file> Main input. in=stdin.fq will pipe from stdin.
22 in2=<file> Input for 2nd read of pairs in a different file.
23 ref=<file,file> Comma-delimited list of reference files.
24 In addition to filenames, you may also use the keywords:
25 adapters, artifacts, phix, lambda, pjet, mtst, kapa
26 literal=<seq,seq> Comma-delimited list of literal reference sequences.
27 touppercase=f (tuc) Change all bases upper-case.
28 interleaved=auto (int) t/f overrides interleaved autodetection.
29 qin=auto Input quality offset: 33 (Sanger), 64, or auto.
30 reads=-1 If positive, quit after processing X reads or pairs.
31 copyundefined=f (cu) Process non-AGCT IUPAC reference bases by making all
32 possible unambiguous copies. Intended for short motifs
33 or adapter barcodes, as time/memory use is exponential.
34 samplerate=1 Set lower to only process a fraction of input reads.
35 samref=<file> Optional reference fasta for processing sam files.
36
37 Output parameters:
38 out=<file> (outnonmatch) Write reads here that do not contain
39 kmers matching the database. 'out=stdout.fq' will pipe
40 to standard out.
41 out2=<file> (outnonmatch2) Use this to write 2nd read of pairs to a
42 different file.
43 outm=<file> (outmatch) Write reads here that fail filters. In default
44 kfilter mode, this means any read with a matching kmer.
45 In any mode, it also includes reads that fail filters such
46 as minlength, mingc, maxgc, entropy, etc. In other words,
47 it includes all reads that do not go to 'out'.
48 outm2=<file> (outmatch2) Use this to write 2nd read of pairs to a
49 different file.
50 outs=<file> (outsingle) Use this to write singleton reads whose mate
51 was trimmed shorter than minlen.
52 stats=<file> Write statistics about which contamininants were detected.
53 refstats=<file> Write statistics on a per-reference-file basis.
54 rpkm=<file> Write RPKM for each reference sequence (for RNA-seq).
55 dump=<file> Dump kmer tables to a file, in fasta format.
56 duk=<file> Write statistics in duk's format. *DEPRECATED*
57 nzo=t Only write statistics about ref sequences with nonzero hits.
58 overwrite=t (ow) Grant permission to overwrite files.
59 showspeed=t (ss) 'f' suppresses display of processing speed.
60 ziplevel=2 (zl) Compression level; 1 (min) through 9 (max).
61 fastawrap=70 Length of lines in fasta output.
62 qout=auto Output quality offset: 33 (Sanger), 64, or auto.
63 statscolumns=3 (cols) Number of columns for stats output, 3 or 5.
64 5 includes base counts.
65 rename=f Rename reads to indicate which sequences they matched.
66 refnames=f Use names of reference files rather than scaffold IDs.
67 trd=f Truncate read and ref names at the first whitespace.
68 ordered=f Set to true to output reads in same order as input.
69 maxbasesout=-1 If positive, quit after writing approximately this many
70 bases to out (outu/outnonmatch).
71 maxbasesoutm=-1 If positive, quit after writing approximately this many
72 bases to outm (outmatch).
73 json=f Print to screen in json format.
74
75 Histogram output parameters:
76 bhist=<file> Base composition histogram by position.
77 qhist=<file> Quality histogram by position.
78 qchist=<file> Count of bases with each quality value.
79 aqhist=<file> Histogram of average read quality.
80 bqhist=<file> Quality histogram designed for box plots.
81 lhist=<file> Read length histogram.
82 phist=<file> Polymer length histogram.
83 gchist=<file> Read GC content histogram.
84 enthist=<file> Read entropy histogram.
85 ihist=<file> Insert size histogram, for paired reads in mapped sam.
86 gcbins=100 Number gchist bins. Set to 'auto' to use read length.
87 maxhistlen=6000 Set an upper bound for histogram lengths; higher uses
88 more memory. The default is 6000 for some histograms
89 and 80000 for others.
90
91 Histograms for mapped sam/bam files only:
92 histbefore=t Calculate histograms from reads before processing.
93 ehist=<file> Errors-per-read histogram.
94 qahist=<file> Quality accuracy histogram of error rates versus quality
95 score.
96 indelhist=<file> Indel length histogram.
97 mhist=<file> Histogram of match, sub, del, and ins rates by position.
98 idhist=<file> Histogram of read count versus percent identity.
99 idbins=100 Number idhist bins. Set to 'auto' to use read length.
100 varfile=<file> Ignore substitution errors listed in this file when
101 calculating error rates. Can be generated with
102 CallVariants.
103 vcf=<file> Ignore substitution errors listed in this VCF file
104 when calculating error rates.
105 ignorevcfindels=t Also ignore indels listed in the VCF.
106
107 Processing parameters:
108 k=27 Kmer length used for finding contaminants. Contaminants
109 shorter than k will not be found. k must be at least 1.
110 rcomp=t Look for reverse-complements of kmers in addition to
111 forward kmers.
112 maskmiddle=t (mm) Treat the middle base of a kmer as a wildcard, to
113 increase sensitivity in the presence of errors.
114 minkmerhits=1 (mkh) Reads need at least this many matching kmers
115 to be considered as matching the reference.
116 minkmerfraction=0.0 (mkf) A reads needs at least this fraction of its total
117 kmers to hit a ref, in order to be considered a match.
118 If this and minkmerhits are set, the greater is used.
119 mincovfraction=0.0 (mcf) A reads needs at least this fraction of its total
120 bases to be covered by ref kmers to be considered a match.
121 If specified, mcf overrides mkh and mkf.
122 hammingdistance=0 (hdist) Maximum Hamming distance for ref kmers (subs only).
123 Memory use is proportional to (3*K)^hdist.
124 qhdist=0 Hamming distance for query kmers; impacts speed, not memory.
125 editdistance=0 (edist) Maximum edit distance from ref kmers (subs
126 and indels). Memory use is proportional to (8*K)^edist.
127 hammingdistance2=0 (hdist2) Sets hdist for short kmers, when using mink.
128 qhdist2=0 Sets qhdist for short kmers, when using mink.
129 editdistance2=0 (edist2) Sets edist for short kmers, when using mink.
130 forbidn=f (fn) Forbids matching of read kmers containing N.
131 By default, these will match a reference 'A' if
132 hdist>0 or edist>0, to increase sensitivity.
133 removeifeitherbad=t (rieb) Paired reads get sent to 'outmatch' if either is
134 match (or either is trimmed shorter than minlen).
135 Set to false to require both.
136 trimfailures=f Instead of discarding failed reads, trim them to 1bp.
137 This makes the statistics a bit odd.
138 findbestmatch=f (fbm) If multiple matches, associate read with sequence
139 sharing most kmers. Reduces speed.
140 skipr1=f Don't do kmer-based operations on read 1.
141 skipr2=f Don't do kmer-based operations on read 2.
142 ecco=f For overlapping paired reads only. Performs error-
143 correction with BBMerge prior to kmer operations.
144 recalibrate=f (recal) Recalibrate quality scores. Requires calibration
145 matrices generated by CalcTrueQuality.
146 sam=<file,file> If recalibration is desired, and matrices have not already
147 been generated, BBDuk will create them from the sam file.
148 amino=f Run in amino acid mode. Some features have not been
149 tested, but kmer-matching works fine. Maximum k is 12.
150
151 Speed and Memory parameters:
152 threads=auto (t) Set number of threads to use; default is number of
153 logical processors.
154 prealloc=f Preallocate memory in table. Allows faster table loading
155 and more efficient memory usage, for a large reference.
156 monitor=f Kill this process if it crashes. monitor=600,0.01 would
157 kill after 600 seconds under 1% usage.
158 minrskip=1 (mns) Force minimal skip interval when indexing reference
159 kmers. 1 means use all, 2 means use every other kmer, etc.
160 maxrskip=1 (mxs) Restrict maximal skip interval when indexing
161 reference kmers. Normally all are used for scaffolds<100kb,
162 but with longer scaffolds, up to maxrskip-1 are skipped.
163 rskip= Set both minrskip and maxrskip to the same value.
164 If not set, rskip will vary based on sequence length.
165 qskip=1 Skip query kmers to increase speed. 1 means use all.
166 speed=0 Ignore this fraction of kmer space (0-15 out of 16) in both
167 reads and reference. Increases speed and reduces memory.
168 Note: Do not use more than one of 'speed', 'qskip', and 'rskip'.
169
170 Trimming/Filtering/Masking parameters:
171 Note - if ktrim, kmask, and ksplit are unset, the default behavior is kfilter.
172 All kmer processing modes are mutually exclusive.
173 Reads only get sent to 'outm' purely based on kmer matches in kfilter mode.
174
175 ktrim=f Trim reads to remove bases matching reference kmers, plus
176 all bases to the left or right.
177 Values:
178 f (don't trim),
179 r (trim to the right),
180 l (trim to the left)
181 ktrimtips=0 Set this to a positive number to perform ktrim on both
182 ends, examining only the outermost X bases.
183 kmask= Replace bases matching ref kmers with another symbol.
184 Allows any non-whitespace character, and processes short
185 kmers on both ends if mink is set. 'kmask=lc' will
186 convert masked bases to lowercase.
187 maskfullycovered=f (mfc) Only mask bases that are fully covered by kmers.
188 ksplit=f For single-ended reads only. Reads will be split into
189 pairs around the kmer. If the kmer is at the end of the
190 read, it will be trimmed instead. Singletons will go to
191 out, and pairs will go to outm. Do not use ksplit with
192 other operations such as quality-trimming or filtering.
193 mink=0 Look for shorter kmers at read tips down to this length,
194 when k-trimming or masking. 0 means disabled. Enabling
195 this will disable maskmiddle.
196 qtrim=f Trim read ends to remove bases with quality below trimq.
197 Performed AFTER looking for kmers. Values:
198 rl (trim both ends),
199 f (neither end),
200 r (right end only),
201 l (left end only),
202 w (sliding window).
203 trimq=6 Regions with average quality BELOW this will be trimmed,
204 if qtrim is set to something other than f. Can be a
205 floating-point number like 7.3.
206 trimclip=f Trim soft-clipped bases from sam files.
207 minlength=10 (ml) Reads shorter than this after trimming will be
208 discarded. Pairs will be discarded if both are shorter.
209 mlf=0 (minlengthfraction) Reads shorter than this fraction of
210 original length after trimming will be discarded.
211 maxlength= Reads longer than this after trimming will be discarded.
212 minavgquality=0 (maq) Reads with average quality (after trimming) below
213 this will be discarded.
214 maqb=0 If positive, calculate maq from this many initial bases.
215 minbasequality=0 (mbq) Reads with any base below this quality (after
216 trimming) will be discarded.
217 maxns=-1 If non-negative, reads with more Ns than this
218 (after trimming) will be discarded.
219 mcb=0 (minconsecutivebases) Discard reads without at least
220 this many consecutive called bases.
221 ottm=f (outputtrimmedtomatch) Output reads trimmed to shorter
222 than minlength to outm rather than discarding.
223 tp=0 (trimpad) Trim this much extra around matching kmers.
224 tbo=f (trimbyoverlap) Trim adapters based on where paired
225 reads overlap.
226 strictoverlap=t Adjust sensitivity for trimbyoverlap mode.
227 minoverlap=14 Require this many bases of overlap for detection.
228 mininsert=40 Require insert size of at least this for overlap.
229 Should be reduced to 16 for small RNA sequencing.
230 tpe=f (trimpairsevenly) When kmer right-trimming, trim both
231 reads to the minimum length of either.
232 forcetrimleft=0 (ftl) If positive, trim bases to the left of this position
233 (exclusive, 0-based).
234 forcetrimright=0 (ftr) If positive, trim bases to the right of this position
235 (exclusive, 0-based).
236 forcetrimright2=0 (ftr2) If positive, trim this many bases on the right end.
237 forcetrimmod=0 (ftm) If positive, right-trim length to be equal to zero,
238 modulo this number.
239 restrictleft=0 If positive, only look for kmer matches in the
240 leftmost X bases.
241 restrictright=0 If positive, only look for kmer matches in the
242 rightmost X bases.
243 NOTE: restrictleft and restrictright are mutually exclusive. If trimming
244 both ends is desired, use ktrimtips.
245 mingc=0 Discard reads with GC content below this.
246 maxgc=1 Discard reads with GC content above this.
247 gcpairs=t Use average GC of paired reads.
248 Also affects gchist.
249 tossjunk=f Discard reads with invalid characters as bases.
250 swift=f Trim Swift sequences: Trailing C/T/N R1, leading G/A/N R2.
251
252 Header-parsing parameters - these require Illumina headers:
253 chastityfilter=f (cf) Discard reads with id containing ' 1:Y:' or ' 2:Y:'.
254 barcodefilter=f Remove reads with unexpected barcodes if barcodes is set,
255 or barcodes containing 'N' otherwise. A barcode must be
256 the last part of the read header. Values:
257 t: Remove reads with bad barcodes.
258 f: Ignore barcodes.
259 crash: Crash upon encountering bad barcodes.
260 barcodes= Comma-delimited list of barcodes or files of barcodes.
261 xmin=-1 If positive, discard reads with a lesser X coordinate.
262 ymin=-1 If positive, discard reads with a lesser Y coordinate.
263 xmax=-1 If positive, discard reads with a greater X coordinate.
264 ymax=-1 If positive, discard reads with a greater Y coordinate.
265
266 Polymer trimming:
267 trimpolya=0 If greater than 0, trim poly-A or poly-T tails of
268 at least this length on either end of reads.
269 trimpolygleft=0 If greater than 0, trim poly-G prefixes of at least this
270 length on the left end of reads. Does not trim poly-C.
271 trimpolygright=0 If greater than 0, trim poly-G tails of at least this
272 length on the right end of reads. Does not trim poly-C.
273 trimpolyg=0 This sets both left and right at once.
274 filterpolyg=0 If greater than 0, remove reads with a poly-G prefix of
275 at least this length (on the left).
276 Note: there are also equivalent poly-C flags.
277
278 Polymer tracking:
279 pratio=base,base 'pratio=G,C' will print the ratio of G to C polymers.
280 plen=20 Length of homopolymers to count.
281
282 Entropy/Complexity parameters:
283 entropy=-1 Set between 0 and 1 to filter reads with entropy below
284 that value. Higher is more stringent.
285 entropywindow=50 Calculate entropy using a sliding window of this length.
286 entropyk=5 Calculate entropy using kmers of this length.
287 minbasefrequency=0 Discard reads with a minimum base frequency below this.
288 entropytrim=f Values:
289 f: (false) Do not entropy-trim.
290 r: (right) Trim low entropy on the right end only.
291 l: (left) Trim low entropy on the left end only.
292 rl: (both) Trim low entropy on both ends.
293 entropymask=f Values:
294 f: (filter) Discard low-entropy sequences.
295 t: (true) Mask low-entropy parts of sequences with N.
296 lc: Change low-entropy parts of sequences to lowercase.
297 entropymark=f Mark each base with its entropy value. This is on a scale
298 of 0-41 and is reported as quality scores, so the output
299 should be fastq or fasta+qual.
300 NOTE: If set, entropytrim overrides entropymask.
301
302 Cardinality estimation:
303 cardinality=f (loglog) Count unique kmers using the LogLog algorithm.
304 cardinalityout=f (loglogout) Count unique kmers in output reads.
305 loglogk=31 Use this kmer length for counting.
306 loglogbuckets=2048 Use this many buckets for counting.
307 khist=<file> Kmer frequency histogram; plots number of kmers versus
308 kmer depth. This is approximate.
309 khistout=<file> Kmer frequency histogram for output reads.
310
311 Java Parameters:
312
313 -Xmx This will set Java's memory usage, overriding autodetection.
314 -Xmx20g will
315 specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
316 The max is typically 85% of physical memory.
317 -eoom This flag will cause the process to exit if an
318 out-of-memory exception occurs. Requires Java 8u92+.
319 -da Disable assertions.
320
321 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
322 "
323 }
324
325 #This block allows symlinked shellscripts to correctly set classpath.
326 pushd . > /dev/null
327 DIR="${BASH_SOURCE[0]}"
328 while [ -h "$DIR" ]; do
329 cd "$(dirname "$DIR")"
330 DIR="$(readlink "$(basename "$DIR")")"
331 done
332 cd "$(dirname "$DIR")"
333 DIR="$(pwd)/"
334 popd > /dev/null
335
336 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
337 CP="$DIR""current/"
338 JNI="-Djava.library.path=""$DIR""jni/"
339 JNI=""
340
341 z="-Xmx1g"
342 z2="-Xms1g"
343 set=0
344 silent=0
345 json=0
346
347 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
348 usage
349 exit
350 fi
351
352 calcXmx () {
353 source "$DIR""/calcmem.sh"
354 setEnvironment
355 parseXmx "$@"
356 if [[ $set == 1 ]]; then
357 return
358 fi
359 freeRam 1400m 42
360 z="-Xmx${RAM}m"
361 z2="-Xms${RAM}m"
362 }
363 calcXmx "$@"
364
365 bbduk() {
366 local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP jgi.BBDuk $@"
367 if [[ $silent == 0 ]] && [[ $json == 0 ]]; then
368 echo $CMD >&2
369 fi
370 eval $CMD
371 }
372
373 bbduk "$@"