Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/bbduk.sh @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 #!/bin/bash | |
2 | |
3 usage(){ | |
4 echo " | |
5 Written by Brian Bushnell | |
6 Last modified July 28, 2022 | |
7 | |
8 Description: Compares reads to the kmers in a reference dataset, optionally | |
9 allowing an edit distance. Splits the reads into two outputs - those that | |
10 match the reference, and those that don't. Can also trim (remove) the matching | |
11 parts of the reads rather than binning the reads. | |
12 Please read bbmap/docs/guides/BBDukGuide.txt for more information. | |
13 | |
14 Usage: bbduk.sh in=<input file> out=<output file> ref=<contaminant files> | |
15 | |
16 Input may be stdin or a fasta or fastq file, compressed or uncompressed. | |
17 If you pipe via stdin/stdout, please include the file type; e.g. for gzipped | |
18 fasta input, set in=stdin.fa.gz | |
19 | |
20 Input parameters: | |
21 in=<file> Main input. in=stdin.fq will pipe from stdin. | |
22 in2=<file> Input for 2nd read of pairs in a different file. | |
23 ref=<file,file> Comma-delimited list of reference files. | |
24 In addition to filenames, you may also use the keywords: | |
25 adapters, artifacts, phix, lambda, pjet, mtst, kapa | |
26 literal=<seq,seq> Comma-delimited list of literal reference sequences. | |
27 touppercase=f (tuc) Change all bases upper-case. | |
28 interleaved=auto (int) t/f overrides interleaved autodetection. | |
29 qin=auto Input quality offset: 33 (Sanger), 64, or auto. | |
30 reads=-1 If positive, quit after processing X reads or pairs. | |
31 copyundefined=f (cu) Process non-AGCT IUPAC reference bases by making all | |
32 possible unambiguous copies. Intended for short motifs | |
33 or adapter barcodes, as time/memory use is exponential. | |
34 samplerate=1 Set lower to only process a fraction of input reads. | |
35 samref=<file> Optional reference fasta for processing sam files. | |
36 | |
37 Output parameters: | |
38 out=<file> (outnonmatch) Write reads here that do not contain | |
39 kmers matching the database. 'out=stdout.fq' will pipe | |
40 to standard out. | |
41 out2=<file> (outnonmatch2) Use this to write 2nd read of pairs to a | |
42 different file. | |
43 outm=<file> (outmatch) Write reads here that fail filters. In default | |
44 kfilter mode, this means any read with a matching kmer. | |
45 In any mode, it also includes reads that fail filters such | |
46 as minlength, mingc, maxgc, entropy, etc. In other words, | |
47 it includes all reads that do not go to 'out'. | |
48 outm2=<file> (outmatch2) Use this to write 2nd read of pairs to a | |
49 different file. | |
50 outs=<file> (outsingle) Use this to write singleton reads whose mate | |
51 was trimmed shorter than minlen. | |
52 stats=<file> Write statistics about which contamininants were detected. | |
53 refstats=<file> Write statistics on a per-reference-file basis. | |
54 rpkm=<file> Write RPKM for each reference sequence (for RNA-seq). | |
55 dump=<file> Dump kmer tables to a file, in fasta format. | |
56 duk=<file> Write statistics in duk's format. *DEPRECATED* | |
57 nzo=t Only write statistics about ref sequences with nonzero hits. | |
58 overwrite=t (ow) Grant permission to overwrite files. | |
59 showspeed=t (ss) 'f' suppresses display of processing speed. | |
60 ziplevel=2 (zl) Compression level; 1 (min) through 9 (max). | |
61 fastawrap=70 Length of lines in fasta output. | |
62 qout=auto Output quality offset: 33 (Sanger), 64, or auto. | |
63 statscolumns=3 (cols) Number of columns for stats output, 3 or 5. | |
64 5 includes base counts. | |
65 rename=f Rename reads to indicate which sequences they matched. | |
66 refnames=f Use names of reference files rather than scaffold IDs. | |
67 trd=f Truncate read and ref names at the first whitespace. | |
68 ordered=f Set to true to output reads in same order as input. | |
69 maxbasesout=-1 If positive, quit after writing approximately this many | |
70 bases to out (outu/outnonmatch). | |
71 maxbasesoutm=-1 If positive, quit after writing approximately this many | |
72 bases to outm (outmatch). | |
73 json=f Print to screen in json format. | |
74 | |
75 Histogram output parameters: | |
76 bhist=<file> Base composition histogram by position. | |
77 qhist=<file> Quality histogram by position. | |
78 qchist=<file> Count of bases with each quality value. | |
79 aqhist=<file> Histogram of average read quality. | |
80 bqhist=<file> Quality histogram designed for box plots. | |
81 lhist=<file> Read length histogram. | |
82 phist=<file> Polymer length histogram. | |
83 gchist=<file> Read GC content histogram. | |
84 enthist=<file> Read entropy histogram. | |
85 ihist=<file> Insert size histogram, for paired reads in mapped sam. | |
86 gcbins=100 Number gchist bins. Set to 'auto' to use read length. | |
87 maxhistlen=6000 Set an upper bound for histogram lengths; higher uses | |
88 more memory. The default is 6000 for some histograms | |
89 and 80000 for others. | |
90 | |
91 Histograms for mapped sam/bam files only: | |
92 histbefore=t Calculate histograms from reads before processing. | |
93 ehist=<file> Errors-per-read histogram. | |
94 qahist=<file> Quality accuracy histogram of error rates versus quality | |
95 score. | |
96 indelhist=<file> Indel length histogram. | |
97 mhist=<file> Histogram of match, sub, del, and ins rates by position. | |
98 idhist=<file> Histogram of read count versus percent identity. | |
99 idbins=100 Number idhist bins. Set to 'auto' to use read length. | |
100 varfile=<file> Ignore substitution errors listed in this file when | |
101 calculating error rates. Can be generated with | |
102 CallVariants. | |
103 vcf=<file> Ignore substitution errors listed in this VCF file | |
104 when calculating error rates. | |
105 ignorevcfindels=t Also ignore indels listed in the VCF. | |
106 | |
107 Processing parameters: | |
108 k=27 Kmer length used for finding contaminants. Contaminants | |
109 shorter than k will not be found. k must be at least 1. | |
110 rcomp=t Look for reverse-complements of kmers in addition to | |
111 forward kmers. | |
112 maskmiddle=t (mm) Treat the middle base of a kmer as a wildcard, to | |
113 increase sensitivity in the presence of errors. | |
114 minkmerhits=1 (mkh) Reads need at least this many matching kmers | |
115 to be considered as matching the reference. | |
116 minkmerfraction=0.0 (mkf) A reads needs at least this fraction of its total | |
117 kmers to hit a ref, in order to be considered a match. | |
118 If this and minkmerhits are set, the greater is used. | |
119 mincovfraction=0.0 (mcf) A reads needs at least this fraction of its total | |
120 bases to be covered by ref kmers to be considered a match. | |
121 If specified, mcf overrides mkh and mkf. | |
122 hammingdistance=0 (hdist) Maximum Hamming distance for ref kmers (subs only). | |
123 Memory use is proportional to (3*K)^hdist. | |
124 qhdist=0 Hamming distance for query kmers; impacts speed, not memory. | |
125 editdistance=0 (edist) Maximum edit distance from ref kmers (subs | |
126 and indels). Memory use is proportional to (8*K)^edist. | |
127 hammingdistance2=0 (hdist2) Sets hdist for short kmers, when using mink. | |
128 qhdist2=0 Sets qhdist for short kmers, when using mink. | |
129 editdistance2=0 (edist2) Sets edist for short kmers, when using mink. | |
130 forbidn=f (fn) Forbids matching of read kmers containing N. | |
131 By default, these will match a reference 'A' if | |
132 hdist>0 or edist>0, to increase sensitivity. | |
133 removeifeitherbad=t (rieb) Paired reads get sent to 'outmatch' if either is | |
134 match (or either is trimmed shorter than minlen). | |
135 Set to false to require both. | |
136 trimfailures=f Instead of discarding failed reads, trim them to 1bp. | |
137 This makes the statistics a bit odd. | |
138 findbestmatch=f (fbm) If multiple matches, associate read with sequence | |
139 sharing most kmers. Reduces speed. | |
140 skipr1=f Don't do kmer-based operations on read 1. | |
141 skipr2=f Don't do kmer-based operations on read 2. | |
142 ecco=f For overlapping paired reads only. Performs error- | |
143 correction with BBMerge prior to kmer operations. | |
144 recalibrate=f (recal) Recalibrate quality scores. Requires calibration | |
145 matrices generated by CalcTrueQuality. | |
146 sam=<file,file> If recalibration is desired, and matrices have not already | |
147 been generated, BBDuk will create them from the sam file. | |
148 amino=f Run in amino acid mode. Some features have not been | |
149 tested, but kmer-matching works fine. Maximum k is 12. | |
150 | |
151 Speed and Memory parameters: | |
152 threads=auto (t) Set number of threads to use; default is number of | |
153 logical processors. | |
154 prealloc=f Preallocate memory in table. Allows faster table loading | |
155 and more efficient memory usage, for a large reference. | |
156 monitor=f Kill this process if it crashes. monitor=600,0.01 would | |
157 kill after 600 seconds under 1% usage. | |
158 minrskip=1 (mns) Force minimal skip interval when indexing reference | |
159 kmers. 1 means use all, 2 means use every other kmer, etc. | |
160 maxrskip=1 (mxs) Restrict maximal skip interval when indexing | |
161 reference kmers. Normally all are used for scaffolds<100kb, | |
162 but with longer scaffolds, up to maxrskip-1 are skipped. | |
163 rskip= Set both minrskip and maxrskip to the same value. | |
164 If not set, rskip will vary based on sequence length. | |
165 qskip=1 Skip query kmers to increase speed. 1 means use all. | |
166 speed=0 Ignore this fraction of kmer space (0-15 out of 16) in both | |
167 reads and reference. Increases speed and reduces memory. | |
168 Note: Do not use more than one of 'speed', 'qskip', and 'rskip'. | |
169 | |
170 Trimming/Filtering/Masking parameters: | |
171 Note - if ktrim, kmask, and ksplit are unset, the default behavior is kfilter. | |
172 All kmer processing modes are mutually exclusive. | |
173 Reads only get sent to 'outm' purely based on kmer matches in kfilter mode. | |
174 | |
175 ktrim=f Trim reads to remove bases matching reference kmers, plus | |
176 all bases to the left or right. | |
177 Values: | |
178 f (don't trim), | |
179 r (trim to the right), | |
180 l (trim to the left) | |
181 ktrimtips=0 Set this to a positive number to perform ktrim on both | |
182 ends, examining only the outermost X bases. | |
183 kmask= Replace bases matching ref kmers with another symbol. | |
184 Allows any non-whitespace character, and processes short | |
185 kmers on both ends if mink is set. 'kmask=lc' will | |
186 convert masked bases to lowercase. | |
187 maskfullycovered=f (mfc) Only mask bases that are fully covered by kmers. | |
188 ksplit=f For single-ended reads only. Reads will be split into | |
189 pairs around the kmer. If the kmer is at the end of the | |
190 read, it will be trimmed instead. Singletons will go to | |
191 out, and pairs will go to outm. Do not use ksplit with | |
192 other operations such as quality-trimming or filtering. | |
193 mink=0 Look for shorter kmers at read tips down to this length, | |
194 when k-trimming or masking. 0 means disabled. Enabling | |
195 this will disable maskmiddle. | |
196 qtrim=f Trim read ends to remove bases with quality below trimq. | |
197 Performed AFTER looking for kmers. Values: | |
198 rl (trim both ends), | |
199 f (neither end), | |
200 r (right end only), | |
201 l (left end only), | |
202 w (sliding window). | |
203 trimq=6 Regions with average quality BELOW this will be trimmed, | |
204 if qtrim is set to something other than f. Can be a | |
205 floating-point number like 7.3. | |
206 trimclip=f Trim soft-clipped bases from sam files. | |
207 minlength=10 (ml) Reads shorter than this after trimming will be | |
208 discarded. Pairs will be discarded if both are shorter. | |
209 mlf=0 (minlengthfraction) Reads shorter than this fraction of | |
210 original length after trimming will be discarded. | |
211 maxlength= Reads longer than this after trimming will be discarded. | |
212 minavgquality=0 (maq) Reads with average quality (after trimming) below | |
213 this will be discarded. | |
214 maqb=0 If positive, calculate maq from this many initial bases. | |
215 minbasequality=0 (mbq) Reads with any base below this quality (after | |
216 trimming) will be discarded. | |
217 maxns=-1 If non-negative, reads with more Ns than this | |
218 (after trimming) will be discarded. | |
219 mcb=0 (minconsecutivebases) Discard reads without at least | |
220 this many consecutive called bases. | |
221 ottm=f (outputtrimmedtomatch) Output reads trimmed to shorter | |
222 than minlength to outm rather than discarding. | |
223 tp=0 (trimpad) Trim this much extra around matching kmers. | |
224 tbo=f (trimbyoverlap) Trim adapters based on where paired | |
225 reads overlap. | |
226 strictoverlap=t Adjust sensitivity for trimbyoverlap mode. | |
227 minoverlap=14 Require this many bases of overlap for detection. | |
228 mininsert=40 Require insert size of at least this for overlap. | |
229 Should be reduced to 16 for small RNA sequencing. | |
230 tpe=f (trimpairsevenly) When kmer right-trimming, trim both | |
231 reads to the minimum length of either. | |
232 forcetrimleft=0 (ftl) If positive, trim bases to the left of this position | |
233 (exclusive, 0-based). | |
234 forcetrimright=0 (ftr) If positive, trim bases to the right of this position | |
235 (exclusive, 0-based). | |
236 forcetrimright2=0 (ftr2) If positive, trim this many bases on the right end. | |
237 forcetrimmod=0 (ftm) If positive, right-trim length to be equal to zero, | |
238 modulo this number. | |
239 restrictleft=0 If positive, only look for kmer matches in the | |
240 leftmost X bases. | |
241 restrictright=0 If positive, only look for kmer matches in the | |
242 rightmost X bases. | |
243 NOTE: restrictleft and restrictright are mutually exclusive. If trimming | |
244 both ends is desired, use ktrimtips. | |
245 mingc=0 Discard reads with GC content below this. | |
246 maxgc=1 Discard reads with GC content above this. | |
247 gcpairs=t Use average GC of paired reads. | |
248 Also affects gchist. | |
249 tossjunk=f Discard reads with invalid characters as bases. | |
250 swift=f Trim Swift sequences: Trailing C/T/N R1, leading G/A/N R2. | |
251 | |
252 Header-parsing parameters - these require Illumina headers: | |
253 chastityfilter=f (cf) Discard reads with id containing ' 1:Y:' or ' 2:Y:'. | |
254 barcodefilter=f Remove reads with unexpected barcodes if barcodes is set, | |
255 or barcodes containing 'N' otherwise. A barcode must be | |
256 the last part of the read header. Values: | |
257 t: Remove reads with bad barcodes. | |
258 f: Ignore barcodes. | |
259 crash: Crash upon encountering bad barcodes. | |
260 barcodes= Comma-delimited list of barcodes or files of barcodes. | |
261 xmin=-1 If positive, discard reads with a lesser X coordinate. | |
262 ymin=-1 If positive, discard reads with a lesser Y coordinate. | |
263 xmax=-1 If positive, discard reads with a greater X coordinate. | |
264 ymax=-1 If positive, discard reads with a greater Y coordinate. | |
265 | |
266 Polymer trimming: | |
267 trimpolya=0 If greater than 0, trim poly-A or poly-T tails of | |
268 at least this length on either end of reads. | |
269 trimpolygleft=0 If greater than 0, trim poly-G prefixes of at least this | |
270 length on the left end of reads. Does not trim poly-C. | |
271 trimpolygright=0 If greater than 0, trim poly-G tails of at least this | |
272 length on the right end of reads. Does not trim poly-C. | |
273 trimpolyg=0 This sets both left and right at once. | |
274 filterpolyg=0 If greater than 0, remove reads with a poly-G prefix of | |
275 at least this length (on the left). | |
276 Note: there are also equivalent poly-C flags. | |
277 | |
278 Polymer tracking: | |
279 pratio=base,base 'pratio=G,C' will print the ratio of G to C polymers. | |
280 plen=20 Length of homopolymers to count. | |
281 | |
282 Entropy/Complexity parameters: | |
283 entropy=-1 Set between 0 and 1 to filter reads with entropy below | |
284 that value. Higher is more stringent. | |
285 entropywindow=50 Calculate entropy using a sliding window of this length. | |
286 entropyk=5 Calculate entropy using kmers of this length. | |
287 minbasefrequency=0 Discard reads with a minimum base frequency below this. | |
288 entropytrim=f Values: | |
289 f: (false) Do not entropy-trim. | |
290 r: (right) Trim low entropy on the right end only. | |
291 l: (left) Trim low entropy on the left end only. | |
292 rl: (both) Trim low entropy on both ends. | |
293 entropymask=f Values: | |
294 f: (filter) Discard low-entropy sequences. | |
295 t: (true) Mask low-entropy parts of sequences with N. | |
296 lc: Change low-entropy parts of sequences to lowercase. | |
297 entropymark=f Mark each base with its entropy value. This is on a scale | |
298 of 0-41 and is reported as quality scores, so the output | |
299 should be fastq or fasta+qual. | |
300 NOTE: If set, entropytrim overrides entropymask. | |
301 | |
302 Cardinality estimation: | |
303 cardinality=f (loglog) Count unique kmers using the LogLog algorithm. | |
304 cardinalityout=f (loglogout) Count unique kmers in output reads. | |
305 loglogk=31 Use this kmer length for counting. | |
306 loglogbuckets=2048 Use this many buckets for counting. | |
307 khist=<file> Kmer frequency histogram; plots number of kmers versus | |
308 kmer depth. This is approximate. | |
309 khistout=<file> Kmer frequency histogram for output reads. | |
310 | |
311 Java Parameters: | |
312 | |
313 -Xmx This will set Java's memory usage, overriding autodetection. | |
314 -Xmx20g will | |
315 specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. | |
316 The max is typically 85% of physical memory. | |
317 -eoom This flag will cause the process to exit if an | |
318 out-of-memory exception occurs. Requires Java 8u92+. | |
319 -da Disable assertions. | |
320 | |
321 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems. | |
322 " | |
323 } | |
324 | |
325 #This block allows symlinked shellscripts to correctly set classpath. | |
326 pushd . > /dev/null | |
327 DIR="${BASH_SOURCE[0]}" | |
328 while [ -h "$DIR" ]; do | |
329 cd "$(dirname "$DIR")" | |
330 DIR="$(readlink "$(basename "$DIR")")" | |
331 done | |
332 cd "$(dirname "$DIR")" | |
333 DIR="$(pwd)/" | |
334 popd > /dev/null | |
335 | |
336 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" | |
337 CP="$DIR""current/" | |
338 JNI="-Djava.library.path=""$DIR""jni/" | |
339 JNI="" | |
340 | |
341 z="-Xmx1g" | |
342 z2="-Xms1g" | |
343 set=0 | |
344 silent=0 | |
345 json=0 | |
346 | |
347 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then | |
348 usage | |
349 exit | |
350 fi | |
351 | |
352 calcXmx () { | |
353 source "$DIR""/calcmem.sh" | |
354 setEnvironment | |
355 parseXmx "$@" | |
356 if [[ $set == 1 ]]; then | |
357 return | |
358 fi | |
359 freeRam 1400m 42 | |
360 z="-Xmx${RAM}m" | |
361 z2="-Xms${RAM}m" | |
362 } | |
363 calcXmx "$@" | |
364 | |
365 bbduk() { | |
366 local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP jgi.BBDuk $@" | |
367 if [[ $silent == 0 ]] && [[ $json == 0 ]]; then | |
368 echo $CMD >&2 | |
369 fi | |
370 eval $CMD | |
371 } | |
372 | |
373 bbduk "$@" |