comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/bbmap.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 #!/bin/bash
2
3 usage(){
4 echo "
5 BBMap
6 Written by Brian Bushnell, from Dec. 2010 - present
7 Last modified September 15, 2022
8
9 Description: Fast and accurate splice-aware read aligner.
10 Please read bbmap/docs/guides/BBMapGuide.txt for more information.
11
12 To index: bbmap.sh ref=<reference fasta>
13 To map: bbmap.sh in=<reads> out=<output sam>
14 To map without writing an index:
15 bbmap.sh ref=<reference fasta> in=<reads> out=<output sam> nodisk
16
17 in=stdin will accept reads from standard in, and out=stdout will write to
18 standard out, but file extensions are still needed to specify the format of the
19 input and output files e.g. in=stdin.fa.gz will read gzipped fasta from
20 standard in; out=stdout.sam.gz will write gzipped sam.
21
22 Indexing Parameters (required when building the index):
23 nodisk=f Set to true to build index in memory and write nothing
24 to disk except output.
25 ref=<file> Specify the reference sequence. Only do this ONCE,
26 when building the index (unless using 'nodisk').
27 build=1 If multiple references are indexed in the same directory,
28 each needs a unique numeric ID (unless using 'nodisk').
29 k=13 Kmer length, range 8-15. Longer is faster but uses
30 more memory. Shorter is more sensitive.
31 If indexing and mapping are done in two steps, K should
32 be specified each time.
33 path=<.> Specify the location to write the index, if you don't
34 want it in the current working directory.
35 usemodulo=f Throw away ~80% of kmers based on remainder modulo a
36 number (reduces RAM by 50% and sensitivity slightly).
37 Should be enabled both when building the index AND
38 when mapping.
39 rebuild=f Force a rebuild of the index (ref= should be set).
40
41 Input Parameters:
42 build=1 Designate index to use. Corresponds to the number
43 specified when building the index.
44 in=<file> Primary reads input; required parameter.
45 in2=<file> For paired reads in two files.
46 interleaved=auto True forces paired/interleaved input; false forces
47 single-ended mapping. If not specified, interleaved
48 status will be autodetected from read names.
49 fastareadlen=500 Break up FASTA reads longer than this. Max is 500 for
50 BBMap and 6000 for BBMapPacBio. Only works for FASTA
51 input (use 'maxlen' for FASTQ input). The default for
52 bbmap.sh is 500, and for mapPacBio.sh is 6000.
53 unpigz=f Spawn a pigz (parallel gzip) process for faster
54 decompression than using Java.
55 Requires pigz to be installed.
56 touppercase=t (tuc) Convert lowercase letters in reads to upper case
57 (otherwise they will not match the reference).
58
59 Sampling Parameters:
60
61 reads=-1 Set to a positive number N to only process the first N
62 reads (or pairs), then quit. -1 means use all reads.
63 samplerate=1 Set to a number from 0 to 1 to randomly select that
64 fraction of reads for mapping. 1 uses all reads.
65 skipreads=0 Set to a number N to skip the first N reads (or pairs),
66 then map the rest.
67
68 Mapping Parameters:
69 fast=f This flag is a macro which sets other paramters to run
70 faster, at reduced sensitivity. Bad for RNA-seq.
71 slow=f This flag is a macro which sets other paramters to run
72 slower, at greater sensitivity. 'vslow' is even slower.
73 maxindel=16000 Don't look for indels longer than this. Lower is faster.
74 Set to >=100k for RNAseq with long introns like mammals.
75 strictmaxindel=f When enabled, do not allow indels longer than 'maxindel'.
76 By default these are not sought, but may be found anyway.
77 tipsearch=100 Look this far for read-end deletions with anchors
78 shorter than K, using brute force.
79 minid=0.76 Approximate minimum alignment identity to look for.
80 Higher is faster and less sensitive.
81 minhits=1 Minimum number of seed hits required for candidate sites.
82 Higher is faster.
83 local=f Set to true to use local, rather than global, alignments.
84 This will soft-clip ugly ends of poor alignments.
85 perfectmode=f Allow only perfect mappings when set to true (very fast).
86 semiperfectmode=f Allow only perfect and semiperfect (perfect except for
87 N's in the reference) mappings.
88 threads=auto (t) Set to number of threads desired. By default, uses
89 all cores available.
90 ambiguous=best (ambig) Set behavior on ambiguously-mapped reads (with
91 multiple top-scoring mapping locations).
92 best (use the first best site)
93 toss (consider unmapped)
94 random (select one top-scoring site randomly)
95 all (retain all top-scoring sites)
96 samestrandpairs=f (ssp) Specify whether paired reads should map to the
97 same strand or opposite strands.
98 requirecorrectstrand=t (rcs) Forbid pairing of reads without correct strand
99 orientation. Set to false for long-mate-pair libraries.
100 killbadpairs=f (kbp) If a read pair is mapped with an inappropriate
101 insert size or orientation, the read with the lower
102 mapping quality is marked unmapped.
103 pairedonly=f (po) Treat unpaired reads as unmapped. Thus they will
104 be sent to 'outu' but not 'outm'.
105 rcomp=f Reverse complement both reads prior to mapping (for LMP
106 outward-facing libraries).
107 rcompmate=f Reverse complement read2 prior to mapping.
108 pairlen=32000 Set max allowed distance between paired reads.
109 (insert size)=(pairlen)+(read1 length)+(read2 length)
110 rescuedist=1200 Don't try to rescue paired reads if avg. insert size
111 greater than this. Lower is faster.
112 rescuemismatches=32 Maximum mismatches allowed in a rescued read. Lower
113 is faster.
114 averagepairdist=100 (apd) Initial average distance between paired reads.
115 Varies dynamically; does not need to be specified.
116 deterministic=f Run in deterministic mode. In this case it is good
117 to set averagepairdist. BBMap is deterministic
118 without this flag if using single-ended reads,
119 or run singlethreaded.
120 bandwidthratio=0 (bwr) If above zero, restrict alignment band to this
121 fraction of read length. Faster but less accurate.
122 bandwidth=0 (bw) Set the bandwidth directly.
123 fraction of read length. Faster but less accurate.
124 usejni=f (jni) Do alignments faster, in C code. Requires
125 compiling the C code; details are in /jni/README.txt.
126 maxsites2=800 Don't analyze (or print) more than this many alignments
127 per read.
128 ignorefrequentkmers=t (ifk) Discard low-information kmers that occur often.
129 excludefraction=0.03 (ef) Fraction of kmers to ignore. For example, 0.03
130 will ignore the most common 3% of kmers.
131 greedy=t Use a greedy algorithm to discard the least-useful
132 kmers on a per-read basis.
133 kfilter=0 If positive, potential mapping sites must have at
134 least this many consecutive exact matches.
135
136
137 Quality and Trimming Parameters:
138 qin=auto Set to 33 or 64 to specify input quality value ASCII
139 offset. 33 is Sanger, 64 is old Solexa.
140 qout=auto Set to 33 or 64 to specify output quality value ASCII
141 offset (only if output format is fastq).
142 qtrim=f Quality-trim ends before mapping. Options are:
143 'f' (false), 'l' (left), 'r' (right), and 'lr' (both).
144 untrim=f Undo trimming after mapping. Untrimmed bases will be
145 soft-clipped in cigar strings.
146 trimq=6 Trim regions with average quality below this
147 (phred algorithm).
148 mintrimlength=60 (mintl) Don't trim reads to be shorter than this.
149 fakefastaquality=-1 (ffq) Set to a positive number 1-50 to generate fake
150 quality strings for fasta input reads.
151 ignorebadquality=f (ibq) Keep going, rather than crashing, if a read has
152 out-of-range quality values.
153 usequality=t Use quality scores when determining which read kmers
154 to use as seeds.
155 minaveragequality=0 (maq) Do not map reads with average quality below this.
156 maqb=0 If positive, calculate maq from this many initial bases.
157
158 Output Parameters:
159 out=<file> Write all reads to this file.
160 outu=<file> Write only unmapped reads to this file. Does not
161 include unmapped paired reads with a mapped mate.
162 outm=<file> Write only mapped reads to this file. Includes
163 unmapped paired reads with a mapped mate.
164 mappedonly=f If true, treats 'out' like 'outm'.
165 bamscript=<file> (bs) Write a shell script to <file> that will turn
166 the sam output into a sorted, indexed bam file.
167 ordered=f Set to true to output reads in same order as input.
168 Slower and uses more memory.
169 overwrite=f (ow) Allow process to overwrite existing files.
170 secondary=f Print secondary alignments.
171 sssr=0.95 (secondarysitescoreratio) Print only secondary alignments
172 with score of at least this fraction of primary.
173 ssao=f (secondarysiteasambiguousonly) Only print secondary
174 alignments for ambiguously-mapped reads.
175 maxsites=5 Maximum number of total alignments to print per read.
176 Only relevant when secondary=t.
177 quickmatch=f Generate cigar strings more quickly.
178 trimreaddescriptions=f (trd) Truncate read and ref names at the first whitespace,
179 assuming that the remainder is a comment or description.
180 ziplevel=2 (zl) Compression level for zip or gzip output.
181 pigz=f Spawn a pigz (parallel gzip) process for faster
182 compression than Java. Requires pigz to be installed.
183 machineout=f Set to true to output statistics in machine-friendly
184 'key=value' format.
185 printunmappedcount=f Print the total number of unmapped reads and bases.
186 If input is paired, the number will be of pairs
187 for which both reads are unmapped.
188 showprogress=0 If positive, print a '.' every X reads.
189 showprogress2=0 If positive, print the number of seconds since the
190 last progress update (instead of a '.').
191 renamebyinsert=f Renames reads based on their mapped insert size.
192
193 Bloom-Filtering Parameters (bloomfilter.sh is the standalone version).
194 bloom=f Use a Bloom filter to ignore reads not sharing kmers
195 with the reference. This uses more memory, but speeds
196 mapping when most reads don't match the reference.
197 bloomhashes=2 Number of hash functions.
198 bloomminhits=3 Number of consecutive hits to be considered matched.
199 bloomk=31 Bloom filter kmer length.
200 bloomserial=t Use the serialized Bloom filter for greater loading
201 speed, if available. If not, generate and write one.
202
203 Post-Filtering Parameters:
204 idfilter=0 Independant of minid; sets exact minimum identity
205 allowed for alignments to be printed. Range 0 to 1.
206 subfilter=-1 Ban alignments with more than this many substitutions.
207 insfilter=-1 Ban alignments with more than this many insertions.
208 delfilter=-1 Ban alignments with more than this many deletions.
209 indelfilter=-1 Ban alignments with more than this many indels.
210 editfilter=-1 Ban alignments with more than this many edits.
211 inslenfilter=-1 Ban alignments with an insertion longer than this.
212 dellenfilter=-1 Ban alignments with a deletion longer than this.
213 nfilter=-1 Ban alignments with more than this many ns. This
214 includes nocall, noref, and off scaffold ends.
215
216 Sam flags and settings:
217 noheader=f Disable generation of header lines.
218 sam=1.4 Set to 1.4 to write Sam version 1.4 cigar strings,
219 with = and X, or 1.3 to use M.
220 saa=t (secondaryalignmentasterisks) Use asterisks instead of
221 bases for sam secondary alignments.
222 cigar=t Set to 'f' to skip generation of cigar strings (faster).
223 keepnames=f Keep original names of paired reads, rather than
224 ensuring both reads have the same name.
225 intronlen=999999999 Set to a lower number like 10 to change 'D' to 'N' in
226 cigar strings for deletions of at least that length.
227 rgid= Set readgroup ID. All other readgroup fields
228 can be set similarly, with the flag rgXX=
229 If you set a readgroup flag to the word 'filename',
230 e.g. rgid=filename, the input file name will be used.
231 mdtag=f Write MD tags.
232 nhtag=f Write NH tags.
233 xmtag=f Write XM tags (may only work correctly with ambig=all).
234 amtag=f Write AM tags.
235 nmtag=f Write NM tags.
236 xstag=f Set to 'xs=fs', 'xs=ss', or 'xs=us' to write XS tags
237 for RNAseq using firststrand, secondstrand, or
238 unstranded libraries. Needed by Cufflinks.
239 JGI mainly uses 'firststrand'.
240 stoptag=f Write a tag indicating read stop location, prefixed by YS:i:
241 lengthtag=f Write a tag indicating (query,ref) alignment lengths,
242 prefixed by YL:Z:
243 idtag=f Write a tag indicating percent identity, prefixed by YI:f:
244 inserttag=f Write a tag indicating insert size, prefixed by X8:Z:
245 scoretag=f Write a tag indicating BBMap's raw score, prefixed by YR:i:
246 timetag=f Write a tag indicating this read's mapping time, prefixed by X0:i:
247 boundstag=f Write a tag indicating whether either read in the pair
248 goes off the end of the reference, prefixed by XB:Z:
249 notags=f Turn off all optional tags.
250
251 Histogram and statistics output parameters:
252 scafstats=<file> Statistics on how many reads mapped to which scaffold.
253 refstats=<file> Statistics on how many reads mapped to which reference
254 file; only for BBSplit.
255 sortscafs=t Sort scaffolds or references by read count.
256 bhist=<file> Base composition histogram by position.
257 qhist=<file> Quality histogram by position.
258 aqhist=<file> Histogram of average read quality.
259 bqhist=<file> Quality histogram designed for box plots.
260 lhist=<file> Read length histogram.
261 ihist=<file> Write histogram of insert sizes (for paired reads).
262 ehist=<file> Errors-per-read histogram.
263 qahist=<file> Quality accuracy histogram of error rates versus
264 quality score.
265 indelhist=<file> Indel length histogram.
266 mhist=<file> Histogram of match, sub, del, and ins rates by
267 read location.
268 gchist=<file> Read GC content histogram.
269 gcbins=100 Number gchist bins. Set to 'auto' to use read length.
270 gcpairs=t Use average GC of paired reads.
271 idhist=<file> Histogram of read count versus percent identity.
272 idbins=100 Number idhist bins. Set to 'auto' to use read length.
273 statsfile=stderr Mapping statistics are printed here.
274
275 Coverage output parameters (these may reduce speed and use more RAM):
276 covstats=<file> Per-scaffold coverage info.
277 rpkm=<file> Per-scaffold RPKM/FPKM counts.
278 covhist=<file> Histogram of # occurrences of each depth level.
279 basecov=<file> Coverage per base location.
280 bincov=<file> Print binned coverage per location (one line per X bases).
281 covbinsize=1000 Set the binsize for binned coverage output.
282 nzo=t Only print scaffolds with nonzero coverage.
283 twocolumn=f Change to true to print only ID and Avg_fold instead of
284 all 6 columns to the 'out=' file.
285 32bit=f Set to true if you need per-base coverage over 64k.
286 strandedcov=f Track coverage for plus and minus strand independently.
287 startcov=f Only track start positions of reads.
288 secondarycov=t Include coverage of secondary alignments.
289 physcov=f Calculate physical coverage for paired reads.
290 This includes the unsequenced bases.
291 delcoverage=t (delcov) Count bases covered by deletions as covered.
292 True is faster than false.
293 covk=0 If positive, calculate kmer coverage statistics.
294
295 Java Parameters:
296 -Xmx This will set Java's memory usage,
297 overriding autodetection.
298 -Xmx20g will specify 20 gigs of RAM, and -Xmx800m
299 will specify 800 megs. The max is typically 85% of
300 physical memory. The human genome requires around 24g,
301 or 12g with the 'usemodulo' flag. The index uses
302 roughly 6 bytes per reference base.
303 -eoom This flag will cause the process to exit if an
304 out-of-memory exception occurs. Requires Java 8u92+.
305 -da Disable assertions.
306
307 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter
308 any problems, or post at: http://seqanswers.com/forums/showthread.php?t=41057
309 "
310 }
311
312 #This block allows symlinked shellscripts to correctly set classpath.
313 pushd . > /dev/null
314 DIR="${BASH_SOURCE[0]}"
315 while [ -h "$DIR" ]; do
316 cd "$(dirname "$DIR")"
317 DIR="$(readlink "$(basename "$DIR")")"
318 done
319 cd "$(dirname "$DIR")"
320 DIR="$(pwd)/"
321 popd > /dev/null
322
323 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
324 CP="$DIR""current/"
325 JNI="-Djava.library.path=""$DIR""jni/"
326 JNI=""
327
328 z="-Xmx1g"
329 z2="-Xms1g"
330 set=0
331
332 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
333 usage
334 exit
335 fi
336
337 calcXmx () {
338 source "$DIR""/calcmem.sh"
339 setEnvironment
340 parseXmx "$@"
341 if [[ $set == 1 ]]; then
342 return
343 fi
344 freeRam 3200m 84
345 z="-Xmx${RAM}m"
346 z2="-Xms${RAM}m"
347 }
348 calcXmx "$@"
349
350
351 bbmap() {
352 local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap build=1 overwrite=true fastareadlen=500 $@"
353 echo $CMD >&2
354 eval $CMD
355 }
356
357 bbmap "$@"