annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/bbmap.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 #!/bin/bash
jpayne@69 2
jpayne@69 3 usage(){
jpayne@69 4 echo "
jpayne@69 5 BBMap
jpayne@69 6 Written by Brian Bushnell, from Dec. 2010 - present
jpayne@69 7 Last modified September 15, 2022
jpayne@69 8
jpayne@69 9 Description: Fast and accurate splice-aware read aligner.
jpayne@69 10 Please read bbmap/docs/guides/BBMapGuide.txt for more information.
jpayne@69 11
jpayne@69 12 To index: bbmap.sh ref=<reference fasta>
jpayne@69 13 To map: bbmap.sh in=<reads> out=<output sam>
jpayne@69 14 To map without writing an index:
jpayne@69 15 bbmap.sh ref=<reference fasta> in=<reads> out=<output sam> nodisk
jpayne@69 16
jpayne@69 17 in=stdin will accept reads from standard in, and out=stdout will write to
jpayne@69 18 standard out, but file extensions are still needed to specify the format of the
jpayne@69 19 input and output files e.g. in=stdin.fa.gz will read gzipped fasta from
jpayne@69 20 standard in; out=stdout.sam.gz will write gzipped sam.
jpayne@69 21
jpayne@69 22 Indexing Parameters (required when building the index):
jpayne@69 23 nodisk=f Set to true to build index in memory and write nothing
jpayne@69 24 to disk except output.
jpayne@69 25 ref=<file> Specify the reference sequence. Only do this ONCE,
jpayne@69 26 when building the index (unless using 'nodisk').
jpayne@69 27 build=1 If multiple references are indexed in the same directory,
jpayne@69 28 each needs a unique numeric ID (unless using 'nodisk').
jpayne@69 29 k=13 Kmer length, range 8-15. Longer is faster but uses
jpayne@69 30 more memory. Shorter is more sensitive.
jpayne@69 31 If indexing and mapping are done in two steps, K should
jpayne@69 32 be specified each time.
jpayne@69 33 path=<.> Specify the location to write the index, if you don't
jpayne@69 34 want it in the current working directory.
jpayne@69 35 usemodulo=f Throw away ~80% of kmers based on remainder modulo a
jpayne@69 36 number (reduces RAM by 50% and sensitivity slightly).
jpayne@69 37 Should be enabled both when building the index AND
jpayne@69 38 when mapping.
jpayne@69 39 rebuild=f Force a rebuild of the index (ref= should be set).
jpayne@69 40
jpayne@69 41 Input Parameters:
jpayne@69 42 build=1 Designate index to use. Corresponds to the number
jpayne@69 43 specified when building the index.
jpayne@69 44 in=<file> Primary reads input; required parameter.
jpayne@69 45 in2=<file> For paired reads in two files.
jpayne@69 46 interleaved=auto True forces paired/interleaved input; false forces
jpayne@69 47 single-ended mapping. If not specified, interleaved
jpayne@69 48 status will be autodetected from read names.
jpayne@69 49 fastareadlen=500 Break up FASTA reads longer than this. Max is 500 for
jpayne@69 50 BBMap and 6000 for BBMapPacBio. Only works for FASTA
jpayne@69 51 input (use 'maxlen' for FASTQ input). The default for
jpayne@69 52 bbmap.sh is 500, and for mapPacBio.sh is 6000.
jpayne@69 53 unpigz=f Spawn a pigz (parallel gzip) process for faster
jpayne@69 54 decompression than using Java.
jpayne@69 55 Requires pigz to be installed.
jpayne@69 56 touppercase=t (tuc) Convert lowercase letters in reads to upper case
jpayne@69 57 (otherwise they will not match the reference).
jpayne@69 58
jpayne@69 59 Sampling Parameters:
jpayne@69 60
jpayne@69 61 reads=-1 Set to a positive number N to only process the first N
jpayne@69 62 reads (or pairs), then quit. -1 means use all reads.
jpayne@69 63 samplerate=1 Set to a number from 0 to 1 to randomly select that
jpayne@69 64 fraction of reads for mapping. 1 uses all reads.
jpayne@69 65 skipreads=0 Set to a number N to skip the first N reads (or pairs),
jpayne@69 66 then map the rest.
jpayne@69 67
jpayne@69 68 Mapping Parameters:
jpayne@69 69 fast=f This flag is a macro which sets other paramters to run
jpayne@69 70 faster, at reduced sensitivity. Bad for RNA-seq.
jpayne@69 71 slow=f This flag is a macro which sets other paramters to run
jpayne@69 72 slower, at greater sensitivity. 'vslow' is even slower.
jpayne@69 73 maxindel=16000 Don't look for indels longer than this. Lower is faster.
jpayne@69 74 Set to >=100k for RNAseq with long introns like mammals.
jpayne@69 75 strictmaxindel=f When enabled, do not allow indels longer than 'maxindel'.
jpayne@69 76 By default these are not sought, but may be found anyway.
jpayne@69 77 tipsearch=100 Look this far for read-end deletions with anchors
jpayne@69 78 shorter than K, using brute force.
jpayne@69 79 minid=0.76 Approximate minimum alignment identity to look for.
jpayne@69 80 Higher is faster and less sensitive.
jpayne@69 81 minhits=1 Minimum number of seed hits required for candidate sites.
jpayne@69 82 Higher is faster.
jpayne@69 83 local=f Set to true to use local, rather than global, alignments.
jpayne@69 84 This will soft-clip ugly ends of poor alignments.
jpayne@69 85 perfectmode=f Allow only perfect mappings when set to true (very fast).
jpayne@69 86 semiperfectmode=f Allow only perfect and semiperfect (perfect except for
jpayne@69 87 N's in the reference) mappings.
jpayne@69 88 threads=auto (t) Set to number of threads desired. By default, uses
jpayne@69 89 all cores available.
jpayne@69 90 ambiguous=best (ambig) Set behavior on ambiguously-mapped reads (with
jpayne@69 91 multiple top-scoring mapping locations).
jpayne@69 92 best (use the first best site)
jpayne@69 93 toss (consider unmapped)
jpayne@69 94 random (select one top-scoring site randomly)
jpayne@69 95 all (retain all top-scoring sites)
jpayne@69 96 samestrandpairs=f (ssp) Specify whether paired reads should map to the
jpayne@69 97 same strand or opposite strands.
jpayne@69 98 requirecorrectstrand=t (rcs) Forbid pairing of reads without correct strand
jpayne@69 99 orientation. Set to false for long-mate-pair libraries.
jpayne@69 100 killbadpairs=f (kbp) If a read pair is mapped with an inappropriate
jpayne@69 101 insert size or orientation, the read with the lower
jpayne@69 102 mapping quality is marked unmapped.
jpayne@69 103 pairedonly=f (po) Treat unpaired reads as unmapped. Thus they will
jpayne@69 104 be sent to 'outu' but not 'outm'.
jpayne@69 105 rcomp=f Reverse complement both reads prior to mapping (for LMP
jpayne@69 106 outward-facing libraries).
jpayne@69 107 rcompmate=f Reverse complement read2 prior to mapping.
jpayne@69 108 pairlen=32000 Set max allowed distance between paired reads.
jpayne@69 109 (insert size)=(pairlen)+(read1 length)+(read2 length)
jpayne@69 110 rescuedist=1200 Don't try to rescue paired reads if avg. insert size
jpayne@69 111 greater than this. Lower is faster.
jpayne@69 112 rescuemismatches=32 Maximum mismatches allowed in a rescued read. Lower
jpayne@69 113 is faster.
jpayne@69 114 averagepairdist=100 (apd) Initial average distance between paired reads.
jpayne@69 115 Varies dynamically; does not need to be specified.
jpayne@69 116 deterministic=f Run in deterministic mode. In this case it is good
jpayne@69 117 to set averagepairdist. BBMap is deterministic
jpayne@69 118 without this flag if using single-ended reads,
jpayne@69 119 or run singlethreaded.
jpayne@69 120 bandwidthratio=0 (bwr) If above zero, restrict alignment band to this
jpayne@69 121 fraction of read length. Faster but less accurate.
jpayne@69 122 bandwidth=0 (bw) Set the bandwidth directly.
jpayne@69 123 fraction of read length. Faster but less accurate.
jpayne@69 124 usejni=f (jni) Do alignments faster, in C code. Requires
jpayne@69 125 compiling the C code; details are in /jni/README.txt.
jpayne@69 126 maxsites2=800 Don't analyze (or print) more than this many alignments
jpayne@69 127 per read.
jpayne@69 128 ignorefrequentkmers=t (ifk) Discard low-information kmers that occur often.
jpayne@69 129 excludefraction=0.03 (ef) Fraction of kmers to ignore. For example, 0.03
jpayne@69 130 will ignore the most common 3% of kmers.
jpayne@69 131 greedy=t Use a greedy algorithm to discard the least-useful
jpayne@69 132 kmers on a per-read basis.
jpayne@69 133 kfilter=0 If positive, potential mapping sites must have at
jpayne@69 134 least this many consecutive exact matches.
jpayne@69 135
jpayne@69 136
jpayne@69 137 Quality and Trimming Parameters:
jpayne@69 138 qin=auto Set to 33 or 64 to specify input quality value ASCII
jpayne@69 139 offset. 33 is Sanger, 64 is old Solexa.
jpayne@69 140 qout=auto Set to 33 or 64 to specify output quality value ASCII
jpayne@69 141 offset (only if output format is fastq).
jpayne@69 142 qtrim=f Quality-trim ends before mapping. Options are:
jpayne@69 143 'f' (false), 'l' (left), 'r' (right), and 'lr' (both).
jpayne@69 144 untrim=f Undo trimming after mapping. Untrimmed bases will be
jpayne@69 145 soft-clipped in cigar strings.
jpayne@69 146 trimq=6 Trim regions with average quality below this
jpayne@69 147 (phred algorithm).
jpayne@69 148 mintrimlength=60 (mintl) Don't trim reads to be shorter than this.
jpayne@69 149 fakefastaquality=-1 (ffq) Set to a positive number 1-50 to generate fake
jpayne@69 150 quality strings for fasta input reads.
jpayne@69 151 ignorebadquality=f (ibq) Keep going, rather than crashing, if a read has
jpayne@69 152 out-of-range quality values.
jpayne@69 153 usequality=t Use quality scores when determining which read kmers
jpayne@69 154 to use as seeds.
jpayne@69 155 minaveragequality=0 (maq) Do not map reads with average quality below this.
jpayne@69 156 maqb=0 If positive, calculate maq from this many initial bases.
jpayne@69 157
jpayne@69 158 Output Parameters:
jpayne@69 159 out=<file> Write all reads to this file.
jpayne@69 160 outu=<file> Write only unmapped reads to this file. Does not
jpayne@69 161 include unmapped paired reads with a mapped mate.
jpayne@69 162 outm=<file> Write only mapped reads to this file. Includes
jpayne@69 163 unmapped paired reads with a mapped mate.
jpayne@69 164 mappedonly=f If true, treats 'out' like 'outm'.
jpayne@69 165 bamscript=<file> (bs) Write a shell script to <file> that will turn
jpayne@69 166 the sam output into a sorted, indexed bam file.
jpayne@69 167 ordered=f Set to true to output reads in same order as input.
jpayne@69 168 Slower and uses more memory.
jpayne@69 169 overwrite=f (ow) Allow process to overwrite existing files.
jpayne@69 170 secondary=f Print secondary alignments.
jpayne@69 171 sssr=0.95 (secondarysitescoreratio) Print only secondary alignments
jpayne@69 172 with score of at least this fraction of primary.
jpayne@69 173 ssao=f (secondarysiteasambiguousonly) Only print secondary
jpayne@69 174 alignments for ambiguously-mapped reads.
jpayne@69 175 maxsites=5 Maximum number of total alignments to print per read.
jpayne@69 176 Only relevant when secondary=t.
jpayne@69 177 quickmatch=f Generate cigar strings more quickly.
jpayne@69 178 trimreaddescriptions=f (trd) Truncate read and ref names at the first whitespace,
jpayne@69 179 assuming that the remainder is a comment or description.
jpayne@69 180 ziplevel=2 (zl) Compression level for zip or gzip output.
jpayne@69 181 pigz=f Spawn a pigz (parallel gzip) process for faster
jpayne@69 182 compression than Java. Requires pigz to be installed.
jpayne@69 183 machineout=f Set to true to output statistics in machine-friendly
jpayne@69 184 'key=value' format.
jpayne@69 185 printunmappedcount=f Print the total number of unmapped reads and bases.
jpayne@69 186 If input is paired, the number will be of pairs
jpayne@69 187 for which both reads are unmapped.
jpayne@69 188 showprogress=0 If positive, print a '.' every X reads.
jpayne@69 189 showprogress2=0 If positive, print the number of seconds since the
jpayne@69 190 last progress update (instead of a '.').
jpayne@69 191 renamebyinsert=f Renames reads based on their mapped insert size.
jpayne@69 192
jpayne@69 193 Bloom-Filtering Parameters (bloomfilter.sh is the standalone version).
jpayne@69 194 bloom=f Use a Bloom filter to ignore reads not sharing kmers
jpayne@69 195 with the reference. This uses more memory, but speeds
jpayne@69 196 mapping when most reads don't match the reference.
jpayne@69 197 bloomhashes=2 Number of hash functions.
jpayne@69 198 bloomminhits=3 Number of consecutive hits to be considered matched.
jpayne@69 199 bloomk=31 Bloom filter kmer length.
jpayne@69 200 bloomserial=t Use the serialized Bloom filter for greater loading
jpayne@69 201 speed, if available. If not, generate and write one.
jpayne@69 202
jpayne@69 203 Post-Filtering Parameters:
jpayne@69 204 idfilter=0 Independant of minid; sets exact minimum identity
jpayne@69 205 allowed for alignments to be printed. Range 0 to 1.
jpayne@69 206 subfilter=-1 Ban alignments with more than this many substitutions.
jpayne@69 207 insfilter=-1 Ban alignments with more than this many insertions.
jpayne@69 208 delfilter=-1 Ban alignments with more than this many deletions.
jpayne@69 209 indelfilter=-1 Ban alignments with more than this many indels.
jpayne@69 210 editfilter=-1 Ban alignments with more than this many edits.
jpayne@69 211 inslenfilter=-1 Ban alignments with an insertion longer than this.
jpayne@69 212 dellenfilter=-1 Ban alignments with a deletion longer than this.
jpayne@69 213 nfilter=-1 Ban alignments with more than this many ns. This
jpayne@69 214 includes nocall, noref, and off scaffold ends.
jpayne@69 215
jpayne@69 216 Sam flags and settings:
jpayne@69 217 noheader=f Disable generation of header lines.
jpayne@69 218 sam=1.4 Set to 1.4 to write Sam version 1.4 cigar strings,
jpayne@69 219 with = and X, or 1.3 to use M.
jpayne@69 220 saa=t (secondaryalignmentasterisks) Use asterisks instead of
jpayne@69 221 bases for sam secondary alignments.
jpayne@69 222 cigar=t Set to 'f' to skip generation of cigar strings (faster).
jpayne@69 223 keepnames=f Keep original names of paired reads, rather than
jpayne@69 224 ensuring both reads have the same name.
jpayne@69 225 intronlen=999999999 Set to a lower number like 10 to change 'D' to 'N' in
jpayne@69 226 cigar strings for deletions of at least that length.
jpayne@69 227 rgid= Set readgroup ID. All other readgroup fields
jpayne@69 228 can be set similarly, with the flag rgXX=
jpayne@69 229 If you set a readgroup flag to the word 'filename',
jpayne@69 230 e.g. rgid=filename, the input file name will be used.
jpayne@69 231 mdtag=f Write MD tags.
jpayne@69 232 nhtag=f Write NH tags.
jpayne@69 233 xmtag=f Write XM tags (may only work correctly with ambig=all).
jpayne@69 234 amtag=f Write AM tags.
jpayne@69 235 nmtag=f Write NM tags.
jpayne@69 236 xstag=f Set to 'xs=fs', 'xs=ss', or 'xs=us' to write XS tags
jpayne@69 237 for RNAseq using firststrand, secondstrand, or
jpayne@69 238 unstranded libraries. Needed by Cufflinks.
jpayne@69 239 JGI mainly uses 'firststrand'.
jpayne@69 240 stoptag=f Write a tag indicating read stop location, prefixed by YS:i:
jpayne@69 241 lengthtag=f Write a tag indicating (query,ref) alignment lengths,
jpayne@69 242 prefixed by YL:Z:
jpayne@69 243 idtag=f Write a tag indicating percent identity, prefixed by YI:f:
jpayne@69 244 inserttag=f Write a tag indicating insert size, prefixed by X8:Z:
jpayne@69 245 scoretag=f Write a tag indicating BBMap's raw score, prefixed by YR:i:
jpayne@69 246 timetag=f Write a tag indicating this read's mapping time, prefixed by X0:i:
jpayne@69 247 boundstag=f Write a tag indicating whether either read in the pair
jpayne@69 248 goes off the end of the reference, prefixed by XB:Z:
jpayne@69 249 notags=f Turn off all optional tags.
jpayne@69 250
jpayne@69 251 Histogram and statistics output parameters:
jpayne@69 252 scafstats=<file> Statistics on how many reads mapped to which scaffold.
jpayne@69 253 refstats=<file> Statistics on how many reads mapped to which reference
jpayne@69 254 file; only for BBSplit.
jpayne@69 255 sortscafs=t Sort scaffolds or references by read count.
jpayne@69 256 bhist=<file> Base composition histogram by position.
jpayne@69 257 qhist=<file> Quality histogram by position.
jpayne@69 258 aqhist=<file> Histogram of average read quality.
jpayne@69 259 bqhist=<file> Quality histogram designed for box plots.
jpayne@69 260 lhist=<file> Read length histogram.
jpayne@69 261 ihist=<file> Write histogram of insert sizes (for paired reads).
jpayne@69 262 ehist=<file> Errors-per-read histogram.
jpayne@69 263 qahist=<file> Quality accuracy histogram of error rates versus
jpayne@69 264 quality score.
jpayne@69 265 indelhist=<file> Indel length histogram.
jpayne@69 266 mhist=<file> Histogram of match, sub, del, and ins rates by
jpayne@69 267 read location.
jpayne@69 268 gchist=<file> Read GC content histogram.
jpayne@69 269 gcbins=100 Number gchist bins. Set to 'auto' to use read length.
jpayne@69 270 gcpairs=t Use average GC of paired reads.
jpayne@69 271 idhist=<file> Histogram of read count versus percent identity.
jpayne@69 272 idbins=100 Number idhist bins. Set to 'auto' to use read length.
jpayne@69 273 statsfile=stderr Mapping statistics are printed here.
jpayne@69 274
jpayne@69 275 Coverage output parameters (these may reduce speed and use more RAM):
jpayne@69 276 covstats=<file> Per-scaffold coverage info.
jpayne@69 277 rpkm=<file> Per-scaffold RPKM/FPKM counts.
jpayne@69 278 covhist=<file> Histogram of # occurrences of each depth level.
jpayne@69 279 basecov=<file> Coverage per base location.
jpayne@69 280 bincov=<file> Print binned coverage per location (one line per X bases).
jpayne@69 281 covbinsize=1000 Set the binsize for binned coverage output.
jpayne@69 282 nzo=t Only print scaffolds with nonzero coverage.
jpayne@69 283 twocolumn=f Change to true to print only ID and Avg_fold instead of
jpayne@69 284 all 6 columns to the 'out=' file.
jpayne@69 285 32bit=f Set to true if you need per-base coverage over 64k.
jpayne@69 286 strandedcov=f Track coverage for plus and minus strand independently.
jpayne@69 287 startcov=f Only track start positions of reads.
jpayne@69 288 secondarycov=t Include coverage of secondary alignments.
jpayne@69 289 physcov=f Calculate physical coverage for paired reads.
jpayne@69 290 This includes the unsequenced bases.
jpayne@69 291 delcoverage=t (delcov) Count bases covered by deletions as covered.
jpayne@69 292 True is faster than false.
jpayne@69 293 covk=0 If positive, calculate kmer coverage statistics.
jpayne@69 294
jpayne@69 295 Java Parameters:
jpayne@69 296 -Xmx This will set Java's memory usage,
jpayne@69 297 overriding autodetection.
jpayne@69 298 -Xmx20g will specify 20 gigs of RAM, and -Xmx800m
jpayne@69 299 will specify 800 megs. The max is typically 85% of
jpayne@69 300 physical memory. The human genome requires around 24g,
jpayne@69 301 or 12g with the 'usemodulo' flag. The index uses
jpayne@69 302 roughly 6 bytes per reference base.
jpayne@69 303 -eoom This flag will cause the process to exit if an
jpayne@69 304 out-of-memory exception occurs. Requires Java 8u92+.
jpayne@69 305 -da Disable assertions.
jpayne@69 306
jpayne@69 307 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter
jpayne@69 308 any problems, or post at: http://seqanswers.com/forums/showthread.php?t=41057
jpayne@69 309 "
jpayne@69 310 }
jpayne@69 311
jpayne@69 312 #This block allows symlinked shellscripts to correctly set classpath.
jpayne@69 313 pushd . > /dev/null
jpayne@69 314 DIR="${BASH_SOURCE[0]}"
jpayne@69 315 while [ -h "$DIR" ]; do
jpayne@69 316 cd "$(dirname "$DIR")"
jpayne@69 317 DIR="$(readlink "$(basename "$DIR")")"
jpayne@69 318 done
jpayne@69 319 cd "$(dirname "$DIR")"
jpayne@69 320 DIR="$(pwd)/"
jpayne@69 321 popd > /dev/null
jpayne@69 322
jpayne@69 323 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
jpayne@69 324 CP="$DIR""current/"
jpayne@69 325 JNI="-Djava.library.path=""$DIR""jni/"
jpayne@69 326 JNI=""
jpayne@69 327
jpayne@69 328 z="-Xmx1g"
jpayne@69 329 z2="-Xms1g"
jpayne@69 330 set=0
jpayne@69 331
jpayne@69 332 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
jpayne@69 333 usage
jpayne@69 334 exit
jpayne@69 335 fi
jpayne@69 336
jpayne@69 337 calcXmx () {
jpayne@69 338 source "$DIR""/calcmem.sh"
jpayne@69 339 setEnvironment
jpayne@69 340 parseXmx "$@"
jpayne@69 341 if [[ $set == 1 ]]; then
jpayne@69 342 return
jpayne@69 343 fi
jpayne@69 344 freeRam 3200m 84
jpayne@69 345 z="-Xmx${RAM}m"
jpayne@69 346 z2="-Xms${RAM}m"
jpayne@69 347 }
jpayne@69 348 calcXmx "$@"
jpayne@69 349
jpayne@69 350
jpayne@69 351 bbmap() {
jpayne@69 352 local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap build=1 overwrite=true fastareadlen=500 $@"
jpayne@69 353 echo $CMD >&2
jpayne@69 354 eval $CMD
jpayne@69 355 }
jpayne@69 356
jpayne@69 357 bbmap "$@"