jpayne@69: #!/bin/bash
jpayne@69: 
jpayne@69: usage(){
jpayne@69: echo "
jpayne@69: BBMap
jpayne@69: Written by Brian Bushnell, from Dec. 2010 - present
jpayne@69: Last modified September 15, 2022
jpayne@69: 
jpayne@69: Description:  Fast and accurate splice-aware read aligner.
jpayne@69: Please read bbmap/docs/guides/BBMapGuide.txt for more information.
jpayne@69: 
jpayne@69: To index:     bbmap.sh ref=<reference fasta>
jpayne@69: To map:       bbmap.sh in=<reads> out=<output sam>
jpayne@69: To map without writing an index:
jpayne@69:     bbmap.sh ref=<reference fasta> in=<reads> out=<output sam> nodisk
jpayne@69: 
jpayne@69: in=stdin will accept reads from standard in, and out=stdout will write to 
jpayne@69: standard out, but file extensions are still needed to specify the format of the 
jpayne@69: input and output files e.g. in=stdin.fa.gz will read gzipped fasta from 
jpayne@69: standard in; out=stdout.sam.gz will write gzipped sam.
jpayne@69: 
jpayne@69: Indexing Parameters (required when building the index):
jpayne@69: nodisk=f                Set to true to build index in memory and write nothing 
jpayne@69:                         to disk except output.
jpayne@69: ref=<file>              Specify the reference sequence.  Only do this ONCE, 
jpayne@69:                         when building the index (unless using 'nodisk').
jpayne@69: build=1                 If multiple references are indexed in the same directory,
jpayne@69:                         each needs a unique numeric ID (unless using 'nodisk').
jpayne@69: k=13                    Kmer length, range 8-15.  Longer is faster but uses 
jpayne@69:                         more memory.  Shorter is more sensitive.
jpayne@69:                         If indexing and mapping are done in two steps, K should
jpayne@69:                         be specified each time.
jpayne@69: path=<.>                Specify the location to write the index, if you don't 
jpayne@69:                         want it in the current working directory.
jpayne@69: usemodulo=f             Throw away ~80% of kmers based on remainder modulo a 
jpayne@69:                         number (reduces RAM by 50% and sensitivity slightly).
jpayne@69:                         Should be enabled both when building the index AND 
jpayne@69:                         when mapping.
jpayne@69: rebuild=f               Force a rebuild of the index (ref= should be set).
jpayne@69: 
jpayne@69: Input Parameters:
jpayne@69: build=1                 Designate index to use.  Corresponds to the number 
jpayne@69:                         specified when building the index.
jpayne@69: in=<file>               Primary reads input; required parameter.
jpayne@69: in2=<file>              For paired reads in two files.
jpayne@69: interleaved=auto        True forces paired/interleaved input; false forces 
jpayne@69:                         single-ended mapping. If not specified, interleaved 
jpayne@69:                         status will be autodetected from read names.
jpayne@69: fastareadlen=500        Break up FASTA reads longer than this.  Max is 500 for
jpayne@69:                         BBMap and 6000 for BBMapPacBio.  Only works for FASTA
jpayne@69:                         input (use 'maxlen' for FASTQ input).  The default for
jpayne@69:                         bbmap.sh is 500, and for mapPacBio.sh is 6000.
jpayne@69: unpigz=f                Spawn a pigz (parallel gzip) process for faster 
jpayne@69:                         decompression than using Java.  
jpayne@69:                         Requires pigz to be installed.
jpayne@69: touppercase=t           (tuc) Convert lowercase letters in reads to upper case 
jpayne@69:                         (otherwise they will not match the reference).
jpayne@69: 
jpayne@69: Sampling Parameters:
jpayne@69: 
jpayne@69: reads=-1                Set to a positive number N to only process the first N
jpayne@69:                         reads (or pairs), then quit.  -1 means use all reads.
jpayne@69: samplerate=1            Set to a number from 0 to 1 to randomly select that
jpayne@69:                         fraction of reads for mapping. 1 uses all reads.
jpayne@69: skipreads=0             Set to a number N to skip the first N reads (or pairs), 
jpayne@69:                         then map the rest.
jpayne@69: 
jpayne@69: Mapping Parameters:
jpayne@69: fast=f                  This flag is a macro which sets other paramters to run 
jpayne@69:                         faster, at reduced sensitivity.  Bad for RNA-seq.
jpayne@69: slow=f                  This flag is a macro which sets other paramters to run 
jpayne@69:                         slower, at greater sensitivity.  'vslow' is even slower.
jpayne@69: maxindel=16000          Don't look for indels longer than this. Lower is faster.
jpayne@69:                         Set to >=100k for RNAseq with long introns like mammals.
jpayne@69: strictmaxindel=f        When enabled, do not allow indels longer than 'maxindel'.
jpayne@69:                         By default these are not sought, but may be found anyway.
jpayne@69: tipsearch=100           Look this far for read-end deletions with anchors
jpayne@69:                         shorter than K, using brute force.
jpayne@69: minid=0.76              Approximate minimum alignment identity to look for. 
jpayne@69:                         Higher is faster and less sensitive.
jpayne@69: minhits=1               Minimum number of seed hits required for candidate sites.
jpayne@69:                         Higher is faster.
jpayne@69: local=f                 Set to true to use local, rather than global, alignments.
jpayne@69:                         This will soft-clip ugly ends of poor alignments.
jpayne@69: perfectmode=f           Allow only perfect mappings when set to true (very fast).
jpayne@69: semiperfectmode=f       Allow only perfect and semiperfect (perfect except for 
jpayne@69:                         N's in the reference) mappings.
jpayne@69: threads=auto            (t) Set to number of threads desired.  By default, uses 
jpayne@69:                         all cores available.
jpayne@69: ambiguous=best          (ambig) Set behavior on ambiguously-mapped reads (with 
jpayne@69:                         multiple top-scoring mapping locations).
jpayne@69:                             best    (use the first best site)
jpayne@69:                             toss    (consider unmapped)
jpayne@69:                             random  (select one top-scoring site randomly)
jpayne@69:                             all     (retain all top-scoring sites)
jpayne@69: samestrandpairs=f       (ssp) Specify whether paired reads should map to the
jpayne@69:                         same strand or opposite strands.
jpayne@69: requirecorrectstrand=t  (rcs) Forbid pairing of reads without correct strand 
jpayne@69:                         orientation.  Set to false for long-mate-pair libraries.
jpayne@69: killbadpairs=f          (kbp) If a read pair is mapped with an inappropriate
jpayne@69:                         insert size or orientation, the read with the lower  
jpayne@69:                         mapping quality is marked unmapped.
jpayne@69: pairedonly=f            (po) Treat unpaired reads as unmapped.  Thus they will 
jpayne@69:                         be sent to 'outu' but not 'outm'.
jpayne@69: rcomp=f                 Reverse complement both reads prior to mapping (for LMP
jpayne@69:                         outward-facing libraries).
jpayne@69: rcompmate=f             Reverse complement read2 prior to mapping.
jpayne@69: pairlen=32000           Set max allowed distance between paired reads.  
jpayne@69:                         (insert size)=(pairlen)+(read1 length)+(read2 length)
jpayne@69: rescuedist=1200         Don't try to rescue paired reads if avg. insert size
jpayne@69:                         greater than this.  Lower is faster.
jpayne@69: rescuemismatches=32     Maximum mismatches allowed in a rescued read.  Lower
jpayne@69:                         is faster.
jpayne@69: averagepairdist=100     (apd) Initial average distance between paired reads.
jpayne@69:                         Varies dynamically; does not need to be specified.
jpayne@69: deterministic=f         Run in deterministic mode.  In this case it is good
jpayne@69:                         to set averagepairdist.  BBMap is deterministic
jpayne@69:                         without this flag if using single-ended reads,
jpayne@69:                         or run singlethreaded.
jpayne@69: bandwidthratio=0        (bwr) If above zero, restrict alignment band to this 
jpayne@69:                         fraction of read length.  Faster but less accurate.
jpayne@69: bandwidth=0             (bw) Set the bandwidth directly.
jpayne@69:                         fraction of read length.  Faster but less accurate.
jpayne@69: usejni=f                (jni) Do alignments faster, in C code.  Requires 
jpayne@69:                         compiling the C code; details are in /jni/README.txt.
jpayne@69: maxsites2=800           Don't analyze (or print) more than this many alignments 
jpayne@69:                         per read.
jpayne@69: ignorefrequentkmers=t   (ifk) Discard low-information kmers that occur often.
jpayne@69: excludefraction=0.03    (ef) Fraction of kmers to ignore.  For example, 0.03
jpayne@69:                         will ignore the most common 3% of kmers.
jpayne@69: greedy=t                Use a greedy algorithm to discard the least-useful
jpayne@69:                         kmers on a per-read basis.
jpayne@69: kfilter=0               If positive, potential mapping sites must have at
jpayne@69:                         least this many consecutive exact matches.
jpayne@69: 
jpayne@69: 
jpayne@69: Quality and Trimming Parameters:
jpayne@69: qin=auto                Set to 33 or 64 to specify input quality value ASCII
jpayne@69:                         offset. 33 is Sanger, 64 is old Solexa.
jpayne@69: qout=auto               Set to 33 or 64 to specify output quality value ASCII 
jpayne@69:                         offset (only if output format is fastq).
jpayne@69: qtrim=f                 Quality-trim ends before mapping.  Options are: 
jpayne@69:                         'f' (false), 'l' (left), 'r' (right), and 'lr' (both).
jpayne@69: untrim=f                Undo trimming after mapping.  Untrimmed bases will be 
jpayne@69:                         soft-clipped in cigar strings.
jpayne@69: trimq=6                 Trim regions with average quality below this 
jpayne@69:                         (phred algorithm).
jpayne@69: mintrimlength=60        (mintl) Don't trim reads to be shorter than this.
jpayne@69: fakefastaquality=-1     (ffq) Set to a positive number 1-50 to generate fake
jpayne@69:                         quality strings for fasta input reads.
jpayne@69: ignorebadquality=f      (ibq) Keep going, rather than crashing, if a read has 
jpayne@69:                         out-of-range quality values.
jpayne@69: usequality=t            Use quality scores when determining which read kmers
jpayne@69:                         to use as seeds.
jpayne@69: minaveragequality=0     (maq) Do not map reads with average quality below this.
jpayne@69: maqb=0                  If positive, calculate maq from this many initial bases.
jpayne@69: 
jpayne@69: Output Parameters:
jpayne@69: out=<file>              Write all reads to this file.
jpayne@69: outu=<file>             Write only unmapped reads to this file.  Does not 
jpayne@69:                         include unmapped paired reads with a mapped mate.
jpayne@69: outm=<file>             Write only mapped reads to this file.  Includes 
jpayne@69:                         unmapped paired reads with a mapped mate.
jpayne@69: mappedonly=f            If true, treats 'out' like 'outm'.
jpayne@69: bamscript=<file>        (bs) Write a shell script to <file> that will turn 
jpayne@69:                         the sam output into a sorted, indexed bam file.
jpayne@69: ordered=f               Set to true to output reads in same order as input.  
jpayne@69:                         Slower and uses more memory.
jpayne@69: overwrite=f             (ow) Allow process to overwrite existing files.
jpayne@69: secondary=f             Print secondary alignments.
jpayne@69: sssr=0.95               (secondarysitescoreratio) Print only secondary alignments
jpayne@69:                         with score of at least this fraction of primary.
jpayne@69: ssao=f                  (secondarysiteasambiguousonly) Only print secondary 
jpayne@69:                         alignments for ambiguously-mapped reads.
jpayne@69: maxsites=5              Maximum number of total alignments to print per read.
jpayne@69:                         Only relevant when secondary=t.
jpayne@69: quickmatch=f            Generate cigar strings more quickly.
jpayne@69: trimreaddescriptions=f  (trd) Truncate read and ref names at the first whitespace,
jpayne@69:                         assuming that the remainder is a comment or description.
jpayne@69: ziplevel=2              (zl) Compression level for zip or gzip output.
jpayne@69: pigz=f                  Spawn a pigz (parallel gzip) process for faster 
jpayne@69:                         compression than Java.  Requires pigz to be installed.
jpayne@69: machineout=f            Set to true to output statistics in machine-friendly 
jpayne@69:                         'key=value' format.
jpayne@69: printunmappedcount=f    Print the total number of unmapped reads and bases.
jpayne@69:                         If input is paired, the number will be of pairs
jpayne@69:                         for which both reads are unmapped.
jpayne@69: showprogress=0          If positive, print a '.' every X reads.
jpayne@69: showprogress2=0         If positive, print the number of seconds since the
jpayne@69:                         last progress update (instead of a '.').
jpayne@69: renamebyinsert=f        Renames reads based on their mapped insert size.
jpayne@69: 
jpayne@69: Bloom-Filtering Parameters (bloomfilter.sh is the standalone version).
jpayne@69: bloom=f                 Use a Bloom filter to ignore reads not sharing kmers
jpayne@69:                         with the reference.  This uses more memory, but speeds
jpayne@69:                         mapping when most reads don't match the reference.
jpayne@69: bloomhashes=2           Number of hash functions.
jpayne@69: bloomminhits=3          Number of consecutive hits to be considered matched.
jpayne@69: bloomk=31               Bloom filter kmer length.
jpayne@69: bloomserial=t           Use the serialized Bloom filter for greater loading
jpayne@69:                         speed, if available.  If not, generate and write one.
jpayne@69: 
jpayne@69: Post-Filtering Parameters:
jpayne@69: idfilter=0              Independant of minid; sets exact minimum identity 
jpayne@69:                         allowed for alignments to be printed.  Range 0 to 1.
jpayne@69: subfilter=-1            Ban alignments with more than this many substitutions.
jpayne@69: insfilter=-1            Ban alignments with more than this many insertions.
jpayne@69: delfilter=-1            Ban alignments with more than this many deletions.
jpayne@69: indelfilter=-1          Ban alignments with more than this many indels.
jpayne@69: editfilter=-1           Ban alignments with more than this many edits.
jpayne@69: inslenfilter=-1         Ban alignments with an insertion longer than this.
jpayne@69: dellenfilter=-1         Ban alignments with a deletion longer than this.
jpayne@69: nfilter=-1              Ban alignments with more than this many ns.  This 
jpayne@69:                         includes nocall, noref, and off scaffold ends.
jpayne@69: 
jpayne@69: Sam flags and settings:
jpayne@69: noheader=f              Disable generation of header lines.
jpayne@69: sam=1.4                 Set to 1.4 to write Sam version 1.4 cigar strings, 
jpayne@69:                         with = and X, or 1.3 to use M.
jpayne@69: saa=t                   (secondaryalignmentasterisks) Use asterisks instead of
jpayne@69:                         bases for sam secondary alignments.
jpayne@69: cigar=t                 Set to 'f' to skip generation of cigar strings (faster).
jpayne@69: keepnames=f             Keep original names of paired reads, rather than 
jpayne@69:                         ensuring both reads have the same name.
jpayne@69: intronlen=999999999     Set to a lower number like 10 to change 'D' to 'N' in 
jpayne@69:                         cigar strings for deletions of at least that length.
jpayne@69: rgid=                   Set readgroup ID.  All other readgroup fields 
jpayne@69:                         can be set similarly, with the flag rgXX=
jpayne@69:                         If you set a readgroup flag to the word 'filename',
jpayne@69:                         e.g. rgid=filename, the input file name will be used.
jpayne@69: mdtag=f                 Write MD tags.
jpayne@69: nhtag=f                 Write NH tags.
jpayne@69: xmtag=f                 Write XM tags (may only work correctly with ambig=all).
jpayne@69: amtag=f                 Write AM tags.
jpayne@69: nmtag=f                 Write NM tags.
jpayne@69: xstag=f                 Set to 'xs=fs', 'xs=ss', or 'xs=us' to write XS tags 
jpayne@69:                         for RNAseq using firststrand, secondstrand, or 
jpayne@69:                         unstranded libraries.  Needed by Cufflinks.  
jpayne@69:                         JGI mainly uses 'firststrand'.
jpayne@69: stoptag=f               Write a tag indicating read stop location, prefixed by YS:i:
jpayne@69: lengthtag=f             Write a tag indicating (query,ref) alignment lengths, 
jpayne@69:                         prefixed by YL:Z:
jpayne@69: idtag=f                 Write a tag indicating percent identity, prefixed by YI:f:
jpayne@69: inserttag=f             Write a tag indicating insert size, prefixed by X8:Z:
jpayne@69: scoretag=f              Write a tag indicating BBMap's raw score, prefixed by YR:i:
jpayne@69: timetag=f               Write a tag indicating this read's mapping time, prefixed by X0:i:
jpayne@69: boundstag=f             Write a tag indicating whether either read in the pair
jpayne@69:                         goes off the end of the reference, prefixed by XB:Z:
jpayne@69: notags=f                Turn off all optional tags.
jpayne@69: 
jpayne@69: Histogram and statistics output parameters:
jpayne@69: scafstats=<file>        Statistics on how many reads mapped to which scaffold.
jpayne@69: refstats=<file>         Statistics on how many reads mapped to which reference
jpayne@69:                         file; only for BBSplit.
jpayne@69: sortscafs=t             Sort scaffolds or references by read count.
jpayne@69: bhist=<file>            Base composition histogram by position.
jpayne@69: qhist=<file>            Quality histogram by position.
jpayne@69: aqhist=<file>           Histogram of average read quality.
jpayne@69: bqhist=<file>           Quality histogram designed for box plots.
jpayne@69: lhist=<file>            Read length histogram.
jpayne@69: ihist=<file>            Write histogram of insert sizes (for paired reads).
jpayne@69: ehist=<file>            Errors-per-read histogram.
jpayne@69: qahist=<file>           Quality accuracy histogram of error rates versus 
jpayne@69:                         quality score.
jpayne@69: indelhist=<file>        Indel length histogram.
jpayne@69: mhist=<file>            Histogram of match, sub, del, and ins rates by 
jpayne@69:                         read location.
jpayne@69: gchist=<file>           Read GC content histogram.
jpayne@69: gcbins=100              Number gchist bins.  Set to 'auto' to use read length.
jpayne@69: gcpairs=t               Use average GC of paired reads.
jpayne@69: idhist=<file>           Histogram of read count versus percent identity.
jpayne@69: idbins=100              Number idhist bins.  Set to 'auto' to use read length.
jpayne@69: statsfile=stderr        Mapping statistics are printed here.
jpayne@69: 
jpayne@69: Coverage output parameters (these may reduce speed and use more RAM):
jpayne@69: covstats=<file>         Per-scaffold coverage info.
jpayne@69: rpkm=<file>             Per-scaffold RPKM/FPKM counts.
jpayne@69: covhist=<file>          Histogram of # occurrences of each depth level.
jpayne@69: basecov=<file>          Coverage per base location.
jpayne@69: bincov=<file>           Print binned coverage per location (one line per X bases).
jpayne@69: covbinsize=1000         Set the binsize for binned coverage output.
jpayne@69: nzo=t                   Only print scaffolds with nonzero coverage.
jpayne@69: twocolumn=f             Change to true to print only ID and Avg_fold instead of 
jpayne@69:                         all 6 columns to the 'out=' file.
jpayne@69: 32bit=f                 Set to true if you need per-base coverage over 64k.
jpayne@69: strandedcov=f           Track coverage for plus and minus strand independently.
jpayne@69: startcov=f              Only track start positions of reads.
jpayne@69: secondarycov=t          Include coverage of secondary alignments.
jpayne@69: physcov=f               Calculate physical coverage for paired reads.
jpayne@69:                         This includes the unsequenced bases.
jpayne@69: delcoverage=t           (delcov) Count bases covered by deletions as covered.
jpayne@69:                         True is faster than false.
jpayne@69: covk=0                  If positive, calculate kmer coverage statistics.
jpayne@69: 
jpayne@69: Java Parameters:
jpayne@69: -Xmx                    This will set Java's memory usage, 
jpayne@69:                         overriding autodetection.
jpayne@69:                         -Xmx20g will specify 20 gigs of RAM, and -Xmx800m 
jpayne@69:                         will specify 800 megs.  The max is typically 85% of 
jpayne@69:                         physical memory.  The human genome requires around 24g,
jpayne@69:                         or 12g with the 'usemodulo' flag.  The index uses 
jpayne@69:                         roughly 6 bytes per reference base.
jpayne@69: -eoom                   This flag will cause the process to exit if an 
jpayne@69:                         out-of-memory exception occurs.  Requires Java 8u92+.
jpayne@69: -da                     Disable assertions.
jpayne@69: 
jpayne@69: Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter 
jpayne@69: any problems, or post at: http://seqanswers.com/forums/showthread.php?t=41057
jpayne@69: "   
jpayne@69: }
jpayne@69: 
jpayne@69: #This block allows symlinked shellscripts to correctly set classpath.
jpayne@69: pushd . > /dev/null
jpayne@69: DIR="${BASH_SOURCE[0]}"
jpayne@69: while [ -h "$DIR" ]; do
jpayne@69:   cd "$(dirname "$DIR")"
jpayne@69:   DIR="$(readlink "$(basename "$DIR")")"
jpayne@69: done
jpayne@69: cd "$(dirname "$DIR")"
jpayne@69: DIR="$(pwd)/"
jpayne@69: popd > /dev/null
jpayne@69: 
jpayne@69: #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
jpayne@69: CP="$DIR""current/"
jpayne@69: JNI="-Djava.library.path=""$DIR""jni/"
jpayne@69: JNI=""
jpayne@69: 
jpayne@69: z="-Xmx1g"
jpayne@69: z2="-Xms1g"
jpayne@69: set=0
jpayne@69: 
jpayne@69: if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
jpayne@69: 	usage
jpayne@69: 	exit
jpayne@69: fi
jpayne@69: 
jpayne@69: calcXmx () {
jpayne@69: 	source "$DIR""/calcmem.sh"
jpayne@69: 	setEnvironment
jpayne@69: 	parseXmx "$@"
jpayne@69: 	if [[ $set == 1 ]]; then
jpayne@69: 		return
jpayne@69: 	fi
jpayne@69: 	freeRam 3200m 84
jpayne@69: 	z="-Xmx${RAM}m"
jpayne@69: 	z2="-Xms${RAM}m"
jpayne@69: }
jpayne@69: calcXmx "$@"
jpayne@69: 
jpayne@69: 
jpayne@69: bbmap() {
jpayne@69: 	local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap build=1 overwrite=true fastareadlen=500 $@"
jpayne@69: 	echo $CMD >&2
jpayne@69: 	eval $CMD
jpayne@69: }
jpayne@69: 
jpayne@69: bbmap "$@"