jpayne@69: #!/bin/bash
jpayne@69: 
jpayne@69: usage(){
jpayne@69: echo "
jpayne@69: Written by Brian Bushnell
jpayne@69: Last modified July 28, 2022
jpayne@69: 
jpayne@69: Description:  Sorts sequences to put similar reads near each other.
jpayne@69: Can be used for increased compression or error correction.
jpayne@69: Please read bbmap/docs/guides/ClumpifyGuide.txt for more information.
jpayne@69: 
jpayne@69: Usage:   clumpify.sh in=<file> out=<file> reorder
jpayne@69: 
jpayne@69: Input may be fasta or fastq, compressed or uncompressed.  Cannot accept sam.
jpayne@69: 
jpayne@69: Parameters and their defaults:
jpayne@69: in=<file>           Input file.
jpayne@69: in2=<file>          Optional input for read 2 of twin paired files.
jpayne@69: out=<file>          Output file.  May not be standard out.
jpayne@69: out2=<file>         Optional output for read 2 of twin paired files.
jpayne@69: groups=auto         Use this many intermediate files (to save memory).
jpayne@69:                     1 group is fastest.  Auto will estimate the number
jpayne@69:                     of groups needed based on the file size, so it should
jpayne@69:                     not ever run out of memory.
jpayne@69: lowcomplexity=f     For compressed low-complexity libraries such as RNA-seq,
jpayne@69:                     this will more conservatively estimate how much memory
jpayne@69:                     is needed to automatically decide the number of groups.
jpayne@69: rcomp=f             Give read clumps the same orientation to increase 
jpayne@69:                     compression.  Should be disabled for paired reads.
jpayne@69: overwrite=f         (ow) Set to false to force the program to abort rather 
jpayne@69:                     than overwrite an existing file.
jpayne@69: qin=auto            Auto-detect input quality encoding.  May be set to:
jpayne@69:                        33:  ASCII-33 (Sanger) encoding.
jpayne@69:                        64:  ASCII-64 (old Illumina) encoding.
jpayne@69:                     All modern sequence is encoded as ASCII-33.
jpayne@69: qout=auto           Use input quality encoding as output quality encoding.
jpayne@69: changequality=f     (cq) If true, fix broken quality scores such as Ns with
jpayne@69:                     Q>0.  Default is false to ensure lossless compression.
jpayne@69: fastawrap=70        Set to a higher number like 4000 for longer lines in 
jpayne@69:                     fasta format, which increases compression.
jpayne@69: 
jpayne@69: Compression parameters:
jpayne@69: ziplevel=6          (zl) Gzip compression level (1-11).  Higher is slower.
jpayne@69:                     Level 11 is only available if pigz is installed and is
jpayne@69:                     extremely slow to compress, but faster to decompress.
jpayne@69:                     Naming the output file to *.bz2 will use bzip2 instead of
jpayne@69:                     gzip for ~9% additional compression, which requires
jpayne@69:                     bzip2, pbzip2, or lbzip2 in the path.
jpayne@69: blocksize=128       Size of blocks for pigz, in kb.  Higher gives slightly
jpayne@69:                     better compression.
jpayne@69: shortname=f         Make the names as short as possible.  'shortname=shrink'
jpayne@69:                     will shorten the names where possible, but retain the 
jpayne@69:                     flowcell and barcode information.
jpayne@69: reorder=f           Reorder clumps for additional compression.  Only valid
jpayne@69:                     when groups=1, passes=1, and ecc=f.  Possible modes:
jpayne@69:                        f:  Do not reorder clumps.
jpayne@69:                        c:  Reorder using consensus reads.  Uses additional
jpayne@69:                            time and memory.
jpayne@69:                        p:  Reorder using pair information.  Requires paired
jpayne@69:                            reads.  Yields the highest compression.
jpayne@69:                        a:  Automatically choose between 'c' and 'p'.  The
jpayne@69:                            flag reorder with no argument will set 'reorder=a'.
jpayne@69: quantize=f          Bin the quality scores, like NextSeq.  This greatly
jpayne@69:                     increases compression, but information is lost.
jpayne@69: 
jpayne@69: Temp file parameters:
jpayne@69: compresstemp=auto   (ct) Gzip temporary files.  By default temp files will be
jpayne@69:                     compressed if the output file is compressed.
jpayne@69: deletetemp=t        Delete temporary files.
jpayne@69: deleteinput=f       Delete input upon successful completion.
jpayne@69: usetmpdir=f         Use tmpdir for temp files.
jpayne@69: tmpdir=             By default, this is the environment variable TMPDIR.
jpayne@69: 
jpayne@69: Hashing parameters:
jpayne@69: k=31                Use kmers of this length (1-31).  Shorter kmers may
jpayne@69:                     increase compression, but 31 is recommended for error
jpayne@69:                     correction.
jpayne@69: mincount=0          Don't use pivot kmers with count less than this.
jpayne@69:                     Setting mincount=2 can increase compression.
jpayne@69:                     Increases time and memory usage.
jpayne@69: seed=1              Random number generator seed for hashing.  
jpayne@69:                     Set to a negative number to use a random seed.
jpayne@69: hashes=4            Use this many masks when hashing.  0 uses raw kmers.
jpayne@69:                     Often hashes=0 increases compression, but it should
jpayne@69:                     not be used with error-correction.
jpayne@69: border=1            Do not use kmers within this many bases of read ends.
jpayne@69: 
jpayne@69: Deduplication parameters:
jpayne@69: dedupe=f            Remove duplicate reads.  For pairs, both must match.
jpayne@69:                     By default, deduplication does not occur.
jpayne@69:                     If dedupe and markduplicates are both false, none of
jpayne@69:                     the other duplicate-related flags will have any effect.
jpayne@69: markduplicates=f    Don't remove; just append ' duplicate' to the name.
jpayne@69: allduplicates=f     Mark or remove all copies of duplicates, instead of
jpayne@69:                     keeping the highest-quality copy.
jpayne@69: addcount=f          Append the number of copies to the read name.
jpayne@69:                     Mutually exclusive with markduplicates or allduplicates.
jpayne@69: entryfilter=f       This assists in removing exact duplicates, which saves
jpayne@69:                     memory in libraries that split unevenly due to huge
jpayne@69:                     numbers of duplicates.  Enabled automatically as needed.
jpayne@69: subs=2              (s) Maximum substitutions allowed between duplicates.
jpayne@69: subrate=0.0         (dsr) If set, the number of substitutions allowed will be
jpayne@69:                     max(subs, subrate*min(length1, length2)) for 2 sequences.
jpayne@69: allowns=t           No-called bases will not be considered substitutions.
jpayne@69: scanlimit=5         (scan) Continue for this many reads after encountering a
jpayne@69:                     nonduplicate.  Improves detection of inexact duplicates.
jpayne@69: containment=f       Allow containments (where one sequence is shorter).
jpayne@69: affix=f             For containments, require one sequence to be an affix
jpayne@69:                     (prefix or suffix) of the other.
jpayne@69: optical=f           If true, mark or remove optical duplicates only.
jpayne@69:                     This means they are Illumina reads within a certain
jpayne@69:                     distance on the flowcell.  Normal Illumina names needed.
jpayne@69:                     Also for tile-edge and well duplicates.
jpayne@69: dupedist=40         (dist) Max distance to consider for optical duplicates.
jpayne@69:                     Higher removes more duplicates but is more likely to
jpayne@69:                     remove PCR rather than optical duplicates.
jpayne@69:                     This is platform-specific; recommendations:
jpayne@69:                        NextSeq      40  (and spany=t)
jpayne@69:                        HiSeq 1T     40
jpayne@69:                        HiSeq 2500   40
jpayne@69:                        HiSeq 3k/4k  2500
jpayne@69:                        Novaseq      12000
jpayne@69: spany=f             Allow reads to be considered optical duplicates if they
jpayne@69:                     are on different tiles, but are within dupedist in the
jpayne@69:                     y-axis.  Should only be enabled when looking for 
jpayne@69:                     tile-edge duplicates (as in NextSeq).
jpayne@69: spanx=f             Like spany, but for the x-axis.  Not necessary 
jpayne@69:                     for NextSeq.
jpayne@69: spantiles=f         Set both spanx and spany.
jpayne@69: adjacent=f          Limit tile-spanning to adjacent tiles (those with 
jpayne@69:                     consecutive numbers).
jpayne@69: *** Thus, for NextSeq, the recommended deduplication flags are: ***
jpayne@69: dedupe optical spany adjacent
jpayne@69: 
jpayne@69: Pairing/ordering parameters (for use with error-correction):
jpayne@69: unpair=f            For paired reads, clump all of them rather than just
jpayne@69:                     read 1.  Destroys pairing.  Without this flag, for paired
jpayne@69:                     reads, only read 1 will be error-corrected.
jpayne@69: repair=f            After clumping and error-correction, restore pairing.
jpayne@69:                     If groups>1 this will sort by name which will destroy
jpayne@69:                     clump ordering; with a single group, clumping will
jpayne@69:                     be retained.
jpayne@69: 
jpayne@69: Error-correction parameters:
jpayne@69: ecc=f               Error-correct reads.  Requires multiple passes for
jpayne@69:                     complete correction.
jpayne@69: ecco=f              Error-correct paired reads via overlap before clumping.
jpayne@69: passes=1            Use this many error-correction passes.  6 passes are 
jpayne@69:                     suggested.
jpayne@69: consensus=f         Output consensus sequence instead of clumps.
jpayne@69: 
jpayne@69: Advanced error-correction parameters:
jpayne@69: mincc=4             (mincountcorrect) Do not correct to alleles occuring less
jpayne@69:                     often than this.
jpayne@69: minss=4             (minsizesplit) Do not split into new clumps smaller than 
jpayne@69:                     this.
jpayne@69: minsfs=0.17         (minsizefractionsplit) Do not split on pivot alleles in
jpayne@69:                     areas with local depth less than this fraction of clump size.
jpayne@69: minsfc=0.20         (minsizefractioncorrect) Do not correct in areas with local
jpayne@69:                     depth less than this.
jpayne@69: minr=30.0           (minratio) Correct to the consensus if the ratio of the
jpayne@69:                     consensus allele to second-most-common allele is >=minr,
jpayne@69:                     for high depth.  Actual ratio used is:
jpayne@69:                     min(minr, minro+minorCount*minrm+quality*minrqm).
jpayne@69: minro=1.9           (minratiooffset) Base ratio.
jpayne@69: minrm=1.8           (minratiomult) Ratio multiplier for secondary allele count.
jpayne@69: minrqm=0.08         (minratioqmult) Ratio multiplier for base quality.
jpayne@69: minqr=2.8           (minqratio) Do not correct bases when cq*minqr>rqsum.
jpayne@69: minaqr=0.70         (minaqratio) Do not correct bases when cq*minaqr>5+rqavg.
jpayne@69: minid=0.97          (minidentity) Do not correct reads with identity to 
jpayne@69:                     consensus less than this.
jpayne@69: maxqadjust=0        Adjust quality scores by at most maxqadjust per pass.
jpayne@69: maxqi=-1            (maxqualityincorrect) Do not correct bases with quality 
jpayne@69:                     above this (if positive).
jpayne@69: maxci=-1            (maxcountincorrect) Do not correct alleles with count 
jpayne@69:                     above this (if positive).
jpayne@69: findcorrelations=t  Look for correlated SNPs in clumps to split into alleles.
jpayne@69: maxcorrelations=12  Maximum number of eligible SNPs per clump to consider for
jpayne@69:                     correlations.  Increasing this number can reduce false-
jpayne@69:                     positive corrections at the possible expense of speed.
jpayne@69: 
jpayne@69: Java Parameters:
jpayne@69: -Xmx                This will set Java's memory usage, overriding autodetection.
jpayne@69:                     -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
jpayne@69:                     The max is typically 85% of physical memory.
jpayne@69: -eoom               This flag will cause the process to exit if an
jpayne@69:                     out-of-memory exception occurs.  Requires Java 8u92+.
jpayne@69: -da                 Disable assertions.
jpayne@69: 
jpayne@69: Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
jpayne@69: "
jpayne@69: }
jpayne@69: 
jpayne@69: #This block allows symlinked shellscripts to correctly set classpath.
jpayne@69: pushd . > /dev/null
jpayne@69: DIR="${BASH_SOURCE[0]}"
jpayne@69: while [ -h "$DIR" ]; do
jpayne@69:   cd "$(dirname "$DIR")"
jpayne@69:   DIR="$(readlink "$(basename "$DIR")")"
jpayne@69: done
jpayne@69: cd "$(dirname "$DIR")"
jpayne@69: DIR="$(pwd)/"
jpayne@69: popd > /dev/null
jpayne@69: 
jpayne@69: #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
jpayne@69: CP="$DIR""current/"
jpayne@69: 
jpayne@69: z="-Xmx2g"
jpayne@69: z2="-Xms2g"
jpayne@69: set=0
jpayne@69: 
jpayne@69: if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
jpayne@69: 	usage
jpayne@69: 	exit
jpayne@69: fi
jpayne@69: 
jpayne@69: calcXmx () {
jpayne@69: 	source "$DIR""/calcmem.sh"
jpayne@69: 	setEnvironment
jpayne@69: 	parseXmx "$@"
jpayne@69: 	if [[ $set == 1 ]]; then
jpayne@69: 		return
jpayne@69: 	fi
jpayne@69: 	#Note that this uses slighly less (82%) of memory than normal to account for multiple pigz instances.
jpayne@69: 	freeRam 2000m 82
jpayne@69: 	z="-Xmx${RAM}m"
jpayne@69: 	z2="-Xms${RAM}m"
jpayne@69: }
jpayne@69: calcXmx "$@"
jpayne@69: 
jpayne@69: clumpify() {
jpayne@69: 	local CMD="java $EA $EOOM $z $z2 -cp $CP clump.Clumpify $@"
jpayne@69: 	echo $CMD >&2
jpayne@69: 	eval $CMD
jpayne@69: }
jpayne@69: 
jpayne@69: clumpify "$@"