diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/clumpify.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/clumpify.sh	Tue Mar 18 17:55:14 2025 -0400
@@ -0,0 +1,237 @@
+#!/bin/bash
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified July 28, 2022
+
+Description:  Sorts sequences to put similar reads near each other.
+Can be used for increased compression or error correction.
+Please read bbmap/docs/guides/ClumpifyGuide.txt for more information.
+
+Usage:   clumpify.sh in=<file> out=<file> reorder
+
+Input may be fasta or fastq, compressed or uncompressed.  Cannot accept sam.
+
+Parameters and their defaults:
+in=<file>           Input file.
+in2=<file>          Optional input for read 2 of twin paired files.
+out=<file>          Output file.  May not be standard out.
+out2=<file>         Optional output for read 2 of twin paired files.
+groups=auto         Use this many intermediate files (to save memory).
+                    1 group is fastest.  Auto will estimate the number
+                    of groups needed based on the file size, so it should
+                    not ever run out of memory.
+lowcomplexity=f     For compressed low-complexity libraries such as RNA-seq,
+                    this will more conservatively estimate how much memory
+                    is needed to automatically decide the number of groups.
+rcomp=f             Give read clumps the same orientation to increase 
+                    compression.  Should be disabled for paired reads.
+overwrite=f         (ow) Set to false to force the program to abort rather 
+                    than overwrite an existing file.
+qin=auto            Auto-detect input quality encoding.  May be set to:
+                       33:  ASCII-33 (Sanger) encoding.
+                       64:  ASCII-64 (old Illumina) encoding.
+                    All modern sequence is encoded as ASCII-33.
+qout=auto           Use input quality encoding as output quality encoding.
+changequality=f     (cq) If true, fix broken quality scores such as Ns with
+                    Q>0.  Default is false to ensure lossless compression.
+fastawrap=70        Set to a higher number like 4000 for longer lines in 
+                    fasta format, which increases compression.
+
+Compression parameters:
+ziplevel=6          (zl) Gzip compression level (1-11).  Higher is slower.
+                    Level 11 is only available if pigz is installed and is
+                    extremely slow to compress, but faster to decompress.
+                    Naming the output file to *.bz2 will use bzip2 instead of
+                    gzip for ~9% additional compression, which requires
+                    bzip2, pbzip2, or lbzip2 in the path.
+blocksize=128       Size of blocks for pigz, in kb.  Higher gives slightly
+                    better compression.
+shortname=f         Make the names as short as possible.  'shortname=shrink'
+                    will shorten the names where possible, but retain the 
+                    flowcell and barcode information.
+reorder=f           Reorder clumps for additional compression.  Only valid
+                    when groups=1, passes=1, and ecc=f.  Possible modes:
+                       f:  Do not reorder clumps.
+                       c:  Reorder using consensus reads.  Uses additional
+                           time and memory.
+                       p:  Reorder using pair information.  Requires paired
+                           reads.  Yields the highest compression.
+                       a:  Automatically choose between 'c' and 'p'.  The
+                           flag reorder with no argument will set 'reorder=a'.
+quantize=f          Bin the quality scores, like NextSeq.  This greatly
+                    increases compression, but information is lost.
+
+Temp file parameters:
+compresstemp=auto   (ct) Gzip temporary files.  By default temp files will be
+                    compressed if the output file is compressed.
+deletetemp=t        Delete temporary files.
+deleteinput=f       Delete input upon successful completion.
+usetmpdir=f         Use tmpdir for temp files.
+tmpdir=             By default, this is the environment variable TMPDIR.
+
+Hashing parameters:
+k=31                Use kmers of this length (1-31).  Shorter kmers may
+                    increase compression, but 31 is recommended for error
+                    correction.
+mincount=0          Don't use pivot kmers with count less than this.
+                    Setting mincount=2 can increase compression.
+                    Increases time and memory usage.
+seed=1              Random number generator seed for hashing.  
+                    Set to a negative number to use a random seed.
+hashes=4            Use this many masks when hashing.  0 uses raw kmers.
+                    Often hashes=0 increases compression, but it should
+                    not be used with error-correction.
+border=1            Do not use kmers within this many bases of read ends.
+
+Deduplication parameters:
+dedupe=f            Remove duplicate reads.  For pairs, both must match.
+                    By default, deduplication does not occur.
+                    If dedupe and markduplicates are both false, none of
+                    the other duplicate-related flags will have any effect.
+markduplicates=f    Don't remove; just append ' duplicate' to the name.
+allduplicates=f     Mark or remove all copies of duplicates, instead of
+                    keeping the highest-quality copy.
+addcount=f          Append the number of copies to the read name.
+                    Mutually exclusive with markduplicates or allduplicates.
+entryfilter=f       This assists in removing exact duplicates, which saves
+                    memory in libraries that split unevenly due to huge
+                    numbers of duplicates.  Enabled automatically as needed.
+subs=2              (s) Maximum substitutions allowed between duplicates.
+subrate=0.0         (dsr) If set, the number of substitutions allowed will be
+                    max(subs, subrate*min(length1, length2)) for 2 sequences.
+allowns=t           No-called bases will not be considered substitutions.
+scanlimit=5         (scan) Continue for this many reads after encountering a
+                    nonduplicate.  Improves detection of inexact duplicates.
+containment=f       Allow containments (where one sequence is shorter).
+affix=f             For containments, require one sequence to be an affix
+                    (prefix or suffix) of the other.
+optical=f           If true, mark or remove optical duplicates only.
+                    This means they are Illumina reads within a certain
+                    distance on the flowcell.  Normal Illumina names needed.
+                    Also for tile-edge and well duplicates.
+dupedist=40         (dist) Max distance to consider for optical duplicates.
+                    Higher removes more duplicates but is more likely to
+                    remove PCR rather than optical duplicates.
+                    This is platform-specific; recommendations:
+                       NextSeq      40  (and spany=t)
+                       HiSeq 1T     40
+                       HiSeq 2500   40
+                       HiSeq 3k/4k  2500
+                       Novaseq      12000
+spany=f             Allow reads to be considered optical duplicates if they
+                    are on different tiles, but are within dupedist in the
+                    y-axis.  Should only be enabled when looking for 
+                    tile-edge duplicates (as in NextSeq).
+spanx=f             Like spany, but for the x-axis.  Not necessary 
+                    for NextSeq.
+spantiles=f         Set both spanx and spany.
+adjacent=f          Limit tile-spanning to adjacent tiles (those with 
+                    consecutive numbers).
+*** Thus, for NextSeq, the recommended deduplication flags are: ***
+dedupe optical spany adjacent
+
+Pairing/ordering parameters (for use with error-correction):
+unpair=f            For paired reads, clump all of them rather than just
+                    read 1.  Destroys pairing.  Without this flag, for paired
+                    reads, only read 1 will be error-corrected.
+repair=f            After clumping and error-correction, restore pairing.
+                    If groups>1 this will sort by name which will destroy
+                    clump ordering; with a single group, clumping will
+                    be retained.
+
+Error-correction parameters:
+ecc=f               Error-correct reads.  Requires multiple passes for
+                    complete correction.
+ecco=f              Error-correct paired reads via overlap before clumping.
+passes=1            Use this many error-correction passes.  6 passes are 
+                    suggested.
+consensus=f         Output consensus sequence instead of clumps.
+
+Advanced error-correction parameters:
+mincc=4             (mincountcorrect) Do not correct to alleles occuring less
+                    often than this.
+minss=4             (minsizesplit) Do not split into new clumps smaller than 
+                    this.
+minsfs=0.17         (minsizefractionsplit) Do not split on pivot alleles in
+                    areas with local depth less than this fraction of clump size.
+minsfc=0.20         (minsizefractioncorrect) Do not correct in areas with local
+                    depth less than this.
+minr=30.0           (minratio) Correct to the consensus if the ratio of the
+                    consensus allele to second-most-common allele is >=minr,
+                    for high depth.  Actual ratio used is:
+                    min(minr, minro+minorCount*minrm+quality*minrqm).
+minro=1.9           (minratiooffset) Base ratio.
+minrm=1.8           (minratiomult) Ratio multiplier for secondary allele count.
+minrqm=0.08         (minratioqmult) Ratio multiplier for base quality.
+minqr=2.8           (minqratio) Do not correct bases when cq*minqr>rqsum.
+minaqr=0.70         (minaqratio) Do not correct bases when cq*minaqr>5+rqavg.
+minid=0.97          (minidentity) Do not correct reads with identity to 
+                    consensus less than this.
+maxqadjust=0        Adjust quality scores by at most maxqadjust per pass.
+maxqi=-1            (maxqualityincorrect) Do not correct bases with quality 
+                    above this (if positive).
+maxci=-1            (maxcountincorrect) Do not correct alleles with count 
+                    above this (if positive).
+findcorrelations=t  Look for correlated SNPs in clumps to split into alleles.
+maxcorrelations=12  Maximum number of eligible SNPs per clump to consider for
+                    correlations.  Increasing this number can reduce false-
+                    positive corrections at the possible expense of speed.
+
+Java Parameters:
+-Xmx                This will set Java's memory usage, overriding autodetection.
+                    -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
+                    The max is typically 85% of physical memory.
+-eoom               This flag will cause the process to exit if an
+                    out-of-memory exception occurs.  Requires Java 8u92+.
+-da                 Disable assertions.
+
+Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
+"
+}
+
+#This block allows symlinked shellscripts to correctly set classpath.
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+  cd "$(dirname "$DIR")"
+  DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx2g"
+z2="-Xms2g"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+	usage
+	exit
+fi
+
+calcXmx () {
+	source "$DIR""/calcmem.sh"
+	setEnvironment
+	parseXmx "$@"
+	if [[ $set == 1 ]]; then
+		return
+	fi
+	#Note that this uses slighly less (82%) of memory than normal to account for multiple pigz instances.
+	freeRam 2000m 82
+	z="-Xmx${RAM}m"
+	z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+clumpify() {
+	local CMD="java $EA $EOOM $z $z2 -cp $CP clump.Clumpify $@"
+	echo $CMD >&2
+	eval $CMD
+}
+
+clumpify "$@"