csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/clumpify.sh annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/clumpify.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children

rev	line source
jpayne@69	1 #!/bin/bash
jpayne@69	2
jpayne@69	3 usage(){
jpayne@69	4 echo "
jpayne@69	5 Written by Brian Bushnell
jpayne@69	6 Last modified July 28, 2022
jpayne@69	7
jpayne@69	8 Description: Sorts sequences to put similar reads near each other.
jpayne@69	9 Can be used for increased compression or error correction.
jpayne@69	10 Please read bbmap/docs/guides/ClumpifyGuide.txt for more information.
jpayne@69	11
jpayne@69	12 Usage: clumpify.sh in=<file> out=<file> reorder
jpayne@69	13
jpayne@69	14 Input may be fasta or fastq, compressed or uncompressed. Cannot accept sam.
jpayne@69	15
jpayne@69	16 Parameters and their defaults:
jpayne@69	17 in=<file> Input file.
jpayne@69	18 in2=<file> Optional input for read 2 of twin paired files.
jpayne@69	19 out=<file> Output file. May not be standard out.
jpayne@69	20 out2=<file> Optional output for read 2 of twin paired files.
jpayne@69	21 groups=auto Use this many intermediate files (to save memory).
jpayne@69	22 1 group is fastest. Auto will estimate the number
jpayne@69	23 of groups needed based on the file size, so it should
jpayne@69	24 not ever run out of memory.
jpayne@69	25 lowcomplexity=f For compressed low-complexity libraries such as RNA-seq,
jpayne@69	26 this will more conservatively estimate how much memory
jpayne@69	27 is needed to automatically decide the number of groups.
jpayne@69	28 rcomp=f Give read clumps the same orientation to increase
jpayne@69	29 compression. Should be disabled for paired reads.
jpayne@69	30 overwrite=f (ow) Set to false to force the program to abort rather
jpayne@69	31 than overwrite an existing file.
jpayne@69	32 qin=auto Auto-detect input quality encoding. May be set to:
jpayne@69	33 33: ASCII-33 (Sanger) encoding.
jpayne@69	34 64: ASCII-64 (old Illumina) encoding.
jpayne@69	35 All modern sequence is encoded as ASCII-33.
jpayne@69	36 qout=auto Use input quality encoding as output quality encoding.
jpayne@69	37 changequality=f (cq) If true, fix broken quality scores such as Ns with
jpayne@69	38 Q>0. Default is false to ensure lossless compression.
jpayne@69	39 fastawrap=70 Set to a higher number like 4000 for longer lines in
jpayne@69	40 fasta format, which increases compression.
jpayne@69	41
jpayne@69	42 Compression parameters:
jpayne@69	43 ziplevel=6 (zl) Gzip compression level (1-11). Higher is slower.
jpayne@69	44 Level 11 is only available if pigz is installed and is
jpayne@69	45 extremely slow to compress, but faster to decompress.
jpayne@69	46 Naming the output file to *.bz2 will use bzip2 instead of
jpayne@69	47 gzip for ~9% additional compression, which requires
jpayne@69	48 bzip2, pbzip2, or lbzip2 in the path.
jpayne@69	49 blocksize=128 Size of blocks for pigz, in kb. Higher gives slightly
jpayne@69	50 better compression.
jpayne@69	51 shortname=f Make the names as short as possible. 'shortname=shrink'
jpayne@69	52 will shorten the names where possible, but retain the
jpayne@69	53 flowcell and barcode information.
jpayne@69	54 reorder=f Reorder clumps for additional compression. Only valid
jpayne@69	55 when groups=1, passes=1, and ecc=f. Possible modes:
jpayne@69	56 f: Do not reorder clumps.
jpayne@69	57 c: Reorder using consensus reads. Uses additional
jpayne@69	58 time and memory.
jpayne@69	59 p: Reorder using pair information. Requires paired
jpayne@69	60 reads. Yields the highest compression.
jpayne@69	61 a: Automatically choose between 'c' and 'p'. The
jpayne@69	62 flag reorder with no argument will set 'reorder=a'.
jpayne@69	63 quantize=f Bin the quality scores, like NextSeq. This greatly
jpayne@69	64 increases compression, but information is lost.
jpayne@69	65
jpayne@69	66 Temp file parameters:
jpayne@69	67 compresstemp=auto (ct) Gzip temporary files. By default temp files will be
jpayne@69	68 compressed if the output file is compressed.
jpayne@69	69 deletetemp=t Delete temporary files.
jpayne@69	70 deleteinput=f Delete input upon successful completion.
jpayne@69	71 usetmpdir=f Use tmpdir for temp files.
jpayne@69	72 tmpdir= By default, this is the environment variable TMPDIR.
jpayne@69	73
jpayne@69	74 Hashing parameters:
jpayne@69	75 k=31 Use kmers of this length (1-31). Shorter kmers may
jpayne@69	76 increase compression, but 31 is recommended for error
jpayne@69	77 correction.
jpayne@69	78 mincount=0 Don't use pivot kmers with count less than this.
jpayne@69	79 Setting mincount=2 can increase compression.
jpayne@69	80 Increases time and memory usage.
jpayne@69	81 seed=1 Random number generator seed for hashing.
jpayne@69	82 Set to a negative number to use a random seed.
jpayne@69	83 hashes=4 Use this many masks when hashing. 0 uses raw kmers.
jpayne@69	84 Often hashes=0 increases compression, but it should
jpayne@69	85 not be used with error-correction.
jpayne@69	86 border=1 Do not use kmers within this many bases of read ends.
jpayne@69	87
jpayne@69	88 Deduplication parameters:
jpayne@69	89 dedupe=f Remove duplicate reads. For pairs, both must match.
jpayne@69	90 By default, deduplication does not occur.
jpayne@69	91 If dedupe and markduplicates are both false, none of
jpayne@69	92 the other duplicate-related flags will have any effect.
jpayne@69	93 markduplicates=f Don't remove; just append ' duplicate' to the name.
jpayne@69	94 allduplicates=f Mark or remove all copies of duplicates, instead of
jpayne@69	95 keeping the highest-quality copy.
jpayne@69	96 addcount=f Append the number of copies to the read name.
jpayne@69	97 Mutually exclusive with markduplicates or allduplicates.
jpayne@69	98 entryfilter=f This assists in removing exact duplicates, which saves
jpayne@69	99 memory in libraries that split unevenly due to huge
jpayne@69	100 numbers of duplicates. Enabled automatically as needed.
jpayne@69	101 subs=2 (s) Maximum substitutions allowed between duplicates.
jpayne@69	102 subrate=0.0 (dsr) If set, the number of substitutions allowed will be
jpayne@69	103 max(subs, subrate*min(length1, length2)) for 2 sequences.
jpayne@69	104 allowns=t No-called bases will not be considered substitutions.
jpayne@69	105 scanlimit=5 (scan) Continue for this many reads after encountering a
jpayne@69	106 nonduplicate. Improves detection of inexact duplicates.
jpayne@69	107 containment=f Allow containments (where one sequence is shorter).
jpayne@69	108 affix=f For containments, require one sequence to be an affix
jpayne@69	109 (prefix or suffix) of the other.
jpayne@69	110 optical=f If true, mark or remove optical duplicates only.
jpayne@69	111 This means they are Illumina reads within a certain
jpayne@69	112 distance on the flowcell. Normal Illumina names needed.
jpayne@69	113 Also for tile-edge and well duplicates.
jpayne@69	114 dupedist=40 (dist) Max distance to consider for optical duplicates.
jpayne@69	115 Higher removes more duplicates but is more likely to
jpayne@69	116 remove PCR rather than optical duplicates.
jpayne@69	117 This is platform-specific; recommendations:
jpayne@69	118 NextSeq 40 (and spany=t)
jpayne@69	119 HiSeq 1T 40
jpayne@69	120 HiSeq 2500 40
jpayne@69	121 HiSeq 3k/4k 2500
jpayne@69	122 Novaseq 12000
jpayne@69	123 spany=f Allow reads to be considered optical duplicates if they
jpayne@69	124 are on different tiles, but are within dupedist in the
jpayne@69	125 y-axis. Should only be enabled when looking for
jpayne@69	126 tile-edge duplicates (as in NextSeq).
jpayne@69	127 spanx=f Like spany, but for the x-axis. Not necessary
jpayne@69	128 for NextSeq.
jpayne@69	129 spantiles=f Set both spanx and spany.
jpayne@69	130 adjacent=f Limit tile-spanning to adjacent tiles (those with
jpayne@69	131 consecutive numbers).
jpayne@69	132 * Thus, for NextSeq, the recommended deduplication flags are: *
jpayne@69	133 dedupe optical spany adjacent
jpayne@69	134
jpayne@69	135 Pairing/ordering parameters (for use with error-correction):
jpayne@69	136 unpair=f For paired reads, clump all of them rather than just
jpayne@69	137 read 1. Destroys pairing. Without this flag, for paired
jpayne@69	138 reads, only read 1 will be error-corrected.
jpayne@69	139 repair=f After clumping and error-correction, restore pairing.
jpayne@69	140 If groups>1 this will sort by name which will destroy
jpayne@69	141 clump ordering; with a single group, clumping will
jpayne@69	142 be retained.
jpayne@69	143
jpayne@69	144 Error-correction parameters:
jpayne@69	145 ecc=f Error-correct reads. Requires multiple passes for
jpayne@69	146 complete correction.
jpayne@69	147 ecco=f Error-correct paired reads via overlap before clumping.
jpayne@69	148 passes=1 Use this many error-correction passes. 6 passes are
jpayne@69	149 suggested.
jpayne@69	150 consensus=f Output consensus sequence instead of clumps.
jpayne@69	151
jpayne@69	152 Advanced error-correction parameters:
jpayne@69	153 mincc=4 (mincountcorrect) Do not correct to alleles occuring less
jpayne@69	154 often than this.
jpayne@69	155 minss=4 (minsizesplit) Do not split into new clumps smaller than
jpayne@69	156 this.
jpayne@69	157 minsfs=0.17 (minsizefractionsplit) Do not split on pivot alleles in
jpayne@69	158 areas with local depth less than this fraction of clump size.
jpayne@69	159 minsfc=0.20 (minsizefractioncorrect) Do not correct in areas with local
jpayne@69	160 depth less than this.
jpayne@69	161 minr=30.0 (minratio) Correct to the consensus if the ratio of the
jpayne@69	162 consensus allele to second-most-common allele is >=minr,
jpayne@69	163 for high depth. Actual ratio used is:
jpayne@69	164 min(minr, minro+minorCountminrm+qualityminrqm).
jpayne@69	165 minro=1.9 (minratiooffset) Base ratio.
jpayne@69	166 minrm=1.8 (minratiomult) Ratio multiplier for secondary allele count.
jpayne@69	167 minrqm=0.08 (minratioqmult) Ratio multiplier for base quality.
jpayne@69	168 minqr=2.8 (minqratio) Do not correct bases when cq*minqr>rqsum.
jpayne@69	169 minaqr=0.70 (minaqratio) Do not correct bases when cq*minaqr>5+rqavg.
jpayne@69	170 minid=0.97 (minidentity) Do not correct reads with identity to
jpayne@69	171 consensus less than this.
jpayne@69	172 maxqadjust=0 Adjust quality scores by at most maxqadjust per pass.
jpayne@69	173 maxqi=-1 (maxqualityincorrect) Do not correct bases with quality
jpayne@69	174 above this (if positive).
jpayne@69	175 maxci=-1 (maxcountincorrect) Do not correct alleles with count
jpayne@69	176 above this (if positive).
jpayne@69	177 findcorrelations=t Look for correlated SNPs in clumps to split into alleles.
jpayne@69	178 maxcorrelations=12 Maximum number of eligible SNPs per clump to consider for
jpayne@69	179 correlations. Increasing this number can reduce false-
jpayne@69	180 positive corrections at the possible expense of speed.
jpayne@69	181
jpayne@69	182 Java Parameters:
jpayne@69	183 -Xmx This will set Java's memory usage, overriding autodetection.
jpayne@69	184 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
jpayne@69	185 The max is typically 85% of physical memory.
jpayne@69	186 -eoom This flag will cause the process to exit if an
jpayne@69	187 out-of-memory exception occurs. Requires Java 8u92+.
jpayne@69	188 -da Disable assertions.
jpayne@69	189
jpayne@69	190 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
jpayne@69	191 "
jpayne@69	192 }
jpayne@69	193
jpayne@69	194 #This block allows symlinked shellscripts to correctly set classpath.
jpayne@69	195 pushd . > /dev/null
jpayne@69	196 DIR="${BASH_SOURCE[0]}"
jpayne@69	197 while [ -h "$DIR" ]; do
jpayne@69	198 cd "$(dirname "$DIR")"
jpayne@69	199 DIR="$(readlink "$(basename "$DIR")")"
jpayne@69	200 done
jpayne@69	201 cd "$(dirname "$DIR")"
jpayne@69	202 DIR="$(pwd)/"
jpayne@69	203 popd > /dev/null
jpayne@69	204
jpayne@69	205 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
jpayne@69	206 CP="$DIR""current/"
jpayne@69	207
jpayne@69	208 z="-Xmx2g"
jpayne@69	209 z2="-Xms2g"
jpayne@69	210 set=0
jpayne@69	211
jpayne@69	212 if [ -z "$1" ] \|\| [[ $1 == -h ]] \|\| [[ $1 == --help ]]; then
jpayne@69	213 usage
jpayne@69	214 exit
jpayne@69	215 fi
jpayne@69	216
jpayne@69	217 calcXmx () {
jpayne@69	218 source "$DIR""/calcmem.sh"
jpayne@69	219 setEnvironment
jpayne@69	220 parseXmx "$@"
jpayne@69	221 if [[ $set == 1 ]]; then
jpayne@69	222 return
jpayne@69	223 fi
jpayne@69	224 #Note that this uses slighly less (82%) of memory than normal to account for multiple pigz instances.
jpayne@69	225 freeRam 2000m 82
jpayne@69	226 z="-Xmx${RAM}m"
jpayne@69	227 z2="-Xms${RAM}m"
jpayne@69	228 }
jpayne@69	229 calcXmx "$@"
jpayne@69	230
jpayne@69	231 clumpify() {
jpayne@69	232 local CMD="java $EA $EOOM $z $z2 -cp $CP clump.Clumpify $@"
jpayne@69	233 echo $CMD >&2
jpayne@69	234 eval $CMD
jpayne@69	235 }
jpayne@69	236
jpayne@69	237 clumpify "$@"

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/clumpify.sh @ 69:33d812a61356