annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/tadpole.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 #!/bin/bash
jpayne@69 2
jpayne@69 3 usage(){
jpayne@69 4 echo "
jpayne@69 5 Written by Brian Bushnell
jpayne@69 6 Last modified February 3, 2021
jpayne@69 7
jpayne@69 8 Description: Uses kmer counts to assemble contigs, extend sequences,
jpayne@69 9 or error-correct reads. Tadpole has no upper bound for kmer length,
jpayne@69 10 but some values are not supported. Specifically, it allows 1-31,
jpayne@69 11 multiples of 2 from 32-62, multiples of 3 from 63-93, etc.
jpayne@69 12 Please read bbmap/docs/guides/TadpoleGuide.txt for more information.
jpayne@69 13
jpayne@69 14 Usage:
jpayne@69 15 Assembly: tadpole.sh in=<reads> out=<contigs>
jpayne@69 16 Extension: tadpole.sh in=<reads> out=<extended> mode=extend
jpayne@69 17 Correction: tadpole.sh in=<reads> out=<corrected> mode=correct
jpayne@69 18
jpayne@69 19 Recommended parameters for optimal assembly:
jpayne@69 20 tadpole.sh in=<reads> out=<contigs> shave rinse pop k=<50-70% of read length>
jpayne@69 21
jpayne@69 22 Extension and correction may be done simultaneously. Error correction on
jpayne@69 23 multiple files may be done like this:
jpayne@69 24
jpayne@69 25 tadpole.sh in=libA_r1.fq,libA_merged.fq in2=libA_r2.fq,null extra=libB_r1.fq out=ecc_libA_r1.fq,ecc_libA_merged.fq out2=ecc_libA_r2.fq,null mode=correct
jpayne@69 26
jpayne@69 27 Extending contigs with reads could be done like this:
jpayne@69 28
jpayne@69 29 tadpole.sh in=contigs.fa out=extended.fa el=100 er=100 mode=extend extra=reads.fq k=62
jpayne@69 30
jpayne@69 31
jpayne@69 32 Input parameters:
jpayne@69 33 in=<file> Primary input file for reads to use as kmer data.
jpayne@69 34 in2=<file> Second input file for paired data.
jpayne@69 35 extra=<file> Extra files for use as kmer data, but not for error-
jpayne@69 36 correction or extension.
jpayne@69 37 reads=-1 Only process this number of reads, then quit (-1 means all).
jpayne@69 38 NOTE: in, in2, and extra may also be comma-delimited lists of files.
jpayne@69 39
jpayne@69 40 Output parameters:
jpayne@69 41 out=<file> Write contigs (in contig mode) or corrected/extended
jpayne@69 42 reads (in other modes).
jpayne@69 43 out2=<file> Second output file for paired output.
jpayne@69 44 outd=<file> Write discarded reads, if using junk-removal flags.
jpayne@69 45 dot=<file> Write a contigs connectivity graph (partially implemented)
jpayne@69 46 dump=<file> Write kmers and their counts.
jpayne@69 47 fastadump=t Write kmers and counts as fasta versus 2-column tsv.
jpayne@69 48 mincounttodump=1 Only dump kmers with at least this depth.
jpayne@69 49 showstats=t Print assembly statistics after writing contigs.
jpayne@69 50
jpayne@69 51 Prefiltering parameters:
jpayne@69 52 prefilter=0 If set to a positive integer, use a countmin sketch
jpayne@69 53 to ignore kmers with depth of that value or lower.
jpayne@69 54 prehashes=2 Number of hashes for prefilter.
jpayne@69 55 prefiltersize=0.2 (pff) Fraction of memory to use for prefilter.
jpayne@69 56 minprobprefilter=t (mpp) Use minprob for the prefilter.
jpayne@69 57 prepasses=1 Use this many prefiltering passes; higher be more thorough
jpayne@69 58 if the filter is very full. Set to 'auto' to iteratively
jpayne@69 59 prefilter until the remaining kmers will fit in memory.
jpayne@69 60 onepass=f If true, prefilter will be generated in same pass as kmer
jpayne@69 61 counts. Much faster but counts will be lower, by up to
jpayne@69 62 prefilter's depth limit.
jpayne@69 63 filtermem=0 Allows manually specifying prefilter memory in bytes, for
jpayne@69 64 deterministic runs. 0 will set it automatically.
jpayne@69 65
jpayne@69 66 Hashing parameters:
jpayne@69 67 k=31 Kmer length (1 to infinity). Memory use increases with K.
jpayne@69 68 prealloc=t Pre-allocate memory rather than dynamically growing;
jpayne@69 69 faster and more memory-efficient. A float fraction (0-1)
jpayne@69 70 may be specified; default is 1.
jpayne@69 71 minprob=0.5 Ignore kmers with overall probability of correctness below this.
jpayne@69 72 minprobmain=t (mpm) Use minprob for the primary kmer counts.
jpayne@69 73 threads=X Spawn X worker threads; default is number of logical processors.
jpayne@69 74 buildthreads=X Spawn X contig-building threads. If not set, defaults to the same
jpayne@69 75 as threads. Setting this to 1 will make contigs deterministic.
jpayne@69 76 rcomp=t Store and count each kmer together and its reverse-complement.
jpayne@69 77 coremask=t All kmer extensions share the same hashcode.
jpayne@69 78 fillfast=t Speed up kmer extension lookups.
jpayne@69 79
jpayne@69 80 Assembly parameters:
jpayne@69 81 mincountseed=3 (mcs) Minimum kmer count to seed a new contig or begin extension.
jpayne@69 82 mincountextend=2 (mce) Minimum kmer count continue extension of a read or contig.
jpayne@69 83 It is recommended that mce=1 for low-depth metagenomes.
jpayne@69 84 mincountretain=0 (mincr) Discard kmers with count below this.
jpayne@69 85 maxcountretain=INF (maxcr) Discard kmers with count above this.
jpayne@69 86 branchmult1=20 (bm1) Min ratio of 1st to 2nd-greatest path depth at high depth.
jpayne@69 87 branchmult2=3 (bm2) Min ratio of 1st to 2nd-greatest path depth at low depth.
jpayne@69 88 branchlower=3 (blc) Max value of 2nd-greatest path depth to be considered low.
jpayne@69 89 minextension=2 (mine) Do not keep contigs that did not extend at least this much.
jpayne@69 90 mincontig=auto (minc) Do not write contigs shorter than this.
jpayne@69 91 mincoverage=1 (mincov) Do not write contigs with average coverage below this.
jpayne@69 92 maxcoverage=inf (maxcov) Do not write contigs with average coverage above this.
jpayne@69 93 trimends=0 (trim) Trim contig ends by this much. Trimming by K/2
jpayne@69 94 may yield more accurate genome size estimation.
jpayne@69 95 trimcircular=t Trim one end of contigs ending in LOOP/LOOP by K-1,
jpayne@69 96 to eliminate the overlapping portion.
jpayne@69 97 contigpasses=16 Build contigs with decreasing seed depth for this many iterations.
jpayne@69 98 contigpassmult=1.7 Ratio between seed depth of two iterations.
jpayne@69 99 ownership=auto For concurrency; do not touch.
jpayne@69 100 processcontigs=f Explore the contig connectivity graph.
jpayne@69 101 popbubbles=t (pop) Pop bubbles; increases contiguity. Requires
jpayne@69 102 additional time and memory and forces processcontigs=t.
jpayne@69 103
jpayne@69 104 Processing modes:
jpayne@69 105 mode=contig contig: Make contigs from kmers.
jpayne@69 106 extend: Extend sequences to be longer, and optionally
jpayne@69 107 perform error correction.
jpayne@69 108 correct: Error correct only.
jpayne@69 109 insert: Measure insert sizes.
jpayne@69 110 discard: Discard low-depth reads, without error correction.
jpayne@69 111
jpayne@69 112 Extension parameters:
jpayne@69 113 extendleft=100 (el) Extend to the left by at most this many bases.
jpayne@69 114 extendright=100 (er) Extend to the right by at most this many bases.
jpayne@69 115 ibb=t (ignorebackbranches) Do not stop at backward branches.
jpayne@69 116 extendrollback=3 Trim a random number of bases, up to this many, on reads
jpayne@69 117 that extend only partially. This prevents the creation
jpayne@69 118 of sharp coverage discontinuities at branches.
jpayne@69 119
jpayne@69 120 Error-correction parameters:
jpayne@69 121 ecc=f Error correct via kmer counts.
jpayne@69 122 reassemble=t If ecc is enabled, use the reassemble algorithm.
jpayne@69 123 pincer=f If ecc is enabled, use the pincer algorithm.
jpayne@69 124 tail=f If ecc is enabled, use the tail algorithm.
jpayne@69 125 eccfull=f If ecc is enabled, use tail over the entire read.
jpayne@69 126 aggressive=f (aecc) Use aggressive error correction settings.
jpayne@69 127 Overrides some other flags like errormult1 and deadzone.
jpayne@69 128 conservative=f (cecc) Use conservative error correction settings.
jpayne@69 129 Overrides some other flags like errormult1 and deadzone.
jpayne@69 130 rollback=t Undo changes to reads that have lower coverage for
jpayne@69 131 any kmer after correction.
jpayne@69 132 markbadbases=0 (mbb) Any base fully covered by kmers with count below
jpayne@69 133 this will have its quality reduced.
jpayne@69 134 markdeltaonly=t (mdo) Only mark bad bases adjacent to good bases.
jpayne@69 135 meo=t (markerrorreadsonly) Only mark bad bases in reads
jpayne@69 136 containing errors.
jpayne@69 137 markquality=0 (mq) Set quality scores for marked bases to this.
jpayne@69 138 A level of 0 will also convert the base to an N.
jpayne@69 139 errormult1=16 (em1) Min ratio between kmer depths to call an error.
jpayne@69 140 errormult2=2.6 (em2) Alternate ratio between low-depth kmers.
jpayne@69 141 errorlowerconst=3 (elc) Use mult2 when the lower kmer is at most this deep.
jpayne@69 142 mincountcorrect=3 (mcc) Don't correct to kmers with count under this.
jpayne@69 143 pathsimilarityfraction=0.45(psf) Max difference ratio considered similar.
jpayne@69 144 Controls whether a path appears to be continuous.
jpayne@69 145 pathsimilarityconstant=3 (psc) Absolute differences below this are ignored.
jpayne@69 146 errorextensionreassemble=5 (eer) Verify this many kmers before the error as
jpayne@69 147 having similar depth, for reassemble.
jpayne@69 148 errorextensionpincer=5 (eep) Verify this many additional bases after the
jpayne@69 149 error as matching current bases, for pincer.
jpayne@69 150 errorextensiontail=9 (eet) Verify additional bases before and after
jpayne@69 151 the error as matching current bases, for tail.
jpayne@69 152 deadzone=0 (dz) Do not try to correct bases within this distance of
jpayne@69 153 read ends.
jpayne@69 154 window=12 (w) Length of window to use in reassemble mode.
jpayne@69 155 windowcount=6 (wc) If more than this many errors are found within a
jpayne@69 156 a window, halt correction in that direction.
jpayne@69 157 qualsum=80 (qs) If the sum of the qualities of corrected bases within
jpayne@69 158 a window exceeds this, halt correction in that direction.
jpayne@69 159 rbi=t (requirebidirectional) Require agreement from both
jpayne@69 160 directions when correcting errors in the middle part of
jpayne@69 161 the read using the reassemble algorithm.
jpayne@69 162 errorpath=1 (ep) For debugging purposes.
jpayne@69 163
jpayne@69 164 Junk-removal parameters (to only remove junk, set mode=discard):
jpayne@69 165 tossjunk=f Remove reads that cannot be used for assembly.
jpayne@69 166 This means they have no kmers above depth 1 (2 for paired
jpayne@69 167 reads) and the outermost kmers cannot be extended.
jpayne@69 168 Pairs are removed only if both reads fail.
jpayne@69 169 tossdepth=-1 Remove reads containing kmers at or below this depth.
jpayne@69 170 Pairs are removed if either read fails.
jpayne@69 171 lowdepthfraction=0 (ldf) Require at least this fraction of kmers to be
jpayne@69 172 low-depth to discard a read; range 0-1. 0 still
jpayne@69 173 requires at least 1 low-depth kmer.
jpayne@69 174 requirebothbad=f (rbb) Only discard pairs if both reads are low-depth.
jpayne@69 175 tossuncorrectable (tu) Discard reads containing uncorrectable errors.
jpayne@69 176 Requires error-correction to be enabled.
jpayne@69 177
jpayne@69 178 Shaving parameters:
jpayne@69 179 shave=t Remove dead ends (aka hair).
jpayne@69 180 rinse=t Remove bubbles.
jpayne@69 181 wash= Set shave and rinse at the same time.
jpayne@69 182 maxshavedepth=1 (msd) Shave or rinse kmers at most this deep.
jpayne@69 183 exploredist=300 (sed) Quit after exploring this far.
jpayne@69 184 discardlength=150 (sdl) Discard shavings up to this long.
jpayne@69 185 Note: Shave and rinse can produce substantially better assemblies
jpayne@69 186 for low-depth data, but they are very slow for large metagenomes.
jpayne@69 187
jpayne@69 188 Overlap parameters (for overlapping paired-end reads only):
jpayne@69 189 merge=f Attempt to merge overlapping reads prior to
jpayne@69 190 kmer-counting, and again prior to correction. Output
jpayne@69 191 will still be unmerged pairs.
jpayne@69 192 ecco=f Error correct via overlap, but do not merge reads.
jpayne@69 193 testmerge=t Test kmer counts around the read merge junctions. If
jpayne@69 194 it appears that the merge created new errors, undo it.
jpayne@69 195
jpayne@69 196 Java Parameters:
jpayne@69 197 -Xmx This will set Java's memory usage, overriding autodetection.
jpayne@69 198 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
jpayne@69 199 The max is typically 85% of physical memory.
jpayne@69 200 -eoom This flag will cause the process to exit if an
jpayne@69 201 out-of-memory exception occurs. Requires Java 8u92+.
jpayne@69 202 -da Disable assertions.
jpayne@69 203 "
jpayne@69 204 }
jpayne@69 205
jpayne@69 206 #This block allows symlinked shellscripts to correctly set classpath.
jpayne@69 207 pushd . > /dev/null
jpayne@69 208 DIR="${BASH_SOURCE[0]}"
jpayne@69 209 while [ -h "$DIR" ]; do
jpayne@69 210 cd "$(dirname "$DIR")"
jpayne@69 211 DIR="$(readlink "$(basename "$DIR")")"
jpayne@69 212 done
jpayne@69 213 cd "$(dirname "$DIR")"
jpayne@69 214 DIR="$(pwd)/"
jpayne@69 215 popd > /dev/null
jpayne@69 216
jpayne@69 217 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
jpayne@69 218 CP="$DIR""current/"
jpayne@69 219
jpayne@69 220 z="-Xmx14g"
jpayne@69 221 z2="-Xms14g"
jpayne@69 222 set=0
jpayne@69 223
jpayne@69 224 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
jpayne@69 225 usage
jpayne@69 226 exit
jpayne@69 227 fi
jpayne@69 228
jpayne@69 229 calcXmx () {
jpayne@69 230 source "$DIR""/calcmem.sh"
jpayne@69 231 setEnvironment
jpayne@69 232 parseXmx "$@"
jpayne@69 233 if [[ $set == 1 ]]; then
jpayne@69 234 return
jpayne@69 235 fi
jpayne@69 236 freeRam 15000m 84
jpayne@69 237 z="-Xmx${RAM}m"
jpayne@69 238 z2="-Xms${RAM}m"
jpayne@69 239 }
jpayne@69 240 calcXmx "$@"
jpayne@69 241
jpayne@69 242 tadpole() {
jpayne@69 243 local CMD="java $EA $EOOM $z $z2 -cp $CP assemble.Tadpole $@"
jpayne@69 244 echo $CMD >&2
jpayne@69 245 eval $CMD
jpayne@69 246 }
jpayne@69 247
jpayne@69 248 tadpole "$@"