comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/bbmerge.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 #!/bin/bash
2
3 usage(){
4 echo "
5 Written by Brian Bushnell and Jonathan Rood
6 Last modified June 26, 2019
7
8 Description: Merges paired reads into single reads by overlap detection.
9 With sufficient coverage, can merge nonoverlapping reads by kmer extension.
10 Kmer modes (Tadpole or Bloom Filter) require much more memory, and should
11 be used with the bbmerge-auto.sh script rather than bbmerge.sh.
12 Please read bbmap/docs/guides/BBMergeGuide.txt for more information.
13
14 Usage for interleaved files: bbmerge.sh in=<reads> out=<merged reads> outu=<unmerged reads>
15 Usage for paired files: bbmerge.sh in1=<read1> in2=<read2> out=<merged reads> outu1=<unmerged1> outu2=<unmerged2>
16
17 Input may be stdin or a file, fasta or fastq, raw or gzipped.
18
19 Input parameters:
20 in=null Primary input. 'in2' will specify a second file.
21 interleaved=auto May be set to true or false to override autodetection of
22 whether the input file as interleaved.
23 reads=-1 Quit after this many read pairs (-1 means all).
24
25 Output parameters:
26 out=<file> File for merged reads. 'out2' will specify a second file.
27 outu=<file> File for unmerged reads. 'outu2' will specify a second file.
28 outinsert=<file> (outi) File to write read names and insert sizes.
29 outadapter=<file> (outa) File to write consensus adapter sequences.
30 outc=<file> File to write input read kmer cardinality estimate.
31 ihist=<file> (hist) Insert length histogram output file.
32 nzo=t Only print histogram bins with nonzero values.
33 showhiststats=t Print extra header lines with statistical information.
34 ziplevel=2 Set to 1 (lowest) through 9 (max) to change compression
35 level; lower compression is faster.
36 ordered=f Output reads in same order as input.
37 mix=f Output both the merged (or mergable) and unmerged reads
38 in the same file (out=). Useful for ecco mode.
39
40 Trimming/Filtering parameters:
41 qtrim=f Trim read ends to remove bases with quality below minq.
42 Trims BEFORE merging.
43 Values: t (trim both ends),
44 f (neither end),
45 r (right end only),
46 l (left end only).
47 qtrim2=f May be specified instead of qtrim to perform trimming
48 only if merging is unsuccessful, then retry merging.
49 trimq=10 Trim quality threshold. This may be a comma-delimited
50 list (ascending) to try multiple values.
51 minlength=1 (ml) Reads shorter than this after trimming, but before
52 merging, will be discarded. Pairs will be discarded only
53 if both are shorter.
54 maxlength=-1 Reads with longer insert sizes will be discarded.
55 tbo=f (trimbyoverlap) Trim overlapping reads to remove
56 rightmost (3') non-overlapping portion, instead of joining.
57 minavgquality=0 (maq) Reads with average quality below this, after
58 trimming, will not be attempted to be merged.
59 maxexpectederrors=0 (mee) If positive, reads with more combined expected
60 errors than this will not be attempted to be merged.
61 forcetrimleft=0 (ftl) If nonzero, trim left bases of the read to
62 this position (exclusive, 0-based).
63 forcetrimright=0 (ftr) If nonzero, trim right bases of the read
64 after this position (exclusive, 0-based).
65 forcetrimright2=0 (ftr2) If positive, trim this many bases on the right end.
66 forcetrimmod=5 (ftm) If positive, trim length to be equal to
67 zero modulo this number.
68 ooi=f Output only incorrectly merged reads, for testing.
69 trimpolya=t Trim trailing poly-A tail from adapter output. Only
70 affects outadapter. This also trims poly-A followed
71 by poly-G, which occurs on NextSeq.
72
73 Processing Parameters:
74 usejni=f (jni) Do overlapping in C code, which is faster. Requires
75 compiling the C code; details are in /jni/README.txt.
76 However, the jni path is currently disabled.
77 merge=t Create merged reads. If set to false, you can still
78 generate an insert histogram.
79 ecco=f Error-correct the overlapping part, but don't merge.
80 trimnonoverlapping=f (tno) Trim all non-overlapping portions, leaving only
81 consensus sequence. By default, only sequence to the
82 right of the overlap (adapter sequence) is trimmed.
83 useoverlap=t Attempt find the insert size using read overlap.
84 mininsert=35 Minimum insert size to merge reads.
85 mininsert0=35 Insert sizes less than this will not be considered.
86 Must be less than or equal to mininsert.
87 minoverlap=12 Minimum number of overlapping bases to allow merging.
88 minoverlap0=8 Overlaps shorter than this will not be considered.
89 Must be less than or equal to minoverlap.
90 minq=9 Ignore bases with quality below this.
91 maxq=41 Cap output quality scores at this.
92 entropy=t Increase the minimum overlap requirement for low-
93 complexity reads.
94 efilter=6 Ban overlaps with over this many times the expected
95 number of errors. Lower is more strict. -1 disables.
96 pfilter=0.00004 Ban improbable overlaps. Higher is more strict. 0 will
97 disable the filter; 1 will allow only perfect overlaps.
98 kfilter=0 Ban overlaps that create kmers with count below
99 this value (0 disables). If this is used minprob should
100 probably be set to 0. Requires good coverage.
101 ouq=f Calculate best overlap using quality values.
102 owq=t Calculate best overlap without using quality values.
103 usequality=t If disabled, quality values are completely ignored,
104 both for overlap detection and filtering. May be useful
105 for data with inaccurate quality values.
106 iupacton=f (itn) Change ambiguous IUPAC symbols to N.
107 adapter= Specify the adapter sequences used for these reads, if
108 known; this can be a fasta file or a literal sequence.
109 Read 1 and 2 can have adapters specified independently
110 with the adapter1 and adapter2 flags. adapter=default
111 will use a list of common adapter sequences.
112
113 Ratio Mode:
114 ratiomode=t Score overlaps based on the ratio of matching to
115 mismatching bases.
116 maxratio=0.09 Max error rate; higher increases merge rate.
117 ratiomargin=5.5 Lower increases merge rate; min is 1.
118 ratiooffset=0.55 Lower increases merge rate; min is 0.
119 maxmismatches=20 Maximum mismatches allowed in overlapping region.
120 ratiominoverlapreduction=3 This is the difference between minoverlap in
121 flat mode and minoverlap in ratio mode; generally,
122 minoverlap should be lower in ratio mode.
123 minsecondratio=0.1 Cutoff for second-best overlap ratio.
124 forcemerge=f Disable all filters and just merge everything
125 (not recommended).
126
127 Flat Mode:
128 flatmode=f Score overlaps based on the total number of mismatching
129 bases only.
130 margin=2 The best overlap must have at least 'margin' fewer
131 mismatches than the second best.
132 mismatches=3 Do not allow more than this many mismatches.
133 requireratiomatch=f (rrm) Require the answer from flat mode and ratio mode
134 to agree, reducing false positives if both are enabled.
135 trimonfailure=t (tof) If detecting insert size by overlap fails,
136 the reads will be trimmed and this will be re-attempted.
137
138
139 *** Ratio Mode and Flat Mode may be used alone or simultaneously. ***
140 *** Ratio Mode is usually more accurate and is the default mode. ***
141
142
143 Strictness (these are mutually exclusive macros that set other parameters):
144 strict=f Decrease false positive rate and merging rate.
145 verystrict=f (vstrict) Greatly decrease FP and merging rate.
146 ultrastrict=f (ustrict) Decrease FP and merging rate even more.
147 maxstrict=f (xstrict) Maximally decrease FP and merging rate.
148 loose=f Increase false positive rate and merging rate.
149 veryloose=f (vloose) Greatly increase FP and merging rate.
150 ultraloose=f (uloose) Increase FP and merging rate even more.
151 maxloose=f (xloose) Maximally decrease FP and merging rate.
152 fast=f Fastest possible mode; less accurate.
153
154 Tadpole Parameters (for read extension and error-correction):
155 *Note: These require more memory and should be run with bbmerge-auto.sh.*
156 k=31 Kmer length. 31 (or less) is fastest and uses the least
157 memory, but higher values may be more accurate.
158 60 tends to work well for 150bp reads.
159 extend=0 Extend reads to the right this much before merging.
160 Requires sufficient (>5x) kmer coverage.
161 extend2=0 Extend reads this much only after a failed merge attempt,
162 or in rem/rsem mode.
163 iterations=1 (ei) Iteratively attempt to extend by extend2 distance
164 and merge up to this many times.
165 rem=f (requireextensionmatch) Do not merge if the predicted
166 insert size differs before and after extension.
167 However, if only the extended reads overlap, then that
168 insert will be used. Requires setting extend2.
169 rsem=f (requirestrictextensionmatch) Similar to rem but stricter.
170 Reads will only merge if the predicted insert size before
171 and after extension match. Requires setting extend2.
172 Enables the lowest possible false-positive rate.
173 ecctadpole=f (ecct) If reads fail to merge, error-correct with Tadpole
174 and try again. This happens prior to extend2.
175 reassemble=t If ecct is enabled, use Tadpole's reassemble mode for
176 error correction. Alternatives are pincer and tail.
177 removedeadends (shave) Remove kmers leading to dead ends.
178 removebubbles (rinse) Remove kmers in error bubbles.
179 mindepthseed=3 (mds) Minimum kmer depth to begin extension.
180 mindepthextend=2 (mde) Minimum kmer depth continue extension.
181 branchmult1=20 Min ratio of 1st to 2nd-greatest path depth at high depth.
182 branchmult2=3 Min ratio of 1st to 2nd-greatest path depth at low depth.
183 branchlower=3 Max value of 2nd-greatest path depth to be considered low.
184 ibb=t Ignore backward branches when extending.
185 extra=<file> A file or comma-delimited list of files of reads to use
186 for kmer counting, but not for merging or output.
187 prealloc=f Pre-allocate memory rather than dynamically growing;
188 faster and more memory-efficient for large datasets.
189 A float fraction (0-1) may be specified, default 1.
190 prefilter=0 If set to a positive integer, use a countmin sketch to
191 ignore kmers with depth of that value or lower, to
192 reduce memory usage.
193 filtermem=0 Allows manually specifying prefilter memory in bytes, for
194 deterministic runs. 0 will set it automatically.
195 minprob=0.5 Ignore kmers with overall probability of correctness
196 below this, to reduce memory usage.
197 minapproxoverlap=26 For rem mode, do not merge reads if the extended reads
198 indicate that the raw reads should have overlapped by
199 at least this much, but no overlap was found.
200
201
202 Bloom Filter Parameters (for kmer operations with less memory than Tadpole)
203 *Note: These require more memory and should be run with bbmerge-auto.sh.*
204 eccbloom=f (eccb) If reads fail to merge, error-correct with bbcms
205 and try again.
206 testmerge=f Test kmer counts around the read merge junctions. If
207 it appears that the merge created new errors, undo it.
208 This reduces the false-positive rate, but not as much as
209 rem or rsem.
210
211 Java Parameters:
212 -Xmx This will set Java's memory usage,
213 overriding autodetection.
214 For example, -Xmx400m will specify 400 MB RAM.
215 -eoom This flag will cause the process to exit if an
216 out-of-memory exception occurs. Requires Java 8u92+.
217 -da Disable assertions.
218
219 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
220 "
221 }
222
223 #This block allows symlinked shellscripts to correctly set classpath.
224 pushd . > /dev/null
225 DIR="${BASH_SOURCE[0]}"
226 while [ -h "$DIR" ]; do
227 cd "$(dirname "$DIR")"
228 DIR="$(readlink "$(basename "$DIR")")"
229 done
230 cd "$(dirname "$DIR")"
231 DIR="$(pwd)/"
232 popd > /dev/null
233
234 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
235 CP="$DIR""current/"
236 JNI="-Djava.library.path=""$DIR""jni/"
237 #JNI=""
238
239 z="-Xmx1000m"
240 z2="-Xms1000m"
241 set=0
242
243 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
244 usage
245 exit
246 fi
247
248 calcXmx () {
249 source "$DIR""/calcmem.sh"
250 setEnvironment
251 parseXmx "$@"
252 }
253 calcXmx "$@"
254
255 function merge() {
256 local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP jgi.BBMerge $@"
257 echo $CMD >&2
258 eval $CMD
259 }
260
261 merge "$@"