Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/bbmerge.sh @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 #!/bin/bash | |
2 | |
3 usage(){ | |
4 echo " | |
5 Written by Brian Bushnell and Jonathan Rood | |
6 Last modified June 26, 2019 | |
7 | |
8 Description: Merges paired reads into single reads by overlap detection. | |
9 With sufficient coverage, can merge nonoverlapping reads by kmer extension. | |
10 Kmer modes (Tadpole or Bloom Filter) require much more memory, and should | |
11 be used with the bbmerge-auto.sh script rather than bbmerge.sh. | |
12 Please read bbmap/docs/guides/BBMergeGuide.txt for more information. | |
13 | |
14 Usage for interleaved files: bbmerge.sh in=<reads> out=<merged reads> outu=<unmerged reads> | |
15 Usage for paired files: bbmerge.sh in1=<read1> in2=<read2> out=<merged reads> outu1=<unmerged1> outu2=<unmerged2> | |
16 | |
17 Input may be stdin or a file, fasta or fastq, raw or gzipped. | |
18 | |
19 Input parameters: | |
20 in=null Primary input. 'in2' will specify a second file. | |
21 interleaved=auto May be set to true or false to override autodetection of | |
22 whether the input file as interleaved. | |
23 reads=-1 Quit after this many read pairs (-1 means all). | |
24 | |
25 Output parameters: | |
26 out=<file> File for merged reads. 'out2' will specify a second file. | |
27 outu=<file> File for unmerged reads. 'outu2' will specify a second file. | |
28 outinsert=<file> (outi) File to write read names and insert sizes. | |
29 outadapter=<file> (outa) File to write consensus adapter sequences. | |
30 outc=<file> File to write input read kmer cardinality estimate. | |
31 ihist=<file> (hist) Insert length histogram output file. | |
32 nzo=t Only print histogram bins with nonzero values. | |
33 showhiststats=t Print extra header lines with statistical information. | |
34 ziplevel=2 Set to 1 (lowest) through 9 (max) to change compression | |
35 level; lower compression is faster. | |
36 ordered=f Output reads in same order as input. | |
37 mix=f Output both the merged (or mergable) and unmerged reads | |
38 in the same file (out=). Useful for ecco mode. | |
39 | |
40 Trimming/Filtering parameters: | |
41 qtrim=f Trim read ends to remove bases with quality below minq. | |
42 Trims BEFORE merging. | |
43 Values: t (trim both ends), | |
44 f (neither end), | |
45 r (right end only), | |
46 l (left end only). | |
47 qtrim2=f May be specified instead of qtrim to perform trimming | |
48 only if merging is unsuccessful, then retry merging. | |
49 trimq=10 Trim quality threshold. This may be a comma-delimited | |
50 list (ascending) to try multiple values. | |
51 minlength=1 (ml) Reads shorter than this after trimming, but before | |
52 merging, will be discarded. Pairs will be discarded only | |
53 if both are shorter. | |
54 maxlength=-1 Reads with longer insert sizes will be discarded. | |
55 tbo=f (trimbyoverlap) Trim overlapping reads to remove | |
56 rightmost (3') non-overlapping portion, instead of joining. | |
57 minavgquality=0 (maq) Reads with average quality below this, after | |
58 trimming, will not be attempted to be merged. | |
59 maxexpectederrors=0 (mee) If positive, reads with more combined expected | |
60 errors than this will not be attempted to be merged. | |
61 forcetrimleft=0 (ftl) If nonzero, trim left bases of the read to | |
62 this position (exclusive, 0-based). | |
63 forcetrimright=0 (ftr) If nonzero, trim right bases of the read | |
64 after this position (exclusive, 0-based). | |
65 forcetrimright2=0 (ftr2) If positive, trim this many bases on the right end. | |
66 forcetrimmod=5 (ftm) If positive, trim length to be equal to | |
67 zero modulo this number. | |
68 ooi=f Output only incorrectly merged reads, for testing. | |
69 trimpolya=t Trim trailing poly-A tail from adapter output. Only | |
70 affects outadapter. This also trims poly-A followed | |
71 by poly-G, which occurs on NextSeq. | |
72 | |
73 Processing Parameters: | |
74 usejni=f (jni) Do overlapping in C code, which is faster. Requires | |
75 compiling the C code; details are in /jni/README.txt. | |
76 However, the jni path is currently disabled. | |
77 merge=t Create merged reads. If set to false, you can still | |
78 generate an insert histogram. | |
79 ecco=f Error-correct the overlapping part, but don't merge. | |
80 trimnonoverlapping=f (tno) Trim all non-overlapping portions, leaving only | |
81 consensus sequence. By default, only sequence to the | |
82 right of the overlap (adapter sequence) is trimmed. | |
83 useoverlap=t Attempt find the insert size using read overlap. | |
84 mininsert=35 Minimum insert size to merge reads. | |
85 mininsert0=35 Insert sizes less than this will not be considered. | |
86 Must be less than or equal to mininsert. | |
87 minoverlap=12 Minimum number of overlapping bases to allow merging. | |
88 minoverlap0=8 Overlaps shorter than this will not be considered. | |
89 Must be less than or equal to minoverlap. | |
90 minq=9 Ignore bases with quality below this. | |
91 maxq=41 Cap output quality scores at this. | |
92 entropy=t Increase the minimum overlap requirement for low- | |
93 complexity reads. | |
94 efilter=6 Ban overlaps with over this many times the expected | |
95 number of errors. Lower is more strict. -1 disables. | |
96 pfilter=0.00004 Ban improbable overlaps. Higher is more strict. 0 will | |
97 disable the filter; 1 will allow only perfect overlaps. | |
98 kfilter=0 Ban overlaps that create kmers with count below | |
99 this value (0 disables). If this is used minprob should | |
100 probably be set to 0. Requires good coverage. | |
101 ouq=f Calculate best overlap using quality values. | |
102 owq=t Calculate best overlap without using quality values. | |
103 usequality=t If disabled, quality values are completely ignored, | |
104 both for overlap detection and filtering. May be useful | |
105 for data with inaccurate quality values. | |
106 iupacton=f (itn) Change ambiguous IUPAC symbols to N. | |
107 adapter= Specify the adapter sequences used for these reads, if | |
108 known; this can be a fasta file or a literal sequence. | |
109 Read 1 and 2 can have adapters specified independently | |
110 with the adapter1 and adapter2 flags. adapter=default | |
111 will use a list of common adapter sequences. | |
112 | |
113 Ratio Mode: | |
114 ratiomode=t Score overlaps based on the ratio of matching to | |
115 mismatching bases. | |
116 maxratio=0.09 Max error rate; higher increases merge rate. | |
117 ratiomargin=5.5 Lower increases merge rate; min is 1. | |
118 ratiooffset=0.55 Lower increases merge rate; min is 0. | |
119 maxmismatches=20 Maximum mismatches allowed in overlapping region. | |
120 ratiominoverlapreduction=3 This is the difference between minoverlap in | |
121 flat mode and minoverlap in ratio mode; generally, | |
122 minoverlap should be lower in ratio mode. | |
123 minsecondratio=0.1 Cutoff for second-best overlap ratio. | |
124 forcemerge=f Disable all filters and just merge everything | |
125 (not recommended). | |
126 | |
127 Flat Mode: | |
128 flatmode=f Score overlaps based on the total number of mismatching | |
129 bases only. | |
130 margin=2 The best overlap must have at least 'margin' fewer | |
131 mismatches than the second best. | |
132 mismatches=3 Do not allow more than this many mismatches. | |
133 requireratiomatch=f (rrm) Require the answer from flat mode and ratio mode | |
134 to agree, reducing false positives if both are enabled. | |
135 trimonfailure=t (tof) If detecting insert size by overlap fails, | |
136 the reads will be trimmed and this will be re-attempted. | |
137 | |
138 | |
139 *** Ratio Mode and Flat Mode may be used alone or simultaneously. *** | |
140 *** Ratio Mode is usually more accurate and is the default mode. *** | |
141 | |
142 | |
143 Strictness (these are mutually exclusive macros that set other parameters): | |
144 strict=f Decrease false positive rate and merging rate. | |
145 verystrict=f (vstrict) Greatly decrease FP and merging rate. | |
146 ultrastrict=f (ustrict) Decrease FP and merging rate even more. | |
147 maxstrict=f (xstrict) Maximally decrease FP and merging rate. | |
148 loose=f Increase false positive rate and merging rate. | |
149 veryloose=f (vloose) Greatly increase FP and merging rate. | |
150 ultraloose=f (uloose) Increase FP and merging rate even more. | |
151 maxloose=f (xloose) Maximally decrease FP and merging rate. | |
152 fast=f Fastest possible mode; less accurate. | |
153 | |
154 Tadpole Parameters (for read extension and error-correction): | |
155 *Note: These require more memory and should be run with bbmerge-auto.sh.* | |
156 k=31 Kmer length. 31 (or less) is fastest and uses the least | |
157 memory, but higher values may be more accurate. | |
158 60 tends to work well for 150bp reads. | |
159 extend=0 Extend reads to the right this much before merging. | |
160 Requires sufficient (>5x) kmer coverage. | |
161 extend2=0 Extend reads this much only after a failed merge attempt, | |
162 or in rem/rsem mode. | |
163 iterations=1 (ei) Iteratively attempt to extend by extend2 distance | |
164 and merge up to this many times. | |
165 rem=f (requireextensionmatch) Do not merge if the predicted | |
166 insert size differs before and after extension. | |
167 However, if only the extended reads overlap, then that | |
168 insert will be used. Requires setting extend2. | |
169 rsem=f (requirestrictextensionmatch) Similar to rem but stricter. | |
170 Reads will only merge if the predicted insert size before | |
171 and after extension match. Requires setting extend2. | |
172 Enables the lowest possible false-positive rate. | |
173 ecctadpole=f (ecct) If reads fail to merge, error-correct with Tadpole | |
174 and try again. This happens prior to extend2. | |
175 reassemble=t If ecct is enabled, use Tadpole's reassemble mode for | |
176 error correction. Alternatives are pincer and tail. | |
177 removedeadends (shave) Remove kmers leading to dead ends. | |
178 removebubbles (rinse) Remove kmers in error bubbles. | |
179 mindepthseed=3 (mds) Minimum kmer depth to begin extension. | |
180 mindepthextend=2 (mde) Minimum kmer depth continue extension. | |
181 branchmult1=20 Min ratio of 1st to 2nd-greatest path depth at high depth. | |
182 branchmult2=3 Min ratio of 1st to 2nd-greatest path depth at low depth. | |
183 branchlower=3 Max value of 2nd-greatest path depth to be considered low. | |
184 ibb=t Ignore backward branches when extending. | |
185 extra=<file> A file or comma-delimited list of files of reads to use | |
186 for kmer counting, but not for merging or output. | |
187 prealloc=f Pre-allocate memory rather than dynamically growing; | |
188 faster and more memory-efficient for large datasets. | |
189 A float fraction (0-1) may be specified, default 1. | |
190 prefilter=0 If set to a positive integer, use a countmin sketch to | |
191 ignore kmers with depth of that value or lower, to | |
192 reduce memory usage. | |
193 filtermem=0 Allows manually specifying prefilter memory in bytes, for | |
194 deterministic runs. 0 will set it automatically. | |
195 minprob=0.5 Ignore kmers with overall probability of correctness | |
196 below this, to reduce memory usage. | |
197 minapproxoverlap=26 For rem mode, do not merge reads if the extended reads | |
198 indicate that the raw reads should have overlapped by | |
199 at least this much, but no overlap was found. | |
200 | |
201 | |
202 Bloom Filter Parameters (for kmer operations with less memory than Tadpole) | |
203 *Note: These require more memory and should be run with bbmerge-auto.sh.* | |
204 eccbloom=f (eccb) If reads fail to merge, error-correct with bbcms | |
205 and try again. | |
206 testmerge=f Test kmer counts around the read merge junctions. If | |
207 it appears that the merge created new errors, undo it. | |
208 This reduces the false-positive rate, but not as much as | |
209 rem or rsem. | |
210 | |
211 Java Parameters: | |
212 -Xmx This will set Java's memory usage, | |
213 overriding autodetection. | |
214 For example, -Xmx400m will specify 400 MB RAM. | |
215 -eoom This flag will cause the process to exit if an | |
216 out-of-memory exception occurs. Requires Java 8u92+. | |
217 -da Disable assertions. | |
218 | |
219 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems. | |
220 " | |
221 } | |
222 | |
223 #This block allows symlinked shellscripts to correctly set classpath. | |
224 pushd . > /dev/null | |
225 DIR="${BASH_SOURCE[0]}" | |
226 while [ -h "$DIR" ]; do | |
227 cd "$(dirname "$DIR")" | |
228 DIR="$(readlink "$(basename "$DIR")")" | |
229 done | |
230 cd "$(dirname "$DIR")" | |
231 DIR="$(pwd)/" | |
232 popd > /dev/null | |
233 | |
234 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" | |
235 CP="$DIR""current/" | |
236 JNI="-Djava.library.path=""$DIR""jni/" | |
237 #JNI="" | |
238 | |
239 z="-Xmx1000m" | |
240 z2="-Xms1000m" | |
241 set=0 | |
242 | |
243 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then | |
244 usage | |
245 exit | |
246 fi | |
247 | |
248 calcXmx () { | |
249 source "$DIR""/calcmem.sh" | |
250 setEnvironment | |
251 parseXmx "$@" | |
252 } | |
253 calcXmx "$@" | |
254 | |
255 function merge() { | |
256 local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP jgi.BBMerge $@" | |
257 echo $CMD >&2 | |
258 eval $CMD | |
259 } | |
260 | |
261 merge "$@" |