Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/bbmap.sh @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 #!/bin/bash | |
2 | |
3 usage(){ | |
4 echo " | |
5 BBMap | |
6 Written by Brian Bushnell, from Dec. 2010 - present | |
7 Last modified September 15, 2022 | |
8 | |
9 Description: Fast and accurate splice-aware read aligner. | |
10 Please read bbmap/docs/guides/BBMapGuide.txt for more information. | |
11 | |
12 To index: bbmap.sh ref=<reference fasta> | |
13 To map: bbmap.sh in=<reads> out=<output sam> | |
14 To map without writing an index: | |
15 bbmap.sh ref=<reference fasta> in=<reads> out=<output sam> nodisk | |
16 | |
17 in=stdin will accept reads from standard in, and out=stdout will write to | |
18 standard out, but file extensions are still needed to specify the format of the | |
19 input and output files e.g. in=stdin.fa.gz will read gzipped fasta from | |
20 standard in; out=stdout.sam.gz will write gzipped sam. | |
21 | |
22 Indexing Parameters (required when building the index): | |
23 nodisk=f Set to true to build index in memory and write nothing | |
24 to disk except output. | |
25 ref=<file> Specify the reference sequence. Only do this ONCE, | |
26 when building the index (unless using 'nodisk'). | |
27 build=1 If multiple references are indexed in the same directory, | |
28 each needs a unique numeric ID (unless using 'nodisk'). | |
29 k=13 Kmer length, range 8-15. Longer is faster but uses | |
30 more memory. Shorter is more sensitive. | |
31 If indexing and mapping are done in two steps, K should | |
32 be specified each time. | |
33 path=<.> Specify the location to write the index, if you don't | |
34 want it in the current working directory. | |
35 usemodulo=f Throw away ~80% of kmers based on remainder modulo a | |
36 number (reduces RAM by 50% and sensitivity slightly). | |
37 Should be enabled both when building the index AND | |
38 when mapping. | |
39 rebuild=f Force a rebuild of the index (ref= should be set). | |
40 | |
41 Input Parameters: | |
42 build=1 Designate index to use. Corresponds to the number | |
43 specified when building the index. | |
44 in=<file> Primary reads input; required parameter. | |
45 in2=<file> For paired reads in two files. | |
46 interleaved=auto True forces paired/interleaved input; false forces | |
47 single-ended mapping. If not specified, interleaved | |
48 status will be autodetected from read names. | |
49 fastareadlen=500 Break up FASTA reads longer than this. Max is 500 for | |
50 BBMap and 6000 for BBMapPacBio. Only works for FASTA | |
51 input (use 'maxlen' for FASTQ input). The default for | |
52 bbmap.sh is 500, and for mapPacBio.sh is 6000. | |
53 unpigz=f Spawn a pigz (parallel gzip) process for faster | |
54 decompression than using Java. | |
55 Requires pigz to be installed. | |
56 touppercase=t (tuc) Convert lowercase letters in reads to upper case | |
57 (otherwise they will not match the reference). | |
58 | |
59 Sampling Parameters: | |
60 | |
61 reads=-1 Set to a positive number N to only process the first N | |
62 reads (or pairs), then quit. -1 means use all reads. | |
63 samplerate=1 Set to a number from 0 to 1 to randomly select that | |
64 fraction of reads for mapping. 1 uses all reads. | |
65 skipreads=0 Set to a number N to skip the first N reads (or pairs), | |
66 then map the rest. | |
67 | |
68 Mapping Parameters: | |
69 fast=f This flag is a macro which sets other paramters to run | |
70 faster, at reduced sensitivity. Bad for RNA-seq. | |
71 slow=f This flag is a macro which sets other paramters to run | |
72 slower, at greater sensitivity. 'vslow' is even slower. | |
73 maxindel=16000 Don't look for indels longer than this. Lower is faster. | |
74 Set to >=100k for RNAseq with long introns like mammals. | |
75 strictmaxindel=f When enabled, do not allow indels longer than 'maxindel'. | |
76 By default these are not sought, but may be found anyway. | |
77 tipsearch=100 Look this far for read-end deletions with anchors | |
78 shorter than K, using brute force. | |
79 minid=0.76 Approximate minimum alignment identity to look for. | |
80 Higher is faster and less sensitive. | |
81 minhits=1 Minimum number of seed hits required for candidate sites. | |
82 Higher is faster. | |
83 local=f Set to true to use local, rather than global, alignments. | |
84 This will soft-clip ugly ends of poor alignments. | |
85 perfectmode=f Allow only perfect mappings when set to true (very fast). | |
86 semiperfectmode=f Allow only perfect and semiperfect (perfect except for | |
87 N's in the reference) mappings. | |
88 threads=auto (t) Set to number of threads desired. By default, uses | |
89 all cores available. | |
90 ambiguous=best (ambig) Set behavior on ambiguously-mapped reads (with | |
91 multiple top-scoring mapping locations). | |
92 best (use the first best site) | |
93 toss (consider unmapped) | |
94 random (select one top-scoring site randomly) | |
95 all (retain all top-scoring sites) | |
96 samestrandpairs=f (ssp) Specify whether paired reads should map to the | |
97 same strand or opposite strands. | |
98 requirecorrectstrand=t (rcs) Forbid pairing of reads without correct strand | |
99 orientation. Set to false for long-mate-pair libraries. | |
100 killbadpairs=f (kbp) If a read pair is mapped with an inappropriate | |
101 insert size or orientation, the read with the lower | |
102 mapping quality is marked unmapped. | |
103 pairedonly=f (po) Treat unpaired reads as unmapped. Thus they will | |
104 be sent to 'outu' but not 'outm'. | |
105 rcomp=f Reverse complement both reads prior to mapping (for LMP | |
106 outward-facing libraries). | |
107 rcompmate=f Reverse complement read2 prior to mapping. | |
108 pairlen=32000 Set max allowed distance between paired reads. | |
109 (insert size)=(pairlen)+(read1 length)+(read2 length) | |
110 rescuedist=1200 Don't try to rescue paired reads if avg. insert size | |
111 greater than this. Lower is faster. | |
112 rescuemismatches=32 Maximum mismatches allowed in a rescued read. Lower | |
113 is faster. | |
114 averagepairdist=100 (apd) Initial average distance between paired reads. | |
115 Varies dynamically; does not need to be specified. | |
116 deterministic=f Run in deterministic mode. In this case it is good | |
117 to set averagepairdist. BBMap is deterministic | |
118 without this flag if using single-ended reads, | |
119 or run singlethreaded. | |
120 bandwidthratio=0 (bwr) If above zero, restrict alignment band to this | |
121 fraction of read length. Faster but less accurate. | |
122 bandwidth=0 (bw) Set the bandwidth directly. | |
123 fraction of read length. Faster but less accurate. | |
124 usejni=f (jni) Do alignments faster, in C code. Requires | |
125 compiling the C code; details are in /jni/README.txt. | |
126 maxsites2=800 Don't analyze (or print) more than this many alignments | |
127 per read. | |
128 ignorefrequentkmers=t (ifk) Discard low-information kmers that occur often. | |
129 excludefraction=0.03 (ef) Fraction of kmers to ignore. For example, 0.03 | |
130 will ignore the most common 3% of kmers. | |
131 greedy=t Use a greedy algorithm to discard the least-useful | |
132 kmers on a per-read basis. | |
133 kfilter=0 If positive, potential mapping sites must have at | |
134 least this many consecutive exact matches. | |
135 | |
136 | |
137 Quality and Trimming Parameters: | |
138 qin=auto Set to 33 or 64 to specify input quality value ASCII | |
139 offset. 33 is Sanger, 64 is old Solexa. | |
140 qout=auto Set to 33 or 64 to specify output quality value ASCII | |
141 offset (only if output format is fastq). | |
142 qtrim=f Quality-trim ends before mapping. Options are: | |
143 'f' (false), 'l' (left), 'r' (right), and 'lr' (both). | |
144 untrim=f Undo trimming after mapping. Untrimmed bases will be | |
145 soft-clipped in cigar strings. | |
146 trimq=6 Trim regions with average quality below this | |
147 (phred algorithm). | |
148 mintrimlength=60 (mintl) Don't trim reads to be shorter than this. | |
149 fakefastaquality=-1 (ffq) Set to a positive number 1-50 to generate fake | |
150 quality strings for fasta input reads. | |
151 ignorebadquality=f (ibq) Keep going, rather than crashing, if a read has | |
152 out-of-range quality values. | |
153 usequality=t Use quality scores when determining which read kmers | |
154 to use as seeds. | |
155 minaveragequality=0 (maq) Do not map reads with average quality below this. | |
156 maqb=0 If positive, calculate maq from this many initial bases. | |
157 | |
158 Output Parameters: | |
159 out=<file> Write all reads to this file. | |
160 outu=<file> Write only unmapped reads to this file. Does not | |
161 include unmapped paired reads with a mapped mate. | |
162 outm=<file> Write only mapped reads to this file. Includes | |
163 unmapped paired reads with a mapped mate. | |
164 mappedonly=f If true, treats 'out' like 'outm'. | |
165 bamscript=<file> (bs) Write a shell script to <file> that will turn | |
166 the sam output into a sorted, indexed bam file. | |
167 ordered=f Set to true to output reads in same order as input. | |
168 Slower and uses more memory. | |
169 overwrite=f (ow) Allow process to overwrite existing files. | |
170 secondary=f Print secondary alignments. | |
171 sssr=0.95 (secondarysitescoreratio) Print only secondary alignments | |
172 with score of at least this fraction of primary. | |
173 ssao=f (secondarysiteasambiguousonly) Only print secondary | |
174 alignments for ambiguously-mapped reads. | |
175 maxsites=5 Maximum number of total alignments to print per read. | |
176 Only relevant when secondary=t. | |
177 quickmatch=f Generate cigar strings more quickly. | |
178 trimreaddescriptions=f (trd) Truncate read and ref names at the first whitespace, | |
179 assuming that the remainder is a comment or description. | |
180 ziplevel=2 (zl) Compression level for zip or gzip output. | |
181 pigz=f Spawn a pigz (parallel gzip) process for faster | |
182 compression than Java. Requires pigz to be installed. | |
183 machineout=f Set to true to output statistics in machine-friendly | |
184 'key=value' format. | |
185 printunmappedcount=f Print the total number of unmapped reads and bases. | |
186 If input is paired, the number will be of pairs | |
187 for which both reads are unmapped. | |
188 showprogress=0 If positive, print a '.' every X reads. | |
189 showprogress2=0 If positive, print the number of seconds since the | |
190 last progress update (instead of a '.'). | |
191 renamebyinsert=f Renames reads based on their mapped insert size. | |
192 | |
193 Bloom-Filtering Parameters (bloomfilter.sh is the standalone version). | |
194 bloom=f Use a Bloom filter to ignore reads not sharing kmers | |
195 with the reference. This uses more memory, but speeds | |
196 mapping when most reads don't match the reference. | |
197 bloomhashes=2 Number of hash functions. | |
198 bloomminhits=3 Number of consecutive hits to be considered matched. | |
199 bloomk=31 Bloom filter kmer length. | |
200 bloomserial=t Use the serialized Bloom filter for greater loading | |
201 speed, if available. If not, generate and write one. | |
202 | |
203 Post-Filtering Parameters: | |
204 idfilter=0 Independant of minid; sets exact minimum identity | |
205 allowed for alignments to be printed. Range 0 to 1. | |
206 subfilter=-1 Ban alignments with more than this many substitutions. | |
207 insfilter=-1 Ban alignments with more than this many insertions. | |
208 delfilter=-1 Ban alignments with more than this many deletions. | |
209 indelfilter=-1 Ban alignments with more than this many indels. | |
210 editfilter=-1 Ban alignments with more than this many edits. | |
211 inslenfilter=-1 Ban alignments with an insertion longer than this. | |
212 dellenfilter=-1 Ban alignments with a deletion longer than this. | |
213 nfilter=-1 Ban alignments with more than this many ns. This | |
214 includes nocall, noref, and off scaffold ends. | |
215 | |
216 Sam flags and settings: | |
217 noheader=f Disable generation of header lines. | |
218 sam=1.4 Set to 1.4 to write Sam version 1.4 cigar strings, | |
219 with = and X, or 1.3 to use M. | |
220 saa=t (secondaryalignmentasterisks) Use asterisks instead of | |
221 bases for sam secondary alignments. | |
222 cigar=t Set to 'f' to skip generation of cigar strings (faster). | |
223 keepnames=f Keep original names of paired reads, rather than | |
224 ensuring both reads have the same name. | |
225 intronlen=999999999 Set to a lower number like 10 to change 'D' to 'N' in | |
226 cigar strings for deletions of at least that length. | |
227 rgid= Set readgroup ID. All other readgroup fields | |
228 can be set similarly, with the flag rgXX= | |
229 If you set a readgroup flag to the word 'filename', | |
230 e.g. rgid=filename, the input file name will be used. | |
231 mdtag=f Write MD tags. | |
232 nhtag=f Write NH tags. | |
233 xmtag=f Write XM tags (may only work correctly with ambig=all). | |
234 amtag=f Write AM tags. | |
235 nmtag=f Write NM tags. | |
236 xstag=f Set to 'xs=fs', 'xs=ss', or 'xs=us' to write XS tags | |
237 for RNAseq using firststrand, secondstrand, or | |
238 unstranded libraries. Needed by Cufflinks. | |
239 JGI mainly uses 'firststrand'. | |
240 stoptag=f Write a tag indicating read stop location, prefixed by YS:i: | |
241 lengthtag=f Write a tag indicating (query,ref) alignment lengths, | |
242 prefixed by YL:Z: | |
243 idtag=f Write a tag indicating percent identity, prefixed by YI:f: | |
244 inserttag=f Write a tag indicating insert size, prefixed by X8:Z: | |
245 scoretag=f Write a tag indicating BBMap's raw score, prefixed by YR:i: | |
246 timetag=f Write a tag indicating this read's mapping time, prefixed by X0:i: | |
247 boundstag=f Write a tag indicating whether either read in the pair | |
248 goes off the end of the reference, prefixed by XB:Z: | |
249 notags=f Turn off all optional tags. | |
250 | |
251 Histogram and statistics output parameters: | |
252 scafstats=<file> Statistics on how many reads mapped to which scaffold. | |
253 refstats=<file> Statistics on how many reads mapped to which reference | |
254 file; only for BBSplit. | |
255 sortscafs=t Sort scaffolds or references by read count. | |
256 bhist=<file> Base composition histogram by position. | |
257 qhist=<file> Quality histogram by position. | |
258 aqhist=<file> Histogram of average read quality. | |
259 bqhist=<file> Quality histogram designed for box plots. | |
260 lhist=<file> Read length histogram. | |
261 ihist=<file> Write histogram of insert sizes (for paired reads). | |
262 ehist=<file> Errors-per-read histogram. | |
263 qahist=<file> Quality accuracy histogram of error rates versus | |
264 quality score. | |
265 indelhist=<file> Indel length histogram. | |
266 mhist=<file> Histogram of match, sub, del, and ins rates by | |
267 read location. | |
268 gchist=<file> Read GC content histogram. | |
269 gcbins=100 Number gchist bins. Set to 'auto' to use read length. | |
270 gcpairs=t Use average GC of paired reads. | |
271 idhist=<file> Histogram of read count versus percent identity. | |
272 idbins=100 Number idhist bins. Set to 'auto' to use read length. | |
273 statsfile=stderr Mapping statistics are printed here. | |
274 | |
275 Coverage output parameters (these may reduce speed and use more RAM): | |
276 covstats=<file> Per-scaffold coverage info. | |
277 rpkm=<file> Per-scaffold RPKM/FPKM counts. | |
278 covhist=<file> Histogram of # occurrences of each depth level. | |
279 basecov=<file> Coverage per base location. | |
280 bincov=<file> Print binned coverage per location (one line per X bases). | |
281 covbinsize=1000 Set the binsize for binned coverage output. | |
282 nzo=t Only print scaffolds with nonzero coverage. | |
283 twocolumn=f Change to true to print only ID and Avg_fold instead of | |
284 all 6 columns to the 'out=' file. | |
285 32bit=f Set to true if you need per-base coverage over 64k. | |
286 strandedcov=f Track coverage for plus and minus strand independently. | |
287 startcov=f Only track start positions of reads. | |
288 secondarycov=t Include coverage of secondary alignments. | |
289 physcov=f Calculate physical coverage for paired reads. | |
290 This includes the unsequenced bases. | |
291 delcoverage=t (delcov) Count bases covered by deletions as covered. | |
292 True is faster than false. | |
293 covk=0 If positive, calculate kmer coverage statistics. | |
294 | |
295 Java Parameters: | |
296 -Xmx This will set Java's memory usage, | |
297 overriding autodetection. | |
298 -Xmx20g will specify 20 gigs of RAM, and -Xmx800m | |
299 will specify 800 megs. The max is typically 85% of | |
300 physical memory. The human genome requires around 24g, | |
301 or 12g with the 'usemodulo' flag. The index uses | |
302 roughly 6 bytes per reference base. | |
303 -eoom This flag will cause the process to exit if an | |
304 out-of-memory exception occurs. Requires Java 8u92+. | |
305 -da Disable assertions. | |
306 | |
307 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter | |
308 any problems, or post at: http://seqanswers.com/forums/showthread.php?t=41057 | |
309 " | |
310 } | |
311 | |
312 #This block allows symlinked shellscripts to correctly set classpath. | |
313 pushd . > /dev/null | |
314 DIR="${BASH_SOURCE[0]}" | |
315 while [ -h "$DIR" ]; do | |
316 cd "$(dirname "$DIR")" | |
317 DIR="$(readlink "$(basename "$DIR")")" | |
318 done | |
319 cd "$(dirname "$DIR")" | |
320 DIR="$(pwd)/" | |
321 popd > /dev/null | |
322 | |
323 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" | |
324 CP="$DIR""current/" | |
325 JNI="-Djava.library.path=""$DIR""jni/" | |
326 JNI="" | |
327 | |
328 z="-Xmx1g" | |
329 z2="-Xms1g" | |
330 set=0 | |
331 | |
332 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then | |
333 usage | |
334 exit | |
335 fi | |
336 | |
337 calcXmx () { | |
338 source "$DIR""/calcmem.sh" | |
339 setEnvironment | |
340 parseXmx "$@" | |
341 if [[ $set == 1 ]]; then | |
342 return | |
343 fi | |
344 freeRam 3200m 84 | |
345 z="-Xmx${RAM}m" | |
346 z2="-Xms${RAM}m" | |
347 } | |
348 calcXmx "$@" | |
349 | |
350 | |
351 bbmap() { | |
352 local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap build=1 overwrite=true fastareadlen=500 $@" | |
353 echo $CMD >&2 | |
354 eval $CMD | |
355 } | |
356 | |
357 bbmap "$@" |