Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/clumpify.sh @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 #!/bin/bash | |
2 | |
3 usage(){ | |
4 echo " | |
5 Written by Brian Bushnell | |
6 Last modified July 28, 2022 | |
7 | |
8 Description: Sorts sequences to put similar reads near each other. | |
9 Can be used for increased compression or error correction. | |
10 Please read bbmap/docs/guides/ClumpifyGuide.txt for more information. | |
11 | |
12 Usage: clumpify.sh in=<file> out=<file> reorder | |
13 | |
14 Input may be fasta or fastq, compressed or uncompressed. Cannot accept sam. | |
15 | |
16 Parameters and their defaults: | |
17 in=<file> Input file. | |
18 in2=<file> Optional input for read 2 of twin paired files. | |
19 out=<file> Output file. May not be standard out. | |
20 out2=<file> Optional output for read 2 of twin paired files. | |
21 groups=auto Use this many intermediate files (to save memory). | |
22 1 group is fastest. Auto will estimate the number | |
23 of groups needed based on the file size, so it should | |
24 not ever run out of memory. | |
25 lowcomplexity=f For compressed low-complexity libraries such as RNA-seq, | |
26 this will more conservatively estimate how much memory | |
27 is needed to automatically decide the number of groups. | |
28 rcomp=f Give read clumps the same orientation to increase | |
29 compression. Should be disabled for paired reads. | |
30 overwrite=f (ow) Set to false to force the program to abort rather | |
31 than overwrite an existing file. | |
32 qin=auto Auto-detect input quality encoding. May be set to: | |
33 33: ASCII-33 (Sanger) encoding. | |
34 64: ASCII-64 (old Illumina) encoding. | |
35 All modern sequence is encoded as ASCII-33. | |
36 qout=auto Use input quality encoding as output quality encoding. | |
37 changequality=f (cq) If true, fix broken quality scores such as Ns with | |
38 Q>0. Default is false to ensure lossless compression. | |
39 fastawrap=70 Set to a higher number like 4000 for longer lines in | |
40 fasta format, which increases compression. | |
41 | |
42 Compression parameters: | |
43 ziplevel=6 (zl) Gzip compression level (1-11). Higher is slower. | |
44 Level 11 is only available if pigz is installed and is | |
45 extremely slow to compress, but faster to decompress. | |
46 Naming the output file to *.bz2 will use bzip2 instead of | |
47 gzip for ~9% additional compression, which requires | |
48 bzip2, pbzip2, or lbzip2 in the path. | |
49 blocksize=128 Size of blocks for pigz, in kb. Higher gives slightly | |
50 better compression. | |
51 shortname=f Make the names as short as possible. 'shortname=shrink' | |
52 will shorten the names where possible, but retain the | |
53 flowcell and barcode information. | |
54 reorder=f Reorder clumps for additional compression. Only valid | |
55 when groups=1, passes=1, and ecc=f. Possible modes: | |
56 f: Do not reorder clumps. | |
57 c: Reorder using consensus reads. Uses additional | |
58 time and memory. | |
59 p: Reorder using pair information. Requires paired | |
60 reads. Yields the highest compression. | |
61 a: Automatically choose between 'c' and 'p'. The | |
62 flag reorder with no argument will set 'reorder=a'. | |
63 quantize=f Bin the quality scores, like NextSeq. This greatly | |
64 increases compression, but information is lost. | |
65 | |
66 Temp file parameters: | |
67 compresstemp=auto (ct) Gzip temporary files. By default temp files will be | |
68 compressed if the output file is compressed. | |
69 deletetemp=t Delete temporary files. | |
70 deleteinput=f Delete input upon successful completion. | |
71 usetmpdir=f Use tmpdir for temp files. | |
72 tmpdir= By default, this is the environment variable TMPDIR. | |
73 | |
74 Hashing parameters: | |
75 k=31 Use kmers of this length (1-31). Shorter kmers may | |
76 increase compression, but 31 is recommended for error | |
77 correction. | |
78 mincount=0 Don't use pivot kmers with count less than this. | |
79 Setting mincount=2 can increase compression. | |
80 Increases time and memory usage. | |
81 seed=1 Random number generator seed for hashing. | |
82 Set to a negative number to use a random seed. | |
83 hashes=4 Use this many masks when hashing. 0 uses raw kmers. | |
84 Often hashes=0 increases compression, but it should | |
85 not be used with error-correction. | |
86 border=1 Do not use kmers within this many bases of read ends. | |
87 | |
88 Deduplication parameters: | |
89 dedupe=f Remove duplicate reads. For pairs, both must match. | |
90 By default, deduplication does not occur. | |
91 If dedupe and markduplicates are both false, none of | |
92 the other duplicate-related flags will have any effect. | |
93 markduplicates=f Don't remove; just append ' duplicate' to the name. | |
94 allduplicates=f Mark or remove all copies of duplicates, instead of | |
95 keeping the highest-quality copy. | |
96 addcount=f Append the number of copies to the read name. | |
97 Mutually exclusive with markduplicates or allduplicates. | |
98 entryfilter=f This assists in removing exact duplicates, which saves | |
99 memory in libraries that split unevenly due to huge | |
100 numbers of duplicates. Enabled automatically as needed. | |
101 subs=2 (s) Maximum substitutions allowed between duplicates. | |
102 subrate=0.0 (dsr) If set, the number of substitutions allowed will be | |
103 max(subs, subrate*min(length1, length2)) for 2 sequences. | |
104 allowns=t No-called bases will not be considered substitutions. | |
105 scanlimit=5 (scan) Continue for this many reads after encountering a | |
106 nonduplicate. Improves detection of inexact duplicates. | |
107 containment=f Allow containments (where one sequence is shorter). | |
108 affix=f For containments, require one sequence to be an affix | |
109 (prefix or suffix) of the other. | |
110 optical=f If true, mark or remove optical duplicates only. | |
111 This means they are Illumina reads within a certain | |
112 distance on the flowcell. Normal Illumina names needed. | |
113 Also for tile-edge and well duplicates. | |
114 dupedist=40 (dist) Max distance to consider for optical duplicates. | |
115 Higher removes more duplicates but is more likely to | |
116 remove PCR rather than optical duplicates. | |
117 This is platform-specific; recommendations: | |
118 NextSeq 40 (and spany=t) | |
119 HiSeq 1T 40 | |
120 HiSeq 2500 40 | |
121 HiSeq 3k/4k 2500 | |
122 Novaseq 12000 | |
123 spany=f Allow reads to be considered optical duplicates if they | |
124 are on different tiles, but are within dupedist in the | |
125 y-axis. Should only be enabled when looking for | |
126 tile-edge duplicates (as in NextSeq). | |
127 spanx=f Like spany, but for the x-axis. Not necessary | |
128 for NextSeq. | |
129 spantiles=f Set both spanx and spany. | |
130 adjacent=f Limit tile-spanning to adjacent tiles (those with | |
131 consecutive numbers). | |
132 *** Thus, for NextSeq, the recommended deduplication flags are: *** | |
133 dedupe optical spany adjacent | |
134 | |
135 Pairing/ordering parameters (for use with error-correction): | |
136 unpair=f For paired reads, clump all of them rather than just | |
137 read 1. Destroys pairing. Without this flag, for paired | |
138 reads, only read 1 will be error-corrected. | |
139 repair=f After clumping and error-correction, restore pairing. | |
140 If groups>1 this will sort by name which will destroy | |
141 clump ordering; with a single group, clumping will | |
142 be retained. | |
143 | |
144 Error-correction parameters: | |
145 ecc=f Error-correct reads. Requires multiple passes for | |
146 complete correction. | |
147 ecco=f Error-correct paired reads via overlap before clumping. | |
148 passes=1 Use this many error-correction passes. 6 passes are | |
149 suggested. | |
150 consensus=f Output consensus sequence instead of clumps. | |
151 | |
152 Advanced error-correction parameters: | |
153 mincc=4 (mincountcorrect) Do not correct to alleles occuring less | |
154 often than this. | |
155 minss=4 (minsizesplit) Do not split into new clumps smaller than | |
156 this. | |
157 minsfs=0.17 (minsizefractionsplit) Do not split on pivot alleles in | |
158 areas with local depth less than this fraction of clump size. | |
159 minsfc=0.20 (minsizefractioncorrect) Do not correct in areas with local | |
160 depth less than this. | |
161 minr=30.0 (minratio) Correct to the consensus if the ratio of the | |
162 consensus allele to second-most-common allele is >=minr, | |
163 for high depth. Actual ratio used is: | |
164 min(minr, minro+minorCount*minrm+quality*minrqm). | |
165 minro=1.9 (minratiooffset) Base ratio. | |
166 minrm=1.8 (minratiomult) Ratio multiplier for secondary allele count. | |
167 minrqm=0.08 (minratioqmult) Ratio multiplier for base quality. | |
168 minqr=2.8 (minqratio) Do not correct bases when cq*minqr>rqsum. | |
169 minaqr=0.70 (minaqratio) Do not correct bases when cq*minaqr>5+rqavg. | |
170 minid=0.97 (minidentity) Do not correct reads with identity to | |
171 consensus less than this. | |
172 maxqadjust=0 Adjust quality scores by at most maxqadjust per pass. | |
173 maxqi=-1 (maxqualityincorrect) Do not correct bases with quality | |
174 above this (if positive). | |
175 maxci=-1 (maxcountincorrect) Do not correct alleles with count | |
176 above this (if positive). | |
177 findcorrelations=t Look for correlated SNPs in clumps to split into alleles. | |
178 maxcorrelations=12 Maximum number of eligible SNPs per clump to consider for | |
179 correlations. Increasing this number can reduce false- | |
180 positive corrections at the possible expense of speed. | |
181 | |
182 Java Parameters: | |
183 -Xmx This will set Java's memory usage, overriding autodetection. | |
184 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. | |
185 The max is typically 85% of physical memory. | |
186 -eoom This flag will cause the process to exit if an | |
187 out-of-memory exception occurs. Requires Java 8u92+. | |
188 -da Disable assertions. | |
189 | |
190 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems. | |
191 " | |
192 } | |
193 | |
194 #This block allows symlinked shellscripts to correctly set classpath. | |
195 pushd . > /dev/null | |
196 DIR="${BASH_SOURCE[0]}" | |
197 while [ -h "$DIR" ]; do | |
198 cd "$(dirname "$DIR")" | |
199 DIR="$(readlink "$(basename "$DIR")")" | |
200 done | |
201 cd "$(dirname "$DIR")" | |
202 DIR="$(pwd)/" | |
203 popd > /dev/null | |
204 | |
205 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" | |
206 CP="$DIR""current/" | |
207 | |
208 z="-Xmx2g" | |
209 z2="-Xms2g" | |
210 set=0 | |
211 | |
212 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then | |
213 usage | |
214 exit | |
215 fi | |
216 | |
217 calcXmx () { | |
218 source "$DIR""/calcmem.sh" | |
219 setEnvironment | |
220 parseXmx "$@" | |
221 if [[ $set == 1 ]]; then | |
222 return | |
223 fi | |
224 #Note that this uses slighly less (82%) of memory than normal to account for multiple pigz instances. | |
225 freeRam 2000m 82 | |
226 z="-Xmx${RAM}m" | |
227 z2="-Xms${RAM}m" | |
228 } | |
229 calcXmx "$@" | |
230 | |
231 clumpify() { | |
232 local CMD="java $EA $EOOM $z $z2 -cp $CP clump.Clumpify $@" | |
233 echo $CMD >&2 | |
234 eval $CMD | |
235 } | |
236 | |
237 clumpify "$@" |