jpayne@69
|
1 #!/bin/bash
|
jpayne@69
|
2
|
jpayne@69
|
3 usage(){
|
jpayne@69
|
4 echo "
|
jpayne@69
|
5 Written by Brian Bushnell
|
jpayne@69
|
6 Last modified July 28, 2022
|
jpayne@69
|
7
|
jpayne@69
|
8 Description: Sorts sequences to put similar reads near each other.
|
jpayne@69
|
9 Can be used for increased compression or error correction.
|
jpayne@69
|
10 Please read bbmap/docs/guides/ClumpifyGuide.txt for more information.
|
jpayne@69
|
11
|
jpayne@69
|
12 Usage: clumpify.sh in=<file> out=<file> reorder
|
jpayne@69
|
13
|
jpayne@69
|
14 Input may be fasta or fastq, compressed or uncompressed. Cannot accept sam.
|
jpayne@69
|
15
|
jpayne@69
|
16 Parameters and their defaults:
|
jpayne@69
|
17 in=<file> Input file.
|
jpayne@69
|
18 in2=<file> Optional input for read 2 of twin paired files.
|
jpayne@69
|
19 out=<file> Output file. May not be standard out.
|
jpayne@69
|
20 out2=<file> Optional output for read 2 of twin paired files.
|
jpayne@69
|
21 groups=auto Use this many intermediate files (to save memory).
|
jpayne@69
|
22 1 group is fastest. Auto will estimate the number
|
jpayne@69
|
23 of groups needed based on the file size, so it should
|
jpayne@69
|
24 not ever run out of memory.
|
jpayne@69
|
25 lowcomplexity=f For compressed low-complexity libraries such as RNA-seq,
|
jpayne@69
|
26 this will more conservatively estimate how much memory
|
jpayne@69
|
27 is needed to automatically decide the number of groups.
|
jpayne@69
|
28 rcomp=f Give read clumps the same orientation to increase
|
jpayne@69
|
29 compression. Should be disabled for paired reads.
|
jpayne@69
|
30 overwrite=f (ow) Set to false to force the program to abort rather
|
jpayne@69
|
31 than overwrite an existing file.
|
jpayne@69
|
32 qin=auto Auto-detect input quality encoding. May be set to:
|
jpayne@69
|
33 33: ASCII-33 (Sanger) encoding.
|
jpayne@69
|
34 64: ASCII-64 (old Illumina) encoding.
|
jpayne@69
|
35 All modern sequence is encoded as ASCII-33.
|
jpayne@69
|
36 qout=auto Use input quality encoding as output quality encoding.
|
jpayne@69
|
37 changequality=f (cq) If true, fix broken quality scores such as Ns with
|
jpayne@69
|
38 Q>0. Default is false to ensure lossless compression.
|
jpayne@69
|
39 fastawrap=70 Set to a higher number like 4000 for longer lines in
|
jpayne@69
|
40 fasta format, which increases compression.
|
jpayne@69
|
41
|
jpayne@69
|
42 Compression parameters:
|
jpayne@69
|
43 ziplevel=6 (zl) Gzip compression level (1-11). Higher is slower.
|
jpayne@69
|
44 Level 11 is only available if pigz is installed and is
|
jpayne@69
|
45 extremely slow to compress, but faster to decompress.
|
jpayne@69
|
46 Naming the output file to *.bz2 will use bzip2 instead of
|
jpayne@69
|
47 gzip for ~9% additional compression, which requires
|
jpayne@69
|
48 bzip2, pbzip2, or lbzip2 in the path.
|
jpayne@69
|
49 blocksize=128 Size of blocks for pigz, in kb. Higher gives slightly
|
jpayne@69
|
50 better compression.
|
jpayne@69
|
51 shortname=f Make the names as short as possible. 'shortname=shrink'
|
jpayne@69
|
52 will shorten the names where possible, but retain the
|
jpayne@69
|
53 flowcell and barcode information.
|
jpayne@69
|
54 reorder=f Reorder clumps for additional compression. Only valid
|
jpayne@69
|
55 when groups=1, passes=1, and ecc=f. Possible modes:
|
jpayne@69
|
56 f: Do not reorder clumps.
|
jpayne@69
|
57 c: Reorder using consensus reads. Uses additional
|
jpayne@69
|
58 time and memory.
|
jpayne@69
|
59 p: Reorder using pair information. Requires paired
|
jpayne@69
|
60 reads. Yields the highest compression.
|
jpayne@69
|
61 a: Automatically choose between 'c' and 'p'. The
|
jpayne@69
|
62 flag reorder with no argument will set 'reorder=a'.
|
jpayne@69
|
63 quantize=f Bin the quality scores, like NextSeq. This greatly
|
jpayne@69
|
64 increases compression, but information is lost.
|
jpayne@69
|
65
|
jpayne@69
|
66 Temp file parameters:
|
jpayne@69
|
67 compresstemp=auto (ct) Gzip temporary files. By default temp files will be
|
jpayne@69
|
68 compressed if the output file is compressed.
|
jpayne@69
|
69 deletetemp=t Delete temporary files.
|
jpayne@69
|
70 deleteinput=f Delete input upon successful completion.
|
jpayne@69
|
71 usetmpdir=f Use tmpdir for temp files.
|
jpayne@69
|
72 tmpdir= By default, this is the environment variable TMPDIR.
|
jpayne@69
|
73
|
jpayne@69
|
74 Hashing parameters:
|
jpayne@69
|
75 k=31 Use kmers of this length (1-31). Shorter kmers may
|
jpayne@69
|
76 increase compression, but 31 is recommended for error
|
jpayne@69
|
77 correction.
|
jpayne@69
|
78 mincount=0 Don't use pivot kmers with count less than this.
|
jpayne@69
|
79 Setting mincount=2 can increase compression.
|
jpayne@69
|
80 Increases time and memory usage.
|
jpayne@69
|
81 seed=1 Random number generator seed for hashing.
|
jpayne@69
|
82 Set to a negative number to use a random seed.
|
jpayne@69
|
83 hashes=4 Use this many masks when hashing. 0 uses raw kmers.
|
jpayne@69
|
84 Often hashes=0 increases compression, but it should
|
jpayne@69
|
85 not be used with error-correction.
|
jpayne@69
|
86 border=1 Do not use kmers within this many bases of read ends.
|
jpayne@69
|
87
|
jpayne@69
|
88 Deduplication parameters:
|
jpayne@69
|
89 dedupe=f Remove duplicate reads. For pairs, both must match.
|
jpayne@69
|
90 By default, deduplication does not occur.
|
jpayne@69
|
91 If dedupe and markduplicates are both false, none of
|
jpayne@69
|
92 the other duplicate-related flags will have any effect.
|
jpayne@69
|
93 markduplicates=f Don't remove; just append ' duplicate' to the name.
|
jpayne@69
|
94 allduplicates=f Mark or remove all copies of duplicates, instead of
|
jpayne@69
|
95 keeping the highest-quality copy.
|
jpayne@69
|
96 addcount=f Append the number of copies to the read name.
|
jpayne@69
|
97 Mutually exclusive with markduplicates or allduplicates.
|
jpayne@69
|
98 entryfilter=f This assists in removing exact duplicates, which saves
|
jpayne@69
|
99 memory in libraries that split unevenly due to huge
|
jpayne@69
|
100 numbers of duplicates. Enabled automatically as needed.
|
jpayne@69
|
101 subs=2 (s) Maximum substitutions allowed between duplicates.
|
jpayne@69
|
102 subrate=0.0 (dsr) If set, the number of substitutions allowed will be
|
jpayne@69
|
103 max(subs, subrate*min(length1, length2)) for 2 sequences.
|
jpayne@69
|
104 allowns=t No-called bases will not be considered substitutions.
|
jpayne@69
|
105 scanlimit=5 (scan) Continue for this many reads after encountering a
|
jpayne@69
|
106 nonduplicate. Improves detection of inexact duplicates.
|
jpayne@69
|
107 containment=f Allow containments (where one sequence is shorter).
|
jpayne@69
|
108 affix=f For containments, require one sequence to be an affix
|
jpayne@69
|
109 (prefix or suffix) of the other.
|
jpayne@69
|
110 optical=f If true, mark or remove optical duplicates only.
|
jpayne@69
|
111 This means they are Illumina reads within a certain
|
jpayne@69
|
112 distance on the flowcell. Normal Illumina names needed.
|
jpayne@69
|
113 Also for tile-edge and well duplicates.
|
jpayne@69
|
114 dupedist=40 (dist) Max distance to consider for optical duplicates.
|
jpayne@69
|
115 Higher removes more duplicates but is more likely to
|
jpayne@69
|
116 remove PCR rather than optical duplicates.
|
jpayne@69
|
117 This is platform-specific; recommendations:
|
jpayne@69
|
118 NextSeq 40 (and spany=t)
|
jpayne@69
|
119 HiSeq 1T 40
|
jpayne@69
|
120 HiSeq 2500 40
|
jpayne@69
|
121 HiSeq 3k/4k 2500
|
jpayne@69
|
122 Novaseq 12000
|
jpayne@69
|
123 spany=f Allow reads to be considered optical duplicates if they
|
jpayne@69
|
124 are on different tiles, but are within dupedist in the
|
jpayne@69
|
125 y-axis. Should only be enabled when looking for
|
jpayne@69
|
126 tile-edge duplicates (as in NextSeq).
|
jpayne@69
|
127 spanx=f Like spany, but for the x-axis. Not necessary
|
jpayne@69
|
128 for NextSeq.
|
jpayne@69
|
129 spantiles=f Set both spanx and spany.
|
jpayne@69
|
130 adjacent=f Limit tile-spanning to adjacent tiles (those with
|
jpayne@69
|
131 consecutive numbers).
|
jpayne@69
|
132 *** Thus, for NextSeq, the recommended deduplication flags are: ***
|
jpayne@69
|
133 dedupe optical spany adjacent
|
jpayne@69
|
134
|
jpayne@69
|
135 Pairing/ordering parameters (for use with error-correction):
|
jpayne@69
|
136 unpair=f For paired reads, clump all of them rather than just
|
jpayne@69
|
137 read 1. Destroys pairing. Without this flag, for paired
|
jpayne@69
|
138 reads, only read 1 will be error-corrected.
|
jpayne@69
|
139 repair=f After clumping and error-correction, restore pairing.
|
jpayne@69
|
140 If groups>1 this will sort by name which will destroy
|
jpayne@69
|
141 clump ordering; with a single group, clumping will
|
jpayne@69
|
142 be retained.
|
jpayne@69
|
143
|
jpayne@69
|
144 Error-correction parameters:
|
jpayne@69
|
145 ecc=f Error-correct reads. Requires multiple passes for
|
jpayne@69
|
146 complete correction.
|
jpayne@69
|
147 ecco=f Error-correct paired reads via overlap before clumping.
|
jpayne@69
|
148 passes=1 Use this many error-correction passes. 6 passes are
|
jpayne@69
|
149 suggested.
|
jpayne@69
|
150 consensus=f Output consensus sequence instead of clumps.
|
jpayne@69
|
151
|
jpayne@69
|
152 Advanced error-correction parameters:
|
jpayne@69
|
153 mincc=4 (mincountcorrect) Do not correct to alleles occuring less
|
jpayne@69
|
154 often than this.
|
jpayne@69
|
155 minss=4 (minsizesplit) Do not split into new clumps smaller than
|
jpayne@69
|
156 this.
|
jpayne@69
|
157 minsfs=0.17 (minsizefractionsplit) Do not split on pivot alleles in
|
jpayne@69
|
158 areas with local depth less than this fraction of clump size.
|
jpayne@69
|
159 minsfc=0.20 (minsizefractioncorrect) Do not correct in areas with local
|
jpayne@69
|
160 depth less than this.
|
jpayne@69
|
161 minr=30.0 (minratio) Correct to the consensus if the ratio of the
|
jpayne@69
|
162 consensus allele to second-most-common allele is >=minr,
|
jpayne@69
|
163 for high depth. Actual ratio used is:
|
jpayne@69
|
164 min(minr, minro+minorCount*minrm+quality*minrqm).
|
jpayne@69
|
165 minro=1.9 (minratiooffset) Base ratio.
|
jpayne@69
|
166 minrm=1.8 (minratiomult) Ratio multiplier for secondary allele count.
|
jpayne@69
|
167 minrqm=0.08 (minratioqmult) Ratio multiplier for base quality.
|
jpayne@69
|
168 minqr=2.8 (minqratio) Do not correct bases when cq*minqr>rqsum.
|
jpayne@69
|
169 minaqr=0.70 (minaqratio) Do not correct bases when cq*minaqr>5+rqavg.
|
jpayne@69
|
170 minid=0.97 (minidentity) Do not correct reads with identity to
|
jpayne@69
|
171 consensus less than this.
|
jpayne@69
|
172 maxqadjust=0 Adjust quality scores by at most maxqadjust per pass.
|
jpayne@69
|
173 maxqi=-1 (maxqualityincorrect) Do not correct bases with quality
|
jpayne@69
|
174 above this (if positive).
|
jpayne@69
|
175 maxci=-1 (maxcountincorrect) Do not correct alleles with count
|
jpayne@69
|
176 above this (if positive).
|
jpayne@69
|
177 findcorrelations=t Look for correlated SNPs in clumps to split into alleles.
|
jpayne@69
|
178 maxcorrelations=12 Maximum number of eligible SNPs per clump to consider for
|
jpayne@69
|
179 correlations. Increasing this number can reduce false-
|
jpayne@69
|
180 positive corrections at the possible expense of speed.
|
jpayne@69
|
181
|
jpayne@69
|
182 Java Parameters:
|
jpayne@69
|
183 -Xmx This will set Java's memory usage, overriding autodetection.
|
jpayne@69
|
184 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
|
jpayne@69
|
185 The max is typically 85% of physical memory.
|
jpayne@69
|
186 -eoom This flag will cause the process to exit if an
|
jpayne@69
|
187 out-of-memory exception occurs. Requires Java 8u92+.
|
jpayne@69
|
188 -da Disable assertions.
|
jpayne@69
|
189
|
jpayne@69
|
190 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
|
jpayne@69
|
191 "
|
jpayne@69
|
192 }
|
jpayne@69
|
193
|
jpayne@69
|
194 #This block allows symlinked shellscripts to correctly set classpath.
|
jpayne@69
|
195 pushd . > /dev/null
|
jpayne@69
|
196 DIR="${BASH_SOURCE[0]}"
|
jpayne@69
|
197 while [ -h "$DIR" ]; do
|
jpayne@69
|
198 cd "$(dirname "$DIR")"
|
jpayne@69
|
199 DIR="$(readlink "$(basename "$DIR")")"
|
jpayne@69
|
200 done
|
jpayne@69
|
201 cd "$(dirname "$DIR")"
|
jpayne@69
|
202 DIR="$(pwd)/"
|
jpayne@69
|
203 popd > /dev/null
|
jpayne@69
|
204
|
jpayne@69
|
205 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
|
jpayne@69
|
206 CP="$DIR""current/"
|
jpayne@69
|
207
|
jpayne@69
|
208 z="-Xmx2g"
|
jpayne@69
|
209 z2="-Xms2g"
|
jpayne@69
|
210 set=0
|
jpayne@69
|
211
|
jpayne@69
|
212 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
|
jpayne@69
|
213 usage
|
jpayne@69
|
214 exit
|
jpayne@69
|
215 fi
|
jpayne@69
|
216
|
jpayne@69
|
217 calcXmx () {
|
jpayne@69
|
218 source "$DIR""/calcmem.sh"
|
jpayne@69
|
219 setEnvironment
|
jpayne@69
|
220 parseXmx "$@"
|
jpayne@69
|
221 if [[ $set == 1 ]]; then
|
jpayne@69
|
222 return
|
jpayne@69
|
223 fi
|
jpayne@69
|
224 #Note that this uses slighly less (82%) of memory than normal to account for multiple pigz instances.
|
jpayne@69
|
225 freeRam 2000m 82
|
jpayne@69
|
226 z="-Xmx${RAM}m"
|
jpayne@69
|
227 z2="-Xms${RAM}m"
|
jpayne@69
|
228 }
|
jpayne@69
|
229 calcXmx "$@"
|
jpayne@69
|
230
|
jpayne@69
|
231 clumpify() {
|
jpayne@69
|
232 local CMD="java $EA $EOOM $z $z2 -cp $CP clump.Clumpify $@"
|
jpayne@69
|
233 echo $CMD >&2
|
jpayne@69
|
234 eval $CMD
|
jpayne@69
|
235 }
|
jpayne@69
|
236
|
jpayne@69
|
237 clumpify "$@"
|