Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/shared/TrimRead.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package shared; | |
2 | |
3 import java.io.Serializable; | |
4 import java.util.ArrayList; | |
5 import java.util.Arrays; | |
6 | |
7 import align2.QualityTools; | |
8 import dna.AminoAcid; | |
9 import stream.Read; | |
10 import stream.SamLine; | |
11 import stream.SiteScore; | |
12 import structures.ByteBuilder; | |
13 | |
14 /** | |
15 * Helper class for processes that do inline quality trimming. | |
16 * @author Brian Bushnell | |
17 * @date Mar 15, 2013 | |
18 * | |
19 */ | |
20 public final class TrimRead implements Serializable { | |
21 | |
22 /** | |
23 * | |
24 */ | |
25 private static final long serialVersionUID = 8791743639124592480L; | |
26 | |
27 public static void main(String[] args){ | |
28 byte[] bases=args[0].getBytes(); | |
29 byte[] quals=(args.length<2 ? null : args[1].getBytes()); | |
30 if(quals!=null){ | |
31 for(int i=0; i<quals.length; i++){quals[i]-=32;} | |
32 } | |
33 byte[] match=(args.length<3 ? null : args[2].getBytes()); | |
34 float minq=(args.length<4 ? 5 : Float.parseFloat(args[3])); | |
35 float minE=(float)QualityTools.phredToProbError(minq); | |
36 Read r=new Read(bases, quals, 1); | |
37 r.match=match; | |
38 System.out.println("Before trim:\n"+r.toFastq()+(r.match==null ? "" : "\n"+new String(r.match))); | |
39 System.out.println(Arrays.toString(r.quality)); | |
40 TrimRead tr=trim(r, true, true, minq, minE, 1); | |
41 System.out.println("\nAfter trim:\n"+r.toFastq()+(r.match==null ? "" : "\n"+new String(r.match))); | |
42 if(r.match==null){ | |
43 r.match=new byte[r.length()]; | |
44 for(int i=0; i<r.length(); i++){r.match[i]='m';} | |
45 } | |
46 tr.untrim(); | |
47 System.out.println("\nAfter untrim:\n"+r.toFastq()+(r.match==null ? "" : "\n"+new String(r.match))); | |
48 } | |
49 | |
50 // public static TrimRead trim(Read r, boolean trimLeft, boolean trimRight, float trimq, int minlen){ | |
51 // if(r==null || r.bases==null){return null;} | |
52 // | |
53 // final int a, b; | |
54 // if(optimalMode){ | |
55 // long packed=testOptimal(r.bases, r.quality, QualityTools.PROB_ERROR[trimq]); | |
56 // a=trimLeft ? (int)((packed>>32)&0xFFFFFFFFL) : 0; | |
57 // b=trimRight ? (int)((packed)&0xFFFFFFFFL) : 0; | |
58 // }else if(windowMode){ | |
59 // a=0; | |
60 // b=(trimRight ? testRightWindow(r.bases, r.quality, (byte)trimq, windowLength) : 0); | |
61 // }else{ | |
62 // a=(trimLeft ? testLeft(r.bases, r.quality, (byte)trimq) : 0); | |
63 // b=(trimRight ? testRight(r.bases, r.quality, (byte)trimq) : 0); | |
64 // } | |
65 // return (a+b==0 ? null : new TrimRead(r, a, b, trimq, minlen)); | |
66 // } | |
67 | |
68 public static TrimRead trim(Read r, boolean trimLeft, boolean trimRight, | |
69 float trimq, float avgErrorRate, int minlen){ | |
70 if(r==null || r.bases==null){return null;} | |
71 | |
72 final int a, b; | |
73 if(optimalMode){ | |
74 long packed=testOptimal(r.bases, r.quality, avgErrorRate); | |
75 a=trimLeft ? (int)((packed>>32)&0xFFFFFFFFL) : 0; | |
76 b=trimRight ? (int)((packed)&0xFFFFFFFFL) : 0; | |
77 }else if(windowMode){ | |
78 a=0; | |
79 b=(trimRight ? testRightWindow(r.bases, r.quality, (byte)trimq, windowLength) : 0); | |
80 }else{ | |
81 a=(trimLeft ? testLeft(r.bases, r.quality, (byte)trimq) : 0); | |
82 b=(trimRight ? testRight(r.bases, r.quality, (byte)trimq) : 0); | |
83 } | |
84 return (a+b==0 ? null : new TrimRead(r, a, b, trimq, minlen)); | |
85 } | |
86 | |
87 /** | |
88 * @param r Read to trim | |
89 * @param trimLeft Trim left side | |
90 * @param trimRight Trim right side | |
91 * @param trimq Maximum quality to trim | |
92 * @param minResult Ensure trimmed read is at least this long | |
93 * @return Number of bases trimmed | |
94 */ | |
95 public static int trimFast(Read r, boolean trimLeft, boolean trimRight, | |
96 float trimq, float avgErrorRate, int minResult){ | |
97 return trimFast(r, trimLeft, trimRight, trimq, avgErrorRate, minResult, false); | |
98 } | |
99 | |
100 /** | |
101 * @param r Read to trim | |
102 * @param trimLeft Trim left side | |
103 * @param trimRight Trim right side | |
104 * @param trimq Maximum quality to trim | |
105 * @param minResult Ensure trimmed read is at least this long | |
106 * @return Number of bases trimmed | |
107 */ | |
108 public static int trimFast(Read r, boolean trimLeft, boolean trimRight, | |
109 float trimq, float avgErrorRate, int minResult, boolean trimClip){ | |
110 return trimFast(r, trimLeft, trimRight, trimq, avgErrorRate, minResult, 0, trimClip); | |
111 } | |
112 | |
113 /** | |
114 * This method allows a "discardUnder" parameter, so that qtrim=r will still discard | |
115 * reads that if left-trimmed, would be shorter than the desired minimum length. | |
116 * It is not presently used. | |
117 * @param r Read to trim | |
118 * @param trimLeft Trim left side | |
119 * @param trimRight Trim right side | |
120 * @param trimq Maximum quality to trim | |
121 * @param minResult Ensure trimmed read is at least this long | |
122 * @param discardUnder Resulting reads shorter than this are not wanted | |
123 * @return Number of bases trimmed | |
124 */ | |
125 public static int trimFast(Read r, boolean trimLeft, boolean trimRight, | |
126 float trimq, float avgErrorRate, int minResult, int discardUnder, boolean trimClip){ | |
127 // assert(avgErrorRate==(float)QualityTools.phredToProbError(trimq)) : trimq+", "+avgErrorRate+", "+(float)QualityTools.phredToProbError(trimq); | |
128 final byte[] bases=r.bases, qual=r.quality; | |
129 if(bases==null || bases.length<1){return 0;} | |
130 | |
131 // assert(false) : trimLeft+", "+trimRight+", "+trimq+", "+minResult+", "+discardUnder; | |
132 | |
133 final int len=r.length(); | |
134 final int a0, b0; | |
135 final int a, b; | |
136 if(optimalMode){ | |
137 long packed=testOptimal(bases, qual, avgErrorRate); | |
138 a0=(int)((packed>>32)&0xFFFFFFFFL); | |
139 b0=(int)((packed)&0xFFFFFFFFL); | |
140 if(trimLeft!=trimRight && discardUnder>0 && len-a0-b0<discardUnder){ | |
141 a=trimLeft ? len : 0; | |
142 b=trimRight ? len : 0; | |
143 }else{ | |
144 a=trimLeft ? a0 : 0; | |
145 b=trimRight ? b0 : 0; | |
146 } | |
147 // assert(false) : a0+", "+b0+" -> "+a+", "+b; | |
148 }else if(windowMode){ | |
149 a=0; | |
150 b=(trimRight ? testRightWindow(bases, qual, (byte)trimq, windowLength) : 0); | |
151 }else{ | |
152 a=(trimLeft ? testLeft(bases, qual, (byte)trimq) : 0); | |
153 b=(trimRight ? testRight(bases, qual, (byte)trimq) : 0); | |
154 } | |
155 return trimByAmount(r, a, b, minResult, trimClip); | |
156 } | |
157 | |
158 public static boolean untrim(Read r){ | |
159 if(r==null || r.obj==null){return false;} | |
160 if(r.obj.getClass()!=TrimRead.class){return false;} | |
161 TrimRead tr=(TrimRead)r.obj; | |
162 return tr.untrim(); | |
163 } | |
164 | |
165 // public TrimRead(Read r_, boolean trimLeft, boolean trimRight, float trimq_, int minlen_){ | |
166 // this(r_, (trimLeft ? testLeft(r_.bases, r_.quality, (byte)trimq_) : 0), (trimRight ? testRight(r_.bases, r_.quality, (byte)trimq_) : 0), trimq_, minlen_); | |
167 // } | |
168 | |
169 public TrimRead(Read r_, int trimLeft, int trimRight, float trimq_, int minlen_){ | |
170 minlen_=Tools.max(minlen_, 0); | |
171 r=r_; | |
172 bases1=r.bases; | |
173 qual1=r.quality; | |
174 trimq=(byte)trimq_; | |
175 assert(bases1!=null || qual1==null) : "\n"+new String(bases1)+"\n"+new String(qual1)+"\n"; | |
176 assert(bases1==null || qual1==null || bases1.length==qual1.length) : "\n"+new String(bases1)+"\n"+new String(qual1)+"\n"; | |
177 int trimmed=trim(trimLeft, trimRight, minlen_); | |
178 if(trimmed>0){ | |
179 assert(bases2==null || bases2.length>=minlen_ || bases1.length<minlen_) : bases1.length+", "+bases2.length+", "+minlen_+", "+trimLeft+", "+trimRight; | |
180 r.bases=bases2; | |
181 r.quality=qual2; | |
182 r.obj=this; | |
183 trimMatch(r); | |
184 } | |
185 } | |
186 | |
187 // /** Trim the left end of the read, from left to right */ | |
188 // private int trim(final boolean trimLeft, final boolean trimRight, final int minlen){ | |
189 // final int a, b; | |
190 // if(optimalMode){ | |
191 // long packed=testOptimal(bases1, qual1, QualityTools.PROB_ERROR[trimq]); | |
192 // a=trimLeft ? (int)((packed>>32)&0xFFFFFFFFL) : 0; | |
193 // b=trimRight ? (int)((packed)&0xFFFFFFFFL) : 0; | |
194 // }else{ | |
195 // a=(trimLeft ? testLeft(bases1, qual1, (byte)trimq) : 0); | |
196 // b=(trimRight ? testRight(bases1, qual1, (byte)trimq) : 0); | |
197 // } | |
198 // return trim(a, b, minlen); | |
199 // } | |
200 | |
201 /** Trim the left end of the read, from left to right */ | |
202 private int trim(int trimLeft, int trimRight, final int minlen){ | |
203 assert(trimLeft>=0 && trimRight>=0) : "trimLeft="+trimLeft+", trimRight="+trimRight+", minlen="+minlen+", len="+bases1.length; | |
204 assert(trimLeft>0 || trimRight>0) : "trimLeft="+trimLeft+", trimRight="+trimRight+", minlen="+minlen+", len="+bases1.length; | |
205 final int maxTrim=Tools.min(bases1.length, bases1.length-minlen); | |
206 if(trimLeft+trimRight>maxTrim){ | |
207 int excess=trimLeft+trimRight-maxTrim; | |
208 if(trimLeft>0 && excess>0){ | |
209 trimLeft=Tools.max(0, trimLeft-excess); | |
210 excess=trimLeft+trimRight-maxTrim; | |
211 } | |
212 if(trimRight>0 && excess>0){ | |
213 trimRight=Tools.max(0, trimRight-excess); | |
214 excess=trimLeft+trimRight-maxTrim; | |
215 } | |
216 | |
217 } | |
218 | |
219 leftTrimmed=trimLeft; | |
220 rightTrimmed=trimRight; | |
221 final int sum=leftTrimmed+rightTrimmed; | |
222 | |
223 if(verbose){ | |
224 System.err.println("leftTrimmed="+leftTrimmed+", rightTrimmed="+rightTrimmed+", sum="+sum); | |
225 } | |
226 | |
227 if(sum==0){ | |
228 bases2=bases1; | |
229 qual2=qual1; | |
230 }else{ | |
231 bases2=KillSwitch.copyOfRange(bases1, trimLeft, bases1.length-trimRight); | |
232 qual2=((qual1==null || (trimLeft+trimRight>=qual1.length)) ? null : KillSwitch.copyOfRange(qual1, trimLeft, qual1.length-trimRight)); | |
233 } | |
234 return sum; | |
235 } | |
236 | |
237 /** Trim bases outside of leftLoc and rightLoc, excluding leftLoc and rightLoc */ | |
238 public static int trimToPosition(Read r, int leftLoc, int rightLoc, int minResultingLength){ | |
239 final int len=r.length(); | |
240 return trimByAmount(r, leftLoc, len-rightLoc-1, minResultingLength, false); | |
241 } | |
242 | |
243 /** Remove non-genetic-code from reads */ | |
244 public static int trimBadSequence(Read r){ | |
245 final byte[] bases=r.bases, quals=r.quality; | |
246 if(bases==null){return 0;} | |
247 final int minGenetic=20; | |
248 int lastNon=-1; | |
249 for(int i=0; i<bases.length; i++){ | |
250 byte b=bases[i]; | |
251 if(!AminoAcid.isACGTN(b)){lastNon=i;} | |
252 if(i-lastNon>minGenetic){break;} | |
253 } | |
254 if(lastNon>=0){ | |
255 r.bases=KillSwitch.copyOfRange(bases, lastNon+1, bases.length); | |
256 if(quals!=null){ | |
257 r.quality=KillSwitch.copyOfRange(quals, lastNon+1, quals.length); | |
258 } | |
259 } | |
260 return lastNon+1; | |
261 } | |
262 | |
263 /** Trim this many bases from each end */ | |
264 public static int trimByAmount(Read r, int leftTrimAmount, int rightTrimAmount, int minResultingLength){ | |
265 return trimByAmount(r, leftTrimAmount, rightTrimAmount, minResultingLength, false); | |
266 } | |
267 | |
268 /** Trim this many bases from each end */ | |
269 public static int trimByAmount(Read r, int leftTrimAmount, int rightTrimAmount, int minResultingLength, boolean trimClip){ | |
270 | |
271 leftTrimAmount=Tools.max(leftTrimAmount, 0); | |
272 rightTrimAmount=Tools.max(rightTrimAmount, 0); | |
273 | |
274 //These assertions are unnecessary if the mapping information will never be used or output. | |
275 // assert(r.match==null) : "TODO: Handle trimming of reads with match strings."; | |
276 assert(r.sites==null) : "TODO: Handle trimming of reads with SiteScores."; | |
277 | |
278 if(r.match!=null){ | |
279 return trimReadWithMatch(r, r.samline, leftTrimAmount, rightTrimAmount, minResultingLength, Integer.MAX_VALUE, trimClip); | |
280 }else if(r.samline!=null && r.samline.cigar!=null && r.samline.cigarContainsOnlyME()){ | |
281 return trimReadWithMatchFast(r, r.samline, leftTrimAmount, rightTrimAmount, minResultingLength); | |
282 } | |
283 | |
284 final byte[] bases=r.bases, qual=r.quality; | |
285 final int len=(bases==null ? 0 : bases.length), qlen=(qual==null ? 0 : qual.length); | |
286 if(len<1){return 0;} | |
287 minResultingLength=Tools.min(len, Tools.max(minResultingLength, 0)); | |
288 if(leftTrimAmount+rightTrimAmount+minResultingLength>len){ | |
289 rightTrimAmount=Tools.max(1, len-minResultingLength); | |
290 leftTrimAmount=0; | |
291 } | |
292 | |
293 final int total=leftTrimAmount+rightTrimAmount; | |
294 // System.err.println("D: L="+leftTrimAmount+", R="+rightTrimAmount+", len="+r.length()+", tot="+total); | |
295 if(total>0){ | |
296 r.bases=KillSwitch.copyOfRange(bases, leftTrimAmount, len-rightTrimAmount); | |
297 r.quality=(leftTrimAmount+rightTrimAmount>=qlen ? null : KillSwitch.copyOfRange(qual, leftTrimAmount, qlen-rightTrimAmount)); | |
298 trimMatch(r); | |
299 if(r.stop>r.start){ //TODO: Fixing mapped coordinates needs more work. | |
300 r.start+=leftTrimAmount; | |
301 r.stop-=rightTrimAmount; | |
302 } | |
303 } | |
304 | |
305 if(verbose){ | |
306 System.err.println("leftTrimmed="+leftTrimAmount+", rightTrimmed="+rightTrimAmount+ | |
307 ", sum="+total+", final length="+r.length()); | |
308 } | |
309 return total; | |
310 } | |
311 | |
312 /** Count number of bases that need trimming on each side, and pack into a long */ | |
313 public static long testOptimal(byte[] bases, byte[] qual, float avgErrorRate){ | |
314 if(optimalBias>=0){avgErrorRate=optimalBias;}//Override | |
315 assert(avgErrorRate>0 && avgErrorRate<=1) : "Average error rate ("+avgErrorRate+") must be between 0 (exclusive) and 1 (inclusive)"; | |
316 if(bases==null || bases.length==0){return 0;} | |
317 if(qual==null){return avgErrorRate>=1 ? 0 : ((((long)testLeftN(bases))<<32) | (((long)testRightN(bases))&0xFFFFFFFFL));} | |
318 | |
319 float maxScore=0; | |
320 float score=0; | |
321 int maxLoc=-1; | |
322 int maxCount=-1; | |
323 int count=0; | |
324 | |
325 final float nprob=Tools.max(Tools.min(avgErrorRate*1.1f, 1), NPROB); | |
326 | |
327 for(int i=0; i<bases.length; i++){ | |
328 final byte b=bases[i]; | |
329 byte q=qual[i]; | |
330 // float probError=(b=='N' ? nprob : ADJUST_QUALITY ? CalcTrueQuality.estimateErrorProbAvg(qual, bases, i) : QualityTools.PROB_ERROR[q]); | |
331 // float probError=(b=='N' ? nprob : ADJUST_QUALITY ? CalcTrueQuality.estimateErrorProbGeoAvg(qual, bases, i) : QualityTools.PROB_ERROR[q]); | |
332 // float probError=(b=='N' ? nprob : ADJUST_QUALITY ? CalcTrueQuality.estimateErrorProb2(qual, bases, i) : QualityTools.PROB_ERROR[q]); | |
333 | |
334 // float probError=(b=='N' ? nprob : q==1 ? PROB1 : QualityTools.PROB_ERROR[q]); | |
335 // float probError=(b=='N' ? nprob : QualityTools.PROB_ERROR[q]); | |
336 float probError=((b=='N' || q<1) ? nprob : QualityTools.PROB_ERROR[q]); | |
337 | |
338 // assert(q>0 || b=='N') : "index "+i+": q="+q+", b="+(char)b+"\n"+new String(bases)+"\n"+Arrays.toString(qual)+"\n"; | |
339 | |
340 float delta=avgErrorRate-probError; | |
341 score=score+delta; | |
342 if(score>0){ | |
343 count++; | |
344 if(score>maxScore || (score==maxScore && count>maxCount)){ | |
345 maxScore=score; | |
346 maxCount=count; | |
347 maxLoc=i; | |
348 } | |
349 }else{ | |
350 score=0; | |
351 count=0; | |
352 } | |
353 } | |
354 | |
355 final int left, right; | |
356 if(maxScore>0){ | |
357 assert(maxLoc>=0); | |
358 assert(maxCount>0); | |
359 left=maxLoc-maxCount+1; | |
360 assert(left>=0 && left<=bases.length); | |
361 right=bases.length-maxLoc-1; | |
362 }else{ | |
363 left=0; | |
364 right=bases.length; | |
365 } | |
366 final long packed=((((long)left)<<32) | (((long)right)&0xFFFFFFFFL)); | |
367 | |
368 if(verbose){ | |
369 System.err.println(Arrays.toString(qual)); | |
370 System.err.println("After testLocal: maxScore="+maxScore+", maxLoc="+maxLoc+", maxCount="+maxCount+ | |
371 ", left="+left+", right="+right+", returning "+Long.toHexString(packed)); | |
372 } | |
373 return packed; | |
374 } | |
375 | |
376 /** Count number of bases that need trimming on left side */ | |
377 private static int testLeft(byte[] bases, byte[] qual, final byte trimq){ | |
378 if(bases==null || bases.length==0){return 0;} | |
379 if(qual==null){return trimq<0 ? 0 : testLeftN(bases);} | |
380 int good=0; | |
381 int lastBad=-1; | |
382 int i=0; | |
383 for(; i<bases.length && good<minGoodInterval; i++){ | |
384 final byte q=qual[i]; | |
385 final byte b=bases[i]; | |
386 assert(q>0 || b=='N') : "index "+i+": q="+q+", b="+(char)b+"\n"+new String(bases)+"\n"+Arrays.toString(qual)+"\n"; | |
387 if(q>trimq){good++;} | |
388 else{good=0; lastBad=i;} | |
389 } | |
390 if(verbose){ | |
391 // System.err.println(Arrays.toString(qual)); | |
392 System.err.println("After testLeft: good="+good+", lastBad="+lastBad+", i="+i+", returning "+(lastBad+1)); | |
393 // assert(false); | |
394 } | |
395 return lastBad+1; | |
396 } | |
397 | |
398 /** Count number of bases that need trimming on right side using a sliding window */ | |
399 private static int testRightWindow(byte[] bases, byte[] qual, final byte trimq, final int window){ | |
400 if(bases==null || bases.length==0){return 0;} | |
401 if(qual==null || qual.length<window){return trimq>0 ? 0 : testRightN(bases);} | |
402 final int thresh=Tools.max(window*trimq, 1); | |
403 int sum=0; | |
404 for(int i=0, j=-window; i<qual.length; i++, j++){ | |
405 final byte q=qual[i]; | |
406 sum+=q; | |
407 if(j>=-1){ | |
408 if(j>=0){sum-=qual[j];} | |
409 if(sum<thresh){ | |
410 return qual.length-j-1; | |
411 } | |
412 } | |
413 } | |
414 return 0; | |
415 } | |
416 | |
417 /** Count number of bases that need trimming on right side */ | |
418 private static int testRight(byte[] bases, byte[] qual, final byte trimq){ | |
419 if(bases==null || bases.length==0){return 0;} | |
420 if(qual==null){return trimq<0 ? 0 : testRightN(bases);} | |
421 int good=0; | |
422 int lastBad=bases.length; | |
423 int i=bases.length-1; | |
424 for(; i>=0 && good<minGoodInterval; i--){ | |
425 final byte q=qual[i]; | |
426 final byte b=bases[i]; | |
427 assert(q>0 || b=='N') : "index "+i+": q="+q+", b="+(char)b+"\n"+new String(bases)+"\n"+Arrays.toString(qual)+"\n"; | |
428 if(q>trimq){good++;} | |
429 else{good=0; lastBad=i;} | |
430 } | |
431 if(verbose){ | |
432 System.err.println("After trimLeft: good="+good+", lastBad="+lastBad+", i="+i+", returning "+(bases.length-lastBad)); | |
433 } | |
434 return bases.length-lastBad; | |
435 } | |
436 | |
437 /** Count number of bases that need trimming on left side, considering only N as bad */ | |
438 public static int testLeftN(byte[] bases){ | |
439 if(bases==null || bases.length==0){return 0;} | |
440 int good=0; | |
441 int lastBad=-1; | |
442 for(int i=0; i<bases.length && good<minGoodInterval; i++){ | |
443 final byte b=bases[i]; | |
444 //if(dna.AminoAcid.isFullyDefined(b)){good++;} | |
445 if(b!=((byte)'N')){good++;} | |
446 else{good=0; lastBad=i;} | |
447 } | |
448 return lastBad+1; | |
449 } | |
450 | |
451 /** Count number of bases that need trimming on right side, considering only N as bad */ | |
452 public static int testRightN(byte[] bases){ | |
453 if(bases==null || bases.length==0){return 0;} | |
454 int good=0; | |
455 int lastBad=bases.length; | |
456 for(int i=bases.length-1; i>=0 && good<minGoodInterval; i--){ | |
457 final byte b=bases[i]; | |
458 //if(dna.AminoAcid.isFullyDefined(b)){good++;} | |
459 if(b!=((byte)'N')){good++;} | |
460 else{good=0; lastBad=i;} | |
461 } | |
462 return bases.length-lastBad; | |
463 } | |
464 | |
465 public boolean untrim(){ | |
466 if(leftTrimmed==0 && rightTrimmed==0){return false;} | |
467 r.setPerfect(false); | |
468 | |
469 final int lt, rt; | |
470 if(r.strand()==Shared.PLUS){ | |
471 lt=leftTrimmed; | |
472 rt=rightTrimmed; | |
473 }else{ | |
474 lt=rightTrimmed; | |
475 rt=leftTrimmed; | |
476 } | |
477 | |
478 boolean returnToShort=false; | |
479 if(verbose){System.err.println("Untrimming");} | |
480 if(r.match!=null){ | |
481 if(r.shortmatch()){ | |
482 r.toLongMatchString(false); | |
483 returnToShort=true; | |
484 } | |
485 byte[] match2=new byte[r.match.length+lt+rt]; | |
486 int i=0; | |
487 for(; i<lt; i++){ | |
488 match2[i]='C'; | |
489 } | |
490 for(int j=0; j<r.match.length; i++, j++){ | |
491 match2[i]=r.match[j]; | |
492 } | |
493 for(; i<match2.length; i++){ | |
494 match2[i]='C'; | |
495 } | |
496 r.match=match2; | |
497 } | |
498 r.bases=bases1; | |
499 r.quality=qual1; | |
500 r.start-=lt; | |
501 r.stop+=rt; | |
502 if(returnToShort){ | |
503 r.toShortMatchString(true); | |
504 } | |
505 | |
506 if(r.sites!=null){ | |
507 for(SiteScore ss : r.sites){untrim(ss);} | |
508 } | |
509 | |
510 return true; | |
511 } | |
512 | |
513 private boolean untrim(SiteScore ss){ | |
514 if(ss==null){return false;} | |
515 if(leftTrimmed==0 && rightTrimmed==0){return false;} | |
516 ss.perfect=ss.semiperfect=false; | |
517 | |
518 final int lt, rt; | |
519 if(ss.strand==Shared.PLUS){ | |
520 lt=leftTrimmed; | |
521 rt=rightTrimmed; | |
522 }else{ | |
523 lt=rightTrimmed; | |
524 rt=leftTrimmed; | |
525 } | |
526 | |
527 boolean returnToShort=false; | |
528 if(verbose){System.err.println("Untrimming ss "+ss);} | |
529 if(ss.match!=null){ | |
530 | |
531 boolean shortmatch=false; | |
532 for(byte b : ss.match){ | |
533 if(Tools.isDigit(b)){shortmatch=true; break;} | |
534 } | |
535 | |
536 if(shortmatch){ | |
537 ss.match=Read.toLongMatchString(ss.match); | |
538 returnToShort=true; | |
539 } | |
540 byte[] match2=new byte[ss.match.length+lt+rt]; | |
541 int i=0; | |
542 for(; i<lt; i++){ | |
543 match2[i]='C'; | |
544 } | |
545 for(int j=0; j<ss.match.length; i++, j++){ | |
546 match2[i]=ss.match[j]; | |
547 } | |
548 for(; i<match2.length; i++){ | |
549 match2[i]='C'; | |
550 } | |
551 ss.match=match2; | |
552 } | |
553 ss.setLimits(ss.start-lt, ss.stop+rt); | |
554 if(returnToShort){ss.match=Read.toShortMatchString(ss.match);} | |
555 return true; | |
556 } | |
557 | |
558 private static boolean trimMatch(Read r){ | |
559 if(r.match==null && r.sites==null){return false;} | |
560 | |
561 //Easy mode! | |
562 r.match=null; | |
563 if(r.sites!=null){ | |
564 for(SiteScore ss : r.sites){ | |
565 if(ss!=null){ss.match=null;} | |
566 } | |
567 } | |
568 return true; | |
569 | |
570 //TODO - need to adjust read start and stop based on this information. Also check strand! | |
571 // byte[] match=r.match; | |
572 // if(r.shortmatch()){match=Read.toLongMatchString(match);} | |
573 // byte[] match2=new byte[match.length-leftTrimmed-rightTrimmed]; | |
574 // for(int mpos=0, bpos=0; bpos<leftTrimmed; mpos++){ | |
575 // byte m=match[mpos]; | |
576 // if(m=='D'){ | |
577 // //do nothing | |
578 // }else{ | |
579 // bpos++; | |
580 // } | |
581 // } | |
582 } | |
583 | |
584 /** Special case of 100% match, or no match string */ | |
585 public static int trimReadWithMatchFast(final Read r, final SamLine sl, final int leftTrimAmount, final int rightTrimAmount, final int minFinalLength){ | |
586 assert(r.match!=null || sl!=null); | |
587 if(r.bases==null){return 0;} | |
588 if(leftTrimAmount<1 && rightTrimAmount<1){return 0;} | |
589 if(leftTrimAmount+rightTrimAmount>=r.length()){return -leftTrimAmount-rightTrimAmount;} | |
590 | |
591 final boolean shortmatch=r.shortmatch(); | |
592 final byte[] oldMatch=r.match; | |
593 r.match=null; | |
594 r.samline=null; | |
595 final int trimmed; | |
596 // System.err.println(rightTrimAmount+", "+leftTrimAmount); | |
597 if(sl!=null && sl.strand()==Shared.MINUS){ | |
598 trimmed=trimByAmount(r, rightTrimAmount, leftTrimAmount, minFinalLength, false); | |
599 }else{ | |
600 trimmed=trimByAmount(r, leftTrimAmount, rightTrimAmount, minFinalLength, false); | |
601 } | |
602 r.samline=sl; | |
603 if(trimmed<1){ | |
604 r.match=oldMatch; | |
605 return 0; | |
606 } | |
607 | |
608 final int len=r.length(); | |
609 ByteBuilder bb=new ByteBuilder(); | |
610 if(oldMatch!=null){ | |
611 if(shortmatch){ | |
612 bb.append((byte)'m'); | |
613 if(len>1){bb.append(len);} | |
614 }else{ | |
615 for(int i=0; i<len; i++){bb.append((byte)'m');} | |
616 } | |
617 r.match=bb.toBytes(); | |
618 bb.clear(); | |
619 } | |
620 | |
621 if(sl!=null){ | |
622 sl.pos+=leftTrimAmount; | |
623 if(sl.cigar!=null && sl.cigar.length()>0){ | |
624 char c=sl.cigar.charAt(sl.cigar.length()-1); | |
625 assert(c=='M' || c=='=') : c+"; "+sl.cigar+"\n"+sl; | |
626 bb.append(len); | |
627 bb.append(SamLine.VERSION>1.3 ? '=' : 'M'); | |
628 sl.cigar=bb.toString(); | |
629 } | |
630 sl.seq=r.bases; | |
631 sl.qual=r.quality; | |
632 if(trimmed>0 && sl.optional!=null && sl.optional.size()>0){ | |
633 ArrayList<String> list=new ArrayList<String>(2); | |
634 for(int i=0; i<sl.optional.size(); i++){ | |
635 String s=sl.optional.get(i); | |
636 if(s.startsWith("PG:") || s.startsWith("RG:") || s.startsWith("X") || s.startsWith("Y") || s.startsWith("Z")){list.add(s);} //Only keep safe flags. | |
637 } | |
638 sl.optional.clear(); | |
639 sl.optional.addAll(list); | |
640 } | |
641 } | |
642 return trimmed; | |
643 } | |
644 | |
645 // //Should be unneeded, just use the above function | |
646 // public static int trimReadWithoutMatch(final Read r, final SamLine sl, final int leftTrimAmount, final int rightTrimAmount, final int minFinalLength){ | |
647 // if(r.bases==null){return 0;} | |
648 // if(leftTrimAmount<1 && rightTrimAmount<1){return 0;} | |
649 // if(leftTrimAmount+rightTrimAmount>=r.length()){return -leftTrimAmount-rightTrimAmount;} | |
650 // | |
651 // assert(r.match==null); | |
652 // final int trimmed; | |
653 // if(sl!=null && sl.strand()==Shared.MINUS){ | |
654 // trimmed=trimByAmount(r, rightTrimAmount, leftTrimAmount, minFinalLength, false); | |
655 // }else{ | |
656 // trimmed=trimByAmount(r, leftTrimAmount, rightTrimAmount, minFinalLength, false); | |
657 // } | |
658 // if(trimmed<1){return 0;} | |
659 // | |
660 // if(sl!=null){ | |
661 // sl.pos+=leftTrimAmount; | |
662 // assert(sl.cigar==null); | |
663 // sl.seq=r.bases; | |
664 // sl.qual=r.quality; | |
665 // if(trimmed>0 && sl.optional!=null && sl.optional.size()>0){ | |
666 // ArrayList<String> list=new ArrayList<String>(2); | |
667 // for(int i=0; i<sl.optional.size(); i++){ | |
668 // String s=sl.optional.get(i); | |
669 // if(s.startsWith("PG:") || s.startsWith("RG:") || s.startsWith("X") || s.startsWith("Y") || s.startsWith("Z")){list.add(s);} //Only keep safe flags. | |
670 // } | |
671 // sl.optional.clear(); | |
672 // sl.optional.addAll(list); | |
673 // } | |
674 // } | |
675 // return trimmed; | |
676 // } | |
677 | |
678 //TODO: This is slow | |
679 //TODO: Note, returns a negative number if the whole read is supposed to be trimmed | |
680 public static int trimReadWithMatch(final Read r, final SamLine sl, | |
681 int leftTrimAmount, int rightTrimAmount, int minFinalLength, long scafLen, boolean trimClip){ | |
682 if(r.bases==null || (leftTrimAmount<1 && rightTrimAmount<1 && !trimClip)){return 0;} | |
683 if(!r.containsNonM() && !trimClip){//TODO: H not handled | |
684 return trimReadWithMatchFast(r, sl, leftTrimAmount, rightTrimAmount, minFinalLength); | |
685 } | |
686 | |
687 // if(r.match==null){ | |
688 // assert(sl.cigar==null); | |
689 // int trimmed=trimByAmount(r, leftTrimAmount, rightTrimAmount, minFinalLength, false); | |
690 // if(sl!=null){ | |
691 // assert(sl.cigar==null); | |
692 // int start=sl.pos-1; | |
693 // int stop=start+sl.seq.length; | |
694 // sl.seq=r.bases; | |
695 // sl.qual=r.quality; | |
696 // if(trimmed>0 && sl.optional!=null && sl.optional.size()>0){ | |
697 // ArrayList<String> list=new ArrayList<String>(2); | |
698 // for(int i=0; i<sl.optional.size(); i++){ | |
699 // String s=sl.optional.get(i); | |
700 // if(s.startsWith("PG:") || s.startsWith("RG:") || s.startsWith("X") || s.startsWith("Y") || s.startsWith("Z")){list.add(s);} //Only keep safe flags. | |
701 // } | |
702 // sl.optional.clear(); | |
703 // sl.optional.addAll(list); | |
704 // } | |
705 // } | |
706 // } | |
707 | |
708 if(trimClip){ | |
709 r.toLongMatchString(false); | |
710 byte[] match=r.match; | |
711 int leftClip=0, rightClip=0; | |
712 for(int i=0; i<match.length; i++){ | |
713 if(match[i]=='C'){leftClip++;} | |
714 else{break;} | |
715 } | |
716 for(int i=match.length-1; i>=0; i--){ | |
717 if(match[i]=='C'){rightClip++;} | |
718 else{break;} | |
719 } | |
720 leftTrimAmount=Tools.max(leftTrimAmount, leftClip); | |
721 rightTrimAmount=Tools.max(rightTrimAmount, rightClip); | |
722 } | |
723 | |
724 if(leftTrimAmount<1 && rightTrimAmount<1){return 0;} | |
725 if(leftTrimAmount+rightTrimAmount>=r.length()){return -leftTrimAmount-rightTrimAmount;} | |
726 | |
727 final int oldPos=(sl==null ? 1 : sl.pos); | |
728 | |
729 r.toLongMatchString(false); | |
730 byte[] match=r.match; | |
731 | |
732 assert(!r.shortmatch()); | |
733 | |
734 // System.err.println("Q: "+sl); | |
735 // System.err.println(new String(match)); | |
736 | |
737 ByteBuilder bb=new ByteBuilder(match.length); | |
738 // System.err.println("leftTrimAmount="+leftTrimAmount+", rightTrimAmount="+rightTrimAmount); | |
739 | |
740 int leftTrim=leftTrimAmount>0 ? r.length() : 0, rightTrim=rightTrimAmount>0 ? r.length() : 0; | |
741 // System.err.println("leftTrim="+leftTrim+", rightTrim="+rightTrim); | |
742 { | |
743 final int mlen=match.length; | |
744 boolean keep=leftTrimAmount<1; | |
745 int rpos=(sl==null ? 1 : sl.pos); | |
746 for(int mpos=0, cpos=0; mpos<mlen; mpos++){ | |
747 final byte m=match[mpos]; | |
748 if(m=='m' || m=='S' || m=='V' || m=='N'){ | |
749 cpos++; | |
750 rpos++; | |
751 }else if(m=='D'){ | |
752 rpos++; | |
753 }else if(m=='I' || m=='C'){ | |
754 cpos++; | |
755 }else if(m=='d'){ | |
756 rpos++; | |
757 }else if(m=='i'){ | |
758 cpos++; | |
759 }else{ | |
760 assert(false) : "Unknown symbol "+(char)m; | |
761 } | |
762 | |
763 if(keep){ | |
764 bb.append(m); | |
765 }else if(cpos>=leftTrimAmount){ | |
766 byte next=(mpos<mlen-1 ? match[mpos+1] : (byte)'m'); | |
767 if(m=='m' || m=='S' || m=='V' || m=='N' || next=='m' || next=='S' || next=='V' || next=='N'){ | |
768 keep=true; | |
769 if(sl!=null){sl.pos=rpos;} | |
770 leftTrim=cpos; | |
771 } | |
772 } | |
773 } | |
774 } | |
775 // System.err.println("R: "+sl); | |
776 match=bb.toBytes(); | |
777 Tools.reverseInPlace(match); | |
778 bb.clear(); | |
779 { | |
780 final int mlen=match.length; | |
781 boolean keep=rightTrimAmount<1; | |
782 for(int mpos=0, cpos=0; mpos<mlen; mpos++){ | |
783 final byte m=match[mpos]; | |
784 if(m=='m' || m=='S' || m=='V' || m=='N' || m=='I' || m=='C'){ | |
785 cpos++; | |
786 }else if(m=='D'){ | |
787 }else if(m=='d'){ | |
788 }else if(m=='i'){ | |
789 cpos++; | |
790 }else{ | |
791 assert(false) : "Unknown symbol "+(char)m; | |
792 } | |
793 | |
794 if(keep){ | |
795 bb.append(m); | |
796 }else if(cpos>=rightTrimAmount){ | |
797 byte next=(mpos<mlen-1 ? match[mpos+1] : (byte)'m'); | |
798 // if(m=='m' || m=='S' || m=='V' || m=='N' || next=='m' || next=='S' || next=='V' || next=='N'){ | |
799 if(next!='I' && next!='D' && next!='i' && next!='d'){ | |
800 keep=true; | |
801 rightTrim=cpos; | |
802 } | |
803 // } | |
804 } | |
805 } | |
806 } | |
807 // System.err.println("S: "+sl); | |
808 match=bb.toBytes(); | |
809 Tools.reverseInPlace(match); | |
810 | |
811 // System.err.println("leftTrim="+leftTrim+", rightTrim="+rightTrim); | |
812 | |
813 if(leftTrim+rightTrim>=r.length()){ | |
814 if(sl!=null){sl.pos=oldPos;} | |
815 return -leftTrim-rightTrim; | |
816 } | |
817 | |
818 // System.err.println("T: "+sl); | |
819 r.match=null; | |
820 if(sl!=null && sl.strand()==Shared.MINUS){ | |
821 int temp=leftTrim; | |
822 leftTrim=rightTrim; | |
823 rightTrim=temp; | |
824 } | |
825 final int trimmed=trimByAmount(r, leftTrim, rightTrim, minFinalLength, false); | |
826 // System.err.println("tba: "+leftTrim+", "+rightTrim+", "+minFinalLength); | |
827 r.match=match; | |
828 | |
829 if(sl!=null){ | |
830 int start=sl.pos-1; | |
831 int stop=start+Read.calcMatchLength(match)-1; | |
832 if(SamLine.VERSION>1.3){ | |
833 sl.cigar=SamLine.toCigar14(match, start, stop, scafLen, r.bases); | |
834 }else{ | |
835 sl.cigar=SamLine.toCigar13(match, start, stop, scafLen, r.bases); | |
836 } | |
837 sl.seq=r.bases; | |
838 sl.qual=r.quality; | |
839 if(trimmed>0 && sl.optional!=null && sl.optional.size()>0){ | |
840 ArrayList<String> list=new ArrayList<String>(2); | |
841 for(int i=0; i<sl.optional.size(); i++){ | |
842 String s=sl.optional.get(i); | |
843 if(s.startsWith("PG:") || s.startsWith("RG:") || s.startsWith("X") || s.startsWith("Y") || s.startsWith("Z")){list.add(s);} //Only keep safe flags. | |
844 } | |
845 sl.optional.clear(); | |
846 sl.optional.addAll(list); | |
847 } | |
848 } | |
849 return trimmed; | |
850 } | |
851 | |
852 public int trimmed() { | |
853 return leftTrimmed+rightTrimmed; | |
854 } | |
855 | |
856 public final Read r; | |
857 | |
858 /** untrimmed bases */ | |
859 public final byte[] bases1; | |
860 /** untrimmed qualities */ | |
861 public final byte[] qual1; | |
862 /** trimmed bases */ | |
863 public byte[] bases2; | |
864 /** trimmed qualities */ | |
865 public byte[] qual2; | |
866 | |
867 | |
868 public final float trimq; | |
869 public int leftTrimmed; | |
870 public int rightTrimmed; | |
871 | |
872 /** Require this many consecutive good bases to stop trimming. Minimum is 1. | |
873 * This is for the old trimming mode and not really used anymore */ | |
874 public static int minGoodInterval=2; | |
875 | |
876 public static boolean verbose=false; | |
877 public static boolean optimalMode=true; | |
878 public static boolean windowMode=false; | |
879 public static float optimalBias=-1f; | |
880 | |
881 public static int windowLength=4; | |
882 | |
883 private static final float NPROB=0.75f; | |
884 // public static float PROB1=QualityTools.PROB_ERROR[1]; | |
885 | |
886 | |
887 } |