comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/shared/TrimRead.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package shared;
2
3 import java.io.Serializable;
4 import java.util.ArrayList;
5 import java.util.Arrays;
6
7 import align2.QualityTools;
8 import dna.AminoAcid;
9 import stream.Read;
10 import stream.SamLine;
11 import stream.SiteScore;
12 import structures.ByteBuilder;
13
14 /**
15 * Helper class for processes that do inline quality trimming.
16 * @author Brian Bushnell
17 * @date Mar 15, 2013
18 *
19 */
20 public final class TrimRead implements Serializable {
21
22 /**
23 *
24 */
25 private static final long serialVersionUID = 8791743639124592480L;
26
27 public static void main(String[] args){
28 byte[] bases=args[0].getBytes();
29 byte[] quals=(args.length<2 ? null : args[1].getBytes());
30 if(quals!=null){
31 for(int i=0; i<quals.length; i++){quals[i]-=32;}
32 }
33 byte[] match=(args.length<3 ? null : args[2].getBytes());
34 float minq=(args.length<4 ? 5 : Float.parseFloat(args[3]));
35 float minE=(float)QualityTools.phredToProbError(minq);
36 Read r=new Read(bases, quals, 1);
37 r.match=match;
38 System.out.println("Before trim:\n"+r.toFastq()+(r.match==null ? "" : "\n"+new String(r.match)));
39 System.out.println(Arrays.toString(r.quality));
40 TrimRead tr=trim(r, true, true, minq, minE, 1);
41 System.out.println("\nAfter trim:\n"+r.toFastq()+(r.match==null ? "" : "\n"+new String(r.match)));
42 if(r.match==null){
43 r.match=new byte[r.length()];
44 for(int i=0; i<r.length(); i++){r.match[i]='m';}
45 }
46 tr.untrim();
47 System.out.println("\nAfter untrim:\n"+r.toFastq()+(r.match==null ? "" : "\n"+new String(r.match)));
48 }
49
50 // public static TrimRead trim(Read r, boolean trimLeft, boolean trimRight, float trimq, int minlen){
51 // if(r==null || r.bases==null){return null;}
52 //
53 // final int a, b;
54 // if(optimalMode){
55 // long packed=testOptimal(r.bases, r.quality, QualityTools.PROB_ERROR[trimq]);
56 // a=trimLeft ? (int)((packed>>32)&0xFFFFFFFFL) : 0;
57 // b=trimRight ? (int)((packed)&0xFFFFFFFFL) : 0;
58 // }else if(windowMode){
59 // a=0;
60 // b=(trimRight ? testRightWindow(r.bases, r.quality, (byte)trimq, windowLength) : 0);
61 // }else{
62 // a=(trimLeft ? testLeft(r.bases, r.quality, (byte)trimq) : 0);
63 // b=(trimRight ? testRight(r.bases, r.quality, (byte)trimq) : 0);
64 // }
65 // return (a+b==0 ? null : new TrimRead(r, a, b, trimq, minlen));
66 // }
67
68 public static TrimRead trim(Read r, boolean trimLeft, boolean trimRight,
69 float trimq, float avgErrorRate, int minlen){
70 if(r==null || r.bases==null){return null;}
71
72 final int a, b;
73 if(optimalMode){
74 long packed=testOptimal(r.bases, r.quality, avgErrorRate);
75 a=trimLeft ? (int)((packed>>32)&0xFFFFFFFFL) : 0;
76 b=trimRight ? (int)((packed)&0xFFFFFFFFL) : 0;
77 }else if(windowMode){
78 a=0;
79 b=(trimRight ? testRightWindow(r.bases, r.quality, (byte)trimq, windowLength) : 0);
80 }else{
81 a=(trimLeft ? testLeft(r.bases, r.quality, (byte)trimq) : 0);
82 b=(trimRight ? testRight(r.bases, r.quality, (byte)trimq) : 0);
83 }
84 return (a+b==0 ? null : new TrimRead(r, a, b, trimq, minlen));
85 }
86
87 /**
88 * @param r Read to trim
89 * @param trimLeft Trim left side
90 * @param trimRight Trim right side
91 * @param trimq Maximum quality to trim
92 * @param minResult Ensure trimmed read is at least this long
93 * @return Number of bases trimmed
94 */
95 public static int trimFast(Read r, boolean trimLeft, boolean trimRight,
96 float trimq, float avgErrorRate, int minResult){
97 return trimFast(r, trimLeft, trimRight, trimq, avgErrorRate, minResult, false);
98 }
99
100 /**
101 * @param r Read to trim
102 * @param trimLeft Trim left side
103 * @param trimRight Trim right side
104 * @param trimq Maximum quality to trim
105 * @param minResult Ensure trimmed read is at least this long
106 * @return Number of bases trimmed
107 */
108 public static int trimFast(Read r, boolean trimLeft, boolean trimRight,
109 float trimq, float avgErrorRate, int minResult, boolean trimClip){
110 return trimFast(r, trimLeft, trimRight, trimq, avgErrorRate, minResult, 0, trimClip);
111 }
112
113 /**
114 * This method allows a "discardUnder" parameter, so that qtrim=r will still discard
115 * reads that if left-trimmed, would be shorter than the desired minimum length.
116 * It is not presently used.
117 * @param r Read to trim
118 * @param trimLeft Trim left side
119 * @param trimRight Trim right side
120 * @param trimq Maximum quality to trim
121 * @param minResult Ensure trimmed read is at least this long
122 * @param discardUnder Resulting reads shorter than this are not wanted
123 * @return Number of bases trimmed
124 */
125 public static int trimFast(Read r, boolean trimLeft, boolean trimRight,
126 float trimq, float avgErrorRate, int minResult, int discardUnder, boolean trimClip){
127 // assert(avgErrorRate==(float)QualityTools.phredToProbError(trimq)) : trimq+", "+avgErrorRate+", "+(float)QualityTools.phredToProbError(trimq);
128 final byte[] bases=r.bases, qual=r.quality;
129 if(bases==null || bases.length<1){return 0;}
130
131 // assert(false) : trimLeft+", "+trimRight+", "+trimq+", "+minResult+", "+discardUnder;
132
133 final int len=r.length();
134 final int a0, b0;
135 final int a, b;
136 if(optimalMode){
137 long packed=testOptimal(bases, qual, avgErrorRate);
138 a0=(int)((packed>>32)&0xFFFFFFFFL);
139 b0=(int)((packed)&0xFFFFFFFFL);
140 if(trimLeft!=trimRight && discardUnder>0 && len-a0-b0<discardUnder){
141 a=trimLeft ? len : 0;
142 b=trimRight ? len : 0;
143 }else{
144 a=trimLeft ? a0 : 0;
145 b=trimRight ? b0 : 0;
146 }
147 // assert(false) : a0+", "+b0+" -> "+a+", "+b;
148 }else if(windowMode){
149 a=0;
150 b=(trimRight ? testRightWindow(bases, qual, (byte)trimq, windowLength) : 0);
151 }else{
152 a=(trimLeft ? testLeft(bases, qual, (byte)trimq) : 0);
153 b=(trimRight ? testRight(bases, qual, (byte)trimq) : 0);
154 }
155 return trimByAmount(r, a, b, minResult, trimClip);
156 }
157
158 public static boolean untrim(Read r){
159 if(r==null || r.obj==null){return false;}
160 if(r.obj.getClass()!=TrimRead.class){return false;}
161 TrimRead tr=(TrimRead)r.obj;
162 return tr.untrim();
163 }
164
165 // public TrimRead(Read r_, boolean trimLeft, boolean trimRight, float trimq_, int minlen_){
166 // this(r_, (trimLeft ? testLeft(r_.bases, r_.quality, (byte)trimq_) : 0), (trimRight ? testRight(r_.bases, r_.quality, (byte)trimq_) : 0), trimq_, minlen_);
167 // }
168
169 public TrimRead(Read r_, int trimLeft, int trimRight, float trimq_, int minlen_){
170 minlen_=Tools.max(minlen_, 0);
171 r=r_;
172 bases1=r.bases;
173 qual1=r.quality;
174 trimq=(byte)trimq_;
175 assert(bases1!=null || qual1==null) : "\n"+new String(bases1)+"\n"+new String(qual1)+"\n";
176 assert(bases1==null || qual1==null || bases1.length==qual1.length) : "\n"+new String(bases1)+"\n"+new String(qual1)+"\n";
177 int trimmed=trim(trimLeft, trimRight, minlen_);
178 if(trimmed>0){
179 assert(bases2==null || bases2.length>=minlen_ || bases1.length<minlen_) : bases1.length+", "+bases2.length+", "+minlen_+", "+trimLeft+", "+trimRight;
180 r.bases=bases2;
181 r.quality=qual2;
182 r.obj=this;
183 trimMatch(r);
184 }
185 }
186
187 // /** Trim the left end of the read, from left to right */
188 // private int trim(final boolean trimLeft, final boolean trimRight, final int minlen){
189 // final int a, b;
190 // if(optimalMode){
191 // long packed=testOptimal(bases1, qual1, QualityTools.PROB_ERROR[trimq]);
192 // a=trimLeft ? (int)((packed>>32)&0xFFFFFFFFL) : 0;
193 // b=trimRight ? (int)((packed)&0xFFFFFFFFL) : 0;
194 // }else{
195 // a=(trimLeft ? testLeft(bases1, qual1, (byte)trimq) : 0);
196 // b=(trimRight ? testRight(bases1, qual1, (byte)trimq) : 0);
197 // }
198 // return trim(a, b, minlen);
199 // }
200
201 /** Trim the left end of the read, from left to right */
202 private int trim(int trimLeft, int trimRight, final int minlen){
203 assert(trimLeft>=0 && trimRight>=0) : "trimLeft="+trimLeft+", trimRight="+trimRight+", minlen="+minlen+", len="+bases1.length;
204 assert(trimLeft>0 || trimRight>0) : "trimLeft="+trimLeft+", trimRight="+trimRight+", minlen="+minlen+", len="+bases1.length;
205 final int maxTrim=Tools.min(bases1.length, bases1.length-minlen);
206 if(trimLeft+trimRight>maxTrim){
207 int excess=trimLeft+trimRight-maxTrim;
208 if(trimLeft>0 && excess>0){
209 trimLeft=Tools.max(0, trimLeft-excess);
210 excess=trimLeft+trimRight-maxTrim;
211 }
212 if(trimRight>0 && excess>0){
213 trimRight=Tools.max(0, trimRight-excess);
214 excess=trimLeft+trimRight-maxTrim;
215 }
216
217 }
218
219 leftTrimmed=trimLeft;
220 rightTrimmed=trimRight;
221 final int sum=leftTrimmed+rightTrimmed;
222
223 if(verbose){
224 System.err.println("leftTrimmed="+leftTrimmed+", rightTrimmed="+rightTrimmed+", sum="+sum);
225 }
226
227 if(sum==0){
228 bases2=bases1;
229 qual2=qual1;
230 }else{
231 bases2=KillSwitch.copyOfRange(bases1, trimLeft, bases1.length-trimRight);
232 qual2=((qual1==null || (trimLeft+trimRight>=qual1.length)) ? null : KillSwitch.copyOfRange(qual1, trimLeft, qual1.length-trimRight));
233 }
234 return sum;
235 }
236
237 /** Trim bases outside of leftLoc and rightLoc, excluding leftLoc and rightLoc */
238 public static int trimToPosition(Read r, int leftLoc, int rightLoc, int minResultingLength){
239 final int len=r.length();
240 return trimByAmount(r, leftLoc, len-rightLoc-1, minResultingLength, false);
241 }
242
243 /** Remove non-genetic-code from reads */
244 public static int trimBadSequence(Read r){
245 final byte[] bases=r.bases, quals=r.quality;
246 if(bases==null){return 0;}
247 final int minGenetic=20;
248 int lastNon=-1;
249 for(int i=0; i<bases.length; i++){
250 byte b=bases[i];
251 if(!AminoAcid.isACGTN(b)){lastNon=i;}
252 if(i-lastNon>minGenetic){break;}
253 }
254 if(lastNon>=0){
255 r.bases=KillSwitch.copyOfRange(bases, lastNon+1, bases.length);
256 if(quals!=null){
257 r.quality=KillSwitch.copyOfRange(quals, lastNon+1, quals.length);
258 }
259 }
260 return lastNon+1;
261 }
262
263 /** Trim this many bases from each end */
264 public static int trimByAmount(Read r, int leftTrimAmount, int rightTrimAmount, int minResultingLength){
265 return trimByAmount(r, leftTrimAmount, rightTrimAmount, minResultingLength, false);
266 }
267
268 /** Trim this many bases from each end */
269 public static int trimByAmount(Read r, int leftTrimAmount, int rightTrimAmount, int minResultingLength, boolean trimClip){
270
271 leftTrimAmount=Tools.max(leftTrimAmount, 0);
272 rightTrimAmount=Tools.max(rightTrimAmount, 0);
273
274 //These assertions are unnecessary if the mapping information will never be used or output.
275 // assert(r.match==null) : "TODO: Handle trimming of reads with match strings.";
276 assert(r.sites==null) : "TODO: Handle trimming of reads with SiteScores.";
277
278 if(r.match!=null){
279 return trimReadWithMatch(r, r.samline, leftTrimAmount, rightTrimAmount, minResultingLength, Integer.MAX_VALUE, trimClip);
280 }else if(r.samline!=null && r.samline.cigar!=null && r.samline.cigarContainsOnlyME()){
281 return trimReadWithMatchFast(r, r.samline, leftTrimAmount, rightTrimAmount, minResultingLength);
282 }
283
284 final byte[] bases=r.bases, qual=r.quality;
285 final int len=(bases==null ? 0 : bases.length), qlen=(qual==null ? 0 : qual.length);
286 if(len<1){return 0;}
287 minResultingLength=Tools.min(len, Tools.max(minResultingLength, 0));
288 if(leftTrimAmount+rightTrimAmount+minResultingLength>len){
289 rightTrimAmount=Tools.max(1, len-minResultingLength);
290 leftTrimAmount=0;
291 }
292
293 final int total=leftTrimAmount+rightTrimAmount;
294 // System.err.println("D: L="+leftTrimAmount+", R="+rightTrimAmount+", len="+r.length()+", tot="+total);
295 if(total>0){
296 r.bases=KillSwitch.copyOfRange(bases, leftTrimAmount, len-rightTrimAmount);
297 r.quality=(leftTrimAmount+rightTrimAmount>=qlen ? null : KillSwitch.copyOfRange(qual, leftTrimAmount, qlen-rightTrimAmount));
298 trimMatch(r);
299 if(r.stop>r.start){ //TODO: Fixing mapped coordinates needs more work.
300 r.start+=leftTrimAmount;
301 r.stop-=rightTrimAmount;
302 }
303 }
304
305 if(verbose){
306 System.err.println("leftTrimmed="+leftTrimAmount+", rightTrimmed="+rightTrimAmount+
307 ", sum="+total+", final length="+r.length());
308 }
309 return total;
310 }
311
312 /** Count number of bases that need trimming on each side, and pack into a long */
313 public static long testOptimal(byte[] bases, byte[] qual, float avgErrorRate){
314 if(optimalBias>=0){avgErrorRate=optimalBias;}//Override
315 assert(avgErrorRate>0 && avgErrorRate<=1) : "Average error rate ("+avgErrorRate+") must be between 0 (exclusive) and 1 (inclusive)";
316 if(bases==null || bases.length==0){return 0;}
317 if(qual==null){return avgErrorRate>=1 ? 0 : ((((long)testLeftN(bases))<<32) | (((long)testRightN(bases))&0xFFFFFFFFL));}
318
319 float maxScore=0;
320 float score=0;
321 int maxLoc=-1;
322 int maxCount=-1;
323 int count=0;
324
325 final float nprob=Tools.max(Tools.min(avgErrorRate*1.1f, 1), NPROB);
326
327 for(int i=0; i<bases.length; i++){
328 final byte b=bases[i];
329 byte q=qual[i];
330 // float probError=(b=='N' ? nprob : ADJUST_QUALITY ? CalcTrueQuality.estimateErrorProbAvg(qual, bases, i) : QualityTools.PROB_ERROR[q]);
331 // float probError=(b=='N' ? nprob : ADJUST_QUALITY ? CalcTrueQuality.estimateErrorProbGeoAvg(qual, bases, i) : QualityTools.PROB_ERROR[q]);
332 // float probError=(b=='N' ? nprob : ADJUST_QUALITY ? CalcTrueQuality.estimateErrorProb2(qual, bases, i) : QualityTools.PROB_ERROR[q]);
333
334 // float probError=(b=='N' ? nprob : q==1 ? PROB1 : QualityTools.PROB_ERROR[q]);
335 // float probError=(b=='N' ? nprob : QualityTools.PROB_ERROR[q]);
336 float probError=((b=='N' || q<1) ? nprob : QualityTools.PROB_ERROR[q]);
337
338 // assert(q>0 || b=='N') : "index "+i+": q="+q+", b="+(char)b+"\n"+new String(bases)+"\n"+Arrays.toString(qual)+"\n";
339
340 float delta=avgErrorRate-probError;
341 score=score+delta;
342 if(score>0){
343 count++;
344 if(score>maxScore || (score==maxScore && count>maxCount)){
345 maxScore=score;
346 maxCount=count;
347 maxLoc=i;
348 }
349 }else{
350 score=0;
351 count=0;
352 }
353 }
354
355 final int left, right;
356 if(maxScore>0){
357 assert(maxLoc>=0);
358 assert(maxCount>0);
359 left=maxLoc-maxCount+1;
360 assert(left>=0 && left<=bases.length);
361 right=bases.length-maxLoc-1;
362 }else{
363 left=0;
364 right=bases.length;
365 }
366 final long packed=((((long)left)<<32) | (((long)right)&0xFFFFFFFFL));
367
368 if(verbose){
369 System.err.println(Arrays.toString(qual));
370 System.err.println("After testLocal: maxScore="+maxScore+", maxLoc="+maxLoc+", maxCount="+maxCount+
371 ", left="+left+", right="+right+", returning "+Long.toHexString(packed));
372 }
373 return packed;
374 }
375
376 /** Count number of bases that need trimming on left side */
377 private static int testLeft(byte[] bases, byte[] qual, final byte trimq){
378 if(bases==null || bases.length==0){return 0;}
379 if(qual==null){return trimq<0 ? 0 : testLeftN(bases);}
380 int good=0;
381 int lastBad=-1;
382 int i=0;
383 for(; i<bases.length && good<minGoodInterval; i++){
384 final byte q=qual[i];
385 final byte b=bases[i];
386 assert(q>0 || b=='N') : "index "+i+": q="+q+", b="+(char)b+"\n"+new String(bases)+"\n"+Arrays.toString(qual)+"\n";
387 if(q>trimq){good++;}
388 else{good=0; lastBad=i;}
389 }
390 if(verbose){
391 // System.err.println(Arrays.toString(qual));
392 System.err.println("After testLeft: good="+good+", lastBad="+lastBad+", i="+i+", returning "+(lastBad+1));
393 // assert(false);
394 }
395 return lastBad+1;
396 }
397
398 /** Count number of bases that need trimming on right side using a sliding window */
399 private static int testRightWindow(byte[] bases, byte[] qual, final byte trimq, final int window){
400 if(bases==null || bases.length==0){return 0;}
401 if(qual==null || qual.length<window){return trimq>0 ? 0 : testRightN(bases);}
402 final int thresh=Tools.max(window*trimq, 1);
403 int sum=0;
404 for(int i=0, j=-window; i<qual.length; i++, j++){
405 final byte q=qual[i];
406 sum+=q;
407 if(j>=-1){
408 if(j>=0){sum-=qual[j];}
409 if(sum<thresh){
410 return qual.length-j-1;
411 }
412 }
413 }
414 return 0;
415 }
416
417 /** Count number of bases that need trimming on right side */
418 private static int testRight(byte[] bases, byte[] qual, final byte trimq){
419 if(bases==null || bases.length==0){return 0;}
420 if(qual==null){return trimq<0 ? 0 : testRightN(bases);}
421 int good=0;
422 int lastBad=bases.length;
423 int i=bases.length-1;
424 for(; i>=0 && good<minGoodInterval; i--){
425 final byte q=qual[i];
426 final byte b=bases[i];
427 assert(q>0 || b=='N') : "index "+i+": q="+q+", b="+(char)b+"\n"+new String(bases)+"\n"+Arrays.toString(qual)+"\n";
428 if(q>trimq){good++;}
429 else{good=0; lastBad=i;}
430 }
431 if(verbose){
432 System.err.println("After trimLeft: good="+good+", lastBad="+lastBad+", i="+i+", returning "+(bases.length-lastBad));
433 }
434 return bases.length-lastBad;
435 }
436
437 /** Count number of bases that need trimming on left side, considering only N as bad */
438 public static int testLeftN(byte[] bases){
439 if(bases==null || bases.length==0){return 0;}
440 int good=0;
441 int lastBad=-1;
442 for(int i=0; i<bases.length && good<minGoodInterval; i++){
443 final byte b=bases[i];
444 //if(dna.AminoAcid.isFullyDefined(b)){good++;}
445 if(b!=((byte)'N')){good++;}
446 else{good=0; lastBad=i;}
447 }
448 return lastBad+1;
449 }
450
451 /** Count number of bases that need trimming on right side, considering only N as bad */
452 public static int testRightN(byte[] bases){
453 if(bases==null || bases.length==0){return 0;}
454 int good=0;
455 int lastBad=bases.length;
456 for(int i=bases.length-1; i>=0 && good<minGoodInterval; i--){
457 final byte b=bases[i];
458 //if(dna.AminoAcid.isFullyDefined(b)){good++;}
459 if(b!=((byte)'N')){good++;}
460 else{good=0; lastBad=i;}
461 }
462 return bases.length-lastBad;
463 }
464
465 public boolean untrim(){
466 if(leftTrimmed==0 && rightTrimmed==0){return false;}
467 r.setPerfect(false);
468
469 final int lt, rt;
470 if(r.strand()==Shared.PLUS){
471 lt=leftTrimmed;
472 rt=rightTrimmed;
473 }else{
474 lt=rightTrimmed;
475 rt=leftTrimmed;
476 }
477
478 boolean returnToShort=false;
479 if(verbose){System.err.println("Untrimming");}
480 if(r.match!=null){
481 if(r.shortmatch()){
482 r.toLongMatchString(false);
483 returnToShort=true;
484 }
485 byte[] match2=new byte[r.match.length+lt+rt];
486 int i=0;
487 for(; i<lt; i++){
488 match2[i]='C';
489 }
490 for(int j=0; j<r.match.length; i++, j++){
491 match2[i]=r.match[j];
492 }
493 for(; i<match2.length; i++){
494 match2[i]='C';
495 }
496 r.match=match2;
497 }
498 r.bases=bases1;
499 r.quality=qual1;
500 r.start-=lt;
501 r.stop+=rt;
502 if(returnToShort){
503 r.toShortMatchString(true);
504 }
505
506 if(r.sites!=null){
507 for(SiteScore ss : r.sites){untrim(ss);}
508 }
509
510 return true;
511 }
512
513 private boolean untrim(SiteScore ss){
514 if(ss==null){return false;}
515 if(leftTrimmed==0 && rightTrimmed==0){return false;}
516 ss.perfect=ss.semiperfect=false;
517
518 final int lt, rt;
519 if(ss.strand==Shared.PLUS){
520 lt=leftTrimmed;
521 rt=rightTrimmed;
522 }else{
523 lt=rightTrimmed;
524 rt=leftTrimmed;
525 }
526
527 boolean returnToShort=false;
528 if(verbose){System.err.println("Untrimming ss "+ss);}
529 if(ss.match!=null){
530
531 boolean shortmatch=false;
532 for(byte b : ss.match){
533 if(Tools.isDigit(b)){shortmatch=true; break;}
534 }
535
536 if(shortmatch){
537 ss.match=Read.toLongMatchString(ss.match);
538 returnToShort=true;
539 }
540 byte[] match2=new byte[ss.match.length+lt+rt];
541 int i=0;
542 for(; i<lt; i++){
543 match2[i]='C';
544 }
545 for(int j=0; j<ss.match.length; i++, j++){
546 match2[i]=ss.match[j];
547 }
548 for(; i<match2.length; i++){
549 match2[i]='C';
550 }
551 ss.match=match2;
552 }
553 ss.setLimits(ss.start-lt, ss.stop+rt);
554 if(returnToShort){ss.match=Read.toShortMatchString(ss.match);}
555 return true;
556 }
557
558 private static boolean trimMatch(Read r){
559 if(r.match==null && r.sites==null){return false;}
560
561 //Easy mode!
562 r.match=null;
563 if(r.sites!=null){
564 for(SiteScore ss : r.sites){
565 if(ss!=null){ss.match=null;}
566 }
567 }
568 return true;
569
570 //TODO - need to adjust read start and stop based on this information. Also check strand!
571 // byte[] match=r.match;
572 // if(r.shortmatch()){match=Read.toLongMatchString(match);}
573 // byte[] match2=new byte[match.length-leftTrimmed-rightTrimmed];
574 // for(int mpos=0, bpos=0; bpos<leftTrimmed; mpos++){
575 // byte m=match[mpos];
576 // if(m=='D'){
577 // //do nothing
578 // }else{
579 // bpos++;
580 // }
581 // }
582 }
583
584 /** Special case of 100% match, or no match string */
585 public static int trimReadWithMatchFast(final Read r, final SamLine sl, final int leftTrimAmount, final int rightTrimAmount, final int minFinalLength){
586 assert(r.match!=null || sl!=null);
587 if(r.bases==null){return 0;}
588 if(leftTrimAmount<1 && rightTrimAmount<1){return 0;}
589 if(leftTrimAmount+rightTrimAmount>=r.length()){return -leftTrimAmount-rightTrimAmount;}
590
591 final boolean shortmatch=r.shortmatch();
592 final byte[] oldMatch=r.match;
593 r.match=null;
594 r.samline=null;
595 final int trimmed;
596 // System.err.println(rightTrimAmount+", "+leftTrimAmount);
597 if(sl!=null && sl.strand()==Shared.MINUS){
598 trimmed=trimByAmount(r, rightTrimAmount, leftTrimAmount, minFinalLength, false);
599 }else{
600 trimmed=trimByAmount(r, leftTrimAmount, rightTrimAmount, minFinalLength, false);
601 }
602 r.samline=sl;
603 if(trimmed<1){
604 r.match=oldMatch;
605 return 0;
606 }
607
608 final int len=r.length();
609 ByteBuilder bb=new ByteBuilder();
610 if(oldMatch!=null){
611 if(shortmatch){
612 bb.append((byte)'m');
613 if(len>1){bb.append(len);}
614 }else{
615 for(int i=0; i<len; i++){bb.append((byte)'m');}
616 }
617 r.match=bb.toBytes();
618 bb.clear();
619 }
620
621 if(sl!=null){
622 sl.pos+=leftTrimAmount;
623 if(sl.cigar!=null && sl.cigar.length()>0){
624 char c=sl.cigar.charAt(sl.cigar.length()-1);
625 assert(c=='M' || c=='=') : c+"; "+sl.cigar+"\n"+sl;
626 bb.append(len);
627 bb.append(SamLine.VERSION>1.3 ? '=' : 'M');
628 sl.cigar=bb.toString();
629 }
630 sl.seq=r.bases;
631 sl.qual=r.quality;
632 if(trimmed>0 && sl.optional!=null && sl.optional.size()>0){
633 ArrayList<String> list=new ArrayList<String>(2);
634 for(int i=0; i<sl.optional.size(); i++){
635 String s=sl.optional.get(i);
636 if(s.startsWith("PG:") || s.startsWith("RG:") || s.startsWith("X") || s.startsWith("Y") || s.startsWith("Z")){list.add(s);} //Only keep safe flags.
637 }
638 sl.optional.clear();
639 sl.optional.addAll(list);
640 }
641 }
642 return trimmed;
643 }
644
645 // //Should be unneeded, just use the above function
646 // public static int trimReadWithoutMatch(final Read r, final SamLine sl, final int leftTrimAmount, final int rightTrimAmount, final int minFinalLength){
647 // if(r.bases==null){return 0;}
648 // if(leftTrimAmount<1 && rightTrimAmount<1){return 0;}
649 // if(leftTrimAmount+rightTrimAmount>=r.length()){return -leftTrimAmount-rightTrimAmount;}
650 //
651 // assert(r.match==null);
652 // final int trimmed;
653 // if(sl!=null && sl.strand()==Shared.MINUS){
654 // trimmed=trimByAmount(r, rightTrimAmount, leftTrimAmount, minFinalLength, false);
655 // }else{
656 // trimmed=trimByAmount(r, leftTrimAmount, rightTrimAmount, minFinalLength, false);
657 // }
658 // if(trimmed<1){return 0;}
659 //
660 // if(sl!=null){
661 // sl.pos+=leftTrimAmount;
662 // assert(sl.cigar==null);
663 // sl.seq=r.bases;
664 // sl.qual=r.quality;
665 // if(trimmed>0 && sl.optional!=null && sl.optional.size()>0){
666 // ArrayList<String> list=new ArrayList<String>(2);
667 // for(int i=0; i<sl.optional.size(); i++){
668 // String s=sl.optional.get(i);
669 // if(s.startsWith("PG:") || s.startsWith("RG:") || s.startsWith("X") || s.startsWith("Y") || s.startsWith("Z")){list.add(s);} //Only keep safe flags.
670 // }
671 // sl.optional.clear();
672 // sl.optional.addAll(list);
673 // }
674 // }
675 // return trimmed;
676 // }
677
678 //TODO: This is slow
679 //TODO: Note, returns a negative number if the whole read is supposed to be trimmed
680 public static int trimReadWithMatch(final Read r, final SamLine sl,
681 int leftTrimAmount, int rightTrimAmount, int minFinalLength, long scafLen, boolean trimClip){
682 if(r.bases==null || (leftTrimAmount<1 && rightTrimAmount<1 && !trimClip)){return 0;}
683 if(!r.containsNonM() && !trimClip){//TODO: H not handled
684 return trimReadWithMatchFast(r, sl, leftTrimAmount, rightTrimAmount, minFinalLength);
685 }
686
687 // if(r.match==null){
688 // assert(sl.cigar==null);
689 // int trimmed=trimByAmount(r, leftTrimAmount, rightTrimAmount, minFinalLength, false);
690 // if(sl!=null){
691 // assert(sl.cigar==null);
692 // int start=sl.pos-1;
693 // int stop=start+sl.seq.length;
694 // sl.seq=r.bases;
695 // sl.qual=r.quality;
696 // if(trimmed>0 && sl.optional!=null && sl.optional.size()>0){
697 // ArrayList<String> list=new ArrayList<String>(2);
698 // for(int i=0; i<sl.optional.size(); i++){
699 // String s=sl.optional.get(i);
700 // if(s.startsWith("PG:") || s.startsWith("RG:") || s.startsWith("X") || s.startsWith("Y") || s.startsWith("Z")){list.add(s);} //Only keep safe flags.
701 // }
702 // sl.optional.clear();
703 // sl.optional.addAll(list);
704 // }
705 // }
706 // }
707
708 if(trimClip){
709 r.toLongMatchString(false);
710 byte[] match=r.match;
711 int leftClip=0, rightClip=0;
712 for(int i=0; i<match.length; i++){
713 if(match[i]=='C'){leftClip++;}
714 else{break;}
715 }
716 for(int i=match.length-1; i>=0; i--){
717 if(match[i]=='C'){rightClip++;}
718 else{break;}
719 }
720 leftTrimAmount=Tools.max(leftTrimAmount, leftClip);
721 rightTrimAmount=Tools.max(rightTrimAmount, rightClip);
722 }
723
724 if(leftTrimAmount<1 && rightTrimAmount<1){return 0;}
725 if(leftTrimAmount+rightTrimAmount>=r.length()){return -leftTrimAmount-rightTrimAmount;}
726
727 final int oldPos=(sl==null ? 1 : sl.pos);
728
729 r.toLongMatchString(false);
730 byte[] match=r.match;
731
732 assert(!r.shortmatch());
733
734 // System.err.println("Q: "+sl);
735 // System.err.println(new String(match));
736
737 ByteBuilder bb=new ByteBuilder(match.length);
738 // System.err.println("leftTrimAmount="+leftTrimAmount+", rightTrimAmount="+rightTrimAmount);
739
740 int leftTrim=leftTrimAmount>0 ? r.length() : 0, rightTrim=rightTrimAmount>0 ? r.length() : 0;
741 // System.err.println("leftTrim="+leftTrim+", rightTrim="+rightTrim);
742 {
743 final int mlen=match.length;
744 boolean keep=leftTrimAmount<1;
745 int rpos=(sl==null ? 1 : sl.pos);
746 for(int mpos=0, cpos=0; mpos<mlen; mpos++){
747 final byte m=match[mpos];
748 if(m=='m' || m=='S' || m=='V' || m=='N'){
749 cpos++;
750 rpos++;
751 }else if(m=='D'){
752 rpos++;
753 }else if(m=='I' || m=='C'){
754 cpos++;
755 }else if(m=='d'){
756 rpos++;
757 }else if(m=='i'){
758 cpos++;
759 }else{
760 assert(false) : "Unknown symbol "+(char)m;
761 }
762
763 if(keep){
764 bb.append(m);
765 }else if(cpos>=leftTrimAmount){
766 byte next=(mpos<mlen-1 ? match[mpos+1] : (byte)'m');
767 if(m=='m' || m=='S' || m=='V' || m=='N' || next=='m' || next=='S' || next=='V' || next=='N'){
768 keep=true;
769 if(sl!=null){sl.pos=rpos;}
770 leftTrim=cpos;
771 }
772 }
773 }
774 }
775 // System.err.println("R: "+sl);
776 match=bb.toBytes();
777 Tools.reverseInPlace(match);
778 bb.clear();
779 {
780 final int mlen=match.length;
781 boolean keep=rightTrimAmount<1;
782 for(int mpos=0, cpos=0; mpos<mlen; mpos++){
783 final byte m=match[mpos];
784 if(m=='m' || m=='S' || m=='V' || m=='N' || m=='I' || m=='C'){
785 cpos++;
786 }else if(m=='D'){
787 }else if(m=='d'){
788 }else if(m=='i'){
789 cpos++;
790 }else{
791 assert(false) : "Unknown symbol "+(char)m;
792 }
793
794 if(keep){
795 bb.append(m);
796 }else if(cpos>=rightTrimAmount){
797 byte next=(mpos<mlen-1 ? match[mpos+1] : (byte)'m');
798 // if(m=='m' || m=='S' || m=='V' || m=='N' || next=='m' || next=='S' || next=='V' || next=='N'){
799 if(next!='I' && next!='D' && next!='i' && next!='d'){
800 keep=true;
801 rightTrim=cpos;
802 }
803 // }
804 }
805 }
806 }
807 // System.err.println("S: "+sl);
808 match=bb.toBytes();
809 Tools.reverseInPlace(match);
810
811 // System.err.println("leftTrim="+leftTrim+", rightTrim="+rightTrim);
812
813 if(leftTrim+rightTrim>=r.length()){
814 if(sl!=null){sl.pos=oldPos;}
815 return -leftTrim-rightTrim;
816 }
817
818 // System.err.println("T: "+sl);
819 r.match=null;
820 if(sl!=null && sl.strand()==Shared.MINUS){
821 int temp=leftTrim;
822 leftTrim=rightTrim;
823 rightTrim=temp;
824 }
825 final int trimmed=trimByAmount(r, leftTrim, rightTrim, minFinalLength, false);
826 // System.err.println("tba: "+leftTrim+", "+rightTrim+", "+minFinalLength);
827 r.match=match;
828
829 if(sl!=null){
830 int start=sl.pos-1;
831 int stop=start+Read.calcMatchLength(match)-1;
832 if(SamLine.VERSION>1.3){
833 sl.cigar=SamLine.toCigar14(match, start, stop, scafLen, r.bases);
834 }else{
835 sl.cigar=SamLine.toCigar13(match, start, stop, scafLen, r.bases);
836 }
837 sl.seq=r.bases;
838 sl.qual=r.quality;
839 if(trimmed>0 && sl.optional!=null && sl.optional.size()>0){
840 ArrayList<String> list=new ArrayList<String>(2);
841 for(int i=0; i<sl.optional.size(); i++){
842 String s=sl.optional.get(i);
843 if(s.startsWith("PG:") || s.startsWith("RG:") || s.startsWith("X") || s.startsWith("Y") || s.startsWith("Z")){list.add(s);} //Only keep safe flags.
844 }
845 sl.optional.clear();
846 sl.optional.addAll(list);
847 }
848 }
849 return trimmed;
850 }
851
852 public int trimmed() {
853 return leftTrimmed+rightTrimmed;
854 }
855
856 public final Read r;
857
858 /** untrimmed bases */
859 public final byte[] bases1;
860 /** untrimmed qualities */
861 public final byte[] qual1;
862 /** trimmed bases */
863 public byte[] bases2;
864 /** trimmed qualities */
865 public byte[] qual2;
866
867
868 public final float trimq;
869 public int leftTrimmed;
870 public int rightTrimmed;
871
872 /** Require this many consecutive good bases to stop trimming. Minimum is 1.
873 * This is for the old trimming mode and not really used anymore */
874 public static int minGoodInterval=2;
875
876 public static boolean verbose=false;
877 public static boolean optimalMode=true;
878 public static boolean windowMode=false;
879 public static float optimalBias=-1f;
880
881 public static int windowLength=4;
882
883 private static final float NPROB=0.75f;
884 // public static float PROB1=QualityTools.PROB_ERROR[1];
885
886
887 }