comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/Clump.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package clump;
2
3 import java.util.ArrayList;
4
5 import dna.AminoAcid;
6 import shared.KillSwitch;
7 import shared.Parse;
8 import shared.Tools;
9 import stream.Read;
10 import structures.ByteBuilder;
11
12 /**
13 * A list of reads sharing a kmer.
14 * @author Brian Bushnell
15 * @date Nov 7, 2015
16 *
17 */
18 public class Clump extends ArrayList<Read> implements Comparable<Clump> {
19
20 public static Clump makeClump(long kmer){
21 try {
22 return new Clump(kmer);
23 } catch (OutOfMemoryError e) {
24 KillSwitch.memKill(e);
25 throw new RuntimeException();
26 }
27 }
28
29 private Clump(long kmer_){
30 this(kmer_, 4);
31 }
32
33 private Clump(long kmer_, int size){
34 super(size);
35 kmer=kmer_;
36 }
37
38 @Override
39 public boolean add(Read r){
40 ReadKey key=(ReadKey) r.obj;
41 assert(key.kmer==kmer);
42 key.clump=this;
43 return super.add(r);
44 }
45
46 private void setMaxima(){
47 maxLeft=-1;
48 maxRight=-1;
49 width=-1;
50 for(Read r : this){
51 ReadKey key=(ReadKey) r.obj;
52 final int pos=key.position;
53 maxLeft=Tools.max(maxLeft, pos);
54 maxRight=Tools.max(maxRight, r.length()-pos);
55 }
56 width=maxLeft+maxRight;
57 }
58
59 /** This will create counts of bases, or sums of qualities, at each position in the cluster. */
60 private int[][] count(final boolean qualityScores){
61 if(width<0){setMaxima();}
62
63 // System.err.println("\n\n");
64 final int[][] counts=new int[4][width];
65 for(Read r : this){
66 ReadKey key=(ReadKey) r.obj;
67 final int pos=key.position;
68 byte[] bases=r.bases, quals=r.quality;
69 if(quals==null){useQuality=false;}
70
71 // System.err.println("pos="+pos+", maxLeft="+maxLeft);
72 for(int cloc=0, rloc=maxLeft-pos; cloc<bases.length; cloc++, rloc++){
73 // System.err.println("cloc="+cloc+"/"+bases.length+", rloc="+rloc+"/"+width);
74 int x=AminoAcid.baseToNumber[bases[cloc]];
75 if(x>-1){
76 final int q;
77 if(qualityScores){q=(quals==null ? 20 : quals[cloc]);}
78 else{q=1;}
79 counts[x][rloc]+=q;
80 }
81 }
82 }
83 return counts;
84 }
85
86 /*--------------------------------------------------------------*/
87
88 public Read makeSimpleConsensus(){
89 // assert(Splitter.findBestPivot(this)<0) : Splitter.findBestPivot(this); //TODO: Slow
90 if(size()==1){
91 Read r=get(0);
92 if(renameConsensus){r.id=r.numericID+" size=1";}
93 return r;
94 }
95 final int[][] bcounts=baseCounts();
96 final int[][] qcounts=qualityCounts();
97
98 final byte[] bases=new byte[width], quals=new byte[width];
99 for(int i=0; i<width; i++){
100 int x=getConsensusAtPosition(qcounts, i);
101 int y=getSecondHighest(qcounts, i);
102 if(x>=0 && qcounts[x][i]==qcounts[y][i]){//Tie-breaker
103 x=getConsensusAtPosition(bcounts, i);
104 y=getSecondHighest(bcounts, i);
105 }
106
107
108 if(x<0){
109 // System.err.println("q="+0+", x="+x+"; A="+counts[0][i]+", C="+counts[1][i]+", G="+counts[2][i]+", T="+counts[3][i]);
110 bases[i]='N';
111 quals[i]=0;
112 assert(getSumAtPosition(qcounts, i)==0) : "\n"+bcounts[0][i]+", "+bcounts[1][i]+", "+bcounts[2][i]+", "+bcounts[3][i]+
113 "\n"+qcounts[0][i]+", "+qcounts[1][i]+", "+qcounts[2][i]+", "+qcounts[3][i]+
114 "\nwidth="+width+", i="+i+", size="+size()+"\n"+new String(bases, 0, i)+"\n"+get(0)+"\n"+get(1)+"\n";
115 // assert(getSumAtPosition(bcounts, i)==0) : "\n"+bcounts[0][i]+", "+bcounts[1][i]+", "+bcounts[2][i]+", "+bcounts[3][i]+
116 // "\n"+qcounts[0][i]+", "+qcounts[1][i]+", "+qcounts[2][i]+", "+qcounts[3][i]+
117 // "\nwidth="+width+", i="+i+", size="+size()+"\n"+new String(bases, 0, i)+"\n"+get(0)+"\n"+get(1)+"\n";
118 }else{
119 final long bsum=getSumAtPosition(bcounts, i);
120 final long bmajor=bcounts[x][i];
121 final long bminor=bsum-bcounts[x][i];
122 final long bsecond=bcounts[y][i];
123
124 final long qsum=getSumAtPosition(qcounts, i);
125 final long qmajor=qcounts[x][i];
126 final long qminor=qsum-qcounts[x][i];
127 final long qsecond=qcounts[y][i];
128
129 float bratio=bminor/(float)(bmajor+bminor);
130 double phred=(bminor==0 ? Read.MAX_CALLED_QUALITY() : -10*Math.log10(bratio));
131 phred=Tools.min(phred, qmajor-qminor);
132 // if(phred<0 || phred>127){
133 // assert(false) : i+","+x+","+bsum+","+qsum+","+bmajor+","+bminor+","+bratio+","+phred+"\n"+
134 // bcounts[0][i]+","+bcounts[1][i]+","+bcounts[2][i]+","+bcounts[3][i]+"\n"+
135 // qcounts[0][i]+","+qcounts[1][i]+","+qcounts[2][i]+","+qcounts[3][i]+"\n"+
136 // this.toStringStaggered()+"\n";
137 // }
138 // assert(phred>=0 && phred<=127) : phred+","+x+","+i+","+bratio+","+bcounts[x][i]+","+bcounts[0][i]+
139 // ","+bcounts[1][i]+","+bcounts[2][i]+","+bcounts[3][i];
140 // assert(phred>=0 && phred<=127) : bmajor+", "+bminor+", "+phred+", "+Read.MAX_CALLED_QUALITY;
141 byte q=Read.capQuality((long)Math.round(phred));
142 bases[i]=AminoAcid.numberToBase[x];
143 quals[i]=q;
144 assert(q>0);
145 assert(x>-1);
146 assert(bases[i]!='N');
147 }
148 }
149 Read leftmost=this.get(0);//TODO: Actually rightmost!
150 Read r=new Read(bases, quals, leftmost.numericID+" size="+size(), 0);
151 //TODO: Attach the long pair, and make sure the kmer location is correct.
152 // assert(false) : "\n"+r.toFastq()+"\nCheck kmer location.";
153 // assert(size()==1) : "\n"+r.toFastq()+"\n"+get(0).toFastq()+"\n"+get(size()-1).toFastq()+"\n";
154 return r;
155 }
156
157 /*--------------------------------------------------------------*/
158
159 public int removeDuplicates(){
160 assert(KmerComparator.compareSequence);
161 if(size()<2){return 0;}
162
163 int removedTotal=0, removed=0;
164
165 final boolean sortXY=(forceSortXY || sortYEarly() || (opticalOnly && (sortX || sortY) && size()>=sortXYSize));
166
167 final int maxDiscarded;
168 final int scan;
169
170 if(opticalOnly && sortXY){
171 scan=Tools.max(scanLimit, 200);
172 maxDiscarded=scan+10;
173 }else if(!sortXY && ((maxSubstitutions<1 && dupeSubRate<=0) || scanLimit<0)){
174 scan=0;
175 maxDiscarded=0;
176 }else{
177 scan=scanLimit;
178 maxDiscarded=scan+10;
179 }
180
181 if(sortXY){
182 assert(sortX || sortY);
183
184 if(sortY){
185 if(!sortYEarly()){
186 this.sort(KmerComparatorY.comparator);
187 }else{
188 // for(int i=1; i<this.size(); i++){
189 // Read a=get(i-1);
190 // Read b=get(i);
191 // assert(KmerComparatorY.comparator.compare(a, b)<=0) : a.obj+" vs "+b.obj;
192 // }
193 // //Otherwise, it was already Y-sorted.
194 }
195 // assert(false) : sortY();
196 removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan, maxDiscarded, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance);
197 removedTotal+=removed;
198 // System.err.println("RemovedY: "+removed);
199 while((maxSubstitutions>0 || dupeSubRate>0) && scanLimit>0 && removed>maxDiscarded){
200 removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan+10, maxDiscarded*2+20, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance);
201 removedTotal+=removed;
202 // System.err.println("RemovedY: "+removed);
203 }
204 }
205 if(sortX && (ReadKey.spanTilesX || !sortY)){
206 this.sort(KmerComparatorX.comparator);
207 removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan, maxDiscarded, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance);
208 removedTotal+=removed;
209 // System.err.println("RemovedX: "+removed);
210 while((maxSubstitutions>0 || dupeSubRate>0) && scanLimit>0 && removed>maxDiscarded){
211 removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan+10, maxDiscarded*2+20, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance);
212 removedTotal+=removed;
213 // System.err.println("RemovedX: "+removed);
214 }
215 }
216 }else{
217 removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan, maxDiscarded, opticalOnly, false, markOnly, markAll, renameByCount, maxOpticalDistance);
218 removedTotal+=removed;
219 while((maxSubstitutions>0 || dupeSubRate>0) && scanLimit>0 && removed>maxDiscarded){
220 removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan+10, maxDiscarded*2+20, opticalOnly, false, markOnly, markAll, renameByCount, maxOpticalDistance);
221 removedTotal+=removed;
222 }
223 }
224
225 return removedTotal;
226 }
227
228 private int removeDuplicates_inner(final int maxSubs, final float subRate, final int scanLimit, final int maxDiscarded,
229 final boolean optical, final boolean xySorted, final boolean mark, final boolean markAll, final boolean rename, final float dist){
230 final int size=size();
231 if(size<2){return 0;}
232 int dupePairs=0;
233 int dupeReads=0;
234
235 // final boolean breakOnTile=(optical && !FlowcellCoordinate.spanTiles);
236
237 // final long start=System.nanoTime();
238
239 for(int i=0, lim=size-1; i<lim; i++){
240 final Read a=get(i);
241 if(!a.discarded()){
242 final ReadKey keyA=(ReadKey) a.obj;
243 final int aLen=a.length();
244 int unequals=0;
245 int discarded=0;
246 for(int j=i+1; j<size && unequals<=scanLimit && discarded<=maxDiscarded && (!a.discarded() || markAll); j++){
247 final Read b=get(j);
248 if(b.discarded()){
249 discarded++;
250 }else{
251 final int bLen=b.length();
252 final ReadKey keyB=(ReadKey) b.obj;
253 if(!containment && !keyA.equals(keyB)){break;}
254 // if(containment && affix && !keyA.physicalAffix(keyB, aLen, bLen)){break;}
255 // if(optical && keyA.lane!=keyB.lane){break;} //Already in equals method
256 // if(breakOnTile && keyA.tile!=keyB.tile){break;} //Already in equals method
257 if(optical && xySorted && !keyA.nearXY(keyB, dist)){break;}
258 // if(System.nanoTime()-start>200000000000L){
259 // TextStreamWriter tsw=new TextStreamWriter("foo.fq", true, false, false);
260 // tsw.start();
261 // for(Read x : this){
262 // tsw.println(x.toFastq());
263 // }
264 // tsw.poisonAndWait();
265 // assert(false) : "kmer="+kmer+", size="+size();
266 // }
267 if(equals(a, b, maxSubs, subRate)){
268 if(!optical || keyA.near(keyB, dist)){
269 if(printDuplicates){
270 System.out.println(a.toFasta());
271 System.out.println(b.toFasta());
272 }
273 float errA=a.expectedErrorsIncludingMate(true);
274 float errB=b.expectedErrorsIncludingMate(true);
275 if(markAll){
276 b.setDiscarded(true);
277 assert(!a.discarded() || markAll);
278 dupePairs++;
279 dupeReads+=1+b.mateCount();
280 unequals=0;
281 if(!a.discarded()){
282 a.setDiscarded(true);
283 dupePairs++;
284 dupeReads+=1+a.mateCount();
285 }
286 }else if(containment || errB>=errA){
287 b.setDiscarded(true);
288 assert(!a.discarded() || markAll);
289 a.copies+=b.copies+parseExtraCopies(b);
290 dupePairs++;
291 dupeReads+=1+b.mateCount();
292 unequals=0;
293 }else{
294 a.setDiscarded(true);
295 assert(!b.discarded() || markAll);
296 b.copies+=a.copies+parseExtraCopies(a);
297 dupePairs++;
298 dupeReads+=1+a.mateCount();
299 }
300 }
301 }else{
302 unequals++;
303 }
304 }
305 }
306 }
307 }
308 if(dupeReads>0){
309 for(int i=0; i<size; i++){
310 Read a=get(i);
311 if(a.discarded()){
312 if(mark){
313 if(!a.id.endsWith(" duplicate")){
314 a.id+=" duplicate";
315 if(a.mate!=null){a.mate.id+=" duplicate";}
316 }
317 }else{
318 set(i, null);
319 }
320 }else if(rename && a.copies>1){
321 renameFromCount(a);
322 }
323 a.copies=1;
324 }
325 if(!mark){
326 int x=Tools.condenseStrict(this);
327 assert(x==dupePairs) : size()+", "+size+", "+dupePairs+", "+x;
328 assert((size()>0 || markAll) && size()==size-dupePairs) : size()+", "+size+", "+dupePairs;
329 }
330 }
331
332 if(containment){
333 dupeReads+=removeDuplicates_backwards(maxSubs, subRate, scanLimit, maxDiscarded, optical, xySorted, mark, markAll, rename, dist);
334 }
335
336 return dupeReads;
337 }
338
339 /** Only for containments */
340 private int removeDuplicates_backwards(final int maxSubs, final float subRate, final int scanLimit, final int maxDiscarded,
341 final boolean optical, final boolean xySorted, final boolean mark, final boolean markAll, final boolean rename, final float dist){
342 final int size=size();
343 if(size<2){return 0;}
344 int dupePairs=0;
345 int dupeReads=0;
346
347 // final boolean breakOnTile=(optical && !FlowcellCoordinate.spanTiles);
348
349 // final long start=System.nanoTime();
350
351 for(int i=size-1; i>0; i--){
352 final Read a=get(i);
353 if(!a.discarded()){
354 final ReadKey keyA=(ReadKey) a.obj;
355 final int aLen=a.length();
356 int unequals=0;
357 int discarded=0;
358 for(int j=i-1; j>=0 && unequals<=scanLimit && discarded<=maxDiscarded && (!a.discarded() || markAll); j--){
359 final Read b=get(j);
360 if(b.discarded()){
361 discarded++;
362 }else{
363 final int bLen=b.length();
364 final ReadKey keyB=(ReadKey) b.obj;
365 if(!containment && !keyA.equals(keyB)){break;}
366 // if(containment && affix && !keyA.physicalAffix(keyB, aLen, bLen)){break;}
367 // if(optical && keyA.lane!=keyB.lane){break;} //Already in equals method
368 // if(breakOnTile && keyA.tile!=keyB.tile){break;} //Already in equals method
369 if(optical && xySorted && !keyA.nearXY(keyB, dist)){break;}
370 // if(System.nanoTime()-start>200000000000L){
371 // TextStreamWriter tsw=new TextStreamWriter("foo.fq", true, false, false);
372 // tsw.start();
373 // for(Read x : this){
374 // tsw.println(x.toFastq());
375 // }
376 // tsw.poisonAndWait();
377 // assert(false) : "kmer="+kmer+", size="+size();
378 // }
379 if(equals(a, b, maxSubs, subRate)){
380 if(!optical || keyA.near(keyB, dist)){
381 if(printDuplicates){
382 System.out.println(a.toFasta());
383 System.out.println(b.toFasta());
384 }
385 float errA=a.expectedErrorsIncludingMate(true);
386 float errB=b.expectedErrorsIncludingMate(true);
387 if(markAll){
388 b.setDiscarded(true);
389 assert(!a.discarded() || markAll);
390 dupePairs++;
391 dupeReads+=1+b.mateCount();
392 unequals=0;
393 if(!a.discarded()){
394 a.setDiscarded(true);
395 dupePairs++;
396 dupeReads+=1+a.mateCount();
397 }
398 }else if(containment || errB>=errA){
399 b.setDiscarded(true);
400 assert(!a.discarded() || markAll);
401 a.copies+=b.copies+parseExtraCopies(b);
402 dupePairs++;
403 dupeReads+=1+b.mateCount();
404 unequals=0;
405 }else{
406 a.setDiscarded(true);
407 assert(!b.discarded() || markAll);
408 b.copies+=a.copies+parseExtraCopies(a);
409 dupePairs++;
410 dupeReads+=1+a.mateCount();
411 }
412 }
413 }else{
414 unequals++;
415 }
416 }
417 }
418 }
419 }
420 if(dupeReads>0){
421 for(int i=0; i<size; i++){
422 Read a=get(i);
423 if(a.discarded()){
424 if(mark){
425 if(!a.id.endsWith(" duplicate")){
426 a.id+=" duplicate";
427 if(a.mate!=null){a.mate.id+=" duplicate";}
428 }
429 }else{
430 set(i, null);
431 }
432 }else if(rename && a.copies>1){
433 renameFromCount(a);
434 }
435 a.copies=1;
436 }
437 if(!mark){
438 int x=Tools.condenseStrict(this);
439 assert(x==dupePairs) : size()+", "+size+", "+dupePairs+", "+x;
440 assert((size()>0 || markAll) && size()==size-dupePairs) : size()+", "+size+", "+dupePairs;
441 }
442 }
443 return dupeReads;
444 }
445
446 public int parseExtraCopies(final Read a){
447 final String id=a.id;
448 final int space=id.lastIndexOf(' ');
449 int extraCopies=0;
450 if(space<0){return 0;}
451 if(space>=0 && Tools.contains(id, "copies=", space+1)){
452 extraCopies=Integer.parseInt(id.substring(space+8))-1;
453 }
454 return extraCopies;
455 }
456
457 private void renameFromCount(final Read a){
458 final int newExtraCopies=a.copies-1;
459 assert(newExtraCopies>0) : newExtraCopies;
460 final int oldExtraCopies=parseExtraCopies(a);
461 final int total=1+newExtraCopies+oldExtraCopies;
462 renameToTotal(a, total);
463 if(a.pairnum()==0 && a.mate!=null){
464 assert(a.mate.pairnum()==1);
465 renameToTotal(a.mate, total);
466 }
467 }
468
469 private static void renameToTotal(final Read a, final int total){
470 final String id=a.id;
471 final int space=id.lastIndexOf(' ');
472 if(space>=0 && Tools.contains(id, "copies=", space+1)){
473 a.id=a.id.substring(0, space);
474 }
475 a.id=a.id+" copies="+total;
476 }
477
478 // public boolean nearby(Read a, Read b, FlowcellCoordinate fca, FlowcellCoordinate fcb, float maxDist){
479 //// fca.setFrom(a.id);
480 // fcb.setFrom(b.id);
481 // float dist=fca.distance(fcb);
482 // return dist<=maxDist;
483 // }
484
485 // public boolean nearby(ReadKey ka, ReadKey kb, float maxDist){
486 // float dist=ka.distance(kb);
487 // return dist<=maxDist;
488 // }
489
490 public static boolean equals(Read a, Read b, int maxSubs, float dupeSubRate){
491 if(a.numericID==b.numericID){return false;}
492 if(dupeSubRate>0){maxSubs=Tools.max(maxSubs, (int)(dupeSubRate*Tools.min(a.length(), b.length())));}
493 if(containment){
494 return contains(a, b, maxSubs);
495 }
496 if(!equals(a.bases, b.bases, maxSubs)){return false;}
497 if(a.mate!=null && !equals(a.mate.bases, b.mate.bases, maxSubs)){return false;}
498 return true;
499 }
500
501 public static boolean equals(byte[] a, byte[] b, int maxSubs){
502 if(a==b){return false;}//Nothing should subsume itself
503 if(a==null || b==null){return false;}
504 if(a.length!=b.length){return false;}
505 int subs=0;
506 if(allowNs){
507 for(int i=0; i<a.length; i++){
508 if(a[i]!=b[i] && (a[i]!='N' && b[i]!='N')){
509 subs++;
510 if(subs>maxSubs){return false;}
511 }
512 }
513 }else{
514 for(int i=0; i<a.length; i++){
515 if(a[i]!=b[i]){
516 subs++;
517 if(subs>maxSubs){return false;}
518 }
519 }
520 }
521 return true;
522 }
523
524 public static boolean contains(Read a, Read b, int maxSubs){
525 if(a.numericID==b.numericID){return false;}
526 boolean ok=contains_inner(a, b, maxSubs);
527 if(!ok || a.mate==null){return ok;}
528 ok=contains_inner(a.mate, b.mate, maxSubs);
529 if(!ok){return false;}
530 ReadKey rka1=(ReadKey)a.obj;
531 ReadKey rkb1=(ReadKey)b.obj;
532 ReadKey rka2=(ReadKey)a.mate.obj; //TODO: In containment mode, mates need to always get keys.
533 ReadKey rkb2=(ReadKey)b.mate.obj;
534 return ((rka1.kmerMinusStrand==rkb1.kmerMinusStrand) && (rka2.kmerMinusStrand==rkb2.kmerMinusStrand)); //Ensures that both reads have the same directionality.
535 }
536
537 public static boolean contains_inner(Read a, Read b, int maxSubs){
538 // if(a.length()==b.length()){return equals(a.bases, b.bases, maxSubs);}
539 ReadKey rka=(ReadKey)a.obj;
540 ReadKey rkb=(ReadKey)b.obj;
541 if(affix ? !rka.physicalAffix(rkb, a.length(), b.length()) : !rka.physicallyContains(rkb, a.length(), b.length())){return false;}
542 boolean flipped=false;
543 // if(a.mate!=null){//In paired mode, need synchronization if the reads are in difference clumps. But... just don't do that.
544 // Read max, min;
545 // if(a.numericID>b.numericID){max=a; min=b;}//Protects against deadlocks.
546 // else{max=b; min=a;}
547 // synchronized(min){
548 // synchronized(max){
549 // if(rka.kmerMinusStrand!=rkb.kmerMinusStrand){
550 // rkb.flip(b, k);
551 // flipped=true;
552 // }
553 // boolean ok=contains(a.bases, b.bases, rka.position, rkb.position, maxSubs);
554 // if(flipped){rkb.flip(b, k);}
555 // return ok;
556 // }
557 // }
558 // }
559 if(rka.kmerMinusStrand!=rkb.kmerMinusStrand){
560 rkb.flip(b, k);
561 flipped=true;
562 }
563 boolean ok=contains(a.bases, b.bases, rka.position, rkb.position, maxSubs);
564 if(flipped){rkb.flip(b, k);}
565 return ok;
566 }
567
568 public static boolean contains(byte[] a, byte[] b, int posA, int posB, int maxSubs){
569 if(a==null || b==null){return false;}
570 assert(a.length>=b.length);
571 if(a==b){return false;} //Nothing should contain itself
572 int subs=0;
573
574 int ai, bi;
575 final int dif=posA-posB;
576 if(dif>0){
577 ai=0;
578 bi=-dif;
579 }else{
580 ai=dif;
581 bi=0;
582 }
583
584 if(allowNs){
585 for(; ai<a.length && bi<b.length; ai++, bi++){
586 if(ai>=0 && bi>=0){
587 final byte na=a[ai], nb=b[bi];
588 if(na!=nb && na!='N' && nb!='N'){
589 subs++;
590 if(subs>maxSubs){return false;}
591 }
592 }
593 }
594 }else{
595 for(; ai<a.length && bi<b.length; ai++, bi++){
596 if(ai>=0 && bi>=0 && a[ai]!=b[bi]){
597 subs++;
598 if(subs>maxSubs){return false;}
599 }
600 }
601 }
602 return true;
603 }
604
605 /*--------------------------------------------------------------*/
606
607 public long splitAndErrorCorrect(){
608 if(size()<Splitter.minSizeSplit){
609 return errorCorrect();
610 }
611 long sum=0;
612 ArrayList<Clump> list=Splitter.splitOnPivot(this);
613 for(Clump c : list){
614 sum+=c.errorCorrect();
615 }
616 return sum;
617 }
618
619 public long errorCorrect(){
620 if(size()<=minCountCorrect){return 0;}
621 // assert(Splitter.findBestPivot(this)<0); //TODO: Slow
622 Read consen=makeSimpleConsensus();
623 long sum=0;
624 final int[] rvector=KillSwitch.allocInt1D(2);
625 for(Read r : this){
626 sum+=errorCorrect(r, consen, rvector);
627 }
628 return sum;
629 }
630
631 private int errorCorrect(Read call, Read ref, int[] rvector){
632
633 // assert(call.validated());
634 // assert(call.checkQuality());
635 // assert(ref.checkQuality());
636
637 final float identity=identity(call, ref.bases, rvector);
638 if((identity<minIdentity && (rvector[1]>0 || rvector[0]<50)) || (identity==1 && !call.containsNocalls()/* && !ref.containsNocalls()*/)){
639 //TODO: Strange, the identity==1 clause causes different behavior, though it does give a speedup.
640 return 0;
641 }
642 final byte[] cbases=call.bases, cquals=call.quality;
643 final byte[] rbases=ref.bases, rquals=ref.quality;
644
645 ReadKey key=(ReadKey) call.obj;
646 final int pos=key.position;
647
648 final int[][] bcounts=baseCounts();
649 final int[][] qcounts=qualityCounts();
650 final float[][] qAvgs=qualityAverages();
651
652 final int minDepth=(int)Tools.max(minCountCorrect, minSizeFractionCorrect*size());
653
654 int corrections=0;
655
656 final int cStart=0, rStart=maxLeft-pos, max=cbases.length;
657 for(int cloc=cStart, rloc=rStart; cloc<max; cloc++, rloc++){
658 //Called base, ref base
659 final byte cb=cbases[cloc], rb=rbases[rloc];
660 //Called quality, ref quality
661 final byte cq=(cquals==null ? 20 : cquals[cloc]), rq=rquals[rloc];
662 //Called number
663 final byte cx=AminoAcid.baseToNumber[cb];
664 //Ref number
665 final byte rx=AminoAcid.baseToNumber[rb];
666
667 // assert((cb=='N') == (cquals[cloc]==0));
668
669 final byte b, q;
670 if(cx<0){
671 b=rb;
672 q=(byte)Tools.min(20, rq);
673 }else if(cb==rb){
674 b=cb;
675 q=(byte)Tools.mid(cq, cq+1, rq);//Adjust upward
676 assert(q>=cq && (q<=rq || q<=cq));
677 }else{
678 final int cCount=bcounts[cx][rloc];
679 final int rCount=bcounts[rx][rloc];
680 final int altQSum=qcounts[cx][rloc];
681 final int rQSum=qcounts[rx][rloc];
682 final float rQAvg=qAvgs[rx][rloc];
683 if(cCount<=maxCIncorrect && cq<=maxQIncorrect && cq*minQRatio<rQSum && cq*minAQRatio<8+rQAvg){
684 final byte pminor=getSecondHighest(bcounts, rloc);
685
686 assert(rx>=0 && rx<bcounts.length) : rx+", "+rloc+", "+bcounts.length+"\n"+call.toFastq()+"\n"+ref.toFastq();
687 assert(rloc>=0 && rloc<bcounts[rx].length) : rx+", "+rloc+", "+bcounts[rloc].length+"\n"+call.toFastq()+"\n"+ref.toFastq();
688 final int minorCount=bcounts[pminor][rloc];
689
690 final long sum=getSumAtPosition(bcounts, rloc);
691 // final long altCount=sum-rCount;
692
693 boolean change=false;
694 if(rCount>=minCountCorrect && sum>=minDepth){
695 final float ratioNeeded=Tools.min(minRatio, minRatioMult*minorCount+minRatioOffset+minRatioQMult*cq);
696 // final float qratioNeeded=Tools.min(minRatio, minRatioMult*altQSum+minRatioOffset+minRatioQMult*cq); //altQSum is slightly different than minorQCount
697 if(minorCount*ratioNeeded<=rCount && altQSum*ratioNeeded<=rQSum){
698 change=true;
699 }
700
701 // else if(minorCount*ratioNeeded<=rCount){
702 // assert(false) : "\n"+altQSum+", "+rQSum+", "+qratioNeeded+"\n"+cCount+", "+rCount+", "+sum+", "+ratioNeeded+"\n"+(altQSum*qratioNeeded);
703 // }
704 }
705 if(change){
706 corrections++;
707 b=rb;
708 q=(byte)Tools.min(cq+1, 25, rq);
709 // assert(q==25 || (q<=rq && q>=cq)) : q+", "+cq+", "+rq;
710 }else{
711 b=cb;
712 q=(byte)Tools.mid(cq, cq-1, 6);//Adjust downward
713 assert(q<=cq || q>=6) : q+","+cq;
714 }
715 }else{
716 b=cb;
717 q=cq;
718 }
719 }
720 cbases[cloc]=b;
721 if(cquals!=null){
722 byte q2=(byte)Tools.mid(q, cq+maxQAdjust, cq-maxQAdjust);
723 if(q2==0 && AminoAcid.isFullyDefined(b)){
724 assert(!AminoAcid.isFullyDefined(cb));
725 q2=(byte)Tools.mid(2, 25, (rq+25)/2);
726 }else if(!AminoAcid.isFullyDefined(b)){
727 q2=0;
728 }
729 cquals[cloc]=q2;
730 assert((b=='N') == (cquals[cloc]==0)) : "b="+(char)b+", cb="+(char)cb+", rb="+(char)rb+", cx="+cx+", "
731 + "new="+cquals[cloc]+", q="+q+", old="+cq+", rq="+rq+", loc="+rloc+"\n"+call.toFastq()+"\n"+ref.toFastq();
732 }
733 }
734 return corrections;
735 }
736
737 /*--------------------------------------------------------------*/
738
739 //Only used by condense mode.
740 public ArrayList<Read> makeConsensus(){
741 if(size()==1){
742 Read r=get(0);
743 r.id=r.numericID+" size=1";
744 return this;
745 }
746
747 ArrayList<Clump> clumps=Splitter.splitOnPivot(this);//TODO: Really, this should return null if there is no pivot
748
749 ArrayList<Read> list=new ArrayList<Read>(clumps.size());
750 for(Clump c : clumps){
751 Read temp=c.makeSimpleConsensus();
752 list.add(temp);
753 }
754 return list;
755 }
756
757 /*--------------------------------------------------------------*/
758
759 private float identity(Read call, byte[] rbases, int[] rvector){
760 ReadKey key=(ReadKey) call.obj;
761 final int pos=key.position;
762 byte[] cbases=call.bases, quals=call.quality;
763 int good=0, bad=0;
764
765 final int cStart=0, rStart=maxLeft-pos, max=cbases.length;
766 for(int cloc=cStart, rloc=rStart; cloc<max; cloc++, rloc++){
767 final byte cb=cbases[cloc], rb=rbases[rloc];
768 if(AminoAcid.isFullyDefined(cb) && AminoAcid.isFullyDefined(rb)){
769 if(cb==rb){good++;}
770 else{bad++;}
771 }
772 }
773 rvector[0]=good;
774 rvector[1]=bad;
775 return good==0 ? 0 : good/(float)(good+bad);
776 }
777
778 /*--------------------------------------------------------------*/
779
780 long getSumAtPosition(int[][] counts, int pos){
781 long sum=0;
782 for(int x=0; x<4; x++){
783 sum+=counts[x][pos];
784 }
785 return sum;
786 }
787
788 byte getConsensusAtPosition(int[][] counts, int pos){
789 byte xMax=0;
790 for(byte x=1; x<4; x++){
791 // System.err.println("x="+x+", max="+max+", Checking "+counts[x][pos]+" vs "+counts[x][max]);
792 if(counts[x][pos]>counts[xMax][pos]){xMax=x;}
793 }
794 // assert(counts[max][pos]>=counts[0][pos]);
795 // assert(counts[max][pos]>=counts[1][pos]);
796 // assert(counts[max][pos]>=counts[2][pos]) : max+", "+counts[max][pos]+", ["+counts[0][pos]+", "+counts[1][pos]+", "+counts[2][pos]+", "+counts[3][pos]+"]";
797 // assert(counts[max][pos]>=counts[3][pos]);
798 return (counts[xMax][pos]>0 ? xMax : -1);
799 }
800
801 byte getSecondHighest(int[][] counts, int pos){
802 byte first=0;
803 byte second=1;
804 if(counts[first][pos]<counts[second][pos]){
805 first=1; second=0;
806 }
807 for(byte x=2; x<4; x++){
808 // System.err.println("x="+x+", max="+max+", Checking "+counts[x][pos]+" vs "+counts[x][max]);
809 if(counts[x][pos]>counts[first][pos]){
810 second=first;
811 first=x;
812 }else if(counts[x][pos]>counts[second][pos]){
813 second=x;
814 }
815 }
816 // assert(counts[max][pos]>=counts[0][pos]);
817 // assert(counts[max][pos]>=counts[1][pos]);
818 // assert(counts[max][pos]>=counts[2][pos]) : max+", "+counts[max][pos]+", ["+counts[0][pos]+", "+counts[1][pos]+", "+counts[2][pos]+", "+counts[3][pos]+"]";
819 // assert(counts[max][pos]>=counts[3][pos]);
820
821 return second; //may be actually 0.
822 //return (counts[second][pos]>0 ? second : -1);
823 }
824
825 /*--------------------------------------------------------------*/
826
827 public String toStringStaggered(){
828 ByteBuilder sb=new ByteBuilder();
829 for(Read r : this){
830 ReadKey key=(ReadKey) r.obj;
831 final int pos=key.position;
832 byte[] bases=r.bases, quals=r.quality;
833 int rloc=0, cloc=rloc+pos-maxLeft;
834 while(cloc<0){sb.append(' '); cloc++;}
835 sb.append(bases);
836 sb.append('\n');
837 }
838 return sb.toString();
839 }
840
841 public Read consensusRead(){
842 if(consensusRead==null){
843 consensusRead=makeSimpleConsensus();
844 }
845 return consensusRead;
846 }
847
848 public int width(){
849 assert(width>=0) : width;
850 return width;
851 }
852
853 public int maxLeft(){
854 assert(maxLeft>=0);
855 return maxLeft;
856 }
857
858 public int maxRight(){
859 assert(maxRight>=0);
860 return maxRight;
861 }
862
863 /*--------------------------------------------------------------*/
864
865 int[][] baseCounts(){
866 if(baseCounts==null){
867 baseCounts=count(false);
868 int len=baseCounts[0].length;
869 assert(width==-1 || width==len);
870 }
871 return baseCounts;
872 }
873
874 int[][] qualityCounts(){
875 if(qualityCounts==null){
876 qualityCounts=count(true);
877 int len=qualityCounts[0].length;
878 assert(width==-1 || width==len);
879 }
880 return qualityCounts;
881 }
882
883 float[][] qualityAverages(){
884 if(qualityAverages==null){
885 qualityAverages=new float[4][width];
886 for(int i=0; i<4; i++){
887 for(int j=0; j<width; j++){
888 int b=baseCounts[i][j];
889 int q=qualityCounts[i][j];
890 qualityAverages[i][j]=(b==0 ? 0 : q/(float)b);
891 }
892 }
893 }
894 return qualityAverages;
895 }
896
897 void clearCounts(){
898 baseCounts=qualityCounts=null;
899 qualityAverages=null;
900 }
901
902 private void clearConsensus(){
903 consensusRead=null;
904 }
905
906 /*--------------------------------------------------------------*/
907
908 @Override
909 public boolean equals(Object b){
910 return this==b;
911 }
912
913 @Override
914 public int hashCode(){
915 return Long.hashCode(kmer);
916 }
917
918 @Override
919 public int compareTo(Clump o) {
920 int x=Long.compare(kmer, o.kmer);
921 return x!=0 ? x : o.size()-size();
922 }
923
924 /*--------------------------------------------------------------*/
925
926 public final long kmer;
927
928 private int width=-1;
929 private int maxLeft=-1;
930 private int maxRight=-1;
931
932 private int[][] baseCounts;
933 private int[][] qualityCounts;
934 private float[][] qualityAverages;
935
936 private Read consensusRead;
937
938 boolean useQuality(){return useQuality;}
939 private boolean useQuality=true;
940
941 boolean added=false;
942
943 public static final int overhead=overhead();
944 private static int overhead(){
945 return 16+ //self
946 16+ //Backing array
947 4+ //Backing array size
948 4*(8)+ //Backing array initial capacity
949 1*(8)+ //kmer
950 3*(4)+ //int fields
951 4*(8)+ //pointers
952 2*(4); //booleans
953 }
954
955 /*--------------------------------------------------------------*/
956
957 public static boolean parseStatic(String arg, String a, String b){
958 if(a.equals("mincountcorrect") || a.equals("mincc")){
959 minCountCorrect=Integer.parseInt(b);
960 }else if(a.equals("minsizesplit") || a.equals("minss")){
961 Splitter.minSizeSplit=Integer.parseInt(b);
962 }else if(a.equals("minsizefractionsplit") || a.equals("minsfs")){
963 Splitter.minSizeFractionSplit=Float.parseFloat(b);
964 }else if(a.equals("minsizefractioncorrect") || a.equals("minsfc")){
965 minSizeFractionCorrect=Float.parseFloat(b);
966 }else if(a.equals("minratio") || a.equals("minr")){
967 minRatio=Float.parseFloat(b);
968 }else if(a.equals("minqratio") || a.equals("minqr")){
969 minQRatio=Float.parseFloat(b);
970 }else if(a.equals("minaqratio") || a.equals("minaqr")){
971 minAQRatio=Float.parseFloat(b);
972 }else if(a.equals("minratiooffset") || a.equals("minro")){
973 minRatioOffset=Float.parseFloat(b);
974 }else if(a.equals("minratiomult") || a.equals("minrm")){
975 minRatioMult=Float.parseFloat(b);
976 }else if(a.equals("minratioqmult") || a.equals("minrqm")){
977 minRatioQMult=Float.parseFloat(b);
978 }else if(a.equals("minidentity") || a.equals("minid")){
979 minIdentity=Float.parseFloat(b);
980 }else if(a.equals("maxqadjust")){
981 maxQAdjust=(byte)Integer.parseInt(b);
982 }else if(a.equals("maxqi") || a.equals("maxqincorrect") || a.equals("maxqualityincorrect")){
983 maxQIncorrect=Integer.parseInt(b);
984 if(maxCIncorrect<0){maxQIncorrect=Integer.MAX_VALUE;}
985 }else if(a.equals("maxci") || a.equals("maxcincorrect") || a.equals("maxcountincorrect")){
986 maxCIncorrect=Integer.parseInt(b);
987 if(maxCIncorrect<0){maxCIncorrect=Integer.MAX_VALUE;}
988 }else if(a.equals("border")){
989 KmerComparator.defaultBorder=Integer.parseInt(b);
990 }else if(a.equals("conservative")){
991 conservativeFlag=Parse.parseBoolean(b);
992 }else if(a.equals("aggressive")){
993 aggressiveFlag=Parse.parseBoolean(b);
994 }else if(a.equals("forceprocess")){
995 forceProcess=Parse.parseBoolean(b);
996 }else if(a.equals("mergefirst")){
997 KmerComparator.mergeFirst=Parse.parseBoolean(b);
998 }else if(a.equals("findcorrelations")){
999 Splitter.FIND_CORRELATIONS=Parse.parseBoolean(b);
1000 }else if(a.equals("maxcorrelations")){
1001 Splitter.MAX_CORRELATIONS=Integer.parseInt(b);
1002 }
1003
1004 else if(a.equals("sortx")){
1005 sortX=Parse.parseBoolean(b);
1006 }else if(a.equals("sorty")){
1007 sortY=Parse.parseBoolean(b);
1008 }else if(a.equals("sortxy")){
1009 sortX=sortY=Parse.parseBoolean(b);
1010 }else if(a.equals("forcesortxy") || a.equals("forcexy")){
1011 forceSortXY=Parse.parseBoolean(b);
1012 }else if(a.equals("sortxysize") || a.equals("xysize")){
1013 sortXYSize=Integer.parseInt(b);
1014 }
1015
1016 else if(a.equals("removeallduplicates") || a.equals("allduplicates")){
1017 markAll=Parse.parseBoolean(b);
1018 }else if(a.equals("addcount") || a.equals("renamebycount")){
1019 renameByCount=Parse.parseBoolean(b);
1020 }else if(a.equals("optical") || a.equals("opticalonly")){
1021 opticalOnly=Parse.parseBoolean(b);
1022 }else if(a.equals("dupesubs") || a.equals("duplicatesubs") || a.equals("dsubs") || a.equals("subs") || a.equals("s")){
1023 maxSubstitutions=Integer.parseInt(b);
1024 }else if(a.equals("dupedist") || a.equals("duplicatedistance") || a.equals("ddist") || a.equals("dist") || a.equals("opticaldist") || a.equals("distance")){
1025 maxOpticalDistance=Float.parseFloat(b);
1026 opticalOnly=maxOpticalDistance>=0;
1027 }else if(a.equals("scanlimit") || a.equals("scan")){
1028 scanLimit=Integer.parseInt(b);
1029 }else if(a.equals("allowns")){
1030 allowNs=Parse.parseBoolean(b);
1031 }else if(a.equals("containment") || a.equals("absorbcontainment") || a.equals("ac") || a.equals("contains")){
1032 containment=Parse.parseBoolean(b);
1033 }else if(a.equalsIgnoreCase("prefixOrSuffix") || a.equalsIgnoreCase("suffixOrPrefix") || a.equals("affix") || a.equals("pos")){
1034 affix=Parse.parseBoolean(b);
1035 }else if(a.equals("printduplicates")){
1036 printDuplicates=Parse.parseBoolean(b);
1037 }else if(a.equals("dupeidentity")){
1038 float dupeIdentity=Float.parseFloat(b);
1039 if(dupeIdentity>1){dupeIdentity=dupeIdentity/100;}
1040 assert(dupeIdentity<=1);
1041 dupeSubRate=1-dupeIdentity;
1042 }else if(a.equals("dupesubrate") || a.equals("dsr") || a.equals("subrate")){
1043 dupeSubRate=Float.parseFloat(b);
1044 if(dupeSubRate>1){dupeSubRate=dupeSubRate/100;}
1045 assert(dupeSubRate<=1);
1046 }
1047
1048 else if(a.equals("minsizesplit")){
1049 Splitter.minSizeSplit=Integer.parseInt(b);
1050 }else if(a.equals("minsizefractionsplit")){
1051 Splitter.minSizeFractionSplit=Float.parseFloat(b);
1052 }else{
1053 return false;
1054 }
1055
1056 return true;
1057 }
1058
1059 static void setConservative(boolean newState){
1060
1061 if(aggressiveFlag){return;}
1062 if(newState==conservativeMode){return;}
1063 conservativeMode=newState;
1064
1065 Splitter.conservative=conservativeMode;
1066
1067 if(conservativeMode){
1068 minCountCorrect++;
1069 minSizeFractionCorrect*=1.5f;
1070 minRatio*=1.25f;
1071 minQRatio*=1.5f;
1072 minAQRatio*=1.4f;
1073 minRatioOffset*=1.25f;
1074 minRatioQMult*=1.50f;
1075 minRatioMult*=1.4f;
1076 minIdentity=1-((1-minIdentity)*0.7f);
1077 if(maxQIncorrect==Integer.MAX_VALUE){maxQIncorrect=35;}
1078 }else{
1079 minCountCorrect--;
1080 minSizeFractionCorrect/=1.5f;
1081 minRatio/=1.25f;
1082 minQRatio/=1.5f;
1083 minAQRatio/=1.4f;
1084 minRatioOffset/=1.25f;
1085 minRatioQMult/=1.50f;
1086 minRatioMult/=1.4f;
1087 minIdentity=1-((1-minIdentity)/0.7f);
1088 if(maxQIncorrect==35){maxQIncorrect=Integer.MAX_VALUE;}
1089 }
1090 }
1091
1092 /*--------------------------------------------------------------*/
1093
1094 public static void setXY() {
1095 if(ReadKey.spanTilesX){sortX=true;}
1096 if(ReadKey.spanTilesY){sortY=true;}
1097 }
1098
1099 static boolean allowNs=true;
1100 static boolean markAll=false;
1101 static boolean markOnly=false;
1102 static boolean opticalOnly=false;
1103 static boolean containment=false;
1104 static boolean affix=false;
1105 static boolean printDuplicates=false; //For debugging
1106
1107 private static boolean renameByCount=false;
1108 private static int scanLimit=5;
1109 private static int maxSubstitutions=2;
1110 private static float dupeSubRate=0;
1111 private static float maxOpticalDistance=40;
1112
1113 static boolean forceProcess=false;
1114 static boolean conservativeFlag=false;
1115 static boolean aggressiveFlag=false;
1116 static boolean conservativeMode=false;
1117 static boolean renameConsensus=false;
1118 static int minCountCorrect=4; //mcc=4 was slightly better than 3
1119 static float minSizeFractionCorrect=0.20f; //0.11 is very slightly better than 0.14
1120 static float minRatio=30.0f;
1121 static float minQRatio=2.8f; //Does nothing?
1122 static float minAQRatio=0.7f;
1123 static float minRatioOffset=1.9f; //3 is far worse than 2; 1 is better
1124 static float minRatioQMult=0.08f;
1125 static float minRatioMult=1.80f; //2.5 is far worse than 2; 1.5 is better
1126 static float minIdentity=0.97f; //0.98 is slightly better than 0.96; 0.94 is substantially worse
1127 static byte maxQAdjust=0;
1128 static int maxQIncorrect=Integer.MAX_VALUE;
1129 static int maxCIncorrect=Integer.MAX_VALUE;
1130
1131 static boolean sortX=false; //Not needed for NextSeq
1132 static boolean sortY=true;
1133 static boolean forceSortXY=false; //Mainly for testing
1134 static int sortXYSize=6;
1135
1136 /** May slightly increase speed for optical dedupe. Can be safely disabled. */
1137 static boolean sortYEarly(){return sortY && (forceSortXY || opticalOnly);}
1138
1139 // private static final boolean countQuality=false;
1140 public static int k=31;
1141 private static final long serialVersionUID = 1L;
1142
1143 }