annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/Clump.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package clump;
jpayne@68 2
jpayne@68 3 import java.util.ArrayList;
jpayne@68 4
jpayne@68 5 import dna.AminoAcid;
jpayne@68 6 import shared.KillSwitch;
jpayne@68 7 import shared.Parse;
jpayne@68 8 import shared.Tools;
jpayne@68 9 import stream.Read;
jpayne@68 10 import structures.ByteBuilder;
jpayne@68 11
jpayne@68 12 /**
jpayne@68 13 * A list of reads sharing a kmer.
jpayne@68 14 * @author Brian Bushnell
jpayne@68 15 * @date Nov 7, 2015
jpayne@68 16 *
jpayne@68 17 */
jpayne@68 18 public class Clump extends ArrayList<Read> implements Comparable<Clump> {
jpayne@68 19
jpayne@68 20 public static Clump makeClump(long kmer){
jpayne@68 21 try {
jpayne@68 22 return new Clump(kmer);
jpayne@68 23 } catch (OutOfMemoryError e) {
jpayne@68 24 KillSwitch.memKill(e);
jpayne@68 25 throw new RuntimeException();
jpayne@68 26 }
jpayne@68 27 }
jpayne@68 28
jpayne@68 29 private Clump(long kmer_){
jpayne@68 30 this(kmer_, 4);
jpayne@68 31 }
jpayne@68 32
jpayne@68 33 private Clump(long kmer_, int size){
jpayne@68 34 super(size);
jpayne@68 35 kmer=kmer_;
jpayne@68 36 }
jpayne@68 37
jpayne@68 38 @Override
jpayne@68 39 public boolean add(Read r){
jpayne@68 40 ReadKey key=(ReadKey) r.obj;
jpayne@68 41 assert(key.kmer==kmer);
jpayne@68 42 key.clump=this;
jpayne@68 43 return super.add(r);
jpayne@68 44 }
jpayne@68 45
jpayne@68 46 private void setMaxima(){
jpayne@68 47 maxLeft=-1;
jpayne@68 48 maxRight=-1;
jpayne@68 49 width=-1;
jpayne@68 50 for(Read r : this){
jpayne@68 51 ReadKey key=(ReadKey) r.obj;
jpayne@68 52 final int pos=key.position;
jpayne@68 53 maxLeft=Tools.max(maxLeft, pos);
jpayne@68 54 maxRight=Tools.max(maxRight, r.length()-pos);
jpayne@68 55 }
jpayne@68 56 width=maxLeft+maxRight;
jpayne@68 57 }
jpayne@68 58
jpayne@68 59 /** This will create counts of bases, or sums of qualities, at each position in the cluster. */
jpayne@68 60 private int[][] count(final boolean qualityScores){
jpayne@68 61 if(width<0){setMaxima();}
jpayne@68 62
jpayne@68 63 // System.err.println("\n\n");
jpayne@68 64 final int[][] counts=new int[4][width];
jpayne@68 65 for(Read r : this){
jpayne@68 66 ReadKey key=(ReadKey) r.obj;
jpayne@68 67 final int pos=key.position;
jpayne@68 68 byte[] bases=r.bases, quals=r.quality;
jpayne@68 69 if(quals==null){useQuality=false;}
jpayne@68 70
jpayne@68 71 // System.err.println("pos="+pos+", maxLeft="+maxLeft);
jpayne@68 72 for(int cloc=0, rloc=maxLeft-pos; cloc<bases.length; cloc++, rloc++){
jpayne@68 73 // System.err.println("cloc="+cloc+"/"+bases.length+", rloc="+rloc+"/"+width);
jpayne@68 74 int x=AminoAcid.baseToNumber[bases[cloc]];
jpayne@68 75 if(x>-1){
jpayne@68 76 final int q;
jpayne@68 77 if(qualityScores){q=(quals==null ? 20 : quals[cloc]);}
jpayne@68 78 else{q=1;}
jpayne@68 79 counts[x][rloc]+=q;
jpayne@68 80 }
jpayne@68 81 }
jpayne@68 82 }
jpayne@68 83 return counts;
jpayne@68 84 }
jpayne@68 85
jpayne@68 86 /*--------------------------------------------------------------*/
jpayne@68 87
jpayne@68 88 public Read makeSimpleConsensus(){
jpayne@68 89 // assert(Splitter.findBestPivot(this)<0) : Splitter.findBestPivot(this); //TODO: Slow
jpayne@68 90 if(size()==1){
jpayne@68 91 Read r=get(0);
jpayne@68 92 if(renameConsensus){r.id=r.numericID+" size=1";}
jpayne@68 93 return r;
jpayne@68 94 }
jpayne@68 95 final int[][] bcounts=baseCounts();
jpayne@68 96 final int[][] qcounts=qualityCounts();
jpayne@68 97
jpayne@68 98 final byte[] bases=new byte[width], quals=new byte[width];
jpayne@68 99 for(int i=0; i<width; i++){
jpayne@68 100 int x=getConsensusAtPosition(qcounts, i);
jpayne@68 101 int y=getSecondHighest(qcounts, i);
jpayne@68 102 if(x>=0 && qcounts[x][i]==qcounts[y][i]){//Tie-breaker
jpayne@68 103 x=getConsensusAtPosition(bcounts, i);
jpayne@68 104 y=getSecondHighest(bcounts, i);
jpayne@68 105 }
jpayne@68 106
jpayne@68 107
jpayne@68 108 if(x<0){
jpayne@68 109 // System.err.println("q="+0+", x="+x+"; A="+counts[0][i]+", C="+counts[1][i]+", G="+counts[2][i]+", T="+counts[3][i]);
jpayne@68 110 bases[i]='N';
jpayne@68 111 quals[i]=0;
jpayne@68 112 assert(getSumAtPosition(qcounts, i)==0) : "\n"+bcounts[0][i]+", "+bcounts[1][i]+", "+bcounts[2][i]+", "+bcounts[3][i]+
jpayne@68 113 "\n"+qcounts[0][i]+", "+qcounts[1][i]+", "+qcounts[2][i]+", "+qcounts[3][i]+
jpayne@68 114 "\nwidth="+width+", i="+i+", size="+size()+"\n"+new String(bases, 0, i)+"\n"+get(0)+"\n"+get(1)+"\n";
jpayne@68 115 // assert(getSumAtPosition(bcounts, i)==0) : "\n"+bcounts[0][i]+", "+bcounts[1][i]+", "+bcounts[2][i]+", "+bcounts[3][i]+
jpayne@68 116 // "\n"+qcounts[0][i]+", "+qcounts[1][i]+", "+qcounts[2][i]+", "+qcounts[3][i]+
jpayne@68 117 // "\nwidth="+width+", i="+i+", size="+size()+"\n"+new String(bases, 0, i)+"\n"+get(0)+"\n"+get(1)+"\n";
jpayne@68 118 }else{
jpayne@68 119 final long bsum=getSumAtPosition(bcounts, i);
jpayne@68 120 final long bmajor=bcounts[x][i];
jpayne@68 121 final long bminor=bsum-bcounts[x][i];
jpayne@68 122 final long bsecond=bcounts[y][i];
jpayne@68 123
jpayne@68 124 final long qsum=getSumAtPosition(qcounts, i);
jpayne@68 125 final long qmajor=qcounts[x][i];
jpayne@68 126 final long qminor=qsum-qcounts[x][i];
jpayne@68 127 final long qsecond=qcounts[y][i];
jpayne@68 128
jpayne@68 129 float bratio=bminor/(float)(bmajor+bminor);
jpayne@68 130 double phred=(bminor==0 ? Read.MAX_CALLED_QUALITY() : -10*Math.log10(bratio));
jpayne@68 131 phred=Tools.min(phred, qmajor-qminor);
jpayne@68 132 // if(phred<0 || phred>127){
jpayne@68 133 // assert(false) : i+","+x+","+bsum+","+qsum+","+bmajor+","+bminor+","+bratio+","+phred+"\n"+
jpayne@68 134 // bcounts[0][i]+","+bcounts[1][i]+","+bcounts[2][i]+","+bcounts[3][i]+"\n"+
jpayne@68 135 // qcounts[0][i]+","+qcounts[1][i]+","+qcounts[2][i]+","+qcounts[3][i]+"\n"+
jpayne@68 136 // this.toStringStaggered()+"\n";
jpayne@68 137 // }
jpayne@68 138 // assert(phred>=0 && phred<=127) : phred+","+x+","+i+","+bratio+","+bcounts[x][i]+","+bcounts[0][i]+
jpayne@68 139 // ","+bcounts[1][i]+","+bcounts[2][i]+","+bcounts[3][i];
jpayne@68 140 // assert(phred>=0 && phred<=127) : bmajor+", "+bminor+", "+phred+", "+Read.MAX_CALLED_QUALITY;
jpayne@68 141 byte q=Read.capQuality((long)Math.round(phred));
jpayne@68 142 bases[i]=AminoAcid.numberToBase[x];
jpayne@68 143 quals[i]=q;
jpayne@68 144 assert(q>0);
jpayne@68 145 assert(x>-1);
jpayne@68 146 assert(bases[i]!='N');
jpayne@68 147 }
jpayne@68 148 }
jpayne@68 149 Read leftmost=this.get(0);//TODO: Actually rightmost!
jpayne@68 150 Read r=new Read(bases, quals, leftmost.numericID+" size="+size(), 0);
jpayne@68 151 //TODO: Attach the long pair, and make sure the kmer location is correct.
jpayne@68 152 // assert(false) : "\n"+r.toFastq()+"\nCheck kmer location.";
jpayne@68 153 // assert(size()==1) : "\n"+r.toFastq()+"\n"+get(0).toFastq()+"\n"+get(size()-1).toFastq()+"\n";
jpayne@68 154 return r;
jpayne@68 155 }
jpayne@68 156
jpayne@68 157 /*--------------------------------------------------------------*/
jpayne@68 158
jpayne@68 159 public int removeDuplicates(){
jpayne@68 160 assert(KmerComparator.compareSequence);
jpayne@68 161 if(size()<2){return 0;}
jpayne@68 162
jpayne@68 163 int removedTotal=0, removed=0;
jpayne@68 164
jpayne@68 165 final boolean sortXY=(forceSortXY || sortYEarly() || (opticalOnly && (sortX || sortY) && size()>=sortXYSize));
jpayne@68 166
jpayne@68 167 final int maxDiscarded;
jpayne@68 168 final int scan;
jpayne@68 169
jpayne@68 170 if(opticalOnly && sortXY){
jpayne@68 171 scan=Tools.max(scanLimit, 200);
jpayne@68 172 maxDiscarded=scan+10;
jpayne@68 173 }else if(!sortXY && ((maxSubstitutions<1 && dupeSubRate<=0) || scanLimit<0)){
jpayne@68 174 scan=0;
jpayne@68 175 maxDiscarded=0;
jpayne@68 176 }else{
jpayne@68 177 scan=scanLimit;
jpayne@68 178 maxDiscarded=scan+10;
jpayne@68 179 }
jpayne@68 180
jpayne@68 181 if(sortXY){
jpayne@68 182 assert(sortX || sortY);
jpayne@68 183
jpayne@68 184 if(sortY){
jpayne@68 185 if(!sortYEarly()){
jpayne@68 186 this.sort(KmerComparatorY.comparator);
jpayne@68 187 }else{
jpayne@68 188 // for(int i=1; i<this.size(); i++){
jpayne@68 189 // Read a=get(i-1);
jpayne@68 190 // Read b=get(i);
jpayne@68 191 // assert(KmerComparatorY.comparator.compare(a, b)<=0) : a.obj+" vs "+b.obj;
jpayne@68 192 // }
jpayne@68 193 // //Otherwise, it was already Y-sorted.
jpayne@68 194 }
jpayne@68 195 // assert(false) : sortY();
jpayne@68 196 removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan, maxDiscarded, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance);
jpayne@68 197 removedTotal+=removed;
jpayne@68 198 // System.err.println("RemovedY: "+removed);
jpayne@68 199 while((maxSubstitutions>0 || dupeSubRate>0) && scanLimit>0 && removed>maxDiscarded){
jpayne@68 200 removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan+10, maxDiscarded*2+20, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance);
jpayne@68 201 removedTotal+=removed;
jpayne@68 202 // System.err.println("RemovedY: "+removed);
jpayne@68 203 }
jpayne@68 204 }
jpayne@68 205 if(sortX && (ReadKey.spanTilesX || !sortY)){
jpayne@68 206 this.sort(KmerComparatorX.comparator);
jpayne@68 207 removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan, maxDiscarded, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance);
jpayne@68 208 removedTotal+=removed;
jpayne@68 209 // System.err.println("RemovedX: "+removed);
jpayne@68 210 while((maxSubstitutions>0 || dupeSubRate>0) && scanLimit>0 && removed>maxDiscarded){
jpayne@68 211 removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan+10, maxDiscarded*2+20, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance);
jpayne@68 212 removedTotal+=removed;
jpayne@68 213 // System.err.println("RemovedX: "+removed);
jpayne@68 214 }
jpayne@68 215 }
jpayne@68 216 }else{
jpayne@68 217 removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan, maxDiscarded, opticalOnly, false, markOnly, markAll, renameByCount, maxOpticalDistance);
jpayne@68 218 removedTotal+=removed;
jpayne@68 219 while((maxSubstitutions>0 || dupeSubRate>0) && scanLimit>0 && removed>maxDiscarded){
jpayne@68 220 removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan+10, maxDiscarded*2+20, opticalOnly, false, markOnly, markAll, renameByCount, maxOpticalDistance);
jpayne@68 221 removedTotal+=removed;
jpayne@68 222 }
jpayne@68 223 }
jpayne@68 224
jpayne@68 225 return removedTotal;
jpayne@68 226 }
jpayne@68 227
jpayne@68 228 private int removeDuplicates_inner(final int maxSubs, final float subRate, final int scanLimit, final int maxDiscarded,
jpayne@68 229 final boolean optical, final boolean xySorted, final boolean mark, final boolean markAll, final boolean rename, final float dist){
jpayne@68 230 final int size=size();
jpayne@68 231 if(size<2){return 0;}
jpayne@68 232 int dupePairs=0;
jpayne@68 233 int dupeReads=0;
jpayne@68 234
jpayne@68 235 // final boolean breakOnTile=(optical && !FlowcellCoordinate.spanTiles);
jpayne@68 236
jpayne@68 237 // final long start=System.nanoTime();
jpayne@68 238
jpayne@68 239 for(int i=0, lim=size-1; i<lim; i++){
jpayne@68 240 final Read a=get(i);
jpayne@68 241 if(!a.discarded()){
jpayne@68 242 final ReadKey keyA=(ReadKey) a.obj;
jpayne@68 243 final int aLen=a.length();
jpayne@68 244 int unequals=0;
jpayne@68 245 int discarded=0;
jpayne@68 246 for(int j=i+1; j<size && unequals<=scanLimit && discarded<=maxDiscarded && (!a.discarded() || markAll); j++){
jpayne@68 247 final Read b=get(j);
jpayne@68 248 if(b.discarded()){
jpayne@68 249 discarded++;
jpayne@68 250 }else{
jpayne@68 251 final int bLen=b.length();
jpayne@68 252 final ReadKey keyB=(ReadKey) b.obj;
jpayne@68 253 if(!containment && !keyA.equals(keyB)){break;}
jpayne@68 254 // if(containment && affix && !keyA.physicalAffix(keyB, aLen, bLen)){break;}
jpayne@68 255 // if(optical && keyA.lane!=keyB.lane){break;} //Already in equals method
jpayne@68 256 // if(breakOnTile && keyA.tile!=keyB.tile){break;} //Already in equals method
jpayne@68 257 if(optical && xySorted && !keyA.nearXY(keyB, dist)){break;}
jpayne@68 258 // if(System.nanoTime()-start>200000000000L){
jpayne@68 259 // TextStreamWriter tsw=new TextStreamWriter("foo.fq", true, false, false);
jpayne@68 260 // tsw.start();
jpayne@68 261 // for(Read x : this){
jpayne@68 262 // tsw.println(x.toFastq());
jpayne@68 263 // }
jpayne@68 264 // tsw.poisonAndWait();
jpayne@68 265 // assert(false) : "kmer="+kmer+", size="+size();
jpayne@68 266 // }
jpayne@68 267 if(equals(a, b, maxSubs, subRate)){
jpayne@68 268 if(!optical || keyA.near(keyB, dist)){
jpayne@68 269 if(printDuplicates){
jpayne@68 270 System.out.println(a.toFasta());
jpayne@68 271 System.out.println(b.toFasta());
jpayne@68 272 }
jpayne@68 273 float errA=a.expectedErrorsIncludingMate(true);
jpayne@68 274 float errB=b.expectedErrorsIncludingMate(true);
jpayne@68 275 if(markAll){
jpayne@68 276 b.setDiscarded(true);
jpayne@68 277 assert(!a.discarded() || markAll);
jpayne@68 278 dupePairs++;
jpayne@68 279 dupeReads+=1+b.mateCount();
jpayne@68 280 unequals=0;
jpayne@68 281 if(!a.discarded()){
jpayne@68 282 a.setDiscarded(true);
jpayne@68 283 dupePairs++;
jpayne@68 284 dupeReads+=1+a.mateCount();
jpayne@68 285 }
jpayne@68 286 }else if(containment || errB>=errA){
jpayne@68 287 b.setDiscarded(true);
jpayne@68 288 assert(!a.discarded() || markAll);
jpayne@68 289 a.copies+=b.copies+parseExtraCopies(b);
jpayne@68 290 dupePairs++;
jpayne@68 291 dupeReads+=1+b.mateCount();
jpayne@68 292 unequals=0;
jpayne@68 293 }else{
jpayne@68 294 a.setDiscarded(true);
jpayne@68 295 assert(!b.discarded() || markAll);
jpayne@68 296 b.copies+=a.copies+parseExtraCopies(a);
jpayne@68 297 dupePairs++;
jpayne@68 298 dupeReads+=1+a.mateCount();
jpayne@68 299 }
jpayne@68 300 }
jpayne@68 301 }else{
jpayne@68 302 unequals++;
jpayne@68 303 }
jpayne@68 304 }
jpayne@68 305 }
jpayne@68 306 }
jpayne@68 307 }
jpayne@68 308 if(dupeReads>0){
jpayne@68 309 for(int i=0; i<size; i++){
jpayne@68 310 Read a=get(i);
jpayne@68 311 if(a.discarded()){
jpayne@68 312 if(mark){
jpayne@68 313 if(!a.id.endsWith(" duplicate")){
jpayne@68 314 a.id+=" duplicate";
jpayne@68 315 if(a.mate!=null){a.mate.id+=" duplicate";}
jpayne@68 316 }
jpayne@68 317 }else{
jpayne@68 318 set(i, null);
jpayne@68 319 }
jpayne@68 320 }else if(rename && a.copies>1){
jpayne@68 321 renameFromCount(a);
jpayne@68 322 }
jpayne@68 323 a.copies=1;
jpayne@68 324 }
jpayne@68 325 if(!mark){
jpayne@68 326 int x=Tools.condenseStrict(this);
jpayne@68 327 assert(x==dupePairs) : size()+", "+size+", "+dupePairs+", "+x;
jpayne@68 328 assert((size()>0 || markAll) && size()==size-dupePairs) : size()+", "+size+", "+dupePairs;
jpayne@68 329 }
jpayne@68 330 }
jpayne@68 331
jpayne@68 332 if(containment){
jpayne@68 333 dupeReads+=removeDuplicates_backwards(maxSubs, subRate, scanLimit, maxDiscarded, optical, xySorted, mark, markAll, rename, dist);
jpayne@68 334 }
jpayne@68 335
jpayne@68 336 return dupeReads;
jpayne@68 337 }
jpayne@68 338
jpayne@68 339 /** Only for containments */
jpayne@68 340 private int removeDuplicates_backwards(final int maxSubs, final float subRate, final int scanLimit, final int maxDiscarded,
jpayne@68 341 final boolean optical, final boolean xySorted, final boolean mark, final boolean markAll, final boolean rename, final float dist){
jpayne@68 342 final int size=size();
jpayne@68 343 if(size<2){return 0;}
jpayne@68 344 int dupePairs=0;
jpayne@68 345 int dupeReads=0;
jpayne@68 346
jpayne@68 347 // final boolean breakOnTile=(optical && !FlowcellCoordinate.spanTiles);
jpayne@68 348
jpayne@68 349 // final long start=System.nanoTime();
jpayne@68 350
jpayne@68 351 for(int i=size-1; i>0; i--){
jpayne@68 352 final Read a=get(i);
jpayne@68 353 if(!a.discarded()){
jpayne@68 354 final ReadKey keyA=(ReadKey) a.obj;
jpayne@68 355 final int aLen=a.length();
jpayne@68 356 int unequals=0;
jpayne@68 357 int discarded=0;
jpayne@68 358 for(int j=i-1; j>=0 && unequals<=scanLimit && discarded<=maxDiscarded && (!a.discarded() || markAll); j--){
jpayne@68 359 final Read b=get(j);
jpayne@68 360 if(b.discarded()){
jpayne@68 361 discarded++;
jpayne@68 362 }else{
jpayne@68 363 final int bLen=b.length();
jpayne@68 364 final ReadKey keyB=(ReadKey) b.obj;
jpayne@68 365 if(!containment && !keyA.equals(keyB)){break;}
jpayne@68 366 // if(containment && affix && !keyA.physicalAffix(keyB, aLen, bLen)){break;}
jpayne@68 367 // if(optical && keyA.lane!=keyB.lane){break;} //Already in equals method
jpayne@68 368 // if(breakOnTile && keyA.tile!=keyB.tile){break;} //Already in equals method
jpayne@68 369 if(optical && xySorted && !keyA.nearXY(keyB, dist)){break;}
jpayne@68 370 // if(System.nanoTime()-start>200000000000L){
jpayne@68 371 // TextStreamWriter tsw=new TextStreamWriter("foo.fq", true, false, false);
jpayne@68 372 // tsw.start();
jpayne@68 373 // for(Read x : this){
jpayne@68 374 // tsw.println(x.toFastq());
jpayne@68 375 // }
jpayne@68 376 // tsw.poisonAndWait();
jpayne@68 377 // assert(false) : "kmer="+kmer+", size="+size();
jpayne@68 378 // }
jpayne@68 379 if(equals(a, b, maxSubs, subRate)){
jpayne@68 380 if(!optical || keyA.near(keyB, dist)){
jpayne@68 381 if(printDuplicates){
jpayne@68 382 System.out.println(a.toFasta());
jpayne@68 383 System.out.println(b.toFasta());
jpayne@68 384 }
jpayne@68 385 float errA=a.expectedErrorsIncludingMate(true);
jpayne@68 386 float errB=b.expectedErrorsIncludingMate(true);
jpayne@68 387 if(markAll){
jpayne@68 388 b.setDiscarded(true);
jpayne@68 389 assert(!a.discarded() || markAll);
jpayne@68 390 dupePairs++;
jpayne@68 391 dupeReads+=1+b.mateCount();
jpayne@68 392 unequals=0;
jpayne@68 393 if(!a.discarded()){
jpayne@68 394 a.setDiscarded(true);
jpayne@68 395 dupePairs++;
jpayne@68 396 dupeReads+=1+a.mateCount();
jpayne@68 397 }
jpayne@68 398 }else if(containment || errB>=errA){
jpayne@68 399 b.setDiscarded(true);
jpayne@68 400 assert(!a.discarded() || markAll);
jpayne@68 401 a.copies+=b.copies+parseExtraCopies(b);
jpayne@68 402 dupePairs++;
jpayne@68 403 dupeReads+=1+b.mateCount();
jpayne@68 404 unequals=0;
jpayne@68 405 }else{
jpayne@68 406 a.setDiscarded(true);
jpayne@68 407 assert(!b.discarded() || markAll);
jpayne@68 408 b.copies+=a.copies+parseExtraCopies(a);
jpayne@68 409 dupePairs++;
jpayne@68 410 dupeReads+=1+a.mateCount();
jpayne@68 411 }
jpayne@68 412 }
jpayne@68 413 }else{
jpayne@68 414 unequals++;
jpayne@68 415 }
jpayne@68 416 }
jpayne@68 417 }
jpayne@68 418 }
jpayne@68 419 }
jpayne@68 420 if(dupeReads>0){
jpayne@68 421 for(int i=0; i<size; i++){
jpayne@68 422 Read a=get(i);
jpayne@68 423 if(a.discarded()){
jpayne@68 424 if(mark){
jpayne@68 425 if(!a.id.endsWith(" duplicate")){
jpayne@68 426 a.id+=" duplicate";
jpayne@68 427 if(a.mate!=null){a.mate.id+=" duplicate";}
jpayne@68 428 }
jpayne@68 429 }else{
jpayne@68 430 set(i, null);
jpayne@68 431 }
jpayne@68 432 }else if(rename && a.copies>1){
jpayne@68 433 renameFromCount(a);
jpayne@68 434 }
jpayne@68 435 a.copies=1;
jpayne@68 436 }
jpayne@68 437 if(!mark){
jpayne@68 438 int x=Tools.condenseStrict(this);
jpayne@68 439 assert(x==dupePairs) : size()+", "+size+", "+dupePairs+", "+x;
jpayne@68 440 assert((size()>0 || markAll) && size()==size-dupePairs) : size()+", "+size+", "+dupePairs;
jpayne@68 441 }
jpayne@68 442 }
jpayne@68 443 return dupeReads;
jpayne@68 444 }
jpayne@68 445
jpayne@68 446 public int parseExtraCopies(final Read a){
jpayne@68 447 final String id=a.id;
jpayne@68 448 final int space=id.lastIndexOf(' ');
jpayne@68 449 int extraCopies=0;
jpayne@68 450 if(space<0){return 0;}
jpayne@68 451 if(space>=0 && Tools.contains(id, "copies=", space+1)){
jpayne@68 452 extraCopies=Integer.parseInt(id.substring(space+8))-1;
jpayne@68 453 }
jpayne@68 454 return extraCopies;
jpayne@68 455 }
jpayne@68 456
jpayne@68 457 private void renameFromCount(final Read a){
jpayne@68 458 final int newExtraCopies=a.copies-1;
jpayne@68 459 assert(newExtraCopies>0) : newExtraCopies;
jpayne@68 460 final int oldExtraCopies=parseExtraCopies(a);
jpayne@68 461 final int total=1+newExtraCopies+oldExtraCopies;
jpayne@68 462 renameToTotal(a, total);
jpayne@68 463 if(a.pairnum()==0 && a.mate!=null){
jpayne@68 464 assert(a.mate.pairnum()==1);
jpayne@68 465 renameToTotal(a.mate, total);
jpayne@68 466 }
jpayne@68 467 }
jpayne@68 468
jpayne@68 469 private static void renameToTotal(final Read a, final int total){
jpayne@68 470 final String id=a.id;
jpayne@68 471 final int space=id.lastIndexOf(' ');
jpayne@68 472 if(space>=0 && Tools.contains(id, "copies=", space+1)){
jpayne@68 473 a.id=a.id.substring(0, space);
jpayne@68 474 }
jpayne@68 475 a.id=a.id+" copies="+total;
jpayne@68 476 }
jpayne@68 477
jpayne@68 478 // public boolean nearby(Read a, Read b, FlowcellCoordinate fca, FlowcellCoordinate fcb, float maxDist){
jpayne@68 479 //// fca.setFrom(a.id);
jpayne@68 480 // fcb.setFrom(b.id);
jpayne@68 481 // float dist=fca.distance(fcb);
jpayne@68 482 // return dist<=maxDist;
jpayne@68 483 // }
jpayne@68 484
jpayne@68 485 // public boolean nearby(ReadKey ka, ReadKey kb, float maxDist){
jpayne@68 486 // float dist=ka.distance(kb);
jpayne@68 487 // return dist<=maxDist;
jpayne@68 488 // }
jpayne@68 489
jpayne@68 490 public static boolean equals(Read a, Read b, int maxSubs, float dupeSubRate){
jpayne@68 491 if(a.numericID==b.numericID){return false;}
jpayne@68 492 if(dupeSubRate>0){maxSubs=Tools.max(maxSubs, (int)(dupeSubRate*Tools.min(a.length(), b.length())));}
jpayne@68 493 if(containment){
jpayne@68 494 return contains(a, b, maxSubs);
jpayne@68 495 }
jpayne@68 496 if(!equals(a.bases, b.bases, maxSubs)){return false;}
jpayne@68 497 if(a.mate!=null && !equals(a.mate.bases, b.mate.bases, maxSubs)){return false;}
jpayne@68 498 return true;
jpayne@68 499 }
jpayne@68 500
jpayne@68 501 public static boolean equals(byte[] a, byte[] b, int maxSubs){
jpayne@68 502 if(a==b){return false;}//Nothing should subsume itself
jpayne@68 503 if(a==null || b==null){return false;}
jpayne@68 504 if(a.length!=b.length){return false;}
jpayne@68 505 int subs=0;
jpayne@68 506 if(allowNs){
jpayne@68 507 for(int i=0; i<a.length; i++){
jpayne@68 508 if(a[i]!=b[i] && (a[i]!='N' && b[i]!='N')){
jpayne@68 509 subs++;
jpayne@68 510 if(subs>maxSubs){return false;}
jpayne@68 511 }
jpayne@68 512 }
jpayne@68 513 }else{
jpayne@68 514 for(int i=0; i<a.length; i++){
jpayne@68 515 if(a[i]!=b[i]){
jpayne@68 516 subs++;
jpayne@68 517 if(subs>maxSubs){return false;}
jpayne@68 518 }
jpayne@68 519 }
jpayne@68 520 }
jpayne@68 521 return true;
jpayne@68 522 }
jpayne@68 523
jpayne@68 524 public static boolean contains(Read a, Read b, int maxSubs){
jpayne@68 525 if(a.numericID==b.numericID){return false;}
jpayne@68 526 boolean ok=contains_inner(a, b, maxSubs);
jpayne@68 527 if(!ok || a.mate==null){return ok;}
jpayne@68 528 ok=contains_inner(a.mate, b.mate, maxSubs);
jpayne@68 529 if(!ok){return false;}
jpayne@68 530 ReadKey rka1=(ReadKey)a.obj;
jpayne@68 531 ReadKey rkb1=(ReadKey)b.obj;
jpayne@68 532 ReadKey rka2=(ReadKey)a.mate.obj; //TODO: In containment mode, mates need to always get keys.
jpayne@68 533 ReadKey rkb2=(ReadKey)b.mate.obj;
jpayne@68 534 return ((rka1.kmerMinusStrand==rkb1.kmerMinusStrand) && (rka2.kmerMinusStrand==rkb2.kmerMinusStrand)); //Ensures that both reads have the same directionality.
jpayne@68 535 }
jpayne@68 536
jpayne@68 537 public static boolean contains_inner(Read a, Read b, int maxSubs){
jpayne@68 538 // if(a.length()==b.length()){return equals(a.bases, b.bases, maxSubs);}
jpayne@68 539 ReadKey rka=(ReadKey)a.obj;
jpayne@68 540 ReadKey rkb=(ReadKey)b.obj;
jpayne@68 541 if(affix ? !rka.physicalAffix(rkb, a.length(), b.length()) : !rka.physicallyContains(rkb, a.length(), b.length())){return false;}
jpayne@68 542 boolean flipped=false;
jpayne@68 543 // if(a.mate!=null){//In paired mode, need synchronization if the reads are in difference clumps. But... just don't do that.
jpayne@68 544 // Read max, min;
jpayne@68 545 // if(a.numericID>b.numericID){max=a; min=b;}//Protects against deadlocks.
jpayne@68 546 // else{max=b; min=a;}
jpayne@68 547 // synchronized(min){
jpayne@68 548 // synchronized(max){
jpayne@68 549 // if(rka.kmerMinusStrand!=rkb.kmerMinusStrand){
jpayne@68 550 // rkb.flip(b, k);
jpayne@68 551 // flipped=true;
jpayne@68 552 // }
jpayne@68 553 // boolean ok=contains(a.bases, b.bases, rka.position, rkb.position, maxSubs);
jpayne@68 554 // if(flipped){rkb.flip(b, k);}
jpayne@68 555 // return ok;
jpayne@68 556 // }
jpayne@68 557 // }
jpayne@68 558 // }
jpayne@68 559 if(rka.kmerMinusStrand!=rkb.kmerMinusStrand){
jpayne@68 560 rkb.flip(b, k);
jpayne@68 561 flipped=true;
jpayne@68 562 }
jpayne@68 563 boolean ok=contains(a.bases, b.bases, rka.position, rkb.position, maxSubs);
jpayne@68 564 if(flipped){rkb.flip(b, k);}
jpayne@68 565 return ok;
jpayne@68 566 }
jpayne@68 567
jpayne@68 568 public static boolean contains(byte[] a, byte[] b, int posA, int posB, int maxSubs){
jpayne@68 569 if(a==null || b==null){return false;}
jpayne@68 570 assert(a.length>=b.length);
jpayne@68 571 if(a==b){return false;} //Nothing should contain itself
jpayne@68 572 int subs=0;
jpayne@68 573
jpayne@68 574 int ai, bi;
jpayne@68 575 final int dif=posA-posB;
jpayne@68 576 if(dif>0){
jpayne@68 577 ai=0;
jpayne@68 578 bi=-dif;
jpayne@68 579 }else{
jpayne@68 580 ai=dif;
jpayne@68 581 bi=0;
jpayne@68 582 }
jpayne@68 583
jpayne@68 584 if(allowNs){
jpayne@68 585 for(; ai<a.length && bi<b.length; ai++, bi++){
jpayne@68 586 if(ai>=0 && bi>=0){
jpayne@68 587 final byte na=a[ai], nb=b[bi];
jpayne@68 588 if(na!=nb && na!='N' && nb!='N'){
jpayne@68 589 subs++;
jpayne@68 590 if(subs>maxSubs){return false;}
jpayne@68 591 }
jpayne@68 592 }
jpayne@68 593 }
jpayne@68 594 }else{
jpayne@68 595 for(; ai<a.length && bi<b.length; ai++, bi++){
jpayne@68 596 if(ai>=0 && bi>=0 && a[ai]!=b[bi]){
jpayne@68 597 subs++;
jpayne@68 598 if(subs>maxSubs){return false;}
jpayne@68 599 }
jpayne@68 600 }
jpayne@68 601 }
jpayne@68 602 return true;
jpayne@68 603 }
jpayne@68 604
jpayne@68 605 /*--------------------------------------------------------------*/
jpayne@68 606
jpayne@68 607 public long splitAndErrorCorrect(){
jpayne@68 608 if(size()<Splitter.minSizeSplit){
jpayne@68 609 return errorCorrect();
jpayne@68 610 }
jpayne@68 611 long sum=0;
jpayne@68 612 ArrayList<Clump> list=Splitter.splitOnPivot(this);
jpayne@68 613 for(Clump c : list){
jpayne@68 614 sum+=c.errorCorrect();
jpayne@68 615 }
jpayne@68 616 return sum;
jpayne@68 617 }
jpayne@68 618
jpayne@68 619 public long errorCorrect(){
jpayne@68 620 if(size()<=minCountCorrect){return 0;}
jpayne@68 621 // assert(Splitter.findBestPivot(this)<0); //TODO: Slow
jpayne@68 622 Read consen=makeSimpleConsensus();
jpayne@68 623 long sum=0;
jpayne@68 624 final int[] rvector=KillSwitch.allocInt1D(2);
jpayne@68 625 for(Read r : this){
jpayne@68 626 sum+=errorCorrect(r, consen, rvector);
jpayne@68 627 }
jpayne@68 628 return sum;
jpayne@68 629 }
jpayne@68 630
jpayne@68 631 private int errorCorrect(Read call, Read ref, int[] rvector){
jpayne@68 632
jpayne@68 633 // assert(call.validated());
jpayne@68 634 // assert(call.checkQuality());
jpayne@68 635 // assert(ref.checkQuality());
jpayne@68 636
jpayne@68 637 final float identity=identity(call, ref.bases, rvector);
jpayne@68 638 if((identity<minIdentity && (rvector[1]>0 || rvector[0]<50)) || (identity==1 && !call.containsNocalls()/* && !ref.containsNocalls()*/)){
jpayne@68 639 //TODO: Strange, the identity==1 clause causes different behavior, though it does give a speedup.
jpayne@68 640 return 0;
jpayne@68 641 }
jpayne@68 642 final byte[] cbases=call.bases, cquals=call.quality;
jpayne@68 643 final byte[] rbases=ref.bases, rquals=ref.quality;
jpayne@68 644
jpayne@68 645 ReadKey key=(ReadKey) call.obj;
jpayne@68 646 final int pos=key.position;
jpayne@68 647
jpayne@68 648 final int[][] bcounts=baseCounts();
jpayne@68 649 final int[][] qcounts=qualityCounts();
jpayne@68 650 final float[][] qAvgs=qualityAverages();
jpayne@68 651
jpayne@68 652 final int minDepth=(int)Tools.max(minCountCorrect, minSizeFractionCorrect*size());
jpayne@68 653
jpayne@68 654 int corrections=0;
jpayne@68 655
jpayne@68 656 final int cStart=0, rStart=maxLeft-pos, max=cbases.length;
jpayne@68 657 for(int cloc=cStart, rloc=rStart; cloc<max; cloc++, rloc++){
jpayne@68 658 //Called base, ref base
jpayne@68 659 final byte cb=cbases[cloc], rb=rbases[rloc];
jpayne@68 660 //Called quality, ref quality
jpayne@68 661 final byte cq=(cquals==null ? 20 : cquals[cloc]), rq=rquals[rloc];
jpayne@68 662 //Called number
jpayne@68 663 final byte cx=AminoAcid.baseToNumber[cb];
jpayne@68 664 //Ref number
jpayne@68 665 final byte rx=AminoAcid.baseToNumber[rb];
jpayne@68 666
jpayne@68 667 // assert((cb=='N') == (cquals[cloc]==0));
jpayne@68 668
jpayne@68 669 final byte b, q;
jpayne@68 670 if(cx<0){
jpayne@68 671 b=rb;
jpayne@68 672 q=(byte)Tools.min(20, rq);
jpayne@68 673 }else if(cb==rb){
jpayne@68 674 b=cb;
jpayne@68 675 q=(byte)Tools.mid(cq, cq+1, rq);//Adjust upward
jpayne@68 676 assert(q>=cq && (q<=rq || q<=cq));
jpayne@68 677 }else{
jpayne@68 678 final int cCount=bcounts[cx][rloc];
jpayne@68 679 final int rCount=bcounts[rx][rloc];
jpayne@68 680 final int altQSum=qcounts[cx][rloc];
jpayne@68 681 final int rQSum=qcounts[rx][rloc];
jpayne@68 682 final float rQAvg=qAvgs[rx][rloc];
jpayne@68 683 if(cCount<=maxCIncorrect && cq<=maxQIncorrect && cq*minQRatio<rQSum && cq*minAQRatio<8+rQAvg){
jpayne@68 684 final byte pminor=getSecondHighest(bcounts, rloc);
jpayne@68 685
jpayne@68 686 assert(rx>=0 && rx<bcounts.length) : rx+", "+rloc+", "+bcounts.length+"\n"+call.toFastq()+"\n"+ref.toFastq();
jpayne@68 687 assert(rloc>=0 && rloc<bcounts[rx].length) : rx+", "+rloc+", "+bcounts[rloc].length+"\n"+call.toFastq()+"\n"+ref.toFastq();
jpayne@68 688 final int minorCount=bcounts[pminor][rloc];
jpayne@68 689
jpayne@68 690 final long sum=getSumAtPosition(bcounts, rloc);
jpayne@68 691 // final long altCount=sum-rCount;
jpayne@68 692
jpayne@68 693 boolean change=false;
jpayne@68 694 if(rCount>=minCountCorrect && sum>=minDepth){
jpayne@68 695 final float ratioNeeded=Tools.min(minRatio, minRatioMult*minorCount+minRatioOffset+minRatioQMult*cq);
jpayne@68 696 // final float qratioNeeded=Tools.min(minRatio, minRatioMult*altQSum+minRatioOffset+minRatioQMult*cq); //altQSum is slightly different than minorQCount
jpayne@68 697 if(minorCount*ratioNeeded<=rCount && altQSum*ratioNeeded<=rQSum){
jpayne@68 698 change=true;
jpayne@68 699 }
jpayne@68 700
jpayne@68 701 // else if(minorCount*ratioNeeded<=rCount){
jpayne@68 702 // assert(false) : "\n"+altQSum+", "+rQSum+", "+qratioNeeded+"\n"+cCount+", "+rCount+", "+sum+", "+ratioNeeded+"\n"+(altQSum*qratioNeeded);
jpayne@68 703 // }
jpayne@68 704 }
jpayne@68 705 if(change){
jpayne@68 706 corrections++;
jpayne@68 707 b=rb;
jpayne@68 708 q=(byte)Tools.min(cq+1, 25, rq);
jpayne@68 709 // assert(q==25 || (q<=rq && q>=cq)) : q+", "+cq+", "+rq;
jpayne@68 710 }else{
jpayne@68 711 b=cb;
jpayne@68 712 q=(byte)Tools.mid(cq, cq-1, 6);//Adjust downward
jpayne@68 713 assert(q<=cq || q>=6) : q+","+cq;
jpayne@68 714 }
jpayne@68 715 }else{
jpayne@68 716 b=cb;
jpayne@68 717 q=cq;
jpayne@68 718 }
jpayne@68 719 }
jpayne@68 720 cbases[cloc]=b;
jpayne@68 721 if(cquals!=null){
jpayne@68 722 byte q2=(byte)Tools.mid(q, cq+maxQAdjust, cq-maxQAdjust);
jpayne@68 723 if(q2==0 && AminoAcid.isFullyDefined(b)){
jpayne@68 724 assert(!AminoAcid.isFullyDefined(cb));
jpayne@68 725 q2=(byte)Tools.mid(2, 25, (rq+25)/2);
jpayne@68 726 }else if(!AminoAcid.isFullyDefined(b)){
jpayne@68 727 q2=0;
jpayne@68 728 }
jpayne@68 729 cquals[cloc]=q2;
jpayne@68 730 assert((b=='N') == (cquals[cloc]==0)) : "b="+(char)b+", cb="+(char)cb+", rb="+(char)rb+", cx="+cx+", "
jpayne@68 731 + "new="+cquals[cloc]+", q="+q+", old="+cq+", rq="+rq+", loc="+rloc+"\n"+call.toFastq()+"\n"+ref.toFastq();
jpayne@68 732 }
jpayne@68 733 }
jpayne@68 734 return corrections;
jpayne@68 735 }
jpayne@68 736
jpayne@68 737 /*--------------------------------------------------------------*/
jpayne@68 738
jpayne@68 739 //Only used by condense mode.
jpayne@68 740 public ArrayList<Read> makeConsensus(){
jpayne@68 741 if(size()==1){
jpayne@68 742 Read r=get(0);
jpayne@68 743 r.id=r.numericID+" size=1";
jpayne@68 744 return this;
jpayne@68 745 }
jpayne@68 746
jpayne@68 747 ArrayList<Clump> clumps=Splitter.splitOnPivot(this);//TODO: Really, this should return null if there is no pivot
jpayne@68 748
jpayne@68 749 ArrayList<Read> list=new ArrayList<Read>(clumps.size());
jpayne@68 750 for(Clump c : clumps){
jpayne@68 751 Read temp=c.makeSimpleConsensus();
jpayne@68 752 list.add(temp);
jpayne@68 753 }
jpayne@68 754 return list;
jpayne@68 755 }
jpayne@68 756
jpayne@68 757 /*--------------------------------------------------------------*/
jpayne@68 758
jpayne@68 759 private float identity(Read call, byte[] rbases, int[] rvector){
jpayne@68 760 ReadKey key=(ReadKey) call.obj;
jpayne@68 761 final int pos=key.position;
jpayne@68 762 byte[] cbases=call.bases, quals=call.quality;
jpayne@68 763 int good=0, bad=0;
jpayne@68 764
jpayne@68 765 final int cStart=0, rStart=maxLeft-pos, max=cbases.length;
jpayne@68 766 for(int cloc=cStart, rloc=rStart; cloc<max; cloc++, rloc++){
jpayne@68 767 final byte cb=cbases[cloc], rb=rbases[rloc];
jpayne@68 768 if(AminoAcid.isFullyDefined(cb) && AminoAcid.isFullyDefined(rb)){
jpayne@68 769 if(cb==rb){good++;}
jpayne@68 770 else{bad++;}
jpayne@68 771 }
jpayne@68 772 }
jpayne@68 773 rvector[0]=good;
jpayne@68 774 rvector[1]=bad;
jpayne@68 775 return good==0 ? 0 : good/(float)(good+bad);
jpayne@68 776 }
jpayne@68 777
jpayne@68 778 /*--------------------------------------------------------------*/
jpayne@68 779
jpayne@68 780 long getSumAtPosition(int[][] counts, int pos){
jpayne@68 781 long sum=0;
jpayne@68 782 for(int x=0; x<4; x++){
jpayne@68 783 sum+=counts[x][pos];
jpayne@68 784 }
jpayne@68 785 return sum;
jpayne@68 786 }
jpayne@68 787
jpayne@68 788 byte getConsensusAtPosition(int[][] counts, int pos){
jpayne@68 789 byte xMax=0;
jpayne@68 790 for(byte x=1; x<4; x++){
jpayne@68 791 // System.err.println("x="+x+", max="+max+", Checking "+counts[x][pos]+" vs "+counts[x][max]);
jpayne@68 792 if(counts[x][pos]>counts[xMax][pos]){xMax=x;}
jpayne@68 793 }
jpayne@68 794 // assert(counts[max][pos]>=counts[0][pos]);
jpayne@68 795 // assert(counts[max][pos]>=counts[1][pos]);
jpayne@68 796 // assert(counts[max][pos]>=counts[2][pos]) : max+", "+counts[max][pos]+", ["+counts[0][pos]+", "+counts[1][pos]+", "+counts[2][pos]+", "+counts[3][pos]+"]";
jpayne@68 797 // assert(counts[max][pos]>=counts[3][pos]);
jpayne@68 798 return (counts[xMax][pos]>0 ? xMax : -1);
jpayne@68 799 }
jpayne@68 800
jpayne@68 801 byte getSecondHighest(int[][] counts, int pos){
jpayne@68 802 byte first=0;
jpayne@68 803 byte second=1;
jpayne@68 804 if(counts[first][pos]<counts[second][pos]){
jpayne@68 805 first=1; second=0;
jpayne@68 806 }
jpayne@68 807 for(byte x=2; x<4; x++){
jpayne@68 808 // System.err.println("x="+x+", max="+max+", Checking "+counts[x][pos]+" vs "+counts[x][max]);
jpayne@68 809 if(counts[x][pos]>counts[first][pos]){
jpayne@68 810 second=first;
jpayne@68 811 first=x;
jpayne@68 812 }else if(counts[x][pos]>counts[second][pos]){
jpayne@68 813 second=x;
jpayne@68 814 }
jpayne@68 815 }
jpayne@68 816 // assert(counts[max][pos]>=counts[0][pos]);
jpayne@68 817 // assert(counts[max][pos]>=counts[1][pos]);
jpayne@68 818 // assert(counts[max][pos]>=counts[2][pos]) : max+", "+counts[max][pos]+", ["+counts[0][pos]+", "+counts[1][pos]+", "+counts[2][pos]+", "+counts[3][pos]+"]";
jpayne@68 819 // assert(counts[max][pos]>=counts[3][pos]);
jpayne@68 820
jpayne@68 821 return second; //may be actually 0.
jpayne@68 822 //return (counts[second][pos]>0 ? second : -1);
jpayne@68 823 }
jpayne@68 824
jpayne@68 825 /*--------------------------------------------------------------*/
jpayne@68 826
jpayne@68 827 public String toStringStaggered(){
jpayne@68 828 ByteBuilder sb=new ByteBuilder();
jpayne@68 829 for(Read r : this){
jpayne@68 830 ReadKey key=(ReadKey) r.obj;
jpayne@68 831 final int pos=key.position;
jpayne@68 832 byte[] bases=r.bases, quals=r.quality;
jpayne@68 833 int rloc=0, cloc=rloc+pos-maxLeft;
jpayne@68 834 while(cloc<0){sb.append(' '); cloc++;}
jpayne@68 835 sb.append(bases);
jpayne@68 836 sb.append('\n');
jpayne@68 837 }
jpayne@68 838 return sb.toString();
jpayne@68 839 }
jpayne@68 840
jpayne@68 841 public Read consensusRead(){
jpayne@68 842 if(consensusRead==null){
jpayne@68 843 consensusRead=makeSimpleConsensus();
jpayne@68 844 }
jpayne@68 845 return consensusRead;
jpayne@68 846 }
jpayne@68 847
jpayne@68 848 public int width(){
jpayne@68 849 assert(width>=0) : width;
jpayne@68 850 return width;
jpayne@68 851 }
jpayne@68 852
jpayne@68 853 public int maxLeft(){
jpayne@68 854 assert(maxLeft>=0);
jpayne@68 855 return maxLeft;
jpayne@68 856 }
jpayne@68 857
jpayne@68 858 public int maxRight(){
jpayne@68 859 assert(maxRight>=0);
jpayne@68 860 return maxRight;
jpayne@68 861 }
jpayne@68 862
jpayne@68 863 /*--------------------------------------------------------------*/
jpayne@68 864
jpayne@68 865 int[][] baseCounts(){
jpayne@68 866 if(baseCounts==null){
jpayne@68 867 baseCounts=count(false);
jpayne@68 868 int len=baseCounts[0].length;
jpayne@68 869 assert(width==-1 || width==len);
jpayne@68 870 }
jpayne@68 871 return baseCounts;
jpayne@68 872 }
jpayne@68 873
jpayne@68 874 int[][] qualityCounts(){
jpayne@68 875 if(qualityCounts==null){
jpayne@68 876 qualityCounts=count(true);
jpayne@68 877 int len=qualityCounts[0].length;
jpayne@68 878 assert(width==-1 || width==len);
jpayne@68 879 }
jpayne@68 880 return qualityCounts;
jpayne@68 881 }
jpayne@68 882
jpayne@68 883 float[][] qualityAverages(){
jpayne@68 884 if(qualityAverages==null){
jpayne@68 885 qualityAverages=new float[4][width];
jpayne@68 886 for(int i=0; i<4; i++){
jpayne@68 887 for(int j=0; j<width; j++){
jpayne@68 888 int b=baseCounts[i][j];
jpayne@68 889 int q=qualityCounts[i][j];
jpayne@68 890 qualityAverages[i][j]=(b==0 ? 0 : q/(float)b);
jpayne@68 891 }
jpayne@68 892 }
jpayne@68 893 }
jpayne@68 894 return qualityAverages;
jpayne@68 895 }
jpayne@68 896
jpayne@68 897 void clearCounts(){
jpayne@68 898 baseCounts=qualityCounts=null;
jpayne@68 899 qualityAverages=null;
jpayne@68 900 }
jpayne@68 901
jpayne@68 902 private void clearConsensus(){
jpayne@68 903 consensusRead=null;
jpayne@68 904 }
jpayne@68 905
jpayne@68 906 /*--------------------------------------------------------------*/
jpayne@68 907
jpayne@68 908 @Override
jpayne@68 909 public boolean equals(Object b){
jpayne@68 910 return this==b;
jpayne@68 911 }
jpayne@68 912
jpayne@68 913 @Override
jpayne@68 914 public int hashCode(){
jpayne@68 915 return Long.hashCode(kmer);
jpayne@68 916 }
jpayne@68 917
jpayne@68 918 @Override
jpayne@68 919 public int compareTo(Clump o) {
jpayne@68 920 int x=Long.compare(kmer, o.kmer);
jpayne@68 921 return x!=0 ? x : o.size()-size();
jpayne@68 922 }
jpayne@68 923
jpayne@68 924 /*--------------------------------------------------------------*/
jpayne@68 925
jpayne@68 926 public final long kmer;
jpayne@68 927
jpayne@68 928 private int width=-1;
jpayne@68 929 private int maxLeft=-1;
jpayne@68 930 private int maxRight=-1;
jpayne@68 931
jpayne@68 932 private int[][] baseCounts;
jpayne@68 933 private int[][] qualityCounts;
jpayne@68 934 private float[][] qualityAverages;
jpayne@68 935
jpayne@68 936 private Read consensusRead;
jpayne@68 937
jpayne@68 938 boolean useQuality(){return useQuality;}
jpayne@68 939 private boolean useQuality=true;
jpayne@68 940
jpayne@68 941 boolean added=false;
jpayne@68 942
jpayne@68 943 public static final int overhead=overhead();
jpayne@68 944 private static int overhead(){
jpayne@68 945 return 16+ //self
jpayne@68 946 16+ //Backing array
jpayne@68 947 4+ //Backing array size
jpayne@68 948 4*(8)+ //Backing array initial capacity
jpayne@68 949 1*(8)+ //kmer
jpayne@68 950 3*(4)+ //int fields
jpayne@68 951 4*(8)+ //pointers
jpayne@68 952 2*(4); //booleans
jpayne@68 953 }
jpayne@68 954
jpayne@68 955 /*--------------------------------------------------------------*/
jpayne@68 956
jpayne@68 957 public static boolean parseStatic(String arg, String a, String b){
jpayne@68 958 if(a.equals("mincountcorrect") || a.equals("mincc")){
jpayne@68 959 minCountCorrect=Integer.parseInt(b);
jpayne@68 960 }else if(a.equals("minsizesplit") || a.equals("minss")){
jpayne@68 961 Splitter.minSizeSplit=Integer.parseInt(b);
jpayne@68 962 }else if(a.equals("minsizefractionsplit") || a.equals("minsfs")){
jpayne@68 963 Splitter.minSizeFractionSplit=Float.parseFloat(b);
jpayne@68 964 }else if(a.equals("minsizefractioncorrect") || a.equals("minsfc")){
jpayne@68 965 minSizeFractionCorrect=Float.parseFloat(b);
jpayne@68 966 }else if(a.equals("minratio") || a.equals("minr")){
jpayne@68 967 minRatio=Float.parseFloat(b);
jpayne@68 968 }else if(a.equals("minqratio") || a.equals("minqr")){
jpayne@68 969 minQRatio=Float.parseFloat(b);
jpayne@68 970 }else if(a.equals("minaqratio") || a.equals("minaqr")){
jpayne@68 971 minAQRatio=Float.parseFloat(b);
jpayne@68 972 }else if(a.equals("minratiooffset") || a.equals("minro")){
jpayne@68 973 minRatioOffset=Float.parseFloat(b);
jpayne@68 974 }else if(a.equals("minratiomult") || a.equals("minrm")){
jpayne@68 975 minRatioMult=Float.parseFloat(b);
jpayne@68 976 }else if(a.equals("minratioqmult") || a.equals("minrqm")){
jpayne@68 977 minRatioQMult=Float.parseFloat(b);
jpayne@68 978 }else if(a.equals("minidentity") || a.equals("minid")){
jpayne@68 979 minIdentity=Float.parseFloat(b);
jpayne@68 980 }else if(a.equals("maxqadjust")){
jpayne@68 981 maxQAdjust=(byte)Integer.parseInt(b);
jpayne@68 982 }else if(a.equals("maxqi") || a.equals("maxqincorrect") || a.equals("maxqualityincorrect")){
jpayne@68 983 maxQIncorrect=Integer.parseInt(b);
jpayne@68 984 if(maxCIncorrect<0){maxQIncorrect=Integer.MAX_VALUE;}
jpayne@68 985 }else if(a.equals("maxci") || a.equals("maxcincorrect") || a.equals("maxcountincorrect")){
jpayne@68 986 maxCIncorrect=Integer.parseInt(b);
jpayne@68 987 if(maxCIncorrect<0){maxCIncorrect=Integer.MAX_VALUE;}
jpayne@68 988 }else if(a.equals("border")){
jpayne@68 989 KmerComparator.defaultBorder=Integer.parseInt(b);
jpayne@68 990 }else if(a.equals("conservative")){
jpayne@68 991 conservativeFlag=Parse.parseBoolean(b);
jpayne@68 992 }else if(a.equals("aggressive")){
jpayne@68 993 aggressiveFlag=Parse.parseBoolean(b);
jpayne@68 994 }else if(a.equals("forceprocess")){
jpayne@68 995 forceProcess=Parse.parseBoolean(b);
jpayne@68 996 }else if(a.equals("mergefirst")){
jpayne@68 997 KmerComparator.mergeFirst=Parse.parseBoolean(b);
jpayne@68 998 }else if(a.equals("findcorrelations")){
jpayne@68 999 Splitter.FIND_CORRELATIONS=Parse.parseBoolean(b);
jpayne@68 1000 }else if(a.equals("maxcorrelations")){
jpayne@68 1001 Splitter.MAX_CORRELATIONS=Integer.parseInt(b);
jpayne@68 1002 }
jpayne@68 1003
jpayne@68 1004 else if(a.equals("sortx")){
jpayne@68 1005 sortX=Parse.parseBoolean(b);
jpayne@68 1006 }else if(a.equals("sorty")){
jpayne@68 1007 sortY=Parse.parseBoolean(b);
jpayne@68 1008 }else if(a.equals("sortxy")){
jpayne@68 1009 sortX=sortY=Parse.parseBoolean(b);
jpayne@68 1010 }else if(a.equals("forcesortxy") || a.equals("forcexy")){
jpayne@68 1011 forceSortXY=Parse.parseBoolean(b);
jpayne@68 1012 }else if(a.equals("sortxysize") || a.equals("xysize")){
jpayne@68 1013 sortXYSize=Integer.parseInt(b);
jpayne@68 1014 }
jpayne@68 1015
jpayne@68 1016 else if(a.equals("removeallduplicates") || a.equals("allduplicates")){
jpayne@68 1017 markAll=Parse.parseBoolean(b);
jpayne@68 1018 }else if(a.equals("addcount") || a.equals("renamebycount")){
jpayne@68 1019 renameByCount=Parse.parseBoolean(b);
jpayne@68 1020 }else if(a.equals("optical") || a.equals("opticalonly")){
jpayne@68 1021 opticalOnly=Parse.parseBoolean(b);
jpayne@68 1022 }else if(a.equals("dupesubs") || a.equals("duplicatesubs") || a.equals("dsubs") || a.equals("subs") || a.equals("s")){
jpayne@68 1023 maxSubstitutions=Integer.parseInt(b);
jpayne@68 1024 }else if(a.equals("dupedist") || a.equals("duplicatedistance") || a.equals("ddist") || a.equals("dist") || a.equals("opticaldist") || a.equals("distance")){
jpayne@68 1025 maxOpticalDistance=Float.parseFloat(b);
jpayne@68 1026 opticalOnly=maxOpticalDistance>=0;
jpayne@68 1027 }else if(a.equals("scanlimit") || a.equals("scan")){
jpayne@68 1028 scanLimit=Integer.parseInt(b);
jpayne@68 1029 }else if(a.equals("allowns")){
jpayne@68 1030 allowNs=Parse.parseBoolean(b);
jpayne@68 1031 }else if(a.equals("containment") || a.equals("absorbcontainment") || a.equals("ac") || a.equals("contains")){
jpayne@68 1032 containment=Parse.parseBoolean(b);
jpayne@68 1033 }else if(a.equalsIgnoreCase("prefixOrSuffix") || a.equalsIgnoreCase("suffixOrPrefix") || a.equals("affix") || a.equals("pos")){
jpayne@68 1034 affix=Parse.parseBoolean(b);
jpayne@68 1035 }else if(a.equals("printduplicates")){
jpayne@68 1036 printDuplicates=Parse.parseBoolean(b);
jpayne@68 1037 }else if(a.equals("dupeidentity")){
jpayne@68 1038 float dupeIdentity=Float.parseFloat(b);
jpayne@68 1039 if(dupeIdentity>1){dupeIdentity=dupeIdentity/100;}
jpayne@68 1040 assert(dupeIdentity<=1);
jpayne@68 1041 dupeSubRate=1-dupeIdentity;
jpayne@68 1042 }else if(a.equals("dupesubrate") || a.equals("dsr") || a.equals("subrate")){
jpayne@68 1043 dupeSubRate=Float.parseFloat(b);
jpayne@68 1044 if(dupeSubRate>1){dupeSubRate=dupeSubRate/100;}
jpayne@68 1045 assert(dupeSubRate<=1);
jpayne@68 1046 }
jpayne@68 1047
jpayne@68 1048 else if(a.equals("minsizesplit")){
jpayne@68 1049 Splitter.minSizeSplit=Integer.parseInt(b);
jpayne@68 1050 }else if(a.equals("minsizefractionsplit")){
jpayne@68 1051 Splitter.minSizeFractionSplit=Float.parseFloat(b);
jpayne@68 1052 }else{
jpayne@68 1053 return false;
jpayne@68 1054 }
jpayne@68 1055
jpayne@68 1056 return true;
jpayne@68 1057 }
jpayne@68 1058
jpayne@68 1059 static void setConservative(boolean newState){
jpayne@68 1060
jpayne@68 1061 if(aggressiveFlag){return;}
jpayne@68 1062 if(newState==conservativeMode){return;}
jpayne@68 1063 conservativeMode=newState;
jpayne@68 1064
jpayne@68 1065 Splitter.conservative=conservativeMode;
jpayne@68 1066
jpayne@68 1067 if(conservativeMode){
jpayne@68 1068 minCountCorrect++;
jpayne@68 1069 minSizeFractionCorrect*=1.5f;
jpayne@68 1070 minRatio*=1.25f;
jpayne@68 1071 minQRatio*=1.5f;
jpayne@68 1072 minAQRatio*=1.4f;
jpayne@68 1073 minRatioOffset*=1.25f;
jpayne@68 1074 minRatioQMult*=1.50f;
jpayne@68 1075 minRatioMult*=1.4f;
jpayne@68 1076 minIdentity=1-((1-minIdentity)*0.7f);
jpayne@68 1077 if(maxQIncorrect==Integer.MAX_VALUE){maxQIncorrect=35;}
jpayne@68 1078 }else{
jpayne@68 1079 minCountCorrect--;
jpayne@68 1080 minSizeFractionCorrect/=1.5f;
jpayne@68 1081 minRatio/=1.25f;
jpayne@68 1082 minQRatio/=1.5f;
jpayne@68 1083 minAQRatio/=1.4f;
jpayne@68 1084 minRatioOffset/=1.25f;
jpayne@68 1085 minRatioQMult/=1.50f;
jpayne@68 1086 minRatioMult/=1.4f;
jpayne@68 1087 minIdentity=1-((1-minIdentity)/0.7f);
jpayne@68 1088 if(maxQIncorrect==35){maxQIncorrect=Integer.MAX_VALUE;}
jpayne@68 1089 }
jpayne@68 1090 }
jpayne@68 1091
jpayne@68 1092 /*--------------------------------------------------------------*/
jpayne@68 1093
jpayne@68 1094 public static void setXY() {
jpayne@68 1095 if(ReadKey.spanTilesX){sortX=true;}
jpayne@68 1096 if(ReadKey.spanTilesY){sortY=true;}
jpayne@68 1097 }
jpayne@68 1098
jpayne@68 1099 static boolean allowNs=true;
jpayne@68 1100 static boolean markAll=false;
jpayne@68 1101 static boolean markOnly=false;
jpayne@68 1102 static boolean opticalOnly=false;
jpayne@68 1103 static boolean containment=false;
jpayne@68 1104 static boolean affix=false;
jpayne@68 1105 static boolean printDuplicates=false; //For debugging
jpayne@68 1106
jpayne@68 1107 private static boolean renameByCount=false;
jpayne@68 1108 private static int scanLimit=5;
jpayne@68 1109 private static int maxSubstitutions=2;
jpayne@68 1110 private static float dupeSubRate=0;
jpayne@68 1111 private static float maxOpticalDistance=40;
jpayne@68 1112
jpayne@68 1113 static boolean forceProcess=false;
jpayne@68 1114 static boolean conservativeFlag=false;
jpayne@68 1115 static boolean aggressiveFlag=false;
jpayne@68 1116 static boolean conservativeMode=false;
jpayne@68 1117 static boolean renameConsensus=false;
jpayne@68 1118 static int minCountCorrect=4; //mcc=4 was slightly better than 3
jpayne@68 1119 static float minSizeFractionCorrect=0.20f; //0.11 is very slightly better than 0.14
jpayne@68 1120 static float minRatio=30.0f;
jpayne@68 1121 static float minQRatio=2.8f; //Does nothing?
jpayne@68 1122 static float minAQRatio=0.7f;
jpayne@68 1123 static float minRatioOffset=1.9f; //3 is far worse than 2; 1 is better
jpayne@68 1124 static float minRatioQMult=0.08f;
jpayne@68 1125 static float minRatioMult=1.80f; //2.5 is far worse than 2; 1.5 is better
jpayne@68 1126 static float minIdentity=0.97f; //0.98 is slightly better than 0.96; 0.94 is substantially worse
jpayne@68 1127 static byte maxQAdjust=0;
jpayne@68 1128 static int maxQIncorrect=Integer.MAX_VALUE;
jpayne@68 1129 static int maxCIncorrect=Integer.MAX_VALUE;
jpayne@68 1130
jpayne@68 1131 static boolean sortX=false; //Not needed for NextSeq
jpayne@68 1132 static boolean sortY=true;
jpayne@68 1133 static boolean forceSortXY=false; //Mainly for testing
jpayne@68 1134 static int sortXYSize=6;
jpayne@68 1135
jpayne@68 1136 /** May slightly increase speed for optical dedupe. Can be safely disabled. */
jpayne@68 1137 static boolean sortYEarly(){return sortY && (forceSortXY || opticalOnly);}
jpayne@68 1138
jpayne@68 1139 // private static final boolean countQuality=false;
jpayne@68 1140 public static int k=31;
jpayne@68 1141 private static final long serialVersionUID = 1L;
jpayne@68 1142
jpayne@68 1143 }