Mercurial > repos > rliterman > csp2
view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/Clump.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line source
package clump; import java.util.ArrayList; import dna.AminoAcid; import shared.KillSwitch; import shared.Parse; import shared.Tools; import stream.Read; import structures.ByteBuilder; /** * A list of reads sharing a kmer. * @author Brian Bushnell * @date Nov 7, 2015 * */ public class Clump extends ArrayList<Read> implements Comparable<Clump> { public static Clump makeClump(long kmer){ try { return new Clump(kmer); } catch (OutOfMemoryError e) { KillSwitch.memKill(e); throw new RuntimeException(); } } private Clump(long kmer_){ this(kmer_, 4); } private Clump(long kmer_, int size){ super(size); kmer=kmer_; } @Override public boolean add(Read r){ ReadKey key=(ReadKey) r.obj; assert(key.kmer==kmer); key.clump=this; return super.add(r); } private void setMaxima(){ maxLeft=-1; maxRight=-1; width=-1; for(Read r : this){ ReadKey key=(ReadKey) r.obj; final int pos=key.position; maxLeft=Tools.max(maxLeft, pos); maxRight=Tools.max(maxRight, r.length()-pos); } width=maxLeft+maxRight; } /** This will create counts of bases, or sums of qualities, at each position in the cluster. */ private int[][] count(final boolean qualityScores){ if(width<0){setMaxima();} // System.err.println("\n\n"); final int[][] counts=new int[4][width]; for(Read r : this){ ReadKey key=(ReadKey) r.obj; final int pos=key.position; byte[] bases=r.bases, quals=r.quality; if(quals==null){useQuality=false;} // System.err.println("pos="+pos+", maxLeft="+maxLeft); for(int cloc=0, rloc=maxLeft-pos; cloc<bases.length; cloc++, rloc++){ // System.err.println("cloc="+cloc+"/"+bases.length+", rloc="+rloc+"/"+width); int x=AminoAcid.baseToNumber[bases[cloc]]; if(x>-1){ final int q; if(qualityScores){q=(quals==null ? 20 : quals[cloc]);} else{q=1;} counts[x][rloc]+=q; } } } return counts; } /*--------------------------------------------------------------*/ public Read makeSimpleConsensus(){ // assert(Splitter.findBestPivot(this)<0) : Splitter.findBestPivot(this); //TODO: Slow if(size()==1){ Read r=get(0); if(renameConsensus){r.id=r.numericID+" size=1";} return r; } final int[][] bcounts=baseCounts(); final int[][] qcounts=qualityCounts(); final byte[] bases=new byte[width], quals=new byte[width]; for(int i=0; i<width; i++){ int x=getConsensusAtPosition(qcounts, i); int y=getSecondHighest(qcounts, i); if(x>=0 && qcounts[x][i]==qcounts[y][i]){//Tie-breaker x=getConsensusAtPosition(bcounts, i); y=getSecondHighest(bcounts, i); } if(x<0){ // System.err.println("q="+0+", x="+x+"; A="+counts[0][i]+", C="+counts[1][i]+", G="+counts[2][i]+", T="+counts[3][i]); bases[i]='N'; quals[i]=0; assert(getSumAtPosition(qcounts, i)==0) : "\n"+bcounts[0][i]+", "+bcounts[1][i]+", "+bcounts[2][i]+", "+bcounts[3][i]+ "\n"+qcounts[0][i]+", "+qcounts[1][i]+", "+qcounts[2][i]+", "+qcounts[3][i]+ "\nwidth="+width+", i="+i+", size="+size()+"\n"+new String(bases, 0, i)+"\n"+get(0)+"\n"+get(1)+"\n"; // assert(getSumAtPosition(bcounts, i)==0) : "\n"+bcounts[0][i]+", "+bcounts[1][i]+", "+bcounts[2][i]+", "+bcounts[3][i]+ // "\n"+qcounts[0][i]+", "+qcounts[1][i]+", "+qcounts[2][i]+", "+qcounts[3][i]+ // "\nwidth="+width+", i="+i+", size="+size()+"\n"+new String(bases, 0, i)+"\n"+get(0)+"\n"+get(1)+"\n"; }else{ final long bsum=getSumAtPosition(bcounts, i); final long bmajor=bcounts[x][i]; final long bminor=bsum-bcounts[x][i]; final long bsecond=bcounts[y][i]; final long qsum=getSumAtPosition(qcounts, i); final long qmajor=qcounts[x][i]; final long qminor=qsum-qcounts[x][i]; final long qsecond=qcounts[y][i]; float bratio=bminor/(float)(bmajor+bminor); double phred=(bminor==0 ? Read.MAX_CALLED_QUALITY() : -10*Math.log10(bratio)); phred=Tools.min(phred, qmajor-qminor); // if(phred<0 || phred>127){ // assert(false) : i+","+x+","+bsum+","+qsum+","+bmajor+","+bminor+","+bratio+","+phred+"\n"+ // bcounts[0][i]+","+bcounts[1][i]+","+bcounts[2][i]+","+bcounts[3][i]+"\n"+ // qcounts[0][i]+","+qcounts[1][i]+","+qcounts[2][i]+","+qcounts[3][i]+"\n"+ // this.toStringStaggered()+"\n"; // } // assert(phred>=0 && phred<=127) : phred+","+x+","+i+","+bratio+","+bcounts[x][i]+","+bcounts[0][i]+ // ","+bcounts[1][i]+","+bcounts[2][i]+","+bcounts[3][i]; // assert(phred>=0 && phred<=127) : bmajor+", "+bminor+", "+phred+", "+Read.MAX_CALLED_QUALITY; byte q=Read.capQuality((long)Math.round(phred)); bases[i]=AminoAcid.numberToBase[x]; quals[i]=q; assert(q>0); assert(x>-1); assert(bases[i]!='N'); } } Read leftmost=this.get(0);//TODO: Actually rightmost! Read r=new Read(bases, quals, leftmost.numericID+" size="+size(), 0); //TODO: Attach the long pair, and make sure the kmer location is correct. // assert(false) : "\n"+r.toFastq()+"\nCheck kmer location."; // assert(size()==1) : "\n"+r.toFastq()+"\n"+get(0).toFastq()+"\n"+get(size()-1).toFastq()+"\n"; return r; } /*--------------------------------------------------------------*/ public int removeDuplicates(){ assert(KmerComparator.compareSequence); if(size()<2){return 0;} int removedTotal=0, removed=0; final boolean sortXY=(forceSortXY || sortYEarly() || (opticalOnly && (sortX || sortY) && size()>=sortXYSize)); final int maxDiscarded; final int scan; if(opticalOnly && sortXY){ scan=Tools.max(scanLimit, 200); maxDiscarded=scan+10; }else if(!sortXY && ((maxSubstitutions<1 && dupeSubRate<=0) || scanLimit<0)){ scan=0; maxDiscarded=0; }else{ scan=scanLimit; maxDiscarded=scan+10; } if(sortXY){ assert(sortX || sortY); if(sortY){ if(!sortYEarly()){ this.sort(KmerComparatorY.comparator); }else{ // for(int i=1; i<this.size(); i++){ // Read a=get(i-1); // Read b=get(i); // assert(KmerComparatorY.comparator.compare(a, b)<=0) : a.obj+" vs "+b.obj; // } // //Otherwise, it was already Y-sorted. } // assert(false) : sortY(); removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan, maxDiscarded, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance); removedTotal+=removed; // System.err.println("RemovedY: "+removed); while((maxSubstitutions>0 || dupeSubRate>0) && scanLimit>0 && removed>maxDiscarded){ removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan+10, maxDiscarded*2+20, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance); removedTotal+=removed; // System.err.println("RemovedY: "+removed); } } if(sortX && (ReadKey.spanTilesX || !sortY)){ this.sort(KmerComparatorX.comparator); removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan, maxDiscarded, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance); removedTotal+=removed; // System.err.println("RemovedX: "+removed); while((maxSubstitutions>0 || dupeSubRate>0) && scanLimit>0 && removed>maxDiscarded){ removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan+10, maxDiscarded*2+20, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance); removedTotal+=removed; // System.err.println("RemovedX: "+removed); } } }else{ removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan, maxDiscarded, opticalOnly, false, markOnly, markAll, renameByCount, maxOpticalDistance); removedTotal+=removed; while((maxSubstitutions>0 || dupeSubRate>0) && scanLimit>0 && removed>maxDiscarded){ removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan+10, maxDiscarded*2+20, opticalOnly, false, markOnly, markAll, renameByCount, maxOpticalDistance); removedTotal+=removed; } } return removedTotal; } private int removeDuplicates_inner(final int maxSubs, final float subRate, final int scanLimit, final int maxDiscarded, final boolean optical, final boolean xySorted, final boolean mark, final boolean markAll, final boolean rename, final float dist){ final int size=size(); if(size<2){return 0;} int dupePairs=0; int dupeReads=0; // final boolean breakOnTile=(optical && !FlowcellCoordinate.spanTiles); // final long start=System.nanoTime(); for(int i=0, lim=size-1; i<lim; i++){ final Read a=get(i); if(!a.discarded()){ final ReadKey keyA=(ReadKey) a.obj; final int aLen=a.length(); int unequals=0; int discarded=0; for(int j=i+1; j<size && unequals<=scanLimit && discarded<=maxDiscarded && (!a.discarded() || markAll); j++){ final Read b=get(j); if(b.discarded()){ discarded++; }else{ final int bLen=b.length(); final ReadKey keyB=(ReadKey) b.obj; if(!containment && !keyA.equals(keyB)){break;} // if(containment && affix && !keyA.physicalAffix(keyB, aLen, bLen)){break;} // if(optical && keyA.lane!=keyB.lane){break;} //Already in equals method // if(breakOnTile && keyA.tile!=keyB.tile){break;} //Already in equals method if(optical && xySorted && !keyA.nearXY(keyB, dist)){break;} // if(System.nanoTime()-start>200000000000L){ // TextStreamWriter tsw=new TextStreamWriter("foo.fq", true, false, false); // tsw.start(); // for(Read x : this){ // tsw.println(x.toFastq()); // } // tsw.poisonAndWait(); // assert(false) : "kmer="+kmer+", size="+size(); // } if(equals(a, b, maxSubs, subRate)){ if(!optical || keyA.near(keyB, dist)){ if(printDuplicates){ System.out.println(a.toFasta()); System.out.println(b.toFasta()); } float errA=a.expectedErrorsIncludingMate(true); float errB=b.expectedErrorsIncludingMate(true); if(markAll){ b.setDiscarded(true); assert(!a.discarded() || markAll); dupePairs++; dupeReads+=1+b.mateCount(); unequals=0; if(!a.discarded()){ a.setDiscarded(true); dupePairs++; dupeReads+=1+a.mateCount(); } }else if(containment || errB>=errA){ b.setDiscarded(true); assert(!a.discarded() || markAll); a.copies+=b.copies+parseExtraCopies(b); dupePairs++; dupeReads+=1+b.mateCount(); unequals=0; }else{ a.setDiscarded(true); assert(!b.discarded() || markAll); b.copies+=a.copies+parseExtraCopies(a); dupePairs++; dupeReads+=1+a.mateCount(); } } }else{ unequals++; } } } } } if(dupeReads>0){ for(int i=0; i<size; i++){ Read a=get(i); if(a.discarded()){ if(mark){ if(!a.id.endsWith(" duplicate")){ a.id+=" duplicate"; if(a.mate!=null){a.mate.id+=" duplicate";} } }else{ set(i, null); } }else if(rename && a.copies>1){ renameFromCount(a); } a.copies=1; } if(!mark){ int x=Tools.condenseStrict(this); assert(x==dupePairs) : size()+", "+size+", "+dupePairs+", "+x; assert((size()>0 || markAll) && size()==size-dupePairs) : size()+", "+size+", "+dupePairs; } } if(containment){ dupeReads+=removeDuplicates_backwards(maxSubs, subRate, scanLimit, maxDiscarded, optical, xySorted, mark, markAll, rename, dist); } return dupeReads; } /** Only for containments */ private int removeDuplicates_backwards(final int maxSubs, final float subRate, final int scanLimit, final int maxDiscarded, final boolean optical, final boolean xySorted, final boolean mark, final boolean markAll, final boolean rename, final float dist){ final int size=size(); if(size<2){return 0;} int dupePairs=0; int dupeReads=0; // final boolean breakOnTile=(optical && !FlowcellCoordinate.spanTiles); // final long start=System.nanoTime(); for(int i=size-1; i>0; i--){ final Read a=get(i); if(!a.discarded()){ final ReadKey keyA=(ReadKey) a.obj; final int aLen=a.length(); int unequals=0; int discarded=0; for(int j=i-1; j>=0 && unequals<=scanLimit && discarded<=maxDiscarded && (!a.discarded() || markAll); j--){ final Read b=get(j); if(b.discarded()){ discarded++; }else{ final int bLen=b.length(); final ReadKey keyB=(ReadKey) b.obj; if(!containment && !keyA.equals(keyB)){break;} // if(containment && affix && !keyA.physicalAffix(keyB, aLen, bLen)){break;} // if(optical && keyA.lane!=keyB.lane){break;} //Already in equals method // if(breakOnTile && keyA.tile!=keyB.tile){break;} //Already in equals method if(optical && xySorted && !keyA.nearXY(keyB, dist)){break;} // if(System.nanoTime()-start>200000000000L){ // TextStreamWriter tsw=new TextStreamWriter("foo.fq", true, false, false); // tsw.start(); // for(Read x : this){ // tsw.println(x.toFastq()); // } // tsw.poisonAndWait(); // assert(false) : "kmer="+kmer+", size="+size(); // } if(equals(a, b, maxSubs, subRate)){ if(!optical || keyA.near(keyB, dist)){ if(printDuplicates){ System.out.println(a.toFasta()); System.out.println(b.toFasta()); } float errA=a.expectedErrorsIncludingMate(true); float errB=b.expectedErrorsIncludingMate(true); if(markAll){ b.setDiscarded(true); assert(!a.discarded() || markAll); dupePairs++; dupeReads+=1+b.mateCount(); unequals=0; if(!a.discarded()){ a.setDiscarded(true); dupePairs++; dupeReads+=1+a.mateCount(); } }else if(containment || errB>=errA){ b.setDiscarded(true); assert(!a.discarded() || markAll); a.copies+=b.copies+parseExtraCopies(b); dupePairs++; dupeReads+=1+b.mateCount(); unequals=0; }else{ a.setDiscarded(true); assert(!b.discarded() || markAll); b.copies+=a.copies+parseExtraCopies(a); dupePairs++; dupeReads+=1+a.mateCount(); } } }else{ unequals++; } } } } } if(dupeReads>0){ for(int i=0; i<size; i++){ Read a=get(i); if(a.discarded()){ if(mark){ if(!a.id.endsWith(" duplicate")){ a.id+=" duplicate"; if(a.mate!=null){a.mate.id+=" duplicate";} } }else{ set(i, null); } }else if(rename && a.copies>1){ renameFromCount(a); } a.copies=1; } if(!mark){ int x=Tools.condenseStrict(this); assert(x==dupePairs) : size()+", "+size+", "+dupePairs+", "+x; assert((size()>0 || markAll) && size()==size-dupePairs) : size()+", "+size+", "+dupePairs; } } return dupeReads; } public int parseExtraCopies(final Read a){ final String id=a.id; final int space=id.lastIndexOf(' '); int extraCopies=0; if(space<0){return 0;} if(space>=0 && Tools.contains(id, "copies=", space+1)){ extraCopies=Integer.parseInt(id.substring(space+8))-1; } return extraCopies; } private void renameFromCount(final Read a){ final int newExtraCopies=a.copies-1; assert(newExtraCopies>0) : newExtraCopies; final int oldExtraCopies=parseExtraCopies(a); final int total=1+newExtraCopies+oldExtraCopies; renameToTotal(a, total); if(a.pairnum()==0 && a.mate!=null){ assert(a.mate.pairnum()==1); renameToTotal(a.mate, total); } } private static void renameToTotal(final Read a, final int total){ final String id=a.id; final int space=id.lastIndexOf(' '); if(space>=0 && Tools.contains(id, "copies=", space+1)){ a.id=a.id.substring(0, space); } a.id=a.id+" copies="+total; } // public boolean nearby(Read a, Read b, FlowcellCoordinate fca, FlowcellCoordinate fcb, float maxDist){ //// fca.setFrom(a.id); // fcb.setFrom(b.id); // float dist=fca.distance(fcb); // return dist<=maxDist; // } // public boolean nearby(ReadKey ka, ReadKey kb, float maxDist){ // float dist=ka.distance(kb); // return dist<=maxDist; // } public static boolean equals(Read a, Read b, int maxSubs, float dupeSubRate){ if(a.numericID==b.numericID){return false;} if(dupeSubRate>0){maxSubs=Tools.max(maxSubs, (int)(dupeSubRate*Tools.min(a.length(), b.length())));} if(containment){ return contains(a, b, maxSubs); } if(!equals(a.bases, b.bases, maxSubs)){return false;} if(a.mate!=null && !equals(a.mate.bases, b.mate.bases, maxSubs)){return false;} return true; } public static boolean equals(byte[] a, byte[] b, int maxSubs){ if(a==b){return false;}//Nothing should subsume itself if(a==null || b==null){return false;} if(a.length!=b.length){return false;} int subs=0; if(allowNs){ for(int i=0; i<a.length; i++){ if(a[i]!=b[i] && (a[i]!='N' && b[i]!='N')){ subs++; if(subs>maxSubs){return false;} } } }else{ for(int i=0; i<a.length; i++){ if(a[i]!=b[i]){ subs++; if(subs>maxSubs){return false;} } } } return true; } public static boolean contains(Read a, Read b, int maxSubs){ if(a.numericID==b.numericID){return false;} boolean ok=contains_inner(a, b, maxSubs); if(!ok || a.mate==null){return ok;} ok=contains_inner(a.mate, b.mate, maxSubs); if(!ok){return false;} ReadKey rka1=(ReadKey)a.obj; ReadKey rkb1=(ReadKey)b.obj; ReadKey rka2=(ReadKey)a.mate.obj; //TODO: In containment mode, mates need to always get keys. ReadKey rkb2=(ReadKey)b.mate.obj; return ((rka1.kmerMinusStrand==rkb1.kmerMinusStrand) && (rka2.kmerMinusStrand==rkb2.kmerMinusStrand)); //Ensures that both reads have the same directionality. } public static boolean contains_inner(Read a, Read b, int maxSubs){ // if(a.length()==b.length()){return equals(a.bases, b.bases, maxSubs);} ReadKey rka=(ReadKey)a.obj; ReadKey rkb=(ReadKey)b.obj; if(affix ? !rka.physicalAffix(rkb, a.length(), b.length()) : !rka.physicallyContains(rkb, a.length(), b.length())){return false;} boolean flipped=false; // if(a.mate!=null){//In paired mode, need synchronization if the reads are in difference clumps. But... just don't do that. // Read max, min; // if(a.numericID>b.numericID){max=a; min=b;}//Protects against deadlocks. // else{max=b; min=a;} // synchronized(min){ // synchronized(max){ // if(rka.kmerMinusStrand!=rkb.kmerMinusStrand){ // rkb.flip(b, k); // flipped=true; // } // boolean ok=contains(a.bases, b.bases, rka.position, rkb.position, maxSubs); // if(flipped){rkb.flip(b, k);} // return ok; // } // } // } if(rka.kmerMinusStrand!=rkb.kmerMinusStrand){ rkb.flip(b, k); flipped=true; } boolean ok=contains(a.bases, b.bases, rka.position, rkb.position, maxSubs); if(flipped){rkb.flip(b, k);} return ok; } public static boolean contains(byte[] a, byte[] b, int posA, int posB, int maxSubs){ if(a==null || b==null){return false;} assert(a.length>=b.length); if(a==b){return false;} //Nothing should contain itself int subs=0; int ai, bi; final int dif=posA-posB; if(dif>0){ ai=0; bi=-dif; }else{ ai=dif; bi=0; } if(allowNs){ for(; ai<a.length && bi<b.length; ai++, bi++){ if(ai>=0 && bi>=0){ final byte na=a[ai], nb=b[bi]; if(na!=nb && na!='N' && nb!='N'){ subs++; if(subs>maxSubs){return false;} } } } }else{ for(; ai<a.length && bi<b.length; ai++, bi++){ if(ai>=0 && bi>=0 && a[ai]!=b[bi]){ subs++; if(subs>maxSubs){return false;} } } } return true; } /*--------------------------------------------------------------*/ public long splitAndErrorCorrect(){ if(size()<Splitter.minSizeSplit){ return errorCorrect(); } long sum=0; ArrayList<Clump> list=Splitter.splitOnPivot(this); for(Clump c : list){ sum+=c.errorCorrect(); } return sum; } public long errorCorrect(){ if(size()<=minCountCorrect){return 0;} // assert(Splitter.findBestPivot(this)<0); //TODO: Slow Read consen=makeSimpleConsensus(); long sum=0; final int[] rvector=KillSwitch.allocInt1D(2); for(Read r : this){ sum+=errorCorrect(r, consen, rvector); } return sum; } private int errorCorrect(Read call, Read ref, int[] rvector){ // assert(call.validated()); // assert(call.checkQuality()); // assert(ref.checkQuality()); final float identity=identity(call, ref.bases, rvector); if((identity<minIdentity && (rvector[1]>0 || rvector[0]<50)) || (identity==1 && !call.containsNocalls()/* && !ref.containsNocalls()*/)){ //TODO: Strange, the identity==1 clause causes different behavior, though it does give a speedup. return 0; } final byte[] cbases=call.bases, cquals=call.quality; final byte[] rbases=ref.bases, rquals=ref.quality; ReadKey key=(ReadKey) call.obj; final int pos=key.position; final int[][] bcounts=baseCounts(); final int[][] qcounts=qualityCounts(); final float[][] qAvgs=qualityAverages(); final int minDepth=(int)Tools.max(minCountCorrect, minSizeFractionCorrect*size()); int corrections=0; final int cStart=0, rStart=maxLeft-pos, max=cbases.length; for(int cloc=cStart, rloc=rStart; cloc<max; cloc++, rloc++){ //Called base, ref base final byte cb=cbases[cloc], rb=rbases[rloc]; //Called quality, ref quality final byte cq=(cquals==null ? 20 : cquals[cloc]), rq=rquals[rloc]; //Called number final byte cx=AminoAcid.baseToNumber[cb]; //Ref number final byte rx=AminoAcid.baseToNumber[rb]; // assert((cb=='N') == (cquals[cloc]==0)); final byte b, q; if(cx<0){ b=rb; q=(byte)Tools.min(20, rq); }else if(cb==rb){ b=cb; q=(byte)Tools.mid(cq, cq+1, rq);//Adjust upward assert(q>=cq && (q<=rq || q<=cq)); }else{ final int cCount=bcounts[cx][rloc]; final int rCount=bcounts[rx][rloc]; final int altQSum=qcounts[cx][rloc]; final int rQSum=qcounts[rx][rloc]; final float rQAvg=qAvgs[rx][rloc]; if(cCount<=maxCIncorrect && cq<=maxQIncorrect && cq*minQRatio<rQSum && cq*minAQRatio<8+rQAvg){ final byte pminor=getSecondHighest(bcounts, rloc); assert(rx>=0 && rx<bcounts.length) : rx+", "+rloc+", "+bcounts.length+"\n"+call.toFastq()+"\n"+ref.toFastq(); assert(rloc>=0 && rloc<bcounts[rx].length) : rx+", "+rloc+", "+bcounts[rloc].length+"\n"+call.toFastq()+"\n"+ref.toFastq(); final int minorCount=bcounts[pminor][rloc]; final long sum=getSumAtPosition(bcounts, rloc); // final long altCount=sum-rCount; boolean change=false; if(rCount>=minCountCorrect && sum>=minDepth){ final float ratioNeeded=Tools.min(minRatio, minRatioMult*minorCount+minRatioOffset+minRatioQMult*cq); // final float qratioNeeded=Tools.min(minRatio, minRatioMult*altQSum+minRatioOffset+minRatioQMult*cq); //altQSum is slightly different than minorQCount if(minorCount*ratioNeeded<=rCount && altQSum*ratioNeeded<=rQSum){ change=true; } // else if(minorCount*ratioNeeded<=rCount){ // assert(false) : "\n"+altQSum+", "+rQSum+", "+qratioNeeded+"\n"+cCount+", "+rCount+", "+sum+", "+ratioNeeded+"\n"+(altQSum*qratioNeeded); // } } if(change){ corrections++; b=rb; q=(byte)Tools.min(cq+1, 25, rq); // assert(q==25 || (q<=rq && q>=cq)) : q+", "+cq+", "+rq; }else{ b=cb; q=(byte)Tools.mid(cq, cq-1, 6);//Adjust downward assert(q<=cq || q>=6) : q+","+cq; } }else{ b=cb; q=cq; } } cbases[cloc]=b; if(cquals!=null){ byte q2=(byte)Tools.mid(q, cq+maxQAdjust, cq-maxQAdjust); if(q2==0 && AminoAcid.isFullyDefined(b)){ assert(!AminoAcid.isFullyDefined(cb)); q2=(byte)Tools.mid(2, 25, (rq+25)/2); }else if(!AminoAcid.isFullyDefined(b)){ q2=0; } cquals[cloc]=q2; assert((b=='N') == (cquals[cloc]==0)) : "b="+(char)b+", cb="+(char)cb+", rb="+(char)rb+", cx="+cx+", " + "new="+cquals[cloc]+", q="+q+", old="+cq+", rq="+rq+", loc="+rloc+"\n"+call.toFastq()+"\n"+ref.toFastq(); } } return corrections; } /*--------------------------------------------------------------*/ //Only used by condense mode. public ArrayList<Read> makeConsensus(){ if(size()==1){ Read r=get(0); r.id=r.numericID+" size=1"; return this; } ArrayList<Clump> clumps=Splitter.splitOnPivot(this);//TODO: Really, this should return null if there is no pivot ArrayList<Read> list=new ArrayList<Read>(clumps.size()); for(Clump c : clumps){ Read temp=c.makeSimpleConsensus(); list.add(temp); } return list; } /*--------------------------------------------------------------*/ private float identity(Read call, byte[] rbases, int[] rvector){ ReadKey key=(ReadKey) call.obj; final int pos=key.position; byte[] cbases=call.bases, quals=call.quality; int good=0, bad=0; final int cStart=0, rStart=maxLeft-pos, max=cbases.length; for(int cloc=cStart, rloc=rStart; cloc<max; cloc++, rloc++){ final byte cb=cbases[cloc], rb=rbases[rloc]; if(AminoAcid.isFullyDefined(cb) && AminoAcid.isFullyDefined(rb)){ if(cb==rb){good++;} else{bad++;} } } rvector[0]=good; rvector[1]=bad; return good==0 ? 0 : good/(float)(good+bad); } /*--------------------------------------------------------------*/ long getSumAtPosition(int[][] counts, int pos){ long sum=0; for(int x=0; x<4; x++){ sum+=counts[x][pos]; } return sum; } byte getConsensusAtPosition(int[][] counts, int pos){ byte xMax=0; for(byte x=1; x<4; x++){ // System.err.println("x="+x+", max="+max+", Checking "+counts[x][pos]+" vs "+counts[x][max]); if(counts[x][pos]>counts[xMax][pos]){xMax=x;} } // assert(counts[max][pos]>=counts[0][pos]); // assert(counts[max][pos]>=counts[1][pos]); // assert(counts[max][pos]>=counts[2][pos]) : max+", "+counts[max][pos]+", ["+counts[0][pos]+", "+counts[1][pos]+", "+counts[2][pos]+", "+counts[3][pos]+"]"; // assert(counts[max][pos]>=counts[3][pos]); return (counts[xMax][pos]>0 ? xMax : -1); } byte getSecondHighest(int[][] counts, int pos){ byte first=0; byte second=1; if(counts[first][pos]<counts[second][pos]){ first=1; second=0; } for(byte x=2; x<4; x++){ // System.err.println("x="+x+", max="+max+", Checking "+counts[x][pos]+" vs "+counts[x][max]); if(counts[x][pos]>counts[first][pos]){ second=first; first=x; }else if(counts[x][pos]>counts[second][pos]){ second=x; } } // assert(counts[max][pos]>=counts[0][pos]); // assert(counts[max][pos]>=counts[1][pos]); // assert(counts[max][pos]>=counts[2][pos]) : max+", "+counts[max][pos]+", ["+counts[0][pos]+", "+counts[1][pos]+", "+counts[2][pos]+", "+counts[3][pos]+"]"; // assert(counts[max][pos]>=counts[3][pos]); return second; //may be actually 0. //return (counts[second][pos]>0 ? second : -1); } /*--------------------------------------------------------------*/ public String toStringStaggered(){ ByteBuilder sb=new ByteBuilder(); for(Read r : this){ ReadKey key=(ReadKey) r.obj; final int pos=key.position; byte[] bases=r.bases, quals=r.quality; int rloc=0, cloc=rloc+pos-maxLeft; while(cloc<0){sb.append(' '); cloc++;} sb.append(bases); sb.append('\n'); } return sb.toString(); } public Read consensusRead(){ if(consensusRead==null){ consensusRead=makeSimpleConsensus(); } return consensusRead; } public int width(){ assert(width>=0) : width; return width; } public int maxLeft(){ assert(maxLeft>=0); return maxLeft; } public int maxRight(){ assert(maxRight>=0); return maxRight; } /*--------------------------------------------------------------*/ int[][] baseCounts(){ if(baseCounts==null){ baseCounts=count(false); int len=baseCounts[0].length; assert(width==-1 || width==len); } return baseCounts; } int[][] qualityCounts(){ if(qualityCounts==null){ qualityCounts=count(true); int len=qualityCounts[0].length; assert(width==-1 || width==len); } return qualityCounts; } float[][] qualityAverages(){ if(qualityAverages==null){ qualityAverages=new float[4][width]; for(int i=0; i<4; i++){ for(int j=0; j<width; j++){ int b=baseCounts[i][j]; int q=qualityCounts[i][j]; qualityAverages[i][j]=(b==0 ? 0 : q/(float)b); } } } return qualityAverages; } void clearCounts(){ baseCounts=qualityCounts=null; qualityAverages=null; } private void clearConsensus(){ consensusRead=null; } /*--------------------------------------------------------------*/ @Override public boolean equals(Object b){ return this==b; } @Override public int hashCode(){ return Long.hashCode(kmer); } @Override public int compareTo(Clump o) { int x=Long.compare(kmer, o.kmer); return x!=0 ? x : o.size()-size(); } /*--------------------------------------------------------------*/ public final long kmer; private int width=-1; private int maxLeft=-1; private int maxRight=-1; private int[][] baseCounts; private int[][] qualityCounts; private float[][] qualityAverages; private Read consensusRead; boolean useQuality(){return useQuality;} private boolean useQuality=true; boolean added=false; public static final int overhead=overhead(); private static int overhead(){ return 16+ //self 16+ //Backing array 4+ //Backing array size 4*(8)+ //Backing array initial capacity 1*(8)+ //kmer 3*(4)+ //int fields 4*(8)+ //pointers 2*(4); //booleans } /*--------------------------------------------------------------*/ public static boolean parseStatic(String arg, String a, String b){ if(a.equals("mincountcorrect") || a.equals("mincc")){ minCountCorrect=Integer.parseInt(b); }else if(a.equals("minsizesplit") || a.equals("minss")){ Splitter.minSizeSplit=Integer.parseInt(b); }else if(a.equals("minsizefractionsplit") || a.equals("minsfs")){ Splitter.minSizeFractionSplit=Float.parseFloat(b); }else if(a.equals("minsizefractioncorrect") || a.equals("minsfc")){ minSizeFractionCorrect=Float.parseFloat(b); }else if(a.equals("minratio") || a.equals("minr")){ minRatio=Float.parseFloat(b); }else if(a.equals("minqratio") || a.equals("minqr")){ minQRatio=Float.parseFloat(b); }else if(a.equals("minaqratio") || a.equals("minaqr")){ minAQRatio=Float.parseFloat(b); }else if(a.equals("minratiooffset") || a.equals("minro")){ minRatioOffset=Float.parseFloat(b); }else if(a.equals("minratiomult") || a.equals("minrm")){ minRatioMult=Float.parseFloat(b); }else if(a.equals("minratioqmult") || a.equals("minrqm")){ minRatioQMult=Float.parseFloat(b); }else if(a.equals("minidentity") || a.equals("minid")){ minIdentity=Float.parseFloat(b); }else if(a.equals("maxqadjust")){ maxQAdjust=(byte)Integer.parseInt(b); }else if(a.equals("maxqi") || a.equals("maxqincorrect") || a.equals("maxqualityincorrect")){ maxQIncorrect=Integer.parseInt(b); if(maxCIncorrect<0){maxQIncorrect=Integer.MAX_VALUE;} }else if(a.equals("maxci") || a.equals("maxcincorrect") || a.equals("maxcountincorrect")){ maxCIncorrect=Integer.parseInt(b); if(maxCIncorrect<0){maxCIncorrect=Integer.MAX_VALUE;} }else if(a.equals("border")){ KmerComparator.defaultBorder=Integer.parseInt(b); }else if(a.equals("conservative")){ conservativeFlag=Parse.parseBoolean(b); }else if(a.equals("aggressive")){ aggressiveFlag=Parse.parseBoolean(b); }else if(a.equals("forceprocess")){ forceProcess=Parse.parseBoolean(b); }else if(a.equals("mergefirst")){ KmerComparator.mergeFirst=Parse.parseBoolean(b); }else if(a.equals("findcorrelations")){ Splitter.FIND_CORRELATIONS=Parse.parseBoolean(b); }else if(a.equals("maxcorrelations")){ Splitter.MAX_CORRELATIONS=Integer.parseInt(b); } else if(a.equals("sortx")){ sortX=Parse.parseBoolean(b); }else if(a.equals("sorty")){ sortY=Parse.parseBoolean(b); }else if(a.equals("sortxy")){ sortX=sortY=Parse.parseBoolean(b); }else if(a.equals("forcesortxy") || a.equals("forcexy")){ forceSortXY=Parse.parseBoolean(b); }else if(a.equals("sortxysize") || a.equals("xysize")){ sortXYSize=Integer.parseInt(b); } else if(a.equals("removeallduplicates") || a.equals("allduplicates")){ markAll=Parse.parseBoolean(b); }else if(a.equals("addcount") || a.equals("renamebycount")){ renameByCount=Parse.parseBoolean(b); }else if(a.equals("optical") || a.equals("opticalonly")){ opticalOnly=Parse.parseBoolean(b); }else if(a.equals("dupesubs") || a.equals("duplicatesubs") || a.equals("dsubs") || a.equals("subs") || a.equals("s")){ maxSubstitutions=Integer.parseInt(b); }else if(a.equals("dupedist") || a.equals("duplicatedistance") || a.equals("ddist") || a.equals("dist") || a.equals("opticaldist") || a.equals("distance")){ maxOpticalDistance=Float.parseFloat(b); opticalOnly=maxOpticalDistance>=0; }else if(a.equals("scanlimit") || a.equals("scan")){ scanLimit=Integer.parseInt(b); }else if(a.equals("allowns")){ allowNs=Parse.parseBoolean(b); }else if(a.equals("containment") || a.equals("absorbcontainment") || a.equals("ac") || a.equals("contains")){ containment=Parse.parseBoolean(b); }else if(a.equalsIgnoreCase("prefixOrSuffix") || a.equalsIgnoreCase("suffixOrPrefix") || a.equals("affix") || a.equals("pos")){ affix=Parse.parseBoolean(b); }else if(a.equals("printduplicates")){ printDuplicates=Parse.parseBoolean(b); }else if(a.equals("dupeidentity")){ float dupeIdentity=Float.parseFloat(b); if(dupeIdentity>1){dupeIdentity=dupeIdentity/100;} assert(dupeIdentity<=1); dupeSubRate=1-dupeIdentity; }else if(a.equals("dupesubrate") || a.equals("dsr") || a.equals("subrate")){ dupeSubRate=Float.parseFloat(b); if(dupeSubRate>1){dupeSubRate=dupeSubRate/100;} assert(dupeSubRate<=1); } else if(a.equals("minsizesplit")){ Splitter.minSizeSplit=Integer.parseInt(b); }else if(a.equals("minsizefractionsplit")){ Splitter.minSizeFractionSplit=Float.parseFloat(b); }else{ return false; } return true; } static void setConservative(boolean newState){ if(aggressiveFlag){return;} if(newState==conservativeMode){return;} conservativeMode=newState; Splitter.conservative=conservativeMode; if(conservativeMode){ minCountCorrect++; minSizeFractionCorrect*=1.5f; minRatio*=1.25f; minQRatio*=1.5f; minAQRatio*=1.4f; minRatioOffset*=1.25f; minRatioQMult*=1.50f; minRatioMult*=1.4f; minIdentity=1-((1-minIdentity)*0.7f); if(maxQIncorrect==Integer.MAX_VALUE){maxQIncorrect=35;} }else{ minCountCorrect--; minSizeFractionCorrect/=1.5f; minRatio/=1.25f; minQRatio/=1.5f; minAQRatio/=1.4f; minRatioOffset/=1.25f; minRatioQMult/=1.50f; minRatioMult/=1.4f; minIdentity=1-((1-minIdentity)/0.7f); if(maxQIncorrect==35){maxQIncorrect=Integer.MAX_VALUE;} } } /*--------------------------------------------------------------*/ public static void setXY() { if(ReadKey.spanTilesX){sortX=true;} if(ReadKey.spanTilesY){sortY=true;} } static boolean allowNs=true; static boolean markAll=false; static boolean markOnly=false; static boolean opticalOnly=false; static boolean containment=false; static boolean affix=false; static boolean printDuplicates=false; //For debugging private static boolean renameByCount=false; private static int scanLimit=5; private static int maxSubstitutions=2; private static float dupeSubRate=0; private static float maxOpticalDistance=40; static boolean forceProcess=false; static boolean conservativeFlag=false; static boolean aggressiveFlag=false; static boolean conservativeMode=false; static boolean renameConsensus=false; static int minCountCorrect=4; //mcc=4 was slightly better than 3 static float minSizeFractionCorrect=0.20f; //0.11 is very slightly better than 0.14 static float minRatio=30.0f; static float minQRatio=2.8f; //Does nothing? static float minAQRatio=0.7f; static float minRatioOffset=1.9f; //3 is far worse than 2; 1 is better static float minRatioQMult=0.08f; static float minRatioMult=1.80f; //2.5 is far worse than 2; 1.5 is better static float minIdentity=0.97f; //0.98 is slightly better than 0.96; 0.94 is substantially worse static byte maxQAdjust=0; static int maxQIncorrect=Integer.MAX_VALUE; static int maxCIncorrect=Integer.MAX_VALUE; static boolean sortX=false; //Not needed for NextSeq static boolean sortY=true; static boolean forceSortXY=false; //Mainly for testing static int sortXYSize=6; /** May slightly increase speed for optical dedupe. Can be safely disabled. */ static boolean sortYEarly(){return sortY && (forceSortXY || opticalOnly);} // private static final boolean countQuality=false; public static int k=31; private static final long serialVersionUID = 1L; }