Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/Clump.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/Clump.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,1143 @@ +package clump; + +import java.util.ArrayList; + +import dna.AminoAcid; +import shared.KillSwitch; +import shared.Parse; +import shared.Tools; +import stream.Read; +import structures.ByteBuilder; + +/** + * A list of reads sharing a kmer. + * @author Brian Bushnell + * @date Nov 7, 2015 + * + */ +public class Clump extends ArrayList<Read> implements Comparable<Clump> { + + public static Clump makeClump(long kmer){ + try { + return new Clump(kmer); + } catch (OutOfMemoryError e) { + KillSwitch.memKill(e); + throw new RuntimeException(); + } + } + + private Clump(long kmer_){ + this(kmer_, 4); + } + + private Clump(long kmer_, int size){ + super(size); + kmer=kmer_; + } + + @Override + public boolean add(Read r){ + ReadKey key=(ReadKey) r.obj; + assert(key.kmer==kmer); + key.clump=this; + return super.add(r); + } + + private void setMaxima(){ + maxLeft=-1; + maxRight=-1; + width=-1; + for(Read r : this){ + ReadKey key=(ReadKey) r.obj; + final int pos=key.position; + maxLeft=Tools.max(maxLeft, pos); + maxRight=Tools.max(maxRight, r.length()-pos); + } + width=maxLeft+maxRight; + } + + /** This will create counts of bases, or sums of qualities, at each position in the cluster. */ + private int[][] count(final boolean qualityScores){ + if(width<0){setMaxima();} + +// System.err.println("\n\n"); + final int[][] counts=new int[4][width]; + for(Read r : this){ + ReadKey key=(ReadKey) r.obj; + final int pos=key.position; + byte[] bases=r.bases, quals=r.quality; + if(quals==null){useQuality=false;} + +// System.err.println("pos="+pos+", maxLeft="+maxLeft); + for(int cloc=0, rloc=maxLeft-pos; cloc<bases.length; cloc++, rloc++){ +// System.err.println("cloc="+cloc+"/"+bases.length+", rloc="+rloc+"/"+width); + int x=AminoAcid.baseToNumber[bases[cloc]]; + if(x>-1){ + final int q; + if(qualityScores){q=(quals==null ? 20 : quals[cloc]);} + else{q=1;} + counts[x][rloc]+=q; + } + } + } + return counts; + } + + /*--------------------------------------------------------------*/ + + public Read makeSimpleConsensus(){ +// assert(Splitter.findBestPivot(this)<0) : Splitter.findBestPivot(this); //TODO: Slow + if(size()==1){ + Read r=get(0); + if(renameConsensus){r.id=r.numericID+" size=1";} + return r; + } + final int[][] bcounts=baseCounts(); + final int[][] qcounts=qualityCounts(); + + final byte[] bases=new byte[width], quals=new byte[width]; + for(int i=0; i<width; i++){ + int x=getConsensusAtPosition(qcounts, i); + int y=getSecondHighest(qcounts, i); + if(x>=0 && qcounts[x][i]==qcounts[y][i]){//Tie-breaker + x=getConsensusAtPosition(bcounts, i); + y=getSecondHighest(bcounts, i); + } + + + if(x<0){ +// System.err.println("q="+0+", x="+x+"; A="+counts[0][i]+", C="+counts[1][i]+", G="+counts[2][i]+", T="+counts[3][i]); + bases[i]='N'; + quals[i]=0; + assert(getSumAtPosition(qcounts, i)==0) : "\n"+bcounts[0][i]+", "+bcounts[1][i]+", "+bcounts[2][i]+", "+bcounts[3][i]+ + "\n"+qcounts[0][i]+", "+qcounts[1][i]+", "+qcounts[2][i]+", "+qcounts[3][i]+ + "\nwidth="+width+", i="+i+", size="+size()+"\n"+new String(bases, 0, i)+"\n"+get(0)+"\n"+get(1)+"\n"; +// assert(getSumAtPosition(bcounts, i)==0) : "\n"+bcounts[0][i]+", "+bcounts[1][i]+", "+bcounts[2][i]+", "+bcounts[3][i]+ +// "\n"+qcounts[0][i]+", "+qcounts[1][i]+", "+qcounts[2][i]+", "+qcounts[3][i]+ +// "\nwidth="+width+", i="+i+", size="+size()+"\n"+new String(bases, 0, i)+"\n"+get(0)+"\n"+get(1)+"\n"; + }else{ + final long bsum=getSumAtPosition(bcounts, i); + final long bmajor=bcounts[x][i]; + final long bminor=bsum-bcounts[x][i]; + final long bsecond=bcounts[y][i]; + + final long qsum=getSumAtPosition(qcounts, i); + final long qmajor=qcounts[x][i]; + final long qminor=qsum-qcounts[x][i]; + final long qsecond=qcounts[y][i]; + + float bratio=bminor/(float)(bmajor+bminor); + double phred=(bminor==0 ? Read.MAX_CALLED_QUALITY() : -10*Math.log10(bratio)); + phred=Tools.min(phred, qmajor-qminor); +// if(phred<0 || phred>127){ +// assert(false) : i+","+x+","+bsum+","+qsum+","+bmajor+","+bminor+","+bratio+","+phred+"\n"+ +// bcounts[0][i]+","+bcounts[1][i]+","+bcounts[2][i]+","+bcounts[3][i]+"\n"+ +// qcounts[0][i]+","+qcounts[1][i]+","+qcounts[2][i]+","+qcounts[3][i]+"\n"+ +// this.toStringStaggered()+"\n"; +// } +// assert(phred>=0 && phred<=127) : phred+","+x+","+i+","+bratio+","+bcounts[x][i]+","+bcounts[0][i]+ +// ","+bcounts[1][i]+","+bcounts[2][i]+","+bcounts[3][i]; +// assert(phred>=0 && phred<=127) : bmajor+", "+bminor+", "+phred+", "+Read.MAX_CALLED_QUALITY; + byte q=Read.capQuality((long)Math.round(phred)); + bases[i]=AminoAcid.numberToBase[x]; + quals[i]=q; + assert(q>0); + assert(x>-1); + assert(bases[i]!='N'); + } + } + Read leftmost=this.get(0);//TODO: Actually rightmost! + Read r=new Read(bases, quals, leftmost.numericID+" size="+size(), 0); + //TODO: Attach the long pair, and make sure the kmer location is correct. +// assert(false) : "\n"+r.toFastq()+"\nCheck kmer location."; +// assert(size()==1) : "\n"+r.toFastq()+"\n"+get(0).toFastq()+"\n"+get(size()-1).toFastq()+"\n"; + return r; + } + + /*--------------------------------------------------------------*/ + + public int removeDuplicates(){ + assert(KmerComparator.compareSequence); + if(size()<2){return 0;} + + int removedTotal=0, removed=0; + + final boolean sortXY=(forceSortXY || sortYEarly() || (opticalOnly && (sortX || sortY) && size()>=sortXYSize)); + + final int maxDiscarded; + final int scan; + + if(opticalOnly && sortXY){ + scan=Tools.max(scanLimit, 200); + maxDiscarded=scan+10; + }else if(!sortXY && ((maxSubstitutions<1 && dupeSubRate<=0) || scanLimit<0)){ + scan=0; + maxDiscarded=0; + }else{ + scan=scanLimit; + maxDiscarded=scan+10; + } + + if(sortXY){ + assert(sortX || sortY); + + if(sortY){ + if(!sortYEarly()){ + this.sort(KmerComparatorY.comparator); + }else{ +// for(int i=1; i<this.size(); i++){ +// Read a=get(i-1); +// Read b=get(i); +// assert(KmerComparatorY.comparator.compare(a, b)<=0) : a.obj+" vs "+b.obj; +// } +// //Otherwise, it was already Y-sorted. + } +// assert(false) : sortY(); + removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan, maxDiscarded, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance); + removedTotal+=removed; +// System.err.println("RemovedY: "+removed); + while((maxSubstitutions>0 || dupeSubRate>0) && scanLimit>0 && removed>maxDiscarded){ + removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan+10, maxDiscarded*2+20, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance); + removedTotal+=removed; +// System.err.println("RemovedY: "+removed); + } + } + if(sortX && (ReadKey.spanTilesX || !sortY)){ + this.sort(KmerComparatorX.comparator); + removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan, maxDiscarded, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance); + removedTotal+=removed; +// System.err.println("RemovedX: "+removed); + while((maxSubstitutions>0 || dupeSubRate>0) && scanLimit>0 && removed>maxDiscarded){ + removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan+10, maxDiscarded*2+20, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance); + removedTotal+=removed; +// System.err.println("RemovedX: "+removed); + } + } + }else{ + removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan, maxDiscarded, opticalOnly, false, markOnly, markAll, renameByCount, maxOpticalDistance); + removedTotal+=removed; + while((maxSubstitutions>0 || dupeSubRate>0) && scanLimit>0 && removed>maxDiscarded){ + removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan+10, maxDiscarded*2+20, opticalOnly, false, markOnly, markAll, renameByCount, maxOpticalDistance); + removedTotal+=removed; + } + } + + return removedTotal; + } + + private int removeDuplicates_inner(final int maxSubs, final float subRate, final int scanLimit, final int maxDiscarded, + final boolean optical, final boolean xySorted, final boolean mark, final boolean markAll, final boolean rename, final float dist){ + final int size=size(); + if(size<2){return 0;} + int dupePairs=0; + int dupeReads=0; + +// final boolean breakOnTile=(optical && !FlowcellCoordinate.spanTiles); + +// final long start=System.nanoTime(); + + for(int i=0, lim=size-1; i<lim; i++){ + final Read a=get(i); + if(!a.discarded()){ + final ReadKey keyA=(ReadKey) a.obj; + final int aLen=a.length(); + int unequals=0; + int discarded=0; + for(int j=i+1; j<size && unequals<=scanLimit && discarded<=maxDiscarded && (!a.discarded() || markAll); j++){ + final Read b=get(j); + if(b.discarded()){ + discarded++; + }else{ + final int bLen=b.length(); + final ReadKey keyB=(ReadKey) b.obj; + if(!containment && !keyA.equals(keyB)){break;} +// if(containment && affix && !keyA.physicalAffix(keyB, aLen, bLen)){break;} +// if(optical && keyA.lane!=keyB.lane){break;} //Already in equals method +// if(breakOnTile && keyA.tile!=keyB.tile){break;} //Already in equals method + if(optical && xySorted && !keyA.nearXY(keyB, dist)){break;} +// if(System.nanoTime()-start>200000000000L){ +// TextStreamWriter tsw=new TextStreamWriter("foo.fq", true, false, false); +// tsw.start(); +// for(Read x : this){ +// tsw.println(x.toFastq()); +// } +// tsw.poisonAndWait(); +// assert(false) : "kmer="+kmer+", size="+size(); +// } + if(equals(a, b, maxSubs, subRate)){ + if(!optical || keyA.near(keyB, dist)){ + if(printDuplicates){ + System.out.println(a.toFasta()); + System.out.println(b.toFasta()); + } + float errA=a.expectedErrorsIncludingMate(true); + float errB=b.expectedErrorsIncludingMate(true); + if(markAll){ + b.setDiscarded(true); + assert(!a.discarded() || markAll); + dupePairs++; + dupeReads+=1+b.mateCount(); + unequals=0; + if(!a.discarded()){ + a.setDiscarded(true); + dupePairs++; + dupeReads+=1+a.mateCount(); + } + }else if(containment || errB>=errA){ + b.setDiscarded(true); + assert(!a.discarded() || markAll); + a.copies+=b.copies+parseExtraCopies(b); + dupePairs++; + dupeReads+=1+b.mateCount(); + unequals=0; + }else{ + a.setDiscarded(true); + assert(!b.discarded() || markAll); + b.copies+=a.copies+parseExtraCopies(a); + dupePairs++; + dupeReads+=1+a.mateCount(); + } + } + }else{ + unequals++; + } + } + } + } + } + if(dupeReads>0){ + for(int i=0; i<size; i++){ + Read a=get(i); + if(a.discarded()){ + if(mark){ + if(!a.id.endsWith(" duplicate")){ + a.id+=" duplicate"; + if(a.mate!=null){a.mate.id+=" duplicate";} + } + }else{ + set(i, null); + } + }else if(rename && a.copies>1){ + renameFromCount(a); + } + a.copies=1; + } + if(!mark){ + int x=Tools.condenseStrict(this); + assert(x==dupePairs) : size()+", "+size+", "+dupePairs+", "+x; + assert((size()>0 || markAll) && size()==size-dupePairs) : size()+", "+size+", "+dupePairs; + } + } + + if(containment){ + dupeReads+=removeDuplicates_backwards(maxSubs, subRate, scanLimit, maxDiscarded, optical, xySorted, mark, markAll, rename, dist); + } + + return dupeReads; + } + + /** Only for containments */ + private int removeDuplicates_backwards(final int maxSubs, final float subRate, final int scanLimit, final int maxDiscarded, + final boolean optical, final boolean xySorted, final boolean mark, final boolean markAll, final boolean rename, final float dist){ + final int size=size(); + if(size<2){return 0;} + int dupePairs=0; + int dupeReads=0; + +// final boolean breakOnTile=(optical && !FlowcellCoordinate.spanTiles); + +// final long start=System.nanoTime(); + + for(int i=size-1; i>0; i--){ + final Read a=get(i); + if(!a.discarded()){ + final ReadKey keyA=(ReadKey) a.obj; + final int aLen=a.length(); + int unequals=0; + int discarded=0; + for(int j=i-1; j>=0 && unequals<=scanLimit && discarded<=maxDiscarded && (!a.discarded() || markAll); j--){ + final Read b=get(j); + if(b.discarded()){ + discarded++; + }else{ + final int bLen=b.length(); + final ReadKey keyB=(ReadKey) b.obj; + if(!containment && !keyA.equals(keyB)){break;} +// if(containment && affix && !keyA.physicalAffix(keyB, aLen, bLen)){break;} +// if(optical && keyA.lane!=keyB.lane){break;} //Already in equals method +// if(breakOnTile && keyA.tile!=keyB.tile){break;} //Already in equals method + if(optical && xySorted && !keyA.nearXY(keyB, dist)){break;} +// if(System.nanoTime()-start>200000000000L){ +// TextStreamWriter tsw=new TextStreamWriter("foo.fq", true, false, false); +// tsw.start(); +// for(Read x : this){ +// tsw.println(x.toFastq()); +// } +// tsw.poisonAndWait(); +// assert(false) : "kmer="+kmer+", size="+size(); +// } + if(equals(a, b, maxSubs, subRate)){ + if(!optical || keyA.near(keyB, dist)){ + if(printDuplicates){ + System.out.println(a.toFasta()); + System.out.println(b.toFasta()); + } + float errA=a.expectedErrorsIncludingMate(true); + float errB=b.expectedErrorsIncludingMate(true); + if(markAll){ + b.setDiscarded(true); + assert(!a.discarded() || markAll); + dupePairs++; + dupeReads+=1+b.mateCount(); + unequals=0; + if(!a.discarded()){ + a.setDiscarded(true); + dupePairs++; + dupeReads+=1+a.mateCount(); + } + }else if(containment || errB>=errA){ + b.setDiscarded(true); + assert(!a.discarded() || markAll); + a.copies+=b.copies+parseExtraCopies(b); + dupePairs++; + dupeReads+=1+b.mateCount(); + unequals=0; + }else{ + a.setDiscarded(true); + assert(!b.discarded() || markAll); + b.copies+=a.copies+parseExtraCopies(a); + dupePairs++; + dupeReads+=1+a.mateCount(); + } + } + }else{ + unequals++; + } + } + } + } + } + if(dupeReads>0){ + for(int i=0; i<size; i++){ + Read a=get(i); + if(a.discarded()){ + if(mark){ + if(!a.id.endsWith(" duplicate")){ + a.id+=" duplicate"; + if(a.mate!=null){a.mate.id+=" duplicate";} + } + }else{ + set(i, null); + } + }else if(rename && a.copies>1){ + renameFromCount(a); + } + a.copies=1; + } + if(!mark){ + int x=Tools.condenseStrict(this); + assert(x==dupePairs) : size()+", "+size+", "+dupePairs+", "+x; + assert((size()>0 || markAll) && size()==size-dupePairs) : size()+", "+size+", "+dupePairs; + } + } + return dupeReads; + } + + public int parseExtraCopies(final Read a){ + final String id=a.id; + final int space=id.lastIndexOf(' '); + int extraCopies=0; + if(space<0){return 0;} + if(space>=0 && Tools.contains(id, "copies=", space+1)){ + extraCopies=Integer.parseInt(id.substring(space+8))-1; + } + return extraCopies; + } + + private void renameFromCount(final Read a){ + final int newExtraCopies=a.copies-1; + assert(newExtraCopies>0) : newExtraCopies; + final int oldExtraCopies=parseExtraCopies(a); + final int total=1+newExtraCopies+oldExtraCopies; + renameToTotal(a, total); + if(a.pairnum()==0 && a.mate!=null){ + assert(a.mate.pairnum()==1); + renameToTotal(a.mate, total); + } + } + + private static void renameToTotal(final Read a, final int total){ + final String id=a.id; + final int space=id.lastIndexOf(' '); + if(space>=0 && Tools.contains(id, "copies=", space+1)){ + a.id=a.id.substring(0, space); + } + a.id=a.id+" copies="+total; + } + +// public boolean nearby(Read a, Read b, FlowcellCoordinate fca, FlowcellCoordinate fcb, float maxDist){ +//// fca.setFrom(a.id); +// fcb.setFrom(b.id); +// float dist=fca.distance(fcb); +// return dist<=maxDist; +// } + +// public boolean nearby(ReadKey ka, ReadKey kb, float maxDist){ +// float dist=ka.distance(kb); +// return dist<=maxDist; +// } + + public static boolean equals(Read a, Read b, int maxSubs, float dupeSubRate){ + if(a.numericID==b.numericID){return false;} + if(dupeSubRate>0){maxSubs=Tools.max(maxSubs, (int)(dupeSubRate*Tools.min(a.length(), b.length())));} + if(containment){ + return contains(a, b, maxSubs); + } + if(!equals(a.bases, b.bases, maxSubs)){return false;} + if(a.mate!=null && !equals(a.mate.bases, b.mate.bases, maxSubs)){return false;} + return true; + } + + public static boolean equals(byte[] a, byte[] b, int maxSubs){ + if(a==b){return false;}//Nothing should subsume itself + if(a==null || b==null){return false;} + if(a.length!=b.length){return false;} + int subs=0; + if(allowNs){ + for(int i=0; i<a.length; i++){ + if(a[i]!=b[i] && (a[i]!='N' && b[i]!='N')){ + subs++; + if(subs>maxSubs){return false;} + } + } + }else{ + for(int i=0; i<a.length; i++){ + if(a[i]!=b[i]){ + subs++; + if(subs>maxSubs){return false;} + } + } + } + return true; + } + + public static boolean contains(Read a, Read b, int maxSubs){ + if(a.numericID==b.numericID){return false;} + boolean ok=contains_inner(a, b, maxSubs); + if(!ok || a.mate==null){return ok;} + ok=contains_inner(a.mate, b.mate, maxSubs); + if(!ok){return false;} + ReadKey rka1=(ReadKey)a.obj; + ReadKey rkb1=(ReadKey)b.obj; + ReadKey rka2=(ReadKey)a.mate.obj; //TODO: In containment mode, mates need to always get keys. + ReadKey rkb2=(ReadKey)b.mate.obj; + return ((rka1.kmerMinusStrand==rkb1.kmerMinusStrand) && (rka2.kmerMinusStrand==rkb2.kmerMinusStrand)); //Ensures that both reads have the same directionality. + } + + public static boolean contains_inner(Read a, Read b, int maxSubs){ +// if(a.length()==b.length()){return equals(a.bases, b.bases, maxSubs);} + ReadKey rka=(ReadKey)a.obj; + ReadKey rkb=(ReadKey)b.obj; + if(affix ? !rka.physicalAffix(rkb, a.length(), b.length()) : !rka.physicallyContains(rkb, a.length(), b.length())){return false;} + boolean flipped=false; +// if(a.mate!=null){//In paired mode, need synchronization if the reads are in difference clumps. But... just don't do that. +// Read max, min; +// if(a.numericID>b.numericID){max=a; min=b;}//Protects against deadlocks. +// else{max=b; min=a;} +// synchronized(min){ +// synchronized(max){ +// if(rka.kmerMinusStrand!=rkb.kmerMinusStrand){ +// rkb.flip(b, k); +// flipped=true; +// } +// boolean ok=contains(a.bases, b.bases, rka.position, rkb.position, maxSubs); +// if(flipped){rkb.flip(b, k);} +// return ok; +// } +// } +// } + if(rka.kmerMinusStrand!=rkb.kmerMinusStrand){ + rkb.flip(b, k); + flipped=true; + } + boolean ok=contains(a.bases, b.bases, rka.position, rkb.position, maxSubs); + if(flipped){rkb.flip(b, k);} + return ok; + } + + public static boolean contains(byte[] a, byte[] b, int posA, int posB, int maxSubs){ + if(a==null || b==null){return false;} + assert(a.length>=b.length); + if(a==b){return false;} //Nothing should contain itself + int subs=0; + + int ai, bi; + final int dif=posA-posB; + if(dif>0){ + ai=0; + bi=-dif; + }else{ + ai=dif; + bi=0; + } + + if(allowNs){ + for(; ai<a.length && bi<b.length; ai++, bi++){ + if(ai>=0 && bi>=0){ + final byte na=a[ai], nb=b[bi]; + if(na!=nb && na!='N' && nb!='N'){ + subs++; + if(subs>maxSubs){return false;} + } + } + } + }else{ + for(; ai<a.length && bi<b.length; ai++, bi++){ + if(ai>=0 && bi>=0 && a[ai]!=b[bi]){ + subs++; + if(subs>maxSubs){return false;} + } + } + } + return true; + } + + /*--------------------------------------------------------------*/ + + public long splitAndErrorCorrect(){ + if(size()<Splitter.minSizeSplit){ + return errorCorrect(); + } + long sum=0; + ArrayList<Clump> list=Splitter.splitOnPivot(this); + for(Clump c : list){ + sum+=c.errorCorrect(); + } + return sum; + } + + public long errorCorrect(){ + if(size()<=minCountCorrect){return 0;} +// assert(Splitter.findBestPivot(this)<0); //TODO: Slow + Read consen=makeSimpleConsensus(); + long sum=0; + final int[] rvector=KillSwitch.allocInt1D(2); + for(Read r : this){ + sum+=errorCorrect(r, consen, rvector); + } + return sum; + } + + private int errorCorrect(Read call, Read ref, int[] rvector){ + +// assert(call.validated()); +// assert(call.checkQuality()); +// assert(ref.checkQuality()); + + final float identity=identity(call, ref.bases, rvector); + if((identity<minIdentity && (rvector[1]>0 || rvector[0]<50)) || (identity==1 && !call.containsNocalls()/* && !ref.containsNocalls()*/)){ + //TODO: Strange, the identity==1 clause causes different behavior, though it does give a speedup. + return 0; + } + final byte[] cbases=call.bases, cquals=call.quality; + final byte[] rbases=ref.bases, rquals=ref.quality; + + ReadKey key=(ReadKey) call.obj; + final int pos=key.position; + + final int[][] bcounts=baseCounts(); + final int[][] qcounts=qualityCounts(); + final float[][] qAvgs=qualityAverages(); + + final int minDepth=(int)Tools.max(minCountCorrect, minSizeFractionCorrect*size()); + + int corrections=0; + + final int cStart=0, rStart=maxLeft-pos, max=cbases.length; + for(int cloc=cStart, rloc=rStart; cloc<max; cloc++, rloc++){ + //Called base, ref base + final byte cb=cbases[cloc], rb=rbases[rloc]; + //Called quality, ref quality + final byte cq=(cquals==null ? 20 : cquals[cloc]), rq=rquals[rloc]; + //Called number + final byte cx=AminoAcid.baseToNumber[cb]; + //Ref number + final byte rx=AminoAcid.baseToNumber[rb]; + +// assert((cb=='N') == (cquals[cloc]==0)); + + final byte b, q; + if(cx<0){ + b=rb; + q=(byte)Tools.min(20, rq); + }else if(cb==rb){ + b=cb; + q=(byte)Tools.mid(cq, cq+1, rq);//Adjust upward + assert(q>=cq && (q<=rq || q<=cq)); + }else{ + final int cCount=bcounts[cx][rloc]; + final int rCount=bcounts[rx][rloc]; + final int altQSum=qcounts[cx][rloc]; + final int rQSum=qcounts[rx][rloc]; + final float rQAvg=qAvgs[rx][rloc]; + if(cCount<=maxCIncorrect && cq<=maxQIncorrect && cq*minQRatio<rQSum && cq*minAQRatio<8+rQAvg){ + final byte pminor=getSecondHighest(bcounts, rloc); + + assert(rx>=0 && rx<bcounts.length) : rx+", "+rloc+", "+bcounts.length+"\n"+call.toFastq()+"\n"+ref.toFastq(); + assert(rloc>=0 && rloc<bcounts[rx].length) : rx+", "+rloc+", "+bcounts[rloc].length+"\n"+call.toFastq()+"\n"+ref.toFastq(); + final int minorCount=bcounts[pminor][rloc]; + + final long sum=getSumAtPosition(bcounts, rloc); + // final long altCount=sum-rCount; + + boolean change=false; + if(rCount>=minCountCorrect && sum>=minDepth){ + final float ratioNeeded=Tools.min(minRatio, minRatioMult*minorCount+minRatioOffset+minRatioQMult*cq); +// final float qratioNeeded=Tools.min(minRatio, minRatioMult*altQSum+minRatioOffset+minRatioQMult*cq); //altQSum is slightly different than minorQCount + if(minorCount*ratioNeeded<=rCount && altQSum*ratioNeeded<=rQSum){ + change=true; + } + +// else if(minorCount*ratioNeeded<=rCount){ +// assert(false) : "\n"+altQSum+", "+rQSum+", "+qratioNeeded+"\n"+cCount+", "+rCount+", "+sum+", "+ratioNeeded+"\n"+(altQSum*qratioNeeded); +// } + } + if(change){ + corrections++; + b=rb; + q=(byte)Tools.min(cq+1, 25, rq); + // assert(q==25 || (q<=rq && q>=cq)) : q+", "+cq+", "+rq; + }else{ + b=cb; + q=(byte)Tools.mid(cq, cq-1, 6);//Adjust downward + assert(q<=cq || q>=6) : q+","+cq; + } + }else{ + b=cb; + q=cq; + } + } + cbases[cloc]=b; + if(cquals!=null){ + byte q2=(byte)Tools.mid(q, cq+maxQAdjust, cq-maxQAdjust); + if(q2==0 && AminoAcid.isFullyDefined(b)){ + assert(!AminoAcid.isFullyDefined(cb)); + q2=(byte)Tools.mid(2, 25, (rq+25)/2); + }else if(!AminoAcid.isFullyDefined(b)){ + q2=0; + } + cquals[cloc]=q2; + assert((b=='N') == (cquals[cloc]==0)) : "b="+(char)b+", cb="+(char)cb+", rb="+(char)rb+", cx="+cx+", " + + "new="+cquals[cloc]+", q="+q+", old="+cq+", rq="+rq+", loc="+rloc+"\n"+call.toFastq()+"\n"+ref.toFastq(); + } + } + return corrections; + } + + /*--------------------------------------------------------------*/ + + //Only used by condense mode. + public ArrayList<Read> makeConsensus(){ + if(size()==1){ + Read r=get(0); + r.id=r.numericID+" size=1"; + return this; + } + + ArrayList<Clump> clumps=Splitter.splitOnPivot(this);//TODO: Really, this should return null if there is no pivot + + ArrayList<Read> list=new ArrayList<Read>(clumps.size()); + for(Clump c : clumps){ + Read temp=c.makeSimpleConsensus(); + list.add(temp); + } + return list; + } + + /*--------------------------------------------------------------*/ + + private float identity(Read call, byte[] rbases, int[] rvector){ + ReadKey key=(ReadKey) call.obj; + final int pos=key.position; + byte[] cbases=call.bases, quals=call.quality; + int good=0, bad=0; + + final int cStart=0, rStart=maxLeft-pos, max=cbases.length; + for(int cloc=cStart, rloc=rStart; cloc<max; cloc++, rloc++){ + final byte cb=cbases[cloc], rb=rbases[rloc]; + if(AminoAcid.isFullyDefined(cb) && AminoAcid.isFullyDefined(rb)){ + if(cb==rb){good++;} + else{bad++;} + } + } + rvector[0]=good; + rvector[1]=bad; + return good==0 ? 0 : good/(float)(good+bad); + } + + /*--------------------------------------------------------------*/ + + long getSumAtPosition(int[][] counts, int pos){ + long sum=0; + for(int x=0; x<4; x++){ + sum+=counts[x][pos]; + } + return sum; + } + + byte getConsensusAtPosition(int[][] counts, int pos){ + byte xMax=0; + for(byte x=1; x<4; x++){ +// System.err.println("x="+x+", max="+max+", Checking "+counts[x][pos]+" vs "+counts[x][max]); + if(counts[x][pos]>counts[xMax][pos]){xMax=x;} + } +// assert(counts[max][pos]>=counts[0][pos]); +// assert(counts[max][pos]>=counts[1][pos]); +// assert(counts[max][pos]>=counts[2][pos]) : max+", "+counts[max][pos]+", ["+counts[0][pos]+", "+counts[1][pos]+", "+counts[2][pos]+", "+counts[3][pos]+"]"; +// assert(counts[max][pos]>=counts[3][pos]); + return (counts[xMax][pos]>0 ? xMax : -1); + } + + byte getSecondHighest(int[][] counts, int pos){ + byte first=0; + byte second=1; + if(counts[first][pos]<counts[second][pos]){ + first=1; second=0; + } + for(byte x=2; x<4; x++){ +// System.err.println("x="+x+", max="+max+", Checking "+counts[x][pos]+" vs "+counts[x][max]); + if(counts[x][pos]>counts[first][pos]){ + second=first; + first=x; + }else if(counts[x][pos]>counts[second][pos]){ + second=x; + } + } +// assert(counts[max][pos]>=counts[0][pos]); +// assert(counts[max][pos]>=counts[1][pos]); +// assert(counts[max][pos]>=counts[2][pos]) : max+", "+counts[max][pos]+", ["+counts[0][pos]+", "+counts[1][pos]+", "+counts[2][pos]+", "+counts[3][pos]+"]"; +// assert(counts[max][pos]>=counts[3][pos]); + + return second; //may be actually 0. + //return (counts[second][pos]>0 ? second : -1); + } + + /*--------------------------------------------------------------*/ + + public String toStringStaggered(){ + ByteBuilder sb=new ByteBuilder(); + for(Read r : this){ + ReadKey key=(ReadKey) r.obj; + final int pos=key.position; + byte[] bases=r.bases, quals=r.quality; + int rloc=0, cloc=rloc+pos-maxLeft; + while(cloc<0){sb.append(' '); cloc++;} + sb.append(bases); + sb.append('\n'); + } + return sb.toString(); + } + + public Read consensusRead(){ + if(consensusRead==null){ + consensusRead=makeSimpleConsensus(); + } + return consensusRead; + } + + public int width(){ + assert(width>=0) : width; + return width; + } + + public int maxLeft(){ + assert(maxLeft>=0); + return maxLeft; + } + + public int maxRight(){ + assert(maxRight>=0); + return maxRight; + } + + /*--------------------------------------------------------------*/ + + int[][] baseCounts(){ + if(baseCounts==null){ + baseCounts=count(false); + int len=baseCounts[0].length; + assert(width==-1 || width==len); + } + return baseCounts; + } + + int[][] qualityCounts(){ + if(qualityCounts==null){ + qualityCounts=count(true); + int len=qualityCounts[0].length; + assert(width==-1 || width==len); + } + return qualityCounts; + } + + float[][] qualityAverages(){ + if(qualityAverages==null){ + qualityAverages=new float[4][width]; + for(int i=0; i<4; i++){ + for(int j=0; j<width; j++){ + int b=baseCounts[i][j]; + int q=qualityCounts[i][j]; + qualityAverages[i][j]=(b==0 ? 0 : q/(float)b); + } + } + } + return qualityAverages; + } + + void clearCounts(){ + baseCounts=qualityCounts=null; + qualityAverages=null; + } + + private void clearConsensus(){ + consensusRead=null; + } + + /*--------------------------------------------------------------*/ + + @Override + public boolean equals(Object b){ + return this==b; + } + + @Override + public int hashCode(){ + return Long.hashCode(kmer); + } + + @Override + public int compareTo(Clump o) { + int x=Long.compare(kmer, o.kmer); + return x!=0 ? x : o.size()-size(); + } + + /*--------------------------------------------------------------*/ + + public final long kmer; + + private int width=-1; + private int maxLeft=-1; + private int maxRight=-1; + + private int[][] baseCounts; + private int[][] qualityCounts; + private float[][] qualityAverages; + + private Read consensusRead; + + boolean useQuality(){return useQuality;} + private boolean useQuality=true; + + boolean added=false; + + public static final int overhead=overhead(); + private static int overhead(){ + return 16+ //self + 16+ //Backing array + 4+ //Backing array size + 4*(8)+ //Backing array initial capacity + 1*(8)+ //kmer + 3*(4)+ //int fields + 4*(8)+ //pointers + 2*(4); //booleans + } + + /*--------------------------------------------------------------*/ + + public static boolean parseStatic(String arg, String a, String b){ + if(a.equals("mincountcorrect") || a.equals("mincc")){ + minCountCorrect=Integer.parseInt(b); + }else if(a.equals("minsizesplit") || a.equals("minss")){ + Splitter.minSizeSplit=Integer.parseInt(b); + }else if(a.equals("minsizefractionsplit") || a.equals("minsfs")){ + Splitter.minSizeFractionSplit=Float.parseFloat(b); + }else if(a.equals("minsizefractioncorrect") || a.equals("minsfc")){ + minSizeFractionCorrect=Float.parseFloat(b); + }else if(a.equals("minratio") || a.equals("minr")){ + minRatio=Float.parseFloat(b); + }else if(a.equals("minqratio") || a.equals("minqr")){ + minQRatio=Float.parseFloat(b); + }else if(a.equals("minaqratio") || a.equals("minaqr")){ + minAQRatio=Float.parseFloat(b); + }else if(a.equals("minratiooffset") || a.equals("minro")){ + minRatioOffset=Float.parseFloat(b); + }else if(a.equals("minratiomult") || a.equals("minrm")){ + minRatioMult=Float.parseFloat(b); + }else if(a.equals("minratioqmult") || a.equals("minrqm")){ + minRatioQMult=Float.parseFloat(b); + }else if(a.equals("minidentity") || a.equals("minid")){ + minIdentity=Float.parseFloat(b); + }else if(a.equals("maxqadjust")){ + maxQAdjust=(byte)Integer.parseInt(b); + }else if(a.equals("maxqi") || a.equals("maxqincorrect") || a.equals("maxqualityincorrect")){ + maxQIncorrect=Integer.parseInt(b); + if(maxCIncorrect<0){maxQIncorrect=Integer.MAX_VALUE;} + }else if(a.equals("maxci") || a.equals("maxcincorrect") || a.equals("maxcountincorrect")){ + maxCIncorrect=Integer.parseInt(b); + if(maxCIncorrect<0){maxCIncorrect=Integer.MAX_VALUE;} + }else if(a.equals("border")){ + KmerComparator.defaultBorder=Integer.parseInt(b); + }else if(a.equals("conservative")){ + conservativeFlag=Parse.parseBoolean(b); + }else if(a.equals("aggressive")){ + aggressiveFlag=Parse.parseBoolean(b); + }else if(a.equals("forceprocess")){ + forceProcess=Parse.parseBoolean(b); + }else if(a.equals("mergefirst")){ + KmerComparator.mergeFirst=Parse.parseBoolean(b); + }else if(a.equals("findcorrelations")){ + Splitter.FIND_CORRELATIONS=Parse.parseBoolean(b); + }else if(a.equals("maxcorrelations")){ + Splitter.MAX_CORRELATIONS=Integer.parseInt(b); + } + + else if(a.equals("sortx")){ + sortX=Parse.parseBoolean(b); + }else if(a.equals("sorty")){ + sortY=Parse.parseBoolean(b); + }else if(a.equals("sortxy")){ + sortX=sortY=Parse.parseBoolean(b); + }else if(a.equals("forcesortxy") || a.equals("forcexy")){ + forceSortXY=Parse.parseBoolean(b); + }else if(a.equals("sortxysize") || a.equals("xysize")){ + sortXYSize=Integer.parseInt(b); + } + + else if(a.equals("removeallduplicates") || a.equals("allduplicates")){ + markAll=Parse.parseBoolean(b); + }else if(a.equals("addcount") || a.equals("renamebycount")){ + renameByCount=Parse.parseBoolean(b); + }else if(a.equals("optical") || a.equals("opticalonly")){ + opticalOnly=Parse.parseBoolean(b); + }else if(a.equals("dupesubs") || a.equals("duplicatesubs") || a.equals("dsubs") || a.equals("subs") || a.equals("s")){ + maxSubstitutions=Integer.parseInt(b); + }else if(a.equals("dupedist") || a.equals("duplicatedistance") || a.equals("ddist") || a.equals("dist") || a.equals("opticaldist") || a.equals("distance")){ + maxOpticalDistance=Float.parseFloat(b); + opticalOnly=maxOpticalDistance>=0; + }else if(a.equals("scanlimit") || a.equals("scan")){ + scanLimit=Integer.parseInt(b); + }else if(a.equals("allowns")){ + allowNs=Parse.parseBoolean(b); + }else if(a.equals("containment") || a.equals("absorbcontainment") || a.equals("ac") || a.equals("contains")){ + containment=Parse.parseBoolean(b); + }else if(a.equalsIgnoreCase("prefixOrSuffix") || a.equalsIgnoreCase("suffixOrPrefix") || a.equals("affix") || a.equals("pos")){ + affix=Parse.parseBoolean(b); + }else if(a.equals("printduplicates")){ + printDuplicates=Parse.parseBoolean(b); + }else if(a.equals("dupeidentity")){ + float dupeIdentity=Float.parseFloat(b); + if(dupeIdentity>1){dupeIdentity=dupeIdentity/100;} + assert(dupeIdentity<=1); + dupeSubRate=1-dupeIdentity; + }else if(a.equals("dupesubrate") || a.equals("dsr") || a.equals("subrate")){ + dupeSubRate=Float.parseFloat(b); + if(dupeSubRate>1){dupeSubRate=dupeSubRate/100;} + assert(dupeSubRate<=1); + } + + else if(a.equals("minsizesplit")){ + Splitter.minSizeSplit=Integer.parseInt(b); + }else if(a.equals("minsizefractionsplit")){ + Splitter.minSizeFractionSplit=Float.parseFloat(b); + }else{ + return false; + } + + return true; + } + + static void setConservative(boolean newState){ + + if(aggressiveFlag){return;} + if(newState==conservativeMode){return;} + conservativeMode=newState; + + Splitter.conservative=conservativeMode; + + if(conservativeMode){ + minCountCorrect++; + minSizeFractionCorrect*=1.5f; + minRatio*=1.25f; + minQRatio*=1.5f; + minAQRatio*=1.4f; + minRatioOffset*=1.25f; + minRatioQMult*=1.50f; + minRatioMult*=1.4f; + minIdentity=1-((1-minIdentity)*0.7f); + if(maxQIncorrect==Integer.MAX_VALUE){maxQIncorrect=35;} + }else{ + minCountCorrect--; + minSizeFractionCorrect/=1.5f; + minRatio/=1.25f; + minQRatio/=1.5f; + minAQRatio/=1.4f; + minRatioOffset/=1.25f; + minRatioQMult/=1.50f; + minRatioMult/=1.4f; + minIdentity=1-((1-minIdentity)/0.7f); + if(maxQIncorrect==35){maxQIncorrect=Integer.MAX_VALUE;} + } + } + + /*--------------------------------------------------------------*/ + + public static void setXY() { + if(ReadKey.spanTilesX){sortX=true;} + if(ReadKey.spanTilesY){sortY=true;} + } + + static boolean allowNs=true; + static boolean markAll=false; + static boolean markOnly=false; + static boolean opticalOnly=false; + static boolean containment=false; + static boolean affix=false; + static boolean printDuplicates=false; //For debugging + + private static boolean renameByCount=false; + private static int scanLimit=5; + private static int maxSubstitutions=2; + private static float dupeSubRate=0; + private static float maxOpticalDistance=40; + + static boolean forceProcess=false; + static boolean conservativeFlag=false; + static boolean aggressiveFlag=false; + static boolean conservativeMode=false; + static boolean renameConsensus=false; + static int minCountCorrect=4; //mcc=4 was slightly better than 3 + static float minSizeFractionCorrect=0.20f; //0.11 is very slightly better than 0.14 + static float minRatio=30.0f; + static float minQRatio=2.8f; //Does nothing? + static float minAQRatio=0.7f; + static float minRatioOffset=1.9f; //3 is far worse than 2; 1 is better + static float minRatioQMult=0.08f; + static float minRatioMult=1.80f; //2.5 is far worse than 2; 1.5 is better + static float minIdentity=0.97f; //0.98 is slightly better than 0.96; 0.94 is substantially worse + static byte maxQAdjust=0; + static int maxQIncorrect=Integer.MAX_VALUE; + static int maxCIncorrect=Integer.MAX_VALUE; + + static boolean sortX=false; //Not needed for NextSeq + static boolean sortY=true; + static boolean forceSortXY=false; //Mainly for testing + static int sortXYSize=6; + + /** May slightly increase speed for optical dedupe. Can be safely disabled. */ + static boolean sortYEarly(){return sortY && (forceSortXY || opticalOnly);} + +// private static final boolean countQuality=false; + public static int k=31; + private static final long serialVersionUID = 1L; + +}