jpayne@68: package clump; jpayne@68: jpayne@68: import java.util.ArrayList; jpayne@68: jpayne@68: import dna.AminoAcid; jpayne@68: import shared.KillSwitch; jpayne@68: import shared.Parse; jpayne@68: import shared.Tools; jpayne@68: import stream.Read; jpayne@68: import structures.ByteBuilder; jpayne@68: jpayne@68: /** jpayne@68: * A list of reads sharing a kmer. jpayne@68: * @author Brian Bushnell jpayne@68: * @date Nov 7, 2015 jpayne@68: * jpayne@68: */ jpayne@68: public class Clump extends ArrayList implements Comparable { jpayne@68: jpayne@68: public static Clump makeClump(long kmer){ jpayne@68: try { jpayne@68: return new Clump(kmer); jpayne@68: } catch (OutOfMemoryError e) { jpayne@68: KillSwitch.memKill(e); jpayne@68: throw new RuntimeException(); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: private Clump(long kmer_){ jpayne@68: this(kmer_, 4); jpayne@68: } jpayne@68: jpayne@68: private Clump(long kmer_, int size){ jpayne@68: super(size); jpayne@68: kmer=kmer_; jpayne@68: } jpayne@68: jpayne@68: @Override jpayne@68: public boolean add(Read r){ jpayne@68: ReadKey key=(ReadKey) r.obj; jpayne@68: assert(key.kmer==kmer); jpayne@68: key.clump=this; jpayne@68: return super.add(r); jpayne@68: } jpayne@68: jpayne@68: private void setMaxima(){ jpayne@68: maxLeft=-1; jpayne@68: maxRight=-1; jpayne@68: width=-1; jpayne@68: for(Read r : this){ jpayne@68: ReadKey key=(ReadKey) r.obj; jpayne@68: final int pos=key.position; jpayne@68: maxLeft=Tools.max(maxLeft, pos); jpayne@68: maxRight=Tools.max(maxRight, r.length()-pos); jpayne@68: } jpayne@68: width=maxLeft+maxRight; jpayne@68: } jpayne@68: jpayne@68: /** This will create counts of bases, or sums of qualities, at each position in the cluster. */ jpayne@68: private int[][] count(final boolean qualityScores){ jpayne@68: if(width<0){setMaxima();} jpayne@68: jpayne@68: // System.err.println("\n\n"); jpayne@68: final int[][] counts=new int[4][width]; jpayne@68: for(Read r : this){ jpayne@68: ReadKey key=(ReadKey) r.obj; jpayne@68: final int pos=key.position; jpayne@68: byte[] bases=r.bases, quals=r.quality; jpayne@68: if(quals==null){useQuality=false;} jpayne@68: jpayne@68: // System.err.println("pos="+pos+", maxLeft="+maxLeft); jpayne@68: for(int cloc=0, rloc=maxLeft-pos; cloc-1){ jpayne@68: final int q; jpayne@68: if(qualityScores){q=(quals==null ? 20 : quals[cloc]);} jpayne@68: else{q=1;} jpayne@68: counts[x][rloc]+=q; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: return counts; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public Read makeSimpleConsensus(){ jpayne@68: // assert(Splitter.findBestPivot(this)<0) : Splitter.findBestPivot(this); //TODO: Slow jpayne@68: if(size()==1){ jpayne@68: Read r=get(0); jpayne@68: if(renameConsensus){r.id=r.numericID+" size=1";} jpayne@68: return r; jpayne@68: } jpayne@68: final int[][] bcounts=baseCounts(); jpayne@68: final int[][] qcounts=qualityCounts(); jpayne@68: jpayne@68: final byte[] bases=new byte[width], quals=new byte[width]; jpayne@68: for(int i=0; i=0 && qcounts[x][i]==qcounts[y][i]){//Tie-breaker jpayne@68: x=getConsensusAtPosition(bcounts, i); jpayne@68: y=getSecondHighest(bcounts, i); jpayne@68: } jpayne@68: jpayne@68: jpayne@68: if(x<0){ jpayne@68: // System.err.println("q="+0+", x="+x+"; A="+counts[0][i]+", C="+counts[1][i]+", G="+counts[2][i]+", T="+counts[3][i]); jpayne@68: bases[i]='N'; jpayne@68: quals[i]=0; jpayne@68: assert(getSumAtPosition(qcounts, i)==0) : "\n"+bcounts[0][i]+", "+bcounts[1][i]+", "+bcounts[2][i]+", "+bcounts[3][i]+ jpayne@68: "\n"+qcounts[0][i]+", "+qcounts[1][i]+", "+qcounts[2][i]+", "+qcounts[3][i]+ jpayne@68: "\nwidth="+width+", i="+i+", size="+size()+"\n"+new String(bases, 0, i)+"\n"+get(0)+"\n"+get(1)+"\n"; jpayne@68: // assert(getSumAtPosition(bcounts, i)==0) : "\n"+bcounts[0][i]+", "+bcounts[1][i]+", "+bcounts[2][i]+", "+bcounts[3][i]+ jpayne@68: // "\n"+qcounts[0][i]+", "+qcounts[1][i]+", "+qcounts[2][i]+", "+qcounts[3][i]+ jpayne@68: // "\nwidth="+width+", i="+i+", size="+size()+"\n"+new String(bases, 0, i)+"\n"+get(0)+"\n"+get(1)+"\n"; jpayne@68: }else{ jpayne@68: final long bsum=getSumAtPosition(bcounts, i); jpayne@68: final long bmajor=bcounts[x][i]; jpayne@68: final long bminor=bsum-bcounts[x][i]; jpayne@68: final long bsecond=bcounts[y][i]; jpayne@68: jpayne@68: final long qsum=getSumAtPosition(qcounts, i); jpayne@68: final long qmajor=qcounts[x][i]; jpayne@68: final long qminor=qsum-qcounts[x][i]; jpayne@68: final long qsecond=qcounts[y][i]; jpayne@68: jpayne@68: float bratio=bminor/(float)(bmajor+bminor); jpayne@68: double phred=(bminor==0 ? Read.MAX_CALLED_QUALITY() : -10*Math.log10(bratio)); jpayne@68: phred=Tools.min(phred, qmajor-qminor); jpayne@68: // if(phred<0 || phred>127){ jpayne@68: // assert(false) : i+","+x+","+bsum+","+qsum+","+bmajor+","+bminor+","+bratio+","+phred+"\n"+ jpayne@68: // bcounts[0][i]+","+bcounts[1][i]+","+bcounts[2][i]+","+bcounts[3][i]+"\n"+ jpayne@68: // qcounts[0][i]+","+qcounts[1][i]+","+qcounts[2][i]+","+qcounts[3][i]+"\n"+ jpayne@68: // this.toStringStaggered()+"\n"; jpayne@68: // } jpayne@68: // assert(phred>=0 && phred<=127) : phred+","+x+","+i+","+bratio+","+bcounts[x][i]+","+bcounts[0][i]+ jpayne@68: // ","+bcounts[1][i]+","+bcounts[2][i]+","+bcounts[3][i]; jpayne@68: // assert(phred>=0 && phred<=127) : bmajor+", "+bminor+", "+phred+", "+Read.MAX_CALLED_QUALITY; jpayne@68: byte q=Read.capQuality((long)Math.round(phred)); jpayne@68: bases[i]=AminoAcid.numberToBase[x]; jpayne@68: quals[i]=q; jpayne@68: assert(q>0); jpayne@68: assert(x>-1); jpayne@68: assert(bases[i]!='N'); jpayne@68: } jpayne@68: } jpayne@68: Read leftmost=this.get(0);//TODO: Actually rightmost! jpayne@68: Read r=new Read(bases, quals, leftmost.numericID+" size="+size(), 0); jpayne@68: //TODO: Attach the long pair, and make sure the kmer location is correct. jpayne@68: // assert(false) : "\n"+r.toFastq()+"\nCheck kmer location."; jpayne@68: // assert(size()==1) : "\n"+r.toFastq()+"\n"+get(0).toFastq()+"\n"+get(size()-1).toFastq()+"\n"; jpayne@68: return r; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public int removeDuplicates(){ jpayne@68: assert(KmerComparator.compareSequence); jpayne@68: if(size()<2){return 0;} jpayne@68: jpayne@68: int removedTotal=0, removed=0; jpayne@68: jpayne@68: final boolean sortXY=(forceSortXY || sortYEarly() || (opticalOnly && (sortX || sortY) && size()>=sortXYSize)); jpayne@68: jpayne@68: final int maxDiscarded; jpayne@68: final int scan; jpayne@68: jpayne@68: if(opticalOnly && sortXY){ jpayne@68: scan=Tools.max(scanLimit, 200); jpayne@68: maxDiscarded=scan+10; jpayne@68: }else if(!sortXY && ((maxSubstitutions<1 && dupeSubRate<=0) || scanLimit<0)){ jpayne@68: scan=0; jpayne@68: maxDiscarded=0; jpayne@68: }else{ jpayne@68: scan=scanLimit; jpayne@68: maxDiscarded=scan+10; jpayne@68: } jpayne@68: jpayne@68: if(sortXY){ jpayne@68: assert(sortX || sortY); jpayne@68: jpayne@68: if(sortY){ jpayne@68: if(!sortYEarly()){ jpayne@68: this.sort(KmerComparatorY.comparator); jpayne@68: }else{ jpayne@68: // for(int i=1; i0 || dupeSubRate>0) && scanLimit>0 && removed>maxDiscarded){ jpayne@68: removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan+10, maxDiscarded*2+20, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance); jpayne@68: removedTotal+=removed; jpayne@68: // System.err.println("RemovedY: "+removed); jpayne@68: } jpayne@68: } jpayne@68: if(sortX && (ReadKey.spanTilesX || !sortY)){ jpayne@68: this.sort(KmerComparatorX.comparator); jpayne@68: removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan, maxDiscarded, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance); jpayne@68: removedTotal+=removed; jpayne@68: // System.err.println("RemovedX: "+removed); jpayne@68: while((maxSubstitutions>0 || dupeSubRate>0) && scanLimit>0 && removed>maxDiscarded){ jpayne@68: removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan+10, maxDiscarded*2+20, opticalOnly, true, markOnly, markAll, renameByCount, maxOpticalDistance); jpayne@68: removedTotal+=removed; jpayne@68: // System.err.println("RemovedX: "+removed); jpayne@68: } jpayne@68: } jpayne@68: }else{ jpayne@68: removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan, maxDiscarded, opticalOnly, false, markOnly, markAll, renameByCount, maxOpticalDistance); jpayne@68: removedTotal+=removed; jpayne@68: while((maxSubstitutions>0 || dupeSubRate>0) && scanLimit>0 && removed>maxDiscarded){ jpayne@68: removed=removeDuplicates_inner(maxSubstitutions, dupeSubRate, scan+10, maxDiscarded*2+20, opticalOnly, false, markOnly, markAll, renameByCount, maxOpticalDistance); jpayne@68: removedTotal+=removed; jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: return removedTotal; jpayne@68: } jpayne@68: jpayne@68: private int removeDuplicates_inner(final int maxSubs, final float subRate, final int scanLimit, final int maxDiscarded, jpayne@68: final boolean optical, final boolean xySorted, final boolean mark, final boolean markAll, final boolean rename, final float dist){ jpayne@68: final int size=size(); jpayne@68: if(size<2){return 0;} jpayne@68: int dupePairs=0; jpayne@68: int dupeReads=0; jpayne@68: jpayne@68: // final boolean breakOnTile=(optical && !FlowcellCoordinate.spanTiles); jpayne@68: jpayne@68: // final long start=System.nanoTime(); jpayne@68: jpayne@68: for(int i=0, lim=size-1; i200000000000L){ jpayne@68: // TextStreamWriter tsw=new TextStreamWriter("foo.fq", true, false, false); jpayne@68: // tsw.start(); jpayne@68: // for(Read x : this){ jpayne@68: // tsw.println(x.toFastq()); jpayne@68: // } jpayne@68: // tsw.poisonAndWait(); jpayne@68: // assert(false) : "kmer="+kmer+", size="+size(); jpayne@68: // } jpayne@68: if(equals(a, b, maxSubs, subRate)){ jpayne@68: if(!optical || keyA.near(keyB, dist)){ jpayne@68: if(printDuplicates){ jpayne@68: System.out.println(a.toFasta()); jpayne@68: System.out.println(b.toFasta()); jpayne@68: } jpayne@68: float errA=a.expectedErrorsIncludingMate(true); jpayne@68: float errB=b.expectedErrorsIncludingMate(true); jpayne@68: if(markAll){ jpayne@68: b.setDiscarded(true); jpayne@68: assert(!a.discarded() || markAll); jpayne@68: dupePairs++; jpayne@68: dupeReads+=1+b.mateCount(); jpayne@68: unequals=0; jpayne@68: if(!a.discarded()){ jpayne@68: a.setDiscarded(true); jpayne@68: dupePairs++; jpayne@68: dupeReads+=1+a.mateCount(); jpayne@68: } jpayne@68: }else if(containment || errB>=errA){ jpayne@68: b.setDiscarded(true); jpayne@68: assert(!a.discarded() || markAll); jpayne@68: a.copies+=b.copies+parseExtraCopies(b); jpayne@68: dupePairs++; jpayne@68: dupeReads+=1+b.mateCount(); jpayne@68: unequals=0; jpayne@68: }else{ jpayne@68: a.setDiscarded(true); jpayne@68: assert(!b.discarded() || markAll); jpayne@68: b.copies+=a.copies+parseExtraCopies(a); jpayne@68: dupePairs++; jpayne@68: dupeReads+=1+a.mateCount(); jpayne@68: } jpayne@68: } jpayne@68: }else{ jpayne@68: unequals++; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: if(dupeReads>0){ jpayne@68: for(int i=0; i1){ jpayne@68: renameFromCount(a); jpayne@68: } jpayne@68: a.copies=1; jpayne@68: } jpayne@68: if(!mark){ jpayne@68: int x=Tools.condenseStrict(this); jpayne@68: assert(x==dupePairs) : size()+", "+size+", "+dupePairs+", "+x; jpayne@68: assert((size()>0 || markAll) && size()==size-dupePairs) : size()+", "+size+", "+dupePairs; jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(containment){ jpayne@68: dupeReads+=removeDuplicates_backwards(maxSubs, subRate, scanLimit, maxDiscarded, optical, xySorted, mark, markAll, rename, dist); jpayne@68: } jpayne@68: jpayne@68: return dupeReads; jpayne@68: } jpayne@68: jpayne@68: /** Only for containments */ jpayne@68: private int removeDuplicates_backwards(final int maxSubs, final float subRate, final int scanLimit, final int maxDiscarded, jpayne@68: final boolean optical, final boolean xySorted, final boolean mark, final boolean markAll, final boolean rename, final float dist){ jpayne@68: final int size=size(); jpayne@68: if(size<2){return 0;} jpayne@68: int dupePairs=0; jpayne@68: int dupeReads=0; jpayne@68: jpayne@68: // final boolean breakOnTile=(optical && !FlowcellCoordinate.spanTiles); jpayne@68: jpayne@68: // final long start=System.nanoTime(); jpayne@68: jpayne@68: for(int i=size-1; i>0; i--){ jpayne@68: final Read a=get(i); jpayne@68: if(!a.discarded()){ jpayne@68: final ReadKey keyA=(ReadKey) a.obj; jpayne@68: final int aLen=a.length(); jpayne@68: int unequals=0; jpayne@68: int discarded=0; jpayne@68: for(int j=i-1; j>=0 && unequals<=scanLimit && discarded<=maxDiscarded && (!a.discarded() || markAll); j--){ jpayne@68: final Read b=get(j); jpayne@68: if(b.discarded()){ jpayne@68: discarded++; jpayne@68: }else{ jpayne@68: final int bLen=b.length(); jpayne@68: final ReadKey keyB=(ReadKey) b.obj; jpayne@68: if(!containment && !keyA.equals(keyB)){break;} jpayne@68: // if(containment && affix && !keyA.physicalAffix(keyB, aLen, bLen)){break;} jpayne@68: // if(optical && keyA.lane!=keyB.lane){break;} //Already in equals method jpayne@68: // if(breakOnTile && keyA.tile!=keyB.tile){break;} //Already in equals method jpayne@68: if(optical && xySorted && !keyA.nearXY(keyB, dist)){break;} jpayne@68: // if(System.nanoTime()-start>200000000000L){ jpayne@68: // TextStreamWriter tsw=new TextStreamWriter("foo.fq", true, false, false); jpayne@68: // tsw.start(); jpayne@68: // for(Read x : this){ jpayne@68: // tsw.println(x.toFastq()); jpayne@68: // } jpayne@68: // tsw.poisonAndWait(); jpayne@68: // assert(false) : "kmer="+kmer+", size="+size(); jpayne@68: // } jpayne@68: if(equals(a, b, maxSubs, subRate)){ jpayne@68: if(!optical || keyA.near(keyB, dist)){ jpayne@68: if(printDuplicates){ jpayne@68: System.out.println(a.toFasta()); jpayne@68: System.out.println(b.toFasta()); jpayne@68: } jpayne@68: float errA=a.expectedErrorsIncludingMate(true); jpayne@68: float errB=b.expectedErrorsIncludingMate(true); jpayne@68: if(markAll){ jpayne@68: b.setDiscarded(true); jpayne@68: assert(!a.discarded() || markAll); jpayne@68: dupePairs++; jpayne@68: dupeReads+=1+b.mateCount(); jpayne@68: unequals=0; jpayne@68: if(!a.discarded()){ jpayne@68: a.setDiscarded(true); jpayne@68: dupePairs++; jpayne@68: dupeReads+=1+a.mateCount(); jpayne@68: } jpayne@68: }else if(containment || errB>=errA){ jpayne@68: b.setDiscarded(true); jpayne@68: assert(!a.discarded() || markAll); jpayne@68: a.copies+=b.copies+parseExtraCopies(b); jpayne@68: dupePairs++; jpayne@68: dupeReads+=1+b.mateCount(); jpayne@68: unequals=0; jpayne@68: }else{ jpayne@68: a.setDiscarded(true); jpayne@68: assert(!b.discarded() || markAll); jpayne@68: b.copies+=a.copies+parseExtraCopies(a); jpayne@68: dupePairs++; jpayne@68: dupeReads+=1+a.mateCount(); jpayne@68: } jpayne@68: } jpayne@68: }else{ jpayne@68: unequals++; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: if(dupeReads>0){ jpayne@68: for(int i=0; i1){ jpayne@68: renameFromCount(a); jpayne@68: } jpayne@68: a.copies=1; jpayne@68: } jpayne@68: if(!mark){ jpayne@68: int x=Tools.condenseStrict(this); jpayne@68: assert(x==dupePairs) : size()+", "+size+", "+dupePairs+", "+x; jpayne@68: assert((size()>0 || markAll) && size()==size-dupePairs) : size()+", "+size+", "+dupePairs; jpayne@68: } jpayne@68: } jpayne@68: return dupeReads; jpayne@68: } jpayne@68: jpayne@68: public int parseExtraCopies(final Read a){ jpayne@68: final String id=a.id; jpayne@68: final int space=id.lastIndexOf(' '); jpayne@68: int extraCopies=0; jpayne@68: if(space<0){return 0;} jpayne@68: if(space>=0 && Tools.contains(id, "copies=", space+1)){ jpayne@68: extraCopies=Integer.parseInt(id.substring(space+8))-1; jpayne@68: } jpayne@68: return extraCopies; jpayne@68: } jpayne@68: jpayne@68: private void renameFromCount(final Read a){ jpayne@68: final int newExtraCopies=a.copies-1; jpayne@68: assert(newExtraCopies>0) : newExtraCopies; jpayne@68: final int oldExtraCopies=parseExtraCopies(a); jpayne@68: final int total=1+newExtraCopies+oldExtraCopies; jpayne@68: renameToTotal(a, total); jpayne@68: if(a.pairnum()==0 && a.mate!=null){ jpayne@68: assert(a.mate.pairnum()==1); jpayne@68: renameToTotal(a.mate, total); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: private static void renameToTotal(final Read a, final int total){ jpayne@68: final String id=a.id; jpayne@68: final int space=id.lastIndexOf(' '); jpayne@68: if(space>=0 && Tools.contains(id, "copies=", space+1)){ jpayne@68: a.id=a.id.substring(0, space); jpayne@68: } jpayne@68: a.id=a.id+" copies="+total; jpayne@68: } jpayne@68: jpayne@68: // public boolean nearby(Read a, Read b, FlowcellCoordinate fca, FlowcellCoordinate fcb, float maxDist){ jpayne@68: //// fca.setFrom(a.id); jpayne@68: // fcb.setFrom(b.id); jpayne@68: // float dist=fca.distance(fcb); jpayne@68: // return dist<=maxDist; jpayne@68: // } jpayne@68: jpayne@68: // public boolean nearby(ReadKey ka, ReadKey kb, float maxDist){ jpayne@68: // float dist=ka.distance(kb); jpayne@68: // return dist<=maxDist; jpayne@68: // } jpayne@68: jpayne@68: public static boolean equals(Read a, Read b, int maxSubs, float dupeSubRate){ jpayne@68: if(a.numericID==b.numericID){return false;} jpayne@68: if(dupeSubRate>0){maxSubs=Tools.max(maxSubs, (int)(dupeSubRate*Tools.min(a.length(), b.length())));} jpayne@68: if(containment){ jpayne@68: return contains(a, b, maxSubs); jpayne@68: } jpayne@68: if(!equals(a.bases, b.bases, maxSubs)){return false;} jpayne@68: if(a.mate!=null && !equals(a.mate.bases, b.mate.bases, maxSubs)){return false;} jpayne@68: return true; jpayne@68: } jpayne@68: jpayne@68: public static boolean equals(byte[] a, byte[] b, int maxSubs){ jpayne@68: if(a==b){return false;}//Nothing should subsume itself jpayne@68: if(a==null || b==null){return false;} jpayne@68: if(a.length!=b.length){return false;} jpayne@68: int subs=0; jpayne@68: if(allowNs){ jpayne@68: for(int i=0; imaxSubs){return false;} jpayne@68: } jpayne@68: } jpayne@68: }else{ jpayne@68: for(int i=0; imaxSubs){return false;} jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: return true; jpayne@68: } jpayne@68: jpayne@68: public static boolean contains(Read a, Read b, int maxSubs){ jpayne@68: if(a.numericID==b.numericID){return false;} jpayne@68: boolean ok=contains_inner(a, b, maxSubs); jpayne@68: if(!ok || a.mate==null){return ok;} jpayne@68: ok=contains_inner(a.mate, b.mate, maxSubs); jpayne@68: if(!ok){return false;} jpayne@68: ReadKey rka1=(ReadKey)a.obj; jpayne@68: ReadKey rkb1=(ReadKey)b.obj; jpayne@68: ReadKey rka2=(ReadKey)a.mate.obj; //TODO: In containment mode, mates need to always get keys. jpayne@68: ReadKey rkb2=(ReadKey)b.mate.obj; jpayne@68: return ((rka1.kmerMinusStrand==rkb1.kmerMinusStrand) && (rka2.kmerMinusStrand==rkb2.kmerMinusStrand)); //Ensures that both reads have the same directionality. jpayne@68: } jpayne@68: jpayne@68: public static boolean contains_inner(Read a, Read b, int maxSubs){ jpayne@68: // if(a.length()==b.length()){return equals(a.bases, b.bases, maxSubs);} jpayne@68: ReadKey rka=(ReadKey)a.obj; jpayne@68: ReadKey rkb=(ReadKey)b.obj; jpayne@68: if(affix ? !rka.physicalAffix(rkb, a.length(), b.length()) : !rka.physicallyContains(rkb, a.length(), b.length())){return false;} jpayne@68: boolean flipped=false; jpayne@68: // if(a.mate!=null){//In paired mode, need synchronization if the reads are in difference clumps. But... just don't do that. jpayne@68: // Read max, min; jpayne@68: // if(a.numericID>b.numericID){max=a; min=b;}//Protects against deadlocks. jpayne@68: // else{max=b; min=a;} jpayne@68: // synchronized(min){ jpayne@68: // synchronized(max){ jpayne@68: // if(rka.kmerMinusStrand!=rkb.kmerMinusStrand){ jpayne@68: // rkb.flip(b, k); jpayne@68: // flipped=true; jpayne@68: // } jpayne@68: // boolean ok=contains(a.bases, b.bases, rka.position, rkb.position, maxSubs); jpayne@68: // if(flipped){rkb.flip(b, k);} jpayne@68: // return ok; jpayne@68: // } jpayne@68: // } jpayne@68: // } jpayne@68: if(rka.kmerMinusStrand!=rkb.kmerMinusStrand){ jpayne@68: rkb.flip(b, k); jpayne@68: flipped=true; jpayne@68: } jpayne@68: boolean ok=contains(a.bases, b.bases, rka.position, rkb.position, maxSubs); jpayne@68: if(flipped){rkb.flip(b, k);} jpayne@68: return ok; jpayne@68: } jpayne@68: jpayne@68: public static boolean contains(byte[] a, byte[] b, int posA, int posB, int maxSubs){ jpayne@68: if(a==null || b==null){return false;} jpayne@68: assert(a.length>=b.length); jpayne@68: if(a==b){return false;} //Nothing should contain itself jpayne@68: int subs=0; jpayne@68: jpayne@68: int ai, bi; jpayne@68: final int dif=posA-posB; jpayne@68: if(dif>0){ jpayne@68: ai=0; jpayne@68: bi=-dif; jpayne@68: }else{ jpayne@68: ai=dif; jpayne@68: bi=0; jpayne@68: } jpayne@68: jpayne@68: if(allowNs){ jpayne@68: for(; ai=0 && bi>=0){ jpayne@68: final byte na=a[ai], nb=b[bi]; jpayne@68: if(na!=nb && na!='N' && nb!='N'){ jpayne@68: subs++; jpayne@68: if(subs>maxSubs){return false;} jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: }else{ jpayne@68: for(; ai=0 && bi>=0 && a[ai]!=b[bi]){ jpayne@68: subs++; jpayne@68: if(subs>maxSubs){return false;} jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: return true; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public long splitAndErrorCorrect(){ jpayne@68: if(size() list=Splitter.splitOnPivot(this); jpayne@68: for(Clump c : list){ jpayne@68: sum+=c.errorCorrect(); jpayne@68: } jpayne@68: return sum; jpayne@68: } jpayne@68: jpayne@68: public long errorCorrect(){ jpayne@68: if(size()<=minCountCorrect){return 0;} jpayne@68: // assert(Splitter.findBestPivot(this)<0); //TODO: Slow jpayne@68: Read consen=makeSimpleConsensus(); jpayne@68: long sum=0; jpayne@68: final int[] rvector=KillSwitch.allocInt1D(2); jpayne@68: for(Read r : this){ jpayne@68: sum+=errorCorrect(r, consen, rvector); jpayne@68: } jpayne@68: return sum; jpayne@68: } jpayne@68: jpayne@68: private int errorCorrect(Read call, Read ref, int[] rvector){ jpayne@68: jpayne@68: // assert(call.validated()); jpayne@68: // assert(call.checkQuality()); jpayne@68: // assert(ref.checkQuality()); jpayne@68: jpayne@68: final float identity=identity(call, ref.bases, rvector); jpayne@68: if((identity0 || rvector[0]<50)) || (identity==1 && !call.containsNocalls()/* && !ref.containsNocalls()*/)){ jpayne@68: //TODO: Strange, the identity==1 clause causes different behavior, though it does give a speedup. jpayne@68: return 0; jpayne@68: } jpayne@68: final byte[] cbases=call.bases, cquals=call.quality; jpayne@68: final byte[] rbases=ref.bases, rquals=ref.quality; jpayne@68: jpayne@68: ReadKey key=(ReadKey) call.obj; jpayne@68: final int pos=key.position; jpayne@68: jpayne@68: final int[][] bcounts=baseCounts(); jpayne@68: final int[][] qcounts=qualityCounts(); jpayne@68: final float[][] qAvgs=qualityAverages(); jpayne@68: jpayne@68: final int minDepth=(int)Tools.max(minCountCorrect, minSizeFractionCorrect*size()); jpayne@68: jpayne@68: int corrections=0; jpayne@68: jpayne@68: final int cStart=0, rStart=maxLeft-pos, max=cbases.length; jpayne@68: for(int cloc=cStart, rloc=rStart; cloc=cq && (q<=rq || q<=cq)); jpayne@68: }else{ jpayne@68: final int cCount=bcounts[cx][rloc]; jpayne@68: final int rCount=bcounts[rx][rloc]; jpayne@68: final int altQSum=qcounts[cx][rloc]; jpayne@68: final int rQSum=qcounts[rx][rloc]; jpayne@68: final float rQAvg=qAvgs[rx][rloc]; jpayne@68: if(cCount<=maxCIncorrect && cq<=maxQIncorrect && cq*minQRatio=0 && rx=0 && rloc=minCountCorrect && sum>=minDepth){ jpayne@68: final float ratioNeeded=Tools.min(minRatio, minRatioMult*minorCount+minRatioOffset+minRatioQMult*cq); jpayne@68: // final float qratioNeeded=Tools.min(minRatio, minRatioMult*altQSum+minRatioOffset+minRatioQMult*cq); //altQSum is slightly different than minorQCount jpayne@68: if(minorCount*ratioNeeded<=rCount && altQSum*ratioNeeded<=rQSum){ jpayne@68: change=true; jpayne@68: } jpayne@68: jpayne@68: // else if(minorCount*ratioNeeded<=rCount){ jpayne@68: // assert(false) : "\n"+altQSum+", "+rQSum+", "+qratioNeeded+"\n"+cCount+", "+rCount+", "+sum+", "+ratioNeeded+"\n"+(altQSum*qratioNeeded); jpayne@68: // } jpayne@68: } jpayne@68: if(change){ jpayne@68: corrections++; jpayne@68: b=rb; jpayne@68: q=(byte)Tools.min(cq+1, 25, rq); jpayne@68: // assert(q==25 || (q<=rq && q>=cq)) : q+", "+cq+", "+rq; jpayne@68: }else{ jpayne@68: b=cb; jpayne@68: q=(byte)Tools.mid(cq, cq-1, 6);//Adjust downward jpayne@68: assert(q<=cq || q>=6) : q+","+cq; jpayne@68: } jpayne@68: }else{ jpayne@68: b=cb; jpayne@68: q=cq; jpayne@68: } jpayne@68: } jpayne@68: cbases[cloc]=b; jpayne@68: if(cquals!=null){ jpayne@68: byte q2=(byte)Tools.mid(q, cq+maxQAdjust, cq-maxQAdjust); jpayne@68: if(q2==0 && AminoAcid.isFullyDefined(b)){ jpayne@68: assert(!AminoAcid.isFullyDefined(cb)); jpayne@68: q2=(byte)Tools.mid(2, 25, (rq+25)/2); jpayne@68: }else if(!AminoAcid.isFullyDefined(b)){ jpayne@68: q2=0; jpayne@68: } jpayne@68: cquals[cloc]=q2; jpayne@68: assert((b=='N') == (cquals[cloc]==0)) : "b="+(char)b+", cb="+(char)cb+", rb="+(char)rb+", cx="+cx+", " jpayne@68: + "new="+cquals[cloc]+", q="+q+", old="+cq+", rq="+rq+", loc="+rloc+"\n"+call.toFastq()+"\n"+ref.toFastq(); jpayne@68: } jpayne@68: } jpayne@68: return corrections; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: //Only used by condense mode. jpayne@68: public ArrayList makeConsensus(){ jpayne@68: if(size()==1){ jpayne@68: Read r=get(0); jpayne@68: r.id=r.numericID+" size=1"; jpayne@68: return this; jpayne@68: } jpayne@68: jpayne@68: ArrayList clumps=Splitter.splitOnPivot(this);//TODO: Really, this should return null if there is no pivot jpayne@68: jpayne@68: ArrayList list=new ArrayList(clumps.size()); jpayne@68: for(Clump c : clumps){ jpayne@68: Read temp=c.makeSimpleConsensus(); jpayne@68: list.add(temp); jpayne@68: } jpayne@68: return list; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: private float identity(Read call, byte[] rbases, int[] rvector){ jpayne@68: ReadKey key=(ReadKey) call.obj; jpayne@68: final int pos=key.position; jpayne@68: byte[] cbases=call.bases, quals=call.quality; jpayne@68: int good=0, bad=0; jpayne@68: jpayne@68: final int cStart=0, rStart=maxLeft-pos, max=cbases.length; jpayne@68: for(int cloc=cStart, rloc=rStart; cloccounts[xMax][pos]){xMax=x;} jpayne@68: } jpayne@68: // assert(counts[max][pos]>=counts[0][pos]); jpayne@68: // assert(counts[max][pos]>=counts[1][pos]); jpayne@68: // assert(counts[max][pos]>=counts[2][pos]) : max+", "+counts[max][pos]+", ["+counts[0][pos]+", "+counts[1][pos]+", "+counts[2][pos]+", "+counts[3][pos]+"]"; jpayne@68: // assert(counts[max][pos]>=counts[3][pos]); jpayne@68: return (counts[xMax][pos]>0 ? xMax : -1); jpayne@68: } jpayne@68: jpayne@68: byte getSecondHighest(int[][] counts, int pos){ jpayne@68: byte first=0; jpayne@68: byte second=1; jpayne@68: if(counts[first][pos]counts[first][pos]){ jpayne@68: second=first; jpayne@68: first=x; jpayne@68: }else if(counts[x][pos]>counts[second][pos]){ jpayne@68: second=x; jpayne@68: } jpayne@68: } jpayne@68: // assert(counts[max][pos]>=counts[0][pos]); jpayne@68: // assert(counts[max][pos]>=counts[1][pos]); jpayne@68: // assert(counts[max][pos]>=counts[2][pos]) : max+", "+counts[max][pos]+", ["+counts[0][pos]+", "+counts[1][pos]+", "+counts[2][pos]+", "+counts[3][pos]+"]"; jpayne@68: // assert(counts[max][pos]>=counts[3][pos]); jpayne@68: jpayne@68: return second; //may be actually 0. jpayne@68: //return (counts[second][pos]>0 ? second : -1); jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public String toStringStaggered(){ jpayne@68: ByteBuilder sb=new ByteBuilder(); jpayne@68: for(Read r : this){ jpayne@68: ReadKey key=(ReadKey) r.obj; jpayne@68: final int pos=key.position; jpayne@68: byte[] bases=r.bases, quals=r.quality; jpayne@68: int rloc=0, cloc=rloc+pos-maxLeft; jpayne@68: while(cloc<0){sb.append(' '); cloc++;} jpayne@68: sb.append(bases); jpayne@68: sb.append('\n'); jpayne@68: } jpayne@68: return sb.toString(); jpayne@68: } jpayne@68: jpayne@68: public Read consensusRead(){ jpayne@68: if(consensusRead==null){ jpayne@68: consensusRead=makeSimpleConsensus(); jpayne@68: } jpayne@68: return consensusRead; jpayne@68: } jpayne@68: jpayne@68: public int width(){ jpayne@68: assert(width>=0) : width; jpayne@68: return width; jpayne@68: } jpayne@68: jpayne@68: public int maxLeft(){ jpayne@68: assert(maxLeft>=0); jpayne@68: return maxLeft; jpayne@68: } jpayne@68: jpayne@68: public int maxRight(){ jpayne@68: assert(maxRight>=0); jpayne@68: return maxRight; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: int[][] baseCounts(){ jpayne@68: if(baseCounts==null){ jpayne@68: baseCounts=count(false); jpayne@68: int len=baseCounts[0].length; jpayne@68: assert(width==-1 || width==len); jpayne@68: } jpayne@68: return baseCounts; jpayne@68: } jpayne@68: jpayne@68: int[][] qualityCounts(){ jpayne@68: if(qualityCounts==null){ jpayne@68: qualityCounts=count(true); jpayne@68: int len=qualityCounts[0].length; jpayne@68: assert(width==-1 || width==len); jpayne@68: } jpayne@68: return qualityCounts; jpayne@68: } jpayne@68: jpayne@68: float[][] qualityAverages(){ jpayne@68: if(qualityAverages==null){ jpayne@68: qualityAverages=new float[4][width]; jpayne@68: for(int i=0; i<4; i++){ jpayne@68: for(int j=0; j=0; jpayne@68: }else if(a.equals("scanlimit") || a.equals("scan")){ jpayne@68: scanLimit=Integer.parseInt(b); jpayne@68: }else if(a.equals("allowns")){ jpayne@68: allowNs=Parse.parseBoolean(b); jpayne@68: }else if(a.equals("containment") || a.equals("absorbcontainment") || a.equals("ac") || a.equals("contains")){ jpayne@68: containment=Parse.parseBoolean(b); jpayne@68: }else if(a.equalsIgnoreCase("prefixOrSuffix") || a.equalsIgnoreCase("suffixOrPrefix") || a.equals("affix") || a.equals("pos")){ jpayne@68: affix=Parse.parseBoolean(b); jpayne@68: }else if(a.equals("printduplicates")){ jpayne@68: printDuplicates=Parse.parseBoolean(b); jpayne@68: }else if(a.equals("dupeidentity")){ jpayne@68: float dupeIdentity=Float.parseFloat(b); jpayne@68: if(dupeIdentity>1){dupeIdentity=dupeIdentity/100;} jpayne@68: assert(dupeIdentity<=1); jpayne@68: dupeSubRate=1-dupeIdentity; jpayne@68: }else if(a.equals("dupesubrate") || a.equals("dsr") || a.equals("subrate")){ jpayne@68: dupeSubRate=Float.parseFloat(b); jpayne@68: if(dupeSubRate>1){dupeSubRate=dupeSubRate/100;} jpayne@68: assert(dupeSubRate<=1); jpayne@68: } jpayne@68: jpayne@68: else if(a.equals("minsizesplit")){ jpayne@68: Splitter.minSizeSplit=Integer.parseInt(b); jpayne@68: }else if(a.equals("minsizefractionsplit")){ jpayne@68: Splitter.minSizeFractionSplit=Float.parseFloat(b); jpayne@68: }else{ jpayne@68: return false; jpayne@68: } jpayne@68: jpayne@68: return true; jpayne@68: } jpayne@68: jpayne@68: static void setConservative(boolean newState){ jpayne@68: jpayne@68: if(aggressiveFlag){return;} jpayne@68: if(newState==conservativeMode){return;} jpayne@68: conservativeMode=newState; jpayne@68: jpayne@68: Splitter.conservative=conservativeMode; jpayne@68: jpayne@68: if(conservativeMode){ jpayne@68: minCountCorrect++; jpayne@68: minSizeFractionCorrect*=1.5f; jpayne@68: minRatio*=1.25f; jpayne@68: minQRatio*=1.5f; jpayne@68: minAQRatio*=1.4f; jpayne@68: minRatioOffset*=1.25f; jpayne@68: minRatioQMult*=1.50f; jpayne@68: minRatioMult*=1.4f; jpayne@68: minIdentity=1-((1-minIdentity)*0.7f); jpayne@68: if(maxQIncorrect==Integer.MAX_VALUE){maxQIncorrect=35;} jpayne@68: }else{ jpayne@68: minCountCorrect--; jpayne@68: minSizeFractionCorrect/=1.5f; jpayne@68: minRatio/=1.25f; jpayne@68: minQRatio/=1.5f; jpayne@68: minAQRatio/=1.4f; jpayne@68: minRatioOffset/=1.25f; jpayne@68: minRatioQMult/=1.50f; jpayne@68: minRatioMult/=1.4f; jpayne@68: minIdentity=1-((1-minIdentity)/0.7f); jpayne@68: if(maxQIncorrect==35){maxQIncorrect=Integer.MAX_VALUE;} jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public static void setXY() { jpayne@68: if(ReadKey.spanTilesX){sortX=true;} jpayne@68: if(ReadKey.spanTilesY){sortY=true;} jpayne@68: } jpayne@68: jpayne@68: static boolean allowNs=true; jpayne@68: static boolean markAll=false; jpayne@68: static boolean markOnly=false; jpayne@68: static boolean opticalOnly=false; jpayne@68: static boolean containment=false; jpayne@68: static boolean affix=false; jpayne@68: static boolean printDuplicates=false; //For debugging jpayne@68: jpayne@68: private static boolean renameByCount=false; jpayne@68: private static int scanLimit=5; jpayne@68: private static int maxSubstitutions=2; jpayne@68: private static float dupeSubRate=0; jpayne@68: private static float maxOpticalDistance=40; jpayne@68: jpayne@68: static boolean forceProcess=false; jpayne@68: static boolean conservativeFlag=false; jpayne@68: static boolean aggressiveFlag=false; jpayne@68: static boolean conservativeMode=false; jpayne@68: static boolean renameConsensus=false; jpayne@68: static int minCountCorrect=4; //mcc=4 was slightly better than 3 jpayne@68: static float minSizeFractionCorrect=0.20f; //0.11 is very slightly better than 0.14 jpayne@68: static float minRatio=30.0f; jpayne@68: static float minQRatio=2.8f; //Does nothing? jpayne@68: static float minAQRatio=0.7f; jpayne@68: static float minRatioOffset=1.9f; //3 is far worse than 2; 1 is better jpayne@68: static float minRatioQMult=0.08f; jpayne@68: static float minRatioMult=1.80f; //2.5 is far worse than 2; 1.5 is better jpayne@68: static float minIdentity=0.97f; //0.98 is slightly better than 0.96; 0.94 is substantially worse jpayne@68: static byte maxQAdjust=0; jpayne@68: static int maxQIncorrect=Integer.MAX_VALUE; jpayne@68: static int maxCIncorrect=Integer.MAX_VALUE; jpayne@68: jpayne@68: static boolean sortX=false; //Not needed for NextSeq jpayne@68: static boolean sortY=true; jpayne@68: static boolean forceSortXY=false; //Mainly for testing jpayne@68: static int sortXYSize=6; jpayne@68: jpayne@68: /** May slightly increase speed for optical dedupe. Can be safely disabled. */ jpayne@68: static boolean sortYEarly(){return sortY && (forceSortXY || opticalOnly);} jpayne@68: jpayne@68: // private static final boolean countQuality=false; jpayne@68: public static int k=31; jpayne@68: private static final long serialVersionUID = 1L; jpayne@68: jpayne@68: }