jpayne@68: package clump; jpayne@68: jpayne@68: import java.util.ArrayList; jpayne@68: import java.util.LinkedHashMap; jpayne@68: import java.util.Map.Entry; jpayne@68: jpayne@68: import dna.AminoAcid; jpayne@68: import shared.KillSwitch; jpayne@68: import shared.Tools; jpayne@68: import stream.Read; jpayne@68: import structures.IntList; jpayne@68: import structures.LongList; jpayne@68: jpayne@68: /** jpayne@68: * A tool for splitting clumps by allele. jpayne@68: * @author Brian Bushnell jpayne@68: * @date September 26, 2016 jpayne@68: * jpayne@68: */ jpayne@68: class Splitter { jpayne@68: jpayne@68: static ArrayList splitOnPivot(Clump c){ jpayne@68: ArrayList list=new ArrayList(3); jpayne@68: list.add(c); jpayne@68: if(c.size() splitOnPivot(ArrayList list){ jpayne@68: ArrayList out=new ArrayList(); jpayne@68: jpayne@68: final IntList pivots=new IntList(2); jpayne@68: for(int i=0; i1 ? pivots.get(1) : -1), list); jpayne@68: } jpayne@68: } jpayne@68: return out; jpayne@68: } jpayne@68: jpayne@68: static void splitAndAdd(Clump c, final int var1, final int var2, ArrayList out) { jpayne@68: final int maxLeft=c.maxLeft(); jpayne@68: jpayne@68: final Clump major=Clump.makeClump(c.kmer), minor=Clump.makeClump(c.kmer); jpayne@68: jpayne@68: for(Read r : c){ jpayne@68: if(containsVar(var1, r, maxLeft) || (var2>=0 && containsVar(var2, r, maxLeft))){ jpayne@68: minor.add(r); jpayne@68: }else{ jpayne@68: major.add(r); jpayne@68: } jpayne@68: } jpayne@68: // System.err.println(major.size()+"\t"+minor.size()); jpayne@68: out.add(major); jpayne@68: out.add(minor); jpayne@68: } jpayne@68: jpayne@68: //Returns the c jpayne@68: static int countVariants(Clump c, LongList varCounts){ jpayne@68: varCounts.clear(); jpayne@68: final int[][] bcounts=c.baseCounts(); jpayne@68: final int[][] qcounts=c.qualityCounts(); jpayne@68: final int len=bcounts[0].length; jpayne@68: for(int i=0; i1 && x!=major){ jpayne@68: final long var=(((long)bcount)<<32)|((i<>>32); jpayne@68: } jpayne@68: jpayne@68: static LinkedHashMap> findReadVariants(Clump c, boolean makeMap){ jpayne@68: if(c.size()<5){return null;} //Not really needed with tiny clumps jpayne@68: // assert(c.size()>3); //TODO jpayne@68: LinkedHashMap> map=null; jpayne@68: map=(makeMap ? new LinkedHashMap>() : null); jpayne@68: // if(makeMap){ jpayne@68: // map=localMap.get(); jpayne@68: // if(map==null){ jpayne@68: // map=new LinkedHashMap>(); jpayne@68: // localMap.set(map); jpayne@68: // } jpayne@68: // map.clear(); jpayne@68: // } jpayne@68: jpayne@68: final int[][] bcounts=c.baseCounts(); jpayne@68: final Read ref=c.consensusRead(); jpayne@68: final byte[] rbases=ref.bases; jpayne@68: for(Read r : c){ jpayne@68: final byte[] bases=r.bases; jpayne@68: final ReadKey key=(ReadKey) r.obj; jpayne@68: IntList list=key.vars; jpayne@68: if(list!=null){list.clear();} jpayne@68: jpayne@68: jpayne@68: final int cStart=0, rStart=c.maxLeft()-key.position; jpayne@68: jpayne@68: for(int i=cStart, j=rStart; i=0){ jpayne@68: int count=bcounts[x][j]; jpayne@68: if(count>1){ jpayne@68: int var=(j<<2)|x; jpayne@68: if(list==null){list=key.vars=new IntList(4);} jpayne@68: list.add(var); jpayne@68: jpayne@68: if(map!=null){ jpayne@68: Integer mapkey=var;//mapKeys[var]; jpayne@68: ArrayList alr=map.get(mapkey); jpayne@68: if(alr==null){ jpayne@68: alr=new ArrayList(4); jpayne@68: map.put(mapkey, alr); jpayne@68: } jpayne@68: alr.add(r); jpayne@68: } jpayne@68: jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: return map==null || map.isEmpty() ? null : map; jpayne@68: } jpayne@68: jpayne@68: static int findBestPivot_Correlated(Clump c, IntList pivots){ jpayne@68: assert(pivots.size==0); jpayne@68: LinkedHashMap> map=findReadVariants(c, true); jpayne@68: if(map==null){return -1;} jpayne@68: jpayne@68: IntList collection=new IntList(32); jpayne@68: int[] rvector=KillSwitch.allocInt1D(5); jpayne@68: jpayne@68: int bestVar=-1; jpayne@68: int bestVarCount=-1; jpayne@68: int bestVar2=-1; jpayne@68: int bestVar2Count=-1; jpayne@68: int bestDifferent=-1; jpayne@68: float bestCorrelation=-1; jpayne@68: float bestScore=-1; jpayne@68: jpayne@68: final float minCorrelation=0.75f; jpayne@68: final int minVar2Count=2; jpayne@68: jpayne@68: int max=0; jpayne@68: for(Entry> entry : map.entrySet()){ jpayne@68: max=Tools.max(max, entry.getValue().size()); jpayne@68: } jpayne@68: if(max<2){return -1;} jpayne@68: final int thresh=Tools.max(2, max/2); jpayne@68: final int thresh2=max/4; jpayne@68: int numOverThresh=0; jpayne@68: ArrayList remove=new ArrayList(); jpayne@68: for(Entry> entry : map.entrySet()){ jpayne@68: int x=entry.getValue().size(); jpayne@68: if(x>=thresh){numOverThresh++;} jpayne@68: else if(xMAX_CORRELATIONS){return -1;} jpayne@68: jpayne@68: for(Entry> entry : map.entrySet()){ jpayne@68: final ArrayList rlist=entry.getValue(); jpayne@68: final Integer key=entry.getKey(); jpayne@68: final int var=key; jpayne@68: if(rlist.size()>=thresh){ jpayne@68: jpayne@68: final int var2=examineVar(var, rlist, collection, rvector, map); jpayne@68: jpayne@68: if(var2>=0){ jpayne@68: jpayne@68: final int varCount=rvector[1]; jpayne@68: final int var2Count=rvector[3]; jpayne@68: final int different=rvector[4]; jpayne@68: jpayne@68: final int var2reads; jpayne@68: final int var2ReadsWithoutVar; jpayne@68: { jpayne@68: ArrayList temp=map.get(var2); jpayne@68: var2reads=(temp==null ? 0 : temp.size()); jpayne@68: jpayne@68: // if(var2reads==var2Count){ jpayne@68: // var2ReadsWithoutVar=0; jpayne@68: // }else{ jpayne@68: // var2ReadsWithoutVar=countDifferentAlleles(var, temp); jpayne@68: // } jpayne@68: jpayne@68: var2ReadsWithoutVar=var2reads-varCount; jpayne@68: jpayne@68: } jpayne@68: jpayne@68: // final float correlation=(var2Count-0.05f)/(float)varCount; jpayne@68: // final float correlation=(varCount-different)/(float)varCount; jpayne@68: final float correlation=(Tools.max(varCount, var2reads)-different)/(float)Tools.max(varCount, var2reads); jpayne@68: final int distance=Tools.absdif(var>>2, var2>>2); jpayne@68: jpayne@68: // final float score=correlation*((var2Count-1)-0.5f*var2ReadsWithoutVar-different+1.0f*(varCount)); jpayne@68: jpayne@68: final float score=correlation*(var2reads/*var2Count*/+varCount-different+0.5f*var2ReadsWithoutVar)*(distance+250); jpayne@68: jpayne@68: // final float score=correlation*((var2Count-1)-1.0f*var2ReadsWithoutVar+1.0f*(varCount)); jpayne@68: if(correlation>=minCorrelation && var2Count>=minVar2Count){ jpayne@68: if(score>bestScore || (score==bestScore && varCount>bestVarCount)){ jpayne@68: bestVar=var; jpayne@68: bestVarCount=varCount; jpayne@68: bestVar2=var2; jpayne@68: bestVar2Count=var2Count; jpayne@68: bestCorrelation=correlation; jpayne@68: bestScore=score; jpayne@68: bestDifferent=different; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(bestVar2Count>=minVar2Count && bestCorrelation>=minCorrelation){ jpayne@68: pivots.add(bestVar); jpayne@68: pivots.add(bestVar2); jpayne@68: return bestVar; jpayne@68: } jpayne@68: return -1; jpayne@68: } jpayne@68: jpayne@68: static boolean containsVar(final int var, final Read r, final int maxLeft){ jpayne@68: final int varPos=var>>2; jpayne@68: final int varAllele=var&alleleMask; jpayne@68: final ReadKey rk=(ReadKey) r.obj; jpayne@68: final int rloc=toReadLocation(varPos, maxLeft, rk.position); jpayne@68: if(rloc<0 || rloc>=r.length()){ jpayne@68: return false; jpayne@68: } jpayne@68: final int readAllele=AminoAcid.baseToNumber[r.bases[rloc]]; jpayne@68: return readAllele==varAllele; jpayne@68: } jpayne@68: jpayne@68: static boolean hasDifferentAllele(final int var, final Read r/*, final Clump c*/){ jpayne@68: final int varPos=var>>2; jpayne@68: final int varAllele=var&alleleMask; jpayne@68: final ReadKey rk=(ReadKey) r.obj; jpayne@68: final IntList vars=rk.vars; jpayne@68: final Clump c=rk.clump; jpayne@68: assert(c==rk.clump); jpayne@68: final int maxLeft=c.maxLeft(); jpayne@68: final int rloc=toReadLocation(varPos, maxLeft, rk.position); jpayne@68: if(rloc<0 || rloc>=r.length()){ jpayne@68: assert(!vars.contains(var)); jpayne@68: return false; jpayne@68: } jpayne@68: final int readAllele=AminoAcid.baseToNumber[r.bases[rloc]]; jpayne@68: assert((readAllele==varAllele)==vars.contains(var)); jpayne@68: jpayne@68: return readAllele!=varAllele; jpayne@68: } jpayne@68: jpayne@68: static int countDifferentAlleles(final int var, ArrayList list){ jpayne@68: if(list==null){return 0;} jpayne@68: int sum=0; jpayne@68: for(Read r : list){ jpayne@68: if(hasDifferentAllele(var, r)){sum++;} jpayne@68: } jpayne@68: return sum; jpayne@68: } jpayne@68: jpayne@68: static int examineVar(final int var, final ArrayList list, final IntList collection, final int[] rvector, LinkedHashMap> map){ jpayne@68: collection.clear(); jpayne@68: jpayne@68: for(Read r : list){ jpayne@68: final ReadKey rk=(ReadKey) r.obj; jpayne@68: final IntList vars=rk.vars; jpayne@68: jpayne@68: for(int i=0; ibestSharedCount){ jpayne@68: final int different1=(sharedCount==varCount ? 0 : countDifferentAlleles(lastVar2, list)); jpayne@68: if(different1*8 list2=map.get(lastVar2); jpayne@68: final int varCount2=(list2==null ? 0 : list2.size()); jpayne@68: final int different2=(sharedCount==varCount2 ? 0 : countDifferentAlleles(var, list2)); jpayne@68: if(different2*8bestSharedCount){ jpayne@68: final int different1=(sharedCount==varCount ? 0 : countDifferentAlleles(lastVar2, list)); jpayne@68: if(different1*8 list2=map.get(lastVar2); jpayne@68: final int varCount2=(list2==null ? 0 : list2.size()); jpayne@68: final int different2=(sharedCount==varCount2 ? 0 : countDifferentAlleles(var, list2)); jpayne@68: if(different2*8-1){return x;} jpayne@68: } jpayne@68: jpayne@68: final int[][] bcounts=c.baseCounts(); jpayne@68: final int[][] qcounts=c.qualityCounts(); jpayne@68: final float[][] qAvgs=c.qualityAverages(); jpayne@68: final int width=c.width(); jpayne@68: jpayne@68: int bestPosition=-1; jpayne@68: int bestVar=-1; jpayne@68: long bestBsecond=0; jpayne@68: long bestQsecond=0; jpayne@68: jpayne@68: // final float bmult=8f, bmult2=15f, qmult=(c.useQuality() ? 1.5f : 100f); jpayne@68: final boolean useQuality=c.useQuality(); jpayne@68: final float bmult=20f, bmult2=20f; jpayne@68: final float qmult=20f, qamult=1.5f; jpayne@68: jpayne@68: final int minPivotDepth=Tools.max(4, (int)(minSizeFractionSplit*size)); jpayne@68: // final int minMinorAllele=Tools.max((conservative ? 1 : 2), (int)(0.25f+size/bmult2)); jpayne@68: final int minMinorAllele=Tools.max(2, (int)(size/bmult2)); jpayne@68: final int minMinorAlleleQ=minMinorAllele*10; jpayne@68: jpayne@68: // assert(false) : size+", "+minSizeSplit+", "+minPivotDepth+", "+minMinorAllele; jpayne@68: jpayne@68: for(int i=0; i=minPivotDepth){ jpayne@68: final int pmajor=c.getConsensusAtPosition(bcounts, i); jpayne@68: final int pminor=c.getSecondHighest(bcounts, i); jpayne@68: if(pmajor>=0){ jpayne@68: final long bmajor=bcounts[pmajor][i]; jpayne@68: final long bsecond=bcounts[pminor][i]; jpayne@68: jpayne@68: final long qmajor=qcounts[pmajor][i]; jpayne@68: final long qsecond=qcounts[pminor][i]; jpayne@68: jpayne@68: final float qamajor=qAvgs[pmajor][i]; jpayne@68: final float qasecond=qAvgs[pminor][i]; jpayne@68: jpayne@68: if(bsecond*bmult>=bmajor && bsecond>=minMinorAllele){ jpayne@68: if(!useQuality || (qsecond*qmult>=qmajor && qasecond*qamult>=qamajor && qsecond>=minMinorAlleleQ)){//candidate jpayne@68: if(bsecond>bestBsecond || (bsecond==bestBsecond && qsecond>bestQsecond)){//Perhaps Qsecond should be more important...? jpayne@68: jpayne@68: // assert(false) : size+", "+minSizeSplit+", "+minPivotDepth+", "+minMinorAllele+", "+bmajor+", "+bsecond+", "+qmajor+", "+qsecond+", "+minMinorAlleleQ; jpayne@68: jpayne@68: bestBsecond=bsecond; jpayne@68: bestQsecond=qsecond; jpayne@68: bestPosition=i; jpayne@68: bestVar=(bestPosition<<2)|pminor; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: // if(bestVar<0 && findCorrelations){ jpayne@68: // bestVar=findBestPivot_Correlated(c); jpayne@68: // } jpayne@68: jpayne@68: if(bestVar>=0){pivots.add(bestVar);} jpayne@68: return bestVar; jpayne@68: } jpayne@68: jpayne@68: static int minSizeSplit=4; //5 is actually better than 4 in allele separation tests...? jpayne@68: static float minSizeFractionSplit=0.17f; //0.2 is substantially worse, 0.14 is a tiny bit better than 0.17 jpayne@68: static boolean conservative=false; jpayne@68: jpayne@68: private static final int alleleMask=0x3; jpayne@68: private static final int posMask=~alleleMask; jpayne@68: private static final int shift=2; jpayne@68: jpayne@68: static boolean FIND_CORRELATIONS=true; jpayne@68: static int MAX_CORRELATIONS=12; jpayne@68: jpayne@68: // private static final ThreadLocal>> localMap=new ThreadLocal>>(); jpayne@68: // private static final Integer[] mapKeys=new Integer[2000]; jpayne@68: // static{ jpayne@68: // for(int i=0; i