jpayne@68: package sketch; jpayne@68: jpayne@68: import java.util.Arrays; jpayne@68: import java.util.Locale; jpayne@68: jpayne@68: import dna.AminoAcid; jpayne@68: import shared.KillSwitch; jpayne@68: import shared.Tools; jpayne@68: import structures.LongHashMap; jpayne@68: import structures.LongHeap; jpayne@68: import structures.LongHeapMap; jpayne@68: import structures.LongHeapSet; jpayne@68: import structures.LongHeapSetInterface; jpayne@68: jpayne@68: public class SketchHeap { jpayne@68: jpayne@68: SketchHeap(int limit, int minKeyOccuranceCount_, boolean trackCounts){ jpayne@68: minKeyOccuranceCount=minKeyOccuranceCount_; jpayne@68: setMode=minKeyOccuranceCount<2 && !trackCounts; jpayne@68: if(setMode){ jpayne@68: setOrMap=set=new LongHeapSet(limit); jpayne@68: map=null; jpayne@68: heap=set.heap; jpayne@68: }else{ jpayne@68: if(minKeyOccuranceCount>1){limit=(int)Tools.min(10000000, limit*SketchObject.sketchHeapFactor);} jpayne@68: setOrMap=map=new LongHeapMap(limit); jpayne@68: set=null; jpayne@68: heap=map.heap; jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: public void clear(boolean clearFname){ jpayne@68: taxID=-1; jpayne@68: imgID=-1; jpayne@68: genomeSizeBases=0; jpayne@68: genomeSizeKmers=0; jpayne@68: genomeSequences=0; jpayne@68: if(baseCounts!=null){Arrays.fill(baseCounts, 0);} jpayne@68: r16S=null; jpayne@68: r18S=null; jpayne@68: probSum=0; jpayne@68: taxName=null; jpayne@68: name0=null; jpayne@68: if(clearFname){fname=null;} jpayne@68: setOrMap.clear(); jpayne@68: } jpayne@68: jpayne@68: public void add(SketchHeap b){ jpayne@68: if(taxID<0){taxID=b.taxID;} jpayne@68: if(imgID<0){imgID=b.imgID;} jpayne@68: if(taxName==null){taxName=b.taxName;} jpayne@68: if(name0==null){name0=b.name0;} jpayne@68: if(fname==null){fname=b.fname;} jpayne@68: genomeSizeBases+=b.genomeSizeBases; jpayne@68: genomeSizeKmers+=b.genomeSizeKmers; jpayne@68: genomeSequences+=b.genomeSequences; jpayne@68: if(baseCounts!=null){Tools.add(baseCounts, b.baseCounts);} jpayne@68: set16S(b.r16S); jpayne@68: set18S(b.r18S); jpayne@68: probSum+=b.probSum; jpayne@68: if(setMode){ jpayne@68: set.add(b.set); jpayne@68: }else{ jpayne@68: map.add(b.map); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: public void add(Sketch b){ jpayne@68: if(taxID<0){taxID=b.taxID;} jpayne@68: if(imgID<0){imgID=b.imgID;} jpayne@68: if(taxName==null){taxName=b.taxName();} jpayne@68: if(name0==null){name0=b.name0();} jpayne@68: if(fname==null){fname=b.fname();} jpayne@68: genomeSizeBases+=b.genomeSizeBases; jpayne@68: genomeSizeKmers+=b.genomeSizeKmers; jpayne@68: genomeSequences+=b.genomeSequences; jpayne@68: if(baseCounts!=null && b.baseCounts!=null){Tools.add(baseCounts, b.baseCounts);} jpayne@68: set16S(b.r16S()); jpayne@68: set18S(b.r18S()); jpayne@68: jpayne@68: long[] keys=b.keys; jpayne@68: int[] counts=b.keyCounts; jpayne@68: assert(keys.length==b.length()) : keys.length+", "+b.length(); //Otherwise, change to loop through the size jpayne@68: for(int i=0; i=SketchObject.minHashValue)==(count>0)); jpayne@68: increment(key, count); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: public StringBuilder toHeader(){ jpayne@68: StringBuilder sb=new StringBuilder(); jpayne@68: sb.append("#SZ:"+setOrMap.size()); jpayne@68: jpayne@68: sb.append("\tCD:"); jpayne@68: sb.append(SketchObject.codingArray[SketchObject.CODING]); jpayne@68: if(SketchObject.deltaOut){sb.append('D');} jpayne@68: if(SketchObject.aminoOrTranslate()){sb.append('M');} jpayne@68: if(SketchObject.amino8){sb.append('8');} jpayne@68: jpayne@68: sb.append("\tK:").append(SketchObject.k); jpayne@68: if(SketchObject.k2>0){sb.append(",").append(SketchObject.k2);} jpayne@68: if(SketchObject.HASH_VERSION>1){sb.append("\tH:").append(SketchObject.HASH_VERSION);} jpayne@68: jpayne@68: if(genomeSizeBases>0){sb.append("\tGS:"+genomeSizeBases);} jpayne@68: if(genomeSizeKmers>0){sb.append("\tGK:"+genomeSizeKmers);} jpayne@68: final long ge=genomeSizeEstimate(); jpayne@68: if(ge>0){sb.append("\tGE:").append(ge);} jpayne@68: if(genomeSequences>0){sb.append("\tGQ:"+genomeSequences);} jpayne@68: if(baseCounts!=null && !SketchObject.aminoOrTranslate()){ jpayne@68: sb.append("\tBC:").append(baseCounts[0]).append(',').append(baseCounts[1]).append(','); jpayne@68: sb.append(baseCounts[2]).append(',').append(baseCounts[3]); jpayne@68: } jpayne@68: if(probSum>0){sb.append("\tPC:"+String.format(Locale.ROOT, "%.4f",probSum/genomeSizeKmers));} jpayne@68: if(taxID>=0){sb.append("\tID:"+taxID);} jpayne@68: if(imgID>=0){sb.append("\tIMG:"+imgID);} jpayne@68: if(taxName!=null){sb.append("\tNM:"+taxName);} jpayne@68: if(name0!=null){sb.append("\tNM0:"+name0);} jpayne@68: if(fname!=null){sb.append("\tFN:"+fname);} jpayne@68: jpayne@68: if(r16S!=null){sb.append("\t16S:"+r16S.length);} jpayne@68: if(r18S!=null){sb.append("\t18S:"+r18S.length);} jpayne@68: if(r16S!=null){ jpayne@68: sb.append('\n').append("#16S:"); jpayne@68: for(byte b : r16S){sb.append((char)b);} jpayne@68: } jpayne@68: if(r18S!=null){ jpayne@68: sb.append('\n').append("#18S:"); jpayne@68: for(byte b : r18S){sb.append((char)b);} jpayne@68: } jpayne@68: return sb; jpayne@68: } jpayne@68: jpayne@68: public boolean checkAndAdd(long value){ jpayne@68: assert(value>=SketchObject.minHashValue); jpayne@68: jpayne@68: // if(!heap.hasRoom() && value<=heap.peek()){return false;} jpayne@68: // if(Blacklist.contains(value)){return false;} jpayne@68: // if(!Whitelist.contains(value)){return false;} jpayne@68: jpayne@68: if(Blacklist.exists() || Whitelist.exists()){ jpayne@68: if(!heap.hasRoom() && value<=heap.peek()){return false;} jpayne@68: if(Blacklist.contains(value)){return false;} jpayne@68: if(!Whitelist.containsRaw(value)){return false;} jpayne@68: } jpayne@68: jpayne@68: return add(value); jpayne@68: } jpayne@68: jpayne@68: public final int maxLen(){ jpayne@68: return SketchObject.toSketchSize(genomeSizeBases, genomeSizeKmers, genomeSizeEstimate(), SketchObject.targetSketchSize); jpayne@68: } jpayne@68: jpayne@68: public final long[] toSketchArray(){ jpayne@68: int maxLen=maxLen(); jpayne@68: return toSketchArray(maxLen, minKeyOccuranceCount); jpayne@68: } jpayne@68: jpayne@68: public final long[] toSketchArray_minCount(int minKeyOccuranceCount_){ jpayne@68: int maxLen=maxLen(); jpayne@68: return toSketchArray(maxLen, minKeyOccuranceCount_); jpayne@68: } jpayne@68: jpayne@68: final long[] toSketchArray_maxLen(int maxLen){ jpayne@68: return toSketchArray(maxLen, minKeyOccuranceCount); jpayne@68: } jpayne@68: jpayne@68: private final long[] toSketchArrayOld(int maxLen){//Destructive jpayne@68: final int initial=heap.size(); jpayne@68: final int len=Tools.min(maxLen, initial); jpayne@68: final long[] array=KillSwitch.allocLong1D(len); jpayne@68: jpayne@68: int toSkip=heap.size()-len; jpayne@68: for(int i=0; i0) : Arrays.toString(keys); jpayne@68: keys[i]=Long.MAX_VALUE-keys[i]; jpayne@68: // assert(keys[i]>0) : Arrays.toString(keys); jpayne@68: } jpayne@68: Arrays.sort(keys); jpayne@68: if(keys.length>maxLen){ jpayne@68: keys=Arrays.copyOf(keys, maxLen); jpayne@68: } jpayne@68: jpayne@68: // final LongHeap heap=heap; jpayne@68: // heap.clear(); jpayne@68: // assert(heap.size()==0) : heap.size()+", "+maxLen; jpayne@68: return keys; jpayne@68: } jpayne@68: jpayne@68: @Override jpayne@68: public int hashCode(){ jpayne@68: long gSize=genomeSizeKmers>0 ? genomeSizeKmers : genomeSizeBases; jpayne@68: int code=(int) ((gSize^taxID^imgID^(name0==null ? 0 : name0.hashCode()))&Integer.MAX_VALUE); jpayne@68: return code; jpayne@68: } jpayne@68: jpayne@68: public long genomeSizeEstimate() { jpayne@68: int size=size(); jpayne@68: if(size==0){return 0;} jpayne@68: long min=peek(); jpayne@68: long est=Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(Long.MAX_VALUE-min, size)); jpayne@68: // assert(est<30000000) : min+", "+(Long.MAX_VALUE-min)+", "+size+", "+genomeSizeKmers+", "+Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(Long.MAX_VALUE-min, size)); jpayne@68: return est; jpayne@68: } jpayne@68: jpayne@68: public long genomeSizeEstimate(int minCount) { jpayne@68: if(minCount<2){return genomeSizeEstimate();} jpayne@68: if(size()==0){return 0;} jpayne@68: long[] min=map.map.getMin(minCount); jpayne@68: if(min[1]==0){return 0;} jpayne@68: long est=Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(Long.MAX_VALUE-min[0], (int)min[1])); jpayne@68: return est; jpayne@68: } jpayne@68: jpayne@68: public long sketchSizeEstimate(){ jpayne@68: return SketchObject.toSketchSize(genomeSizeBases, genomeSizeKmers, genomeSizeEstimate(), SketchObject.targetSketchSize); jpayne@68: } jpayne@68: jpayne@68: public boolean contains(long key) { jpayne@68: return setOrMap.contains(key); jpayne@68: } jpayne@68: jpayne@68: @Override jpayne@68: public String toString(){return toHeader().toString();} jpayne@68: jpayne@68: public String name(){return taxName==null ? name0 : taxName;} jpayne@68: public String taxName(){return taxName;} jpayne@68: public String name0(){return name0;} jpayne@68: public String fname(){return fname;} jpayne@68: public long[] baseCounts(boolean original){return baseCounts==null ? null : original ? baseCounts : baseCounts.clone();} jpayne@68: public void setTaxName(String s){taxName=s;} jpayne@68: public void setName0(String s){name0=s;} jpayne@68: public void setFname(String s){fname=s;} jpayne@68: jpayne@68: public byte[] r16S(){return r16S;} jpayne@68: public int r16SLen(){return r16S==null ? 0 : r16S.length;} jpayne@68: public void set16S(byte[] b){ jpayne@68: if(b==null || b.lengthscore16S(r16S)){ jpayne@68: r16S=b; jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: private float score16S(byte[] seq){return scoreSSU(seq, 1533);} jpayne@68: private float score18S(byte[] seq){return scoreSSU(seq, 1858);} jpayne@68: jpayne@68: private float scoreSSU(byte[] seq, int idealLen){ jpayne@68: float lengthScore=lengthScore(seq.length, idealLen); jpayne@68: float definedScore=(seq.length-AminoAcid.countUndefined(seq))/(float)seq.length; jpayne@68: return lengthScore*definedScore; jpayne@68: } jpayne@68: private float lengthScore(int len, int ideal){return Tools.min(len, ideal)/(float)Tools.max(len, ideal);} jpayne@68: jpayne@68: public byte[] r18S(){return r18S;} jpayne@68: public int r18SLen(){return r18S==null ? 0 : r18S.length;} jpayne@68: public void set18S(byte[] b){ jpayne@68: if(b==null || b.lengthscore16S(r18S)){ jpayne@68: r18S=b; jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: boolean isEukaryote(){ jpayne@68: if(taxID<1 || taxID>=SketchObject.minFakeID){return false;} jpayne@68: if(SketchObject.taxtree==null){return false;} jpayne@68: return SketchObject.taxtree.isEukaryote((int)taxID); jpayne@68: } jpayne@68: jpayne@68: private String taxName; jpayne@68: private String name0; jpayne@68: private String fname; jpayne@68: public long taxID=-1; jpayne@68: public long imgID=-1; jpayne@68: public long genomeSizeBases=0; jpayne@68: public long genomeSizeKmers=0; jpayne@68: public long genomeSequences=0; jpayne@68: public final long[] baseCounts=(SketchObject.aminoOrTranslate() ? null : new long[4]); jpayne@68: private byte[] r16S; jpayne@68: private byte[] r18S; jpayne@68: double probSum=0; jpayne@68: jpayne@68: public float probCorrect(){return probSum<=0 ? 0f : (float)(probSum/Tools.max(genomeSizeKmers, 1f));} jpayne@68: public int capacity(){return heap.capacity();} jpayne@68: public boolean hasRoom(){return heap.hasRoom();} jpayne@68: public long peek(){return heap.peek();} jpayne@68: public int size(){return heap.size();} jpayne@68: public LongHashMap map(){return map.map;} jpayne@68: jpayne@68: public void clear(){ jpayne@68: setOrMap.clear(); jpayne@68: } jpayne@68: public void clearSet(){ jpayne@68: if(set==null){map.map.clear();} jpayne@68: else{set.set.clear();} jpayne@68: } jpayne@68: public boolean add(long key){return setOrMap.add(key);} jpayne@68: public int increment(long key, int incr){return setOrMap.increment(key, incr);} jpayne@68: jpayne@68: private final LongHeapSet set; jpayne@68: private final LongHeapMap map; jpayne@68: private final LongHeapSetInterface setOrMap; jpayne@68: public final LongHeap heap; jpayne@68: public final int minKeyOccuranceCount; jpayne@68: /** Determines whether to use LongHeapSet or LongHeapMap */ jpayne@68: public final boolean setMode; jpayne@68: }