Mercurial > repos > rliterman > csp2
view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/SketchHeap.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line source
package sketch; import java.util.Arrays; import java.util.Locale; import dna.AminoAcid; import shared.KillSwitch; import shared.Tools; import structures.LongHashMap; import structures.LongHeap; import structures.LongHeapMap; import structures.LongHeapSet; import structures.LongHeapSetInterface; public class SketchHeap { SketchHeap(int limit, int minKeyOccuranceCount_, boolean trackCounts){ minKeyOccuranceCount=minKeyOccuranceCount_; setMode=minKeyOccuranceCount<2 && !trackCounts; if(setMode){ setOrMap=set=new LongHeapSet(limit); map=null; heap=set.heap; }else{ if(minKeyOccuranceCount>1){limit=(int)Tools.min(10000000, limit*SketchObject.sketchHeapFactor);} setOrMap=map=new LongHeapMap(limit); set=null; heap=map.heap; } } public void clear(boolean clearFname){ taxID=-1; imgID=-1; genomeSizeBases=0; genomeSizeKmers=0; genomeSequences=0; if(baseCounts!=null){Arrays.fill(baseCounts, 0);} r16S=null; r18S=null; probSum=0; taxName=null; name0=null; if(clearFname){fname=null;} setOrMap.clear(); } public void add(SketchHeap b){ if(taxID<0){taxID=b.taxID;} if(imgID<0){imgID=b.imgID;} if(taxName==null){taxName=b.taxName;} if(name0==null){name0=b.name0;} if(fname==null){fname=b.fname;} genomeSizeBases+=b.genomeSizeBases; genomeSizeKmers+=b.genomeSizeKmers; genomeSequences+=b.genomeSequences; if(baseCounts!=null){Tools.add(baseCounts, b.baseCounts);} set16S(b.r16S); set18S(b.r18S); probSum+=b.probSum; if(setMode){ set.add(b.set); }else{ map.add(b.map); } } public void add(Sketch b){ if(taxID<0){taxID=b.taxID;} if(imgID<0){imgID=b.imgID;} if(taxName==null){taxName=b.taxName();} if(name0==null){name0=b.name0();} if(fname==null){fname=b.fname();} genomeSizeBases+=b.genomeSizeBases; genomeSizeKmers+=b.genomeSizeKmers; genomeSequences+=b.genomeSequences; if(baseCounts!=null && b.baseCounts!=null){Tools.add(baseCounts, b.baseCounts);} set16S(b.r16S()); set18S(b.r18S()); long[] keys=b.keys; int[] counts=b.keyCounts; assert(keys.length==b.length()) : keys.length+", "+b.length(); //Otherwise, change to loop through the size for(int i=0; i<keys.length; i++){ long key=Long.MAX_VALUE-keys[i]; int count=(counts==null ? 1 : counts[i]); assert((key>=SketchObject.minHashValue)==(count>0)); increment(key, count); } } public StringBuilder toHeader(){ StringBuilder sb=new StringBuilder(); sb.append("#SZ:"+setOrMap.size()); sb.append("\tCD:"); sb.append(SketchObject.codingArray[SketchObject.CODING]); if(SketchObject.deltaOut){sb.append('D');} if(SketchObject.aminoOrTranslate()){sb.append('M');} if(SketchObject.amino8){sb.append('8');} sb.append("\tK:").append(SketchObject.k); if(SketchObject.k2>0){sb.append(",").append(SketchObject.k2);} if(SketchObject.HASH_VERSION>1){sb.append("\tH:").append(SketchObject.HASH_VERSION);} if(genomeSizeBases>0){sb.append("\tGS:"+genomeSizeBases);} if(genomeSizeKmers>0){sb.append("\tGK:"+genomeSizeKmers);} final long ge=genomeSizeEstimate(); if(ge>0){sb.append("\tGE:").append(ge);} if(genomeSequences>0){sb.append("\tGQ:"+genomeSequences);} if(baseCounts!=null && !SketchObject.aminoOrTranslate()){ sb.append("\tBC:").append(baseCounts[0]).append(',').append(baseCounts[1]).append(','); sb.append(baseCounts[2]).append(',').append(baseCounts[3]); } if(probSum>0){sb.append("\tPC:"+String.format(Locale.ROOT, "%.4f",probSum/genomeSizeKmers));} if(taxID>=0){sb.append("\tID:"+taxID);} if(imgID>=0){sb.append("\tIMG:"+imgID);} if(taxName!=null){sb.append("\tNM:"+taxName);} if(name0!=null){sb.append("\tNM0:"+name0);} if(fname!=null){sb.append("\tFN:"+fname);} if(r16S!=null){sb.append("\t16S:"+r16S.length);} if(r18S!=null){sb.append("\t18S:"+r18S.length);} if(r16S!=null){ sb.append('\n').append("#16S:"); for(byte b : r16S){sb.append((char)b);} } if(r18S!=null){ sb.append('\n').append("#18S:"); for(byte b : r18S){sb.append((char)b);} } return sb; } public boolean checkAndAdd(long value){ assert(value>=SketchObject.minHashValue); // if(!heap.hasRoom() && value<=heap.peek()){return false;} // if(Blacklist.contains(value)){return false;} // if(!Whitelist.contains(value)){return false;} if(Blacklist.exists() || Whitelist.exists()){ if(!heap.hasRoom() && value<=heap.peek()){return false;} if(Blacklist.contains(value)){return false;} if(!Whitelist.containsRaw(value)){return false;} } return add(value); } public final int maxLen(){ return SketchObject.toSketchSize(genomeSizeBases, genomeSizeKmers, genomeSizeEstimate(), SketchObject.targetSketchSize); } public final long[] toSketchArray(){ int maxLen=maxLen(); return toSketchArray(maxLen, minKeyOccuranceCount); } public final long[] toSketchArray_minCount(int minKeyOccuranceCount_){ int maxLen=maxLen(); return toSketchArray(maxLen, minKeyOccuranceCount_); } final long[] toSketchArray_maxLen(int maxLen){ return toSketchArray(maxLen, minKeyOccuranceCount); } private final long[] toSketchArrayOld(int maxLen){//Destructive final int initial=heap.size(); final int len=Tools.min(maxLen, initial); final long[] array=KillSwitch.allocLong1D(len); int toSkip=heap.size()-len; for(int i=0; i<toSkip; i++){heap.poll();} for(int i=0; i<len; i++){ array[i]=Long.MAX_VALUE-heap.poll(); } Tools.reverseInPlace(array); assert(heap.size()==0) : heap.size()+", "+len+", "+maxLen+", "+initial; return array; } private final long[] toSketchArray(int maxLen, int minKeyOccuranceCount_){//Non-destructive if(minKeyOccuranceCount_<0){minKeyOccuranceCount_=minKeyOccuranceCount;} if(setMode){return toSketchArrayOld(maxLen);} long[] keys=map().toArray(minKeyOccuranceCount_); for(int i=0; i<keys.length; i++){ // assert(keys[i]>0) : Arrays.toString(keys); keys[i]=Long.MAX_VALUE-keys[i]; // assert(keys[i]>0) : Arrays.toString(keys); } Arrays.sort(keys); if(keys.length>maxLen){ keys=Arrays.copyOf(keys, maxLen); } // final LongHeap heap=heap; // heap.clear(); // assert(heap.size()==0) : heap.size()+", "+maxLen; return keys; } @Override public int hashCode(){ long gSize=genomeSizeKmers>0 ? genomeSizeKmers : genomeSizeBases; int code=(int) ((gSize^taxID^imgID^(name0==null ? 0 : name0.hashCode()))&Integer.MAX_VALUE); return code; } public long genomeSizeEstimate() { int size=size(); if(size==0){return 0;} long min=peek(); long est=Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(Long.MAX_VALUE-min, size)); // assert(est<30000000) : min+", "+(Long.MAX_VALUE-min)+", "+size+", "+genomeSizeKmers+", "+Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(Long.MAX_VALUE-min, size)); return est; } public long genomeSizeEstimate(int minCount) { if(minCount<2){return genomeSizeEstimate();} if(size()==0){return 0;} long[] min=map.map.getMin(minCount); if(min[1]==0){return 0;} long est=Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(Long.MAX_VALUE-min[0], (int)min[1])); return est; } public long sketchSizeEstimate(){ return SketchObject.toSketchSize(genomeSizeBases, genomeSizeKmers, genomeSizeEstimate(), SketchObject.targetSketchSize); } public boolean contains(long key) { return setOrMap.contains(key); } @Override public String toString(){return toHeader().toString();} public String name(){return taxName==null ? name0 : taxName;} public String taxName(){return taxName;} public String name0(){return name0;} public String fname(){return fname;} public long[] baseCounts(boolean original){return baseCounts==null ? null : original ? baseCounts : baseCounts.clone();} public void setTaxName(String s){taxName=s;} public void setName0(String s){name0=s;} public void setFname(String s){fname=s;} public byte[] r16S(){return r16S;} public int r16SLen(){return r16S==null ? 0 : r16S.length;} public void set16S(byte[] b){ if(b==null || b.length<SketchObject.min_SSU_len){return;} if(r16S==null || score16S(b)>score16S(r16S)){ r16S=b; } } private float score16S(byte[] seq){return scoreSSU(seq, 1533);} private float score18S(byte[] seq){return scoreSSU(seq, 1858);} private float scoreSSU(byte[] seq, int idealLen){ float lengthScore=lengthScore(seq.length, idealLen); float definedScore=(seq.length-AminoAcid.countUndefined(seq))/(float)seq.length; return lengthScore*definedScore; } private float lengthScore(int len, int ideal){return Tools.min(len, ideal)/(float)Tools.max(len, ideal);} public byte[] r18S(){return r18S;} public int r18SLen(){return r18S==null ? 0 : r18S.length;} public void set18S(byte[] b){ if(b==null || b.length<SketchObject.min_SSU_len){return;} if(r18S==null || score18S(b)>score16S(r18S)){ r18S=b; } } boolean isEukaryote(){ if(taxID<1 || taxID>=SketchObject.minFakeID){return false;} if(SketchObject.taxtree==null){return false;} return SketchObject.taxtree.isEukaryote((int)taxID); } private String taxName; private String name0; private String fname; public long taxID=-1; public long imgID=-1; public long genomeSizeBases=0; public long genomeSizeKmers=0; public long genomeSequences=0; public final long[] baseCounts=(SketchObject.aminoOrTranslate() ? null : new long[4]); private byte[] r16S; private byte[] r18S; double probSum=0; public float probCorrect(){return probSum<=0 ? 0f : (float)(probSum/Tools.max(genomeSizeKmers, 1f));} public int capacity(){return heap.capacity();} public boolean hasRoom(){return heap.hasRoom();} public long peek(){return heap.peek();} public int size(){return heap.size();} public LongHashMap map(){return map.map;} public void clear(){ setOrMap.clear(); } public void clearSet(){ if(set==null){map.map.clear();} else{set.set.clear();} } public boolean add(long key){return setOrMap.add(key);} public int increment(long key, int incr){return setOrMap.increment(key, incr);} private final LongHeapSet set; private final LongHeapMap map; private final LongHeapSetInterface setOrMap; public final LongHeap heap; public final int minKeyOccuranceCount; /** Determines whether to use LongHeapSet or LongHeapMap */ public final boolean setMode; }