annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/SketchHeap.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package sketch;
jpayne@68 2
jpayne@68 3 import java.util.Arrays;
jpayne@68 4 import java.util.Locale;
jpayne@68 5
jpayne@68 6 import dna.AminoAcid;
jpayne@68 7 import shared.KillSwitch;
jpayne@68 8 import shared.Tools;
jpayne@68 9 import structures.LongHashMap;
jpayne@68 10 import structures.LongHeap;
jpayne@68 11 import structures.LongHeapMap;
jpayne@68 12 import structures.LongHeapSet;
jpayne@68 13 import structures.LongHeapSetInterface;
jpayne@68 14
jpayne@68 15 public class SketchHeap {
jpayne@68 16
jpayne@68 17 SketchHeap(int limit, int minKeyOccuranceCount_, boolean trackCounts){
jpayne@68 18 minKeyOccuranceCount=minKeyOccuranceCount_;
jpayne@68 19 setMode=minKeyOccuranceCount<2 && !trackCounts;
jpayne@68 20 if(setMode){
jpayne@68 21 setOrMap=set=new LongHeapSet(limit);
jpayne@68 22 map=null;
jpayne@68 23 heap=set.heap;
jpayne@68 24 }else{
jpayne@68 25 if(minKeyOccuranceCount>1){limit=(int)Tools.min(10000000, limit*SketchObject.sketchHeapFactor);}
jpayne@68 26 setOrMap=map=new LongHeapMap(limit);
jpayne@68 27 set=null;
jpayne@68 28 heap=map.heap;
jpayne@68 29 }
jpayne@68 30 }
jpayne@68 31
jpayne@68 32 public void clear(boolean clearFname){
jpayne@68 33 taxID=-1;
jpayne@68 34 imgID=-1;
jpayne@68 35 genomeSizeBases=0;
jpayne@68 36 genomeSizeKmers=0;
jpayne@68 37 genomeSequences=0;
jpayne@68 38 if(baseCounts!=null){Arrays.fill(baseCounts, 0);}
jpayne@68 39 r16S=null;
jpayne@68 40 r18S=null;
jpayne@68 41 probSum=0;
jpayne@68 42 taxName=null;
jpayne@68 43 name0=null;
jpayne@68 44 if(clearFname){fname=null;}
jpayne@68 45 setOrMap.clear();
jpayne@68 46 }
jpayne@68 47
jpayne@68 48 public void add(SketchHeap b){
jpayne@68 49 if(taxID<0){taxID=b.taxID;}
jpayne@68 50 if(imgID<0){imgID=b.imgID;}
jpayne@68 51 if(taxName==null){taxName=b.taxName;}
jpayne@68 52 if(name0==null){name0=b.name0;}
jpayne@68 53 if(fname==null){fname=b.fname;}
jpayne@68 54 genomeSizeBases+=b.genomeSizeBases;
jpayne@68 55 genomeSizeKmers+=b.genomeSizeKmers;
jpayne@68 56 genomeSequences+=b.genomeSequences;
jpayne@68 57 if(baseCounts!=null){Tools.add(baseCounts, b.baseCounts);}
jpayne@68 58 set16S(b.r16S);
jpayne@68 59 set18S(b.r18S);
jpayne@68 60 probSum+=b.probSum;
jpayne@68 61 if(setMode){
jpayne@68 62 set.add(b.set);
jpayne@68 63 }else{
jpayne@68 64 map.add(b.map);
jpayne@68 65 }
jpayne@68 66 }
jpayne@68 67
jpayne@68 68 public void add(Sketch b){
jpayne@68 69 if(taxID<0){taxID=b.taxID;}
jpayne@68 70 if(imgID<0){imgID=b.imgID;}
jpayne@68 71 if(taxName==null){taxName=b.taxName();}
jpayne@68 72 if(name0==null){name0=b.name0();}
jpayne@68 73 if(fname==null){fname=b.fname();}
jpayne@68 74 genomeSizeBases+=b.genomeSizeBases;
jpayne@68 75 genomeSizeKmers+=b.genomeSizeKmers;
jpayne@68 76 genomeSequences+=b.genomeSequences;
jpayne@68 77 if(baseCounts!=null && b.baseCounts!=null){Tools.add(baseCounts, b.baseCounts);}
jpayne@68 78 set16S(b.r16S());
jpayne@68 79 set18S(b.r18S());
jpayne@68 80
jpayne@68 81 long[] keys=b.keys;
jpayne@68 82 int[] counts=b.keyCounts;
jpayne@68 83 assert(keys.length==b.length()) : keys.length+", "+b.length(); //Otherwise, change to loop through the size
jpayne@68 84 for(int i=0; i<keys.length; i++){
jpayne@68 85 long key=Long.MAX_VALUE-keys[i];
jpayne@68 86 int count=(counts==null ? 1 : counts[i]);
jpayne@68 87 assert((key>=SketchObject.minHashValue)==(count>0));
jpayne@68 88 increment(key, count);
jpayne@68 89 }
jpayne@68 90 }
jpayne@68 91
jpayne@68 92 public StringBuilder toHeader(){
jpayne@68 93 StringBuilder sb=new StringBuilder();
jpayne@68 94 sb.append("#SZ:"+setOrMap.size());
jpayne@68 95
jpayne@68 96 sb.append("\tCD:");
jpayne@68 97 sb.append(SketchObject.codingArray[SketchObject.CODING]);
jpayne@68 98 if(SketchObject.deltaOut){sb.append('D');}
jpayne@68 99 if(SketchObject.aminoOrTranslate()){sb.append('M');}
jpayne@68 100 if(SketchObject.amino8){sb.append('8');}
jpayne@68 101
jpayne@68 102 sb.append("\tK:").append(SketchObject.k);
jpayne@68 103 if(SketchObject.k2>0){sb.append(",").append(SketchObject.k2);}
jpayne@68 104 if(SketchObject.HASH_VERSION>1){sb.append("\tH:").append(SketchObject.HASH_VERSION);}
jpayne@68 105
jpayne@68 106 if(genomeSizeBases>0){sb.append("\tGS:"+genomeSizeBases);}
jpayne@68 107 if(genomeSizeKmers>0){sb.append("\tGK:"+genomeSizeKmers);}
jpayne@68 108 final long ge=genomeSizeEstimate();
jpayne@68 109 if(ge>0){sb.append("\tGE:").append(ge);}
jpayne@68 110 if(genomeSequences>0){sb.append("\tGQ:"+genomeSequences);}
jpayne@68 111 if(baseCounts!=null && !SketchObject.aminoOrTranslate()){
jpayne@68 112 sb.append("\tBC:").append(baseCounts[0]).append(',').append(baseCounts[1]).append(',');
jpayne@68 113 sb.append(baseCounts[2]).append(',').append(baseCounts[3]);
jpayne@68 114 }
jpayne@68 115 if(probSum>0){sb.append("\tPC:"+String.format(Locale.ROOT, "%.4f",probSum/genomeSizeKmers));}
jpayne@68 116 if(taxID>=0){sb.append("\tID:"+taxID);}
jpayne@68 117 if(imgID>=0){sb.append("\tIMG:"+imgID);}
jpayne@68 118 if(taxName!=null){sb.append("\tNM:"+taxName);}
jpayne@68 119 if(name0!=null){sb.append("\tNM0:"+name0);}
jpayne@68 120 if(fname!=null){sb.append("\tFN:"+fname);}
jpayne@68 121
jpayne@68 122 if(r16S!=null){sb.append("\t16S:"+r16S.length);}
jpayne@68 123 if(r18S!=null){sb.append("\t18S:"+r18S.length);}
jpayne@68 124 if(r16S!=null){
jpayne@68 125 sb.append('\n').append("#16S:");
jpayne@68 126 for(byte b : r16S){sb.append((char)b);}
jpayne@68 127 }
jpayne@68 128 if(r18S!=null){
jpayne@68 129 sb.append('\n').append("#18S:");
jpayne@68 130 for(byte b : r18S){sb.append((char)b);}
jpayne@68 131 }
jpayne@68 132 return sb;
jpayne@68 133 }
jpayne@68 134
jpayne@68 135 public boolean checkAndAdd(long value){
jpayne@68 136 assert(value>=SketchObject.minHashValue);
jpayne@68 137
jpayne@68 138 // if(!heap.hasRoom() && value<=heap.peek()){return false;}
jpayne@68 139 // if(Blacklist.contains(value)){return false;}
jpayne@68 140 // if(!Whitelist.contains(value)){return false;}
jpayne@68 141
jpayne@68 142 if(Blacklist.exists() || Whitelist.exists()){
jpayne@68 143 if(!heap.hasRoom() && value<=heap.peek()){return false;}
jpayne@68 144 if(Blacklist.contains(value)){return false;}
jpayne@68 145 if(!Whitelist.containsRaw(value)){return false;}
jpayne@68 146 }
jpayne@68 147
jpayne@68 148 return add(value);
jpayne@68 149 }
jpayne@68 150
jpayne@68 151 public final int maxLen(){
jpayne@68 152 return SketchObject.toSketchSize(genomeSizeBases, genomeSizeKmers, genomeSizeEstimate(), SketchObject.targetSketchSize);
jpayne@68 153 }
jpayne@68 154
jpayne@68 155 public final long[] toSketchArray(){
jpayne@68 156 int maxLen=maxLen();
jpayne@68 157 return toSketchArray(maxLen, minKeyOccuranceCount);
jpayne@68 158 }
jpayne@68 159
jpayne@68 160 public final long[] toSketchArray_minCount(int minKeyOccuranceCount_){
jpayne@68 161 int maxLen=maxLen();
jpayne@68 162 return toSketchArray(maxLen, minKeyOccuranceCount_);
jpayne@68 163 }
jpayne@68 164
jpayne@68 165 final long[] toSketchArray_maxLen(int maxLen){
jpayne@68 166 return toSketchArray(maxLen, minKeyOccuranceCount);
jpayne@68 167 }
jpayne@68 168
jpayne@68 169 private final long[] toSketchArrayOld(int maxLen){//Destructive
jpayne@68 170 final int initial=heap.size();
jpayne@68 171 final int len=Tools.min(maxLen, initial);
jpayne@68 172 final long[] array=KillSwitch.allocLong1D(len);
jpayne@68 173
jpayne@68 174 int toSkip=heap.size()-len;
jpayne@68 175 for(int i=0; i<toSkip; i++){heap.poll();}
jpayne@68 176 for(int i=0; i<len; i++){
jpayne@68 177 array[i]=Long.MAX_VALUE-heap.poll();
jpayne@68 178 }
jpayne@68 179 Tools.reverseInPlace(array);
jpayne@68 180 assert(heap.size()==0) : heap.size()+", "+len+", "+maxLen+", "+initial;
jpayne@68 181 return array;
jpayne@68 182 }
jpayne@68 183
jpayne@68 184 private final long[] toSketchArray(int maxLen, int minKeyOccuranceCount_){//Non-destructive
jpayne@68 185 if(minKeyOccuranceCount_<0){minKeyOccuranceCount_=minKeyOccuranceCount;}
jpayne@68 186 if(setMode){return toSketchArrayOld(maxLen);}
jpayne@68 187 long[] keys=map().toArray(minKeyOccuranceCount_);
jpayne@68 188 for(int i=0; i<keys.length; i++){
jpayne@68 189 // assert(keys[i]>0) : Arrays.toString(keys);
jpayne@68 190 keys[i]=Long.MAX_VALUE-keys[i];
jpayne@68 191 // assert(keys[i]>0) : Arrays.toString(keys);
jpayne@68 192 }
jpayne@68 193 Arrays.sort(keys);
jpayne@68 194 if(keys.length>maxLen){
jpayne@68 195 keys=Arrays.copyOf(keys, maxLen);
jpayne@68 196 }
jpayne@68 197
jpayne@68 198 // final LongHeap heap=heap;
jpayne@68 199 // heap.clear();
jpayne@68 200 // assert(heap.size()==0) : heap.size()+", "+maxLen;
jpayne@68 201 return keys;
jpayne@68 202 }
jpayne@68 203
jpayne@68 204 @Override
jpayne@68 205 public int hashCode(){
jpayne@68 206 long gSize=genomeSizeKmers>0 ? genomeSizeKmers : genomeSizeBases;
jpayne@68 207 int code=(int) ((gSize^taxID^imgID^(name0==null ? 0 : name0.hashCode()))&Integer.MAX_VALUE);
jpayne@68 208 return code;
jpayne@68 209 }
jpayne@68 210
jpayne@68 211 public long genomeSizeEstimate() {
jpayne@68 212 int size=size();
jpayne@68 213 if(size==0){return 0;}
jpayne@68 214 long min=peek();
jpayne@68 215 long est=Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(Long.MAX_VALUE-min, size));
jpayne@68 216 // assert(est<30000000) : min+", "+(Long.MAX_VALUE-min)+", "+size+", "+genomeSizeKmers+", "+Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(Long.MAX_VALUE-min, size));
jpayne@68 217 return est;
jpayne@68 218 }
jpayne@68 219
jpayne@68 220 public long genomeSizeEstimate(int minCount) {
jpayne@68 221 if(minCount<2){return genomeSizeEstimate();}
jpayne@68 222 if(size()==0){return 0;}
jpayne@68 223 long[] min=map.map.getMin(minCount);
jpayne@68 224 if(min[1]==0){return 0;}
jpayne@68 225 long est=Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(Long.MAX_VALUE-min[0], (int)min[1]));
jpayne@68 226 return est;
jpayne@68 227 }
jpayne@68 228
jpayne@68 229 public long sketchSizeEstimate(){
jpayne@68 230 return SketchObject.toSketchSize(genomeSizeBases, genomeSizeKmers, genomeSizeEstimate(), SketchObject.targetSketchSize);
jpayne@68 231 }
jpayne@68 232
jpayne@68 233 public boolean contains(long key) {
jpayne@68 234 return setOrMap.contains(key);
jpayne@68 235 }
jpayne@68 236
jpayne@68 237 @Override
jpayne@68 238 public String toString(){return toHeader().toString();}
jpayne@68 239
jpayne@68 240 public String name(){return taxName==null ? name0 : taxName;}
jpayne@68 241 public String taxName(){return taxName;}
jpayne@68 242 public String name0(){return name0;}
jpayne@68 243 public String fname(){return fname;}
jpayne@68 244 public long[] baseCounts(boolean original){return baseCounts==null ? null : original ? baseCounts : baseCounts.clone();}
jpayne@68 245 public void setTaxName(String s){taxName=s;}
jpayne@68 246 public void setName0(String s){name0=s;}
jpayne@68 247 public void setFname(String s){fname=s;}
jpayne@68 248
jpayne@68 249 public byte[] r16S(){return r16S;}
jpayne@68 250 public int r16SLen(){return r16S==null ? 0 : r16S.length;}
jpayne@68 251 public void set16S(byte[] b){
jpayne@68 252 if(b==null || b.length<SketchObject.min_SSU_len){return;}
jpayne@68 253
jpayne@68 254 if(r16S==null || score16S(b)>score16S(r16S)){
jpayne@68 255 r16S=b;
jpayne@68 256 }
jpayne@68 257 }
jpayne@68 258
jpayne@68 259 private float score16S(byte[] seq){return scoreSSU(seq, 1533);}
jpayne@68 260 private float score18S(byte[] seq){return scoreSSU(seq, 1858);}
jpayne@68 261
jpayne@68 262 private float scoreSSU(byte[] seq, int idealLen){
jpayne@68 263 float lengthScore=lengthScore(seq.length, idealLen);
jpayne@68 264 float definedScore=(seq.length-AminoAcid.countUndefined(seq))/(float)seq.length;
jpayne@68 265 return lengthScore*definedScore;
jpayne@68 266 }
jpayne@68 267 private float lengthScore(int len, int ideal){return Tools.min(len, ideal)/(float)Tools.max(len, ideal);}
jpayne@68 268
jpayne@68 269 public byte[] r18S(){return r18S;}
jpayne@68 270 public int r18SLen(){return r18S==null ? 0 : r18S.length;}
jpayne@68 271 public void set18S(byte[] b){
jpayne@68 272 if(b==null || b.length<SketchObject.min_SSU_len){return;}
jpayne@68 273 if(r18S==null || score18S(b)>score16S(r18S)){
jpayne@68 274 r18S=b;
jpayne@68 275 }
jpayne@68 276 }
jpayne@68 277
jpayne@68 278 boolean isEukaryote(){
jpayne@68 279 if(taxID<1 || taxID>=SketchObject.minFakeID){return false;}
jpayne@68 280 if(SketchObject.taxtree==null){return false;}
jpayne@68 281 return SketchObject.taxtree.isEukaryote((int)taxID);
jpayne@68 282 }
jpayne@68 283
jpayne@68 284 private String taxName;
jpayne@68 285 private String name0;
jpayne@68 286 private String fname;
jpayne@68 287 public long taxID=-1;
jpayne@68 288 public long imgID=-1;
jpayne@68 289 public long genomeSizeBases=0;
jpayne@68 290 public long genomeSizeKmers=0;
jpayne@68 291 public long genomeSequences=0;
jpayne@68 292 public final long[] baseCounts=(SketchObject.aminoOrTranslate() ? null : new long[4]);
jpayne@68 293 private byte[] r16S;
jpayne@68 294 private byte[] r18S;
jpayne@68 295 double probSum=0;
jpayne@68 296
jpayne@68 297 public float probCorrect(){return probSum<=0 ? 0f : (float)(probSum/Tools.max(genomeSizeKmers, 1f));}
jpayne@68 298 public int capacity(){return heap.capacity();}
jpayne@68 299 public boolean hasRoom(){return heap.hasRoom();}
jpayne@68 300 public long peek(){return heap.peek();}
jpayne@68 301 public int size(){return heap.size();}
jpayne@68 302 public LongHashMap map(){return map.map;}
jpayne@68 303
jpayne@68 304 public void clear(){
jpayne@68 305 setOrMap.clear();
jpayne@68 306 }
jpayne@68 307 public void clearSet(){
jpayne@68 308 if(set==null){map.map.clear();}
jpayne@68 309 else{set.set.clear();}
jpayne@68 310 }
jpayne@68 311 public boolean add(long key){return setOrMap.add(key);}
jpayne@68 312 public int increment(long key, int incr){return setOrMap.increment(key, incr);}
jpayne@68 313
jpayne@68 314 private final LongHeapSet set;
jpayne@68 315 private final LongHeapMap map;
jpayne@68 316 private final LongHeapSetInterface setOrMap;
jpayne@68 317 public final LongHeap heap;
jpayne@68 318 public final int minKeyOccuranceCount;
jpayne@68 319 /** Determines whether to use LongHeapSet or LongHeapMap */
jpayne@68 320 public final boolean setMode;
jpayne@68 321 }