comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/kmer/HashArray.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package kmer;
2
3 import java.util.Arrays;
4 import java.util.concurrent.atomic.AtomicIntegerArray;
5 import java.util.concurrent.atomic.AtomicLong;
6 import java.util.concurrent.locks.Lock;
7 import java.util.concurrent.locks.ReentrantLock;
8
9 import fileIO.ByteStreamWriter;
10 import fileIO.TextStreamWriter;
11 import shared.Primes;
12 import shared.Tools;
13 import structures.ByteBuilder;
14 import structures.SuperLongList;
15
16 /**
17 * Stores kmers in a long[] and values in an int[][], with a victim cache.
18 * @author Brian Bushnell
19 * @date Nov 7, 2014
20 *
21 */
22 public abstract class HashArray extends AbstractKmerTable {
23
24 /*--------------------------------------------------------------*/
25 /*---------------- Initialization ----------------*/
26 /*--------------------------------------------------------------*/
27
28 HashArray(int[] schedule_, long coreMask_, boolean twod_){
29 schedule=schedule_;
30 autoResize=schedule.length>1;
31 prime=schedule[0];
32
33 sizeLimit=(long)((schedule.length==1 ? maxLoadFactorFinal : maxLoadFactor)*prime);
34 array=allocLong1D(prime+extra);
35 victims=new HashForest(Tools.max(10, prime/victimRatio), autoResize, twod_);
36 Arrays.fill(array, NOT_PRESENT);
37 twoD=twod_;
38 coreMask=coreMask_;
39 // coreMask2=coreMask_|3;
40 coreMask2=coreMask_; //Simplifies fast fill
41 }
42
43 HashArray(int initialSize, long coreMask_, boolean autoResize_, boolean twod_){
44 schedule=null;
45 prime=initialSize;
46 sizeLimit=(long)(maxLoadFactor*prime);
47 array=allocLong1D(prime+extra);
48 victims=new HashForest(Tools.max(10, initialSize/victimRatio), autoResize_, twod_);
49 Arrays.fill(array, NOT_PRESENT);
50 autoResize=autoResize_;
51 twoD=twod_;
52 coreMask=coreMask_;
53 // coreMask2=coreMask_|3;
54 coreMask2=coreMask_; //Simplifies fast fill
55 }
56
57 /*--------------------------------------------------------------*/
58 /*---------------- Public Methods ----------------*/
59 /*--------------------------------------------------------------*/
60
61 /*--------------------------------------------------------------*/
62 /*---------------- Test Methods ----------------*/
63 /*--------------------------------------------------------------*/
64
65 // public final int set_Test(final long kmer, final int v){
66 // assert(TESTMODE);
67 // final int x;
68 // if(TWOD){
69 // int[] old=getValues(kmer, new int[1]);
70 // assert(old==null || contains(kmer, old));
71 // if(verbose){System.err.println("Fetched "+Arrays.toString(old));}
72 // x=set0(kmer, v);
73 // assert(old==null || contains(kmer, old)) : "old="+Arrays.toString(old)+", v="+v+", kmer="+kmer+
74 // ", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1])));
75 // assert(contains(kmer, v));
76 // }else{
77 // int old=getValue(kmer);
78 // assert(old==0 || old==-1 || contains(kmer, old));
79 // x=set0(kmer, v);
80 // assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
81 // assert(v==old || !contains(kmer, old));
82 // }
83 // return x;
84 // }
85 //
86 // public final int set_Test(final long kmer, final int v[]){
87 // assert(TESTMODE);
88 // final int x;
89 // if(TWOD){
90 // final int[] singleton=new int[1];
91 // int[] old=getValues(kmer, singleton);
92 // assert(old==null || contains(kmer, old));
93 // if(verbose){System.err.println("Before: old="+Arrays.toString(old)+", v="+Arrays.toString(v));}
94 // x=set0(kmer, v);
95 // if(verbose){System.err.println("After: old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", get()="+Arrays.toString(getValues(kmer, singleton)));}
96 // assert(old==null || contains(kmer, old)) : "old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", kmer="+kmer+
97 // ", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1])));
98 // assert(contains(kmer, v)) : "old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", kmer="+kmer+
99 // ", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1])));
100 // }else{
101 // int old=getValue(kmer);
102 // assert(old==0 || old==-1 || contains(kmer, old));
103 // x=set0(kmer, v);
104 // assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
105 // assert(v[0]==old || !contains(kmer, old));
106 // }
107 // return x;
108 // }
109 //
110 // public final int setIfNotPresent_Test(long kmer, int v){
111 // assert(TESTMODE);
112 // final int x;
113 // if(TWOD){
114 //// int[] vals=getValues(kmer, null);
115 //// assert(vals==null || contains(kmer, vals));
116 //// x=setIfNotPresent(kmer, v);
117 //// assert(contains(kmer, vals));
118 //// assert(contains(kmer, v));
119 // x=0;
120 // assert(false);
121 // }else{
122 // int old=getValue(kmer);
123 // assert(old==0 || old==-1 || contains(kmer, old));
124 // x=setIfNotPresent0(kmer, v);
125 // assert((old<1 && contains(kmer, v)) || (old>0 && contains(kmer, old))) : kmer+", "+old+", "+v;
126 // }
127 // return x;
128 // }
129
130 /*--------------------------------------------------------------*/
131
132 public final int kmerToCell(long kmer){
133 final int cell=(int)((kmer&coreMask)%prime);
134 return cell;
135 }
136
137 @Override
138 public final int set(final long kmer, final int[] v, final int vlen){
139 int cell=kmerToCell(kmer);
140
141 for(final int max=cell+extra; cell<max; cell++){
142 long n=array[cell];
143 if(n==kmer){
144 if(verbose){System.err.println("A2: Adding "+kmer+", "+Arrays.toString(v)+", "+cell);}
145 insertValue(kmer, v, cell, vlen);
146 if(verbose){System.err.println("A2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
147 return 0;
148 }else if(n==NOT_PRESENT){
149 if(verbose){System.err.println("B2: Adding "+kmer+", "+Arrays.toString(v)+", "+cell);}
150 array[cell]=kmer;
151 insertValue(kmer, v, cell, vlen);
152 if(verbose){System.err.println("B2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
153 size++;
154 if(autoResize && size+victims.size>sizeLimit){resize();}
155 return 1;
156 }
157 }
158 if(verbose){System.err.println("C2: Adding "+kmer+", "+v+", "+cell);}
159 final int x=victims.set(kmer, v, vlen);
160 if(autoResize && size+victims.size>sizeLimit){resize();}
161 if(verbose){System.err.println("C2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
162 return x;
163 }
164
165 @Override
166 public final int set(final long kmer, final int v){
167 int cell=kmerToCell(kmer);
168
169 // assert(TESTMODE);
170 // ll.add(kmer);
171 // il.add(v);
172
173 for(final int max=cell+extra; cell<max; cell++){
174 long n=array[cell];
175 if(n==kmer){
176 if(verbose){System.err.println("A1: Adding "+kmer+", "+v+", "+cell);}
177 insertValue(kmer, v, cell);
178 if(verbose){System.err.println("A1: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
179 return 0;
180 }else if(n==NOT_PRESENT){
181 if(verbose){System.err.println("B1: Adding "+kmer+", "+v+", "+cell);}
182 array[cell]=kmer;
183 insertValue(kmer, v, cell);
184 if(verbose){System.err.println("B1: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
185 size++;
186 if(autoResize && size+victims.size>sizeLimit){resize();}
187 return 1;
188 }
189 }
190 if(verbose){System.err.println("C1: Adding "+kmer+", "+v+", "+cell+
191 "; victims.get(kmer)="+Arrays.toString(victims.getValues(kmer, new int[1])));}
192 final int x=victims.set(kmer, v);
193 if(autoResize && size+victims.size>sizeLimit){resize();}
194 if(verbose){System.err.println("C1: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1]))+
195 "; victims.get(kmer)="+Arrays.toString(victims.getValues(kmer, new int[1])));}
196 return x;
197 }
198
199 @Override
200 public final int setIfNotPresent(long kmer, int value){
201 int cell=kmerToCell(kmer);
202 for(final int max=cell+extra; cell<max; cell++){
203 long n=array[cell];
204 if(n==kmer){
205 return 0;
206 }else if(n==NOT_PRESENT){
207 array[cell]=kmer;
208 insertValue(kmer, value, cell);
209 size++;
210 if(autoResize && size+victims.size>sizeLimit){resize();}
211 return 1;
212 }
213 }
214 // System.err.println("size="+size+", prime="+prime+", limit="+sizeLimit);
215 int x=victims.setIfNotPresent(kmer, value);
216 if(autoResize && size+victims.size>sizeLimit){resize();}
217 return x;
218 }
219
220 @Override
221 public final int getValue(long kmer){
222 int cell=findKmer(kmer);
223 if(cell==NOT_PRESENT){return NOT_PRESENT;}
224 if(cell==HASH_COLLISION){return victims.getValue(kmer);}
225 return readCellValue(cell);
226 }
227
228 public final int getValue(long kmer, int startCell){
229 int cell=findKmer(kmer, startCell);
230 if(cell==NOT_PRESENT){return NOT_PRESENT;}
231 if(cell==HASH_COLLISION){return victims.getValue(kmer);}
232 return readCellValue(cell);
233 }
234
235 @Override
236 public final int[] getValues(long kmer, int[] singleton){
237 int cell=findKmer(kmer);
238 if(cell==NOT_PRESENT){
239 singleton[0]=NOT_PRESENT;
240 return singleton;
241 }
242 if(cell==HASH_COLLISION){return victims.getValues(kmer, singleton);}
243 return readCellValues(cell, singleton);
244 }
245
246 @Override
247 public final boolean contains(long kmer){
248 int cell=findKmer(kmer);
249 if(cell==NOT_PRESENT){return false;}
250 if(cell==HASH_COLLISION){return victims.contains(kmer);}
251 return true;
252 }
253
254 public final long getKmer(int cell) {
255 return array[cell];
256 }
257
258 /*--------------------------------------------------------------*/
259 /*---------------- Ownership ----------------*/
260 /*--------------------------------------------------------------*/
261
262 @Override
263 public final void initializeOwnership(){
264 assert(owners==null);
265 owners=allocAtomicInt(array.length);
266 for(int i=0; i<array.length; i++){
267 owners.set(i, NO_OWNER);
268 }
269 victims.initializeOwnership();
270 }
271
272 @Override
273 public final void clearOwnership(){
274 owners=null;
275 victims.clearOwnership();
276 }
277
278 @Override
279 public final int setOwner(final long kmer, final int newOwner){
280 final int cell=findKmer(kmer);
281 assert(cell!=NOT_PRESENT);
282 if(cell==HASH_COLLISION){return victims.setOwner(kmer, newOwner);}
283 return setOwner(kmer, newOwner, cell);
284 }
285
286 public final int setOwner(final long kmer, final int newOwner, final int cell){
287 assert(array[cell]==kmer);
288 final int original=owners.get(cell);
289 int current=original;
290 while(current<newOwner){
291 boolean success=owners.compareAndSet(cell, current, newOwner);
292 if(!success){current=owners.get(cell);}
293 else{current=newOwner;}
294 }
295 assert(current>=original) : "original="+original+", current="+current+", newOwner="+newOwner+", re-read="+owners.get(cell);
296 return current;
297 }
298
299 @Override
300 public final boolean clearOwner(final long kmer, final int owner){
301 final int cell=findKmer(kmer);
302 assert(cell!=NOT_PRESENT);
303 if(cell==HASH_COLLISION){return victims.clearOwner(kmer, owner);}
304 return clearOwner(kmer, owner, cell);
305 }
306
307 public final boolean clearOwner(final long kmer, final int owner, final int cell){
308 assert(array[cell]==kmer);
309 boolean success=owners.compareAndSet(cell, owner, NO_OWNER);
310 return success;
311 }
312
313 @Override
314 public final int getOwner(final long kmer){
315 final int cell=findKmer(kmer);
316 assert(cell!=NOT_PRESENT);
317 if(cell==HASH_COLLISION){return victims.getOwner(kmer);}
318 return getCellOwner(cell);
319 }
320
321 public final int getCellOwner(final int cell){
322 return owners.get(cell);
323 }
324
325 /*--------------------------------------------------------------*/
326 /*---------------- Nonpublic Methods ----------------*/
327 /*--------------------------------------------------------------*/
328
329 protected abstract void insertValue(final long kmer, final int v, final int cell);
330
331 // protected abstract void insertValue(final long kmer, final int[] vals, final int cell);
332
333 /** This is for IntList3 support with HashArrayHybridFast */
334 protected abstract void insertValue(final long kmer, final int[] vals, final int cell, final int vlen);
335
336 protected abstract int readCellValue(int cell);
337 protected abstract int[] readCellValues(int cell, int[] singleton);
338
339 @Override
340 final Object get(long kmer){
341 throw new RuntimeException("Unimplemented.");
342 }
343
344 final int findKmer(long kmer){
345 return findKmer(kmer, kmerToCell(kmer));
346 }
347
348 final int findKmer(final long kmer, final int startCell){
349 int cell=startCell;
350 for(final int max=cell+extra; cell<max; cell++){
351 final long n=array[cell];
352 if(n==kmer){return cell;}
353 else if(n==NOT_PRESENT){return NOT_PRESENT;}
354 }
355 return HASH_COLLISION;
356 }
357
358 final int findKmerOrEmpty(long kmer){
359 int cell=kmerToCell(kmer);
360 for(final int max=cell+extra; cell<max; cell++){
361 final long n=array[cell];
362 if(n==kmer || n==NOT_PRESENT){return cell;}
363 }
364 return HASH_COLLISION;
365 }
366
367 /*--------------------------------------------------------------*/
368 /*---------------- Resizing and Rebalancing ----------------*/
369 /*--------------------------------------------------------------*/
370
371 @Override
372 final boolean canResize() {return true;}
373
374 @Override
375 final public long size() {return size;}
376
377 @Override
378 final public int arrayLength() {return array.length;}
379
380 @Override
381 protected abstract void resize();
382
383 /*--------------------------------------------------------------*/
384 /*---------------- Info Dumping ----------------*/
385 /*--------------------------------------------------------------*/
386
387 @Override
388 public final boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount, int maxcount){
389 if(twoD){
390 final int[] singleton=new int[1];
391 for(int i=0; i<array.length; i++){
392 long kmer=array[i];
393 if(kmer!=NOT_PRESENT){
394 tsw.print(toText(kmer, readCellValues(i, singleton), k).append('\n'));
395 }
396 }
397 }else{
398 for(int i=0; i<array.length; i++){
399 long kmer=array[i];
400 if(kmer!=NOT_PRESENT && (mincount<2 || readCellValue(i)>=mincount)){
401 tsw.print(toText(kmer, readCellValue(i), k).append('\n'));
402 }
403 }
404 }
405 if(victims!=null){
406 victims.dumpKmersAsText(tsw, k, mincount, maxcount);
407 }
408 return true;
409 }
410
411 @Override
412 public final boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount, int maxcount, AtomicLong remaining){
413 if(twoD){
414 final int[] singleton=new int[1];
415 for(int i=0; i<array.length; i++){
416 long kmer=array[i];
417 if(kmer!=NOT_PRESENT){
418 if(remaining!=null && remaining.decrementAndGet()<0){return true;}
419 bsw.printlnKmer(kmer, readCellValues(i, singleton), k);
420 }
421 }
422 }else{
423 for(int i=0; i<array.length; i++){
424 long kmer=array[i];
425 if(kmer!=NOT_PRESENT && (mincount<2 || readCellValue(i)>=mincount)){
426 if(remaining!=null && remaining.decrementAndGet()<0){return true;}
427 bsw.printlnKmer(kmer, readCellValue(i), k);
428 }
429 }
430 }
431 if(victims!=null){
432 victims.dumpKmersAsBytes(bsw, k, mincount, maxcount, remaining);
433 }
434 return true;
435 }
436
437 @Override
438 public final boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount, int maxcount, AtomicLong remaining){
439 if(twoD){
440 final int[] singleton=new int[1];
441 for(int i=0; i<array.length; i++){
442 long kmer=array[i];
443 if(kmer!=NOT_PRESENT){
444 if(remaining!=null && remaining.decrementAndGet()<0){return true;}
445 toBytes(kmer, readCellValues(i, singleton), k, bb);
446 bb.nl();
447 if(bb.length()>=16000){
448 ByteBuilder bb2=new ByteBuilder(bb);
449 synchronized(bsw){bsw.addJob(bb2);}
450 bb.clear();
451 }
452 }
453 }
454 }else{
455 for(int i=0; i<array.length; i++){
456 long kmer=array[i];
457 if(kmer!=NOT_PRESENT && (mincount<2 || readCellValue(i)>=mincount)){
458 if(remaining!=null && remaining.decrementAndGet()<0){return true;}
459 toBytes(kmer, readCellValue(i), k, bb);
460 bb.nl();
461 if(bb.length()>=16000){
462 ByteBuilder bb2=new ByteBuilder(bb);
463 synchronized(bsw){bsw.addJob(bb2);}
464 bb.clear();
465 }
466 }
467 }
468 }
469 if(victims!=null){
470 victims.dumpKmersAsBytes_MT(bsw, bb, k, mincount, maxcount, remaining);
471 }
472 return true;
473 }
474
475 @Override
476 public final void fillHistogram(long[] ca, int max){
477 for(int i=0; i<array.length; i++){
478 long kmer=array[i];
479 if(kmer!=NOT_PRESENT){
480 int count=Tools.min(readCellValue(i), max);
481 ca[count]++;
482 }
483 }
484 if(victims!=null){
485 victims.fillHistogram(ca, max);
486 }
487 }
488
489 @Override
490 public void fillHistogram(SuperLongList sll){
491 for(int i=0; i<array.length; i++){
492 long kmer=array[i];
493 if(kmer!=NOT_PRESENT){
494 int count=readCellValue(i);
495 sll.add(count);
496 }
497 }
498 if(victims!=null){
499 victims.fillHistogram(sll);
500 }
501 }
502
503 @Override
504 public final void countGC(long[] gcCounts, int max){
505 for(int i=0; i<array.length; i++){
506 long kmer=array[i];
507 if(kmer!=NOT_PRESENT){
508 int count=readCellValue(i);
509 int index=Tools.min(count, max);
510 gcCounts[index]+=gc(kmer);
511 }
512 }
513 if(victims!=null){
514 victims.countGC(gcCounts, max);
515 }
516 }
517
518 public HashForest victims(){
519 return victims;
520 }
521
522 /*--------------------------------------------------------------*/
523 /*---------------- Fields ----------------*/
524 /*--------------------------------------------------------------*/
525
526 AtomicIntegerArray owners;
527 long[] array;
528 int prime;
529 long size=0;
530 long sizeLimit;
531 final HashForest victims;
532 final boolean autoResize;
533 public final boolean twoD;
534 private final Lock lock=new ReentrantLock();
535 private final long coreMask;//for ways
536 private final long coreMask2;//for cells
537
538 protected int nextScheduleSize(){
539 if(schedulePos<schedule.length-1){schedulePos++;}
540 return schedule[schedulePos];
541 }
542
543 protected boolean atMaxSize(){
544 return schedulePos>=schedule.length-1;
545 }
546
547 protected final int[] schedule;
548 private int schedulePos=0;
549
550 public long[] array(){return array;}
551
552 public AtomicIntegerArray owners() {return owners;}
553 @Override
554 final Lock getLock(){return lock;}
555
556 /*--------------------------------------------------------------*/
557 /*---------------- Static Fields ----------------*/
558 /*--------------------------------------------------------------*/
559
560 final static int victimRatio=16; //Initial divisor for victim cache size; it self-resizes.
561 final static int extra=60; //Amazingly, increasing this gave increasing returns past 300. Old default was 21. Could allow higher maxLoadFactorFinal and smaller victim cache.
562 final static int maxPrime=Primes.primeAtMost(Integer.MAX_VALUE-extra-20);
563 final static float resizeMult=2f; //Resize by a minimum of this much; not needed for schedule
564 final static float minLoadFactor=0.58f; //Resize by enough to get the load above this factor; not needed for schedule
565 final static float maxLoadFactor=0.88f; //Reaching this load triggers resizing
566 final static float maxLoadFactorFinal=0.95f; //Reaching this load triggers killing
567 final static float minLoadMult=1/minLoadFactor;
568 final static float maxLoadMult=1/maxLoadFactor;
569
570 }