comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/kmer/AbstractKmerTable.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package kmer;
2
3 import java.util.concurrent.atomic.AtomicIntegerArray;
4 import java.util.concurrent.atomic.AtomicLong;
5 import java.util.concurrent.locks.Lock;
6
7 import dna.AminoAcid;
8 import fileIO.ByteStreamWriter;
9 import fileIO.TextStreamWriter;
10 import shared.KillSwitch;
11 import shared.Shared;
12 import shared.Tools;
13 import structures.ByteBuilder;
14 import structures.SuperLongList;
15
16
17 /**
18 * @author Brian Bushnell
19 * @date Oct 23, 2013
20 *
21 */
22 public abstract class AbstractKmerTable {
23
24 /*--------------------------------------------------------------*/
25 /*---------------- Abstract Methods ----------------*/
26 /*--------------------------------------------------------------*/
27
28 // /** Returns count */
29 // public final int increment(long kmer){return increment(kmer, 1);}
30
31 /** Returns count */
32 public abstract int increment(final long kmer, final int incr);
33
34 // /** Returns number of entries created */
35 // public final int incrementAndReturnNumCreated(final long kmer){return incrementAndReturnNumCreated(kmer, 1);}
36
37 /** Returns number of entries created. Incr must be positive. */
38 public abstract int incrementAndReturnNumCreated(final long kmer, final int incr);
39
40 public abstract int set(long kmer, int value);
41
42 // public abstract int set(long kmer, int[] vals);
43
44 /** This is for IntList3 support with HashArrayHybridFast */
45 public abstract int set(long kmer, int[] vals, int vlen);
46
47 /** Returns number of kmers added */
48 public abstract int setIfNotPresent(long kmer, int value);
49
50 /**
51 * Fetch the value associated with a kmer.
52 * @param kmer
53 * @return A value. -1 means the kmer was not present.
54 */
55 public abstract int getValue(long kmer);
56
57 /**
58 * Fetch the values associated with a kmer.
59 * @param kmer
60 * @param singleton A blank array of length 1.
61 * @return An array filled with values. Values of -1 are invalid.
62 */
63 public abstract int[] getValues(long kmer, int[] singleton);
64
65 public abstract boolean contains(long kmer);
66
67 public final boolean contains(long kmer, int v){
68 assert(TESTMODE);
69 int[] set=getValues(kmer, new int[] {-1});
70 if(set==null){return false;}
71 for(int s : set){
72 if(s==-1){break;}
73 if(s==v){return true;}
74 }
75 return false;
76 }
77
78 public final boolean contains(long kmer, int[] vals){
79 assert(TESTMODE);
80 int[] set=getValues(kmer, new int[] {-1});
81 if(set==null){return false;}
82 boolean success=true;
83 for(int v : vals){
84 if(v==-1){break;}
85 success=false;
86 for(int s : set){
87 if(s==v){
88 success=true;
89 break;
90 }
91 }
92 if(!success){break;}
93 }
94 return success;
95 }
96
97 public abstract void rebalance();
98
99 public abstract long size();
100 public abstract int arrayLength();
101 public abstract boolean canRebalance();
102
103 public abstract boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount, int maxcount);
104 public abstract boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount, int maxcount, AtomicLong remaining);
105 public abstract boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount, int maxcount, AtomicLong remaining);
106
107 public abstract void fillHistogram(long[] ca, int max);
108 public abstract void fillHistogram(SuperLongList sll);
109 public abstract void countGC(long[] gcCounts, int max);
110
111 public static final int gc(long kmer){
112 int gc=0;
113 while(kmer>0){
114 long x=kmer&3;
115 kmer>>>=2;
116 if(x==1 || x==2){gc++;}
117 }
118 return gc;
119 }
120
121 abstract Object get(long kmer);
122 abstract void resize();
123 abstract boolean canResize();
124
125
126
127 /**
128 * Removes entries with a value of the limit or less.
129 * Rehashes the remainder.
130 * @return Number removed.
131 */
132 abstract long regenerate(int limit);
133
134 final void lock(){getLock().lock();}
135 final void unlock(){getLock().unlock();}
136 final boolean tryLock(){return getLock().tryLock();}
137 Lock getLock(){
138 throw new RuntimeException("Unimplemented.");
139 }
140
141 /*--------------------------------------------------------------*/
142 /*--------------- Allocation Methods ----------------*/
143 /*--------------------------------------------------------------*/
144
145 final static AtomicIntegerArray allocAtomicInt(int len){
146 return KillSwitch.allocAtomicInt(len);
147 }
148
149 final static long[] allocLong1D(int len){
150 return KillSwitch.allocLong1D(len);
151 }
152
153 final static long[][] allocLong2D(int mult, int len){
154 return KillSwitch.allocLong2D(mult, len);
155 }
156
157 final static int[] allocInt1D(int len){
158 return KillSwitch.allocInt1D(len);
159 }
160
161 final static int[][] allocInt2D(int len){
162 return KillSwitch.allocInt2D(len);
163 }
164
165 final static KmerNode[] allocKmerNodeArray(int len){
166 KmerNode[] ret=null;
167 try {
168 ret=new KmerNode[len];
169 } catch (OutOfMemoryError e) {
170 synchronized(killMessage){
171 e.printStackTrace();
172 System.err.println(killMessage);
173 // Shared.printMemory();
174 KillSwitch.killSilent();
175 }
176 }
177 return ret;
178 }
179
180 /*--------------------------------------------------------------*/
181 /*--------------- Ownership Methods ----------------*/
182 /*--------------------------------------------------------------*/
183
184 /** Set the thread owning this kmer. Return the new owner.
185 * Will only change the owner if newOwner is greater than current owner. */
186 public abstract int setOwner(long kmer, int newOwner);
187
188 /** Reset owner to -1 if this is the current owner. */
189 public abstract boolean clearOwner(long kmer, int owner);
190
191 /** Return the thread ID owning this kmer, or -1. */
192 public abstract int getOwner(long kmer);
193
194 /** Create data structures needed for ownership representation */
195 public abstract void initializeOwnership();
196
197 /** Eliminate ownership data structures or set them to -1. */
198 public abstract void clearOwnership();
199
200 /*--------------------------------------------------------------*/
201 /*---------------- Methods ----------------*/
202 /*--------------------------------------------------------------*/
203
204 public static final StringBuilder toText(long kmer, int k){
205 byte[] lookup=(Shared.AMINO_IN ? AminoAcid.numberToAcid : AminoAcid.numberToBase);
206 int bits=(Shared.AMINO_IN ? 5 : 2);
207 int mask=(Shared.AMINO_IN ? 31 : 3);
208 StringBuilder sb=new StringBuilder(k);
209 for(int i=k-1; i>=0; i--){
210 int x=(int)((kmer>>(bits*i))&mask);
211 sb.append((char)lookup[x]);
212 }
213 return sb;
214 }
215
216 static final StringBuilder toText(long kmer, int count, int k){
217 StringBuilder sb=new StringBuilder(k+10);
218 return toText(kmer, count, k, sb);
219 }
220
221 static final ByteBuilder toBytes(long kmer, int count, int k){
222 ByteBuilder bb=new ByteBuilder(k+10);
223 return toBytes(kmer, count, k, bb);
224 }
225
226 static final StringBuilder toText(long kmer, int[] values, int k){
227 StringBuilder sb=new StringBuilder(k+10);
228 return toText(kmer, values, k, sb);
229 }
230
231 static final ByteBuilder toBytes(long kmer, int[] values, int k){
232 ByteBuilder bb=new ByteBuilder(k+10);
233 return toBytes(kmer, values, k, bb);
234 }
235
236 static final StringBuilder toText(long kmer, int count, int k, StringBuilder sb){
237 byte[] lookup=(Shared.AMINO_IN ? AminoAcid.numberToAcid : AminoAcid.numberToBase);
238 int bits=(Shared.AMINO_IN ? 5 : 2);
239 int mask=(Shared.AMINO_IN ? 31 : 3);
240 if(FASTA_DUMP){
241 sb.append('>');
242 sb.append(count);
243 sb.append('\n');
244 for(int i=k-1; i>=0; i--){
245 int x=(int)((kmer>>(bits*i))&mask);
246 sb.append((char)lookup[x]);
247 }
248 }else{
249 for(int i=k-1; i>=0; i--){
250 int x=(int)((kmer>>(bits*i))&mask);
251 sb.append((char)lookup[x]);
252 }
253 sb.append('\t');
254 sb.append(count);
255 }
256 return sb;
257 }
258
259 static final StringBuilder toText(long kmer, int[] values, int k, StringBuilder sb){
260 byte[] lookup=(Shared.AMINO_IN ? AminoAcid.numberToAcid : AminoAcid.numberToBase);
261 int bits=(Shared.AMINO_IN ? 5 : 2);
262 int mask=(Shared.AMINO_IN ? 31 : 3);
263 if(FASTA_DUMP){
264 sb.append('>');
265 for(int i=0; i<values.length; i++){
266 int x=values[i];
267 if(x==-1){break;}
268 if(i>0){sb.append(',');}
269 sb.append(x);
270 }
271 sb.append('\n');
272 for(int i=k-1; i>=0; i--){
273 int x=(int)((kmer>>(bits*i))&mask);
274 sb.append((char)lookup[x]);
275 }
276 }else{
277 for(int i=k-1; i>=0; i--){
278 int x=(int)((kmer>>(bits*i))&mask);
279 sb.append((char)lookup[x]);
280 }
281 sb.append('\t');
282 for(int i=0; i<values.length; i++){
283 int x=values[i];
284 if(x==-1){break;}
285 if(i>0){sb.append(',');}
286 sb.append(x);
287 }
288 }
289 return sb;
290 }
291
292 public static final ByteBuilder toBytes(long kmer, long count, int k, ByteBuilder bb){
293 byte[] lookup=(Shared.AMINO_IN ? AminoAcid.numberToAcid : AminoAcid.numberToBase);
294 int bits=(Shared.AMINO_IN ? 5 : 2);
295 int mask=(Shared.AMINO_IN ? 31 : 3);
296 if(FASTA_DUMP){
297 bb.append('>');
298 bb.append(count);
299 bb.nl();
300 for(int i=k-1; i>=0; i--){
301 int x=(int)((kmer>>(bits*i))&mask);
302 bb.append(lookup[x]);
303 }
304 // assert(false) : kmer+"->\n"+bb+"\n"+AminoAcid.kmerToStringAA(kmer, k);
305 }else if(NUMERIC_DUMP){
306 bb.append(Long.toHexString(kmer));
307 bb.tab();
308 bb.append(count);
309 }else{
310 for(int i=k-1; i>=0; i--){
311 int x=(int)((kmer>>(bits*i))&mask);
312 bb.append(lookup[x]);
313 }
314 bb.tab();
315 bb.append(count);
316 }
317 return bb;
318 }
319
320 public static final ByteBuilder toBytes(long kmer, int[] values, int k, ByteBuilder bb){
321 byte[] lookup=(Shared.AMINO_IN ? AminoAcid.numberToAcid : AminoAcid.numberToBase);
322 int bits=(Shared.AMINO_IN ? 5 : 2);
323 int mask=(Shared.AMINO_IN ? 31 : 3);
324 if(FASTA_DUMP){
325 bb.append('>');
326 for(int i=0; i<values.length; i++){
327 int x=values[i];
328 if(x==-1){break;}
329 if(i>0){bb.append(',');}
330 bb.append(x);
331 }
332 bb.nl();
333 for(int i=k-1; i>=0; i--){
334 int x=(int)((kmer>>(bits*i))&mask);
335 bb.append(lookup[x]);
336 }
337 }else if(NUMERIC_DUMP){
338 bb.append(Long.toHexString(kmer));
339 bb.tab();
340 for(int i=0; i<values.length; i++){
341 int x=values[i];
342 if(x==-1){break;}
343 if(i>0){bb.append(',');}
344 bb.append(x);
345 }
346 }else{
347 for(int i=k-1; i>=0; i--){
348 int x=(int)((kmer>>(bits*i))&mask);
349 bb.append(lookup[x]);
350 }
351 bb.tab();
352 for(int i=0; i<values.length; i++){
353 int x=values[i];
354 if(x==-1){break;}
355 if(i>0){bb.append(',');}
356 bb.append(x);
357 }
358 }
359 return bb;
360 }
361
362 // static void appendKmerText(long kmer, int count, int k, StringBuilder sb){
363 // sb.setLength(0);
364 // toText(kmer, count, k, sb);
365 // sb.append('\n');
366 // }
367
368 static void appendKmerText(long kmer, int count, int k, ByteBuilder bb){
369 bb.setLength(0);
370 toBytes(kmer, count, k, bb);
371 bb.nl();
372 }
373
374
375 /** For buffered tables. */
376 long flush(){
377 throw new RuntimeException("Unsupported.");
378 }
379
380 /**
381 * This allocates the data structures in multiple threads. Unfortunately, it does not lead to any speedup, at least for ARRAY type.
382 * @param ways
383 * @param tableType
384 * @param schedule
385 * @param mask
386 * @return The preallocated table
387 */
388 public static final AbstractKmerTable[] preallocate(int ways, int tableType, int[] schedule, long mask){
389
390 final AbstractKmerTable[] tables=new AbstractKmerTable[ways];
391
392 {
393 shared.Timer tm=new shared.Timer();
394 final int t=Tools.max(1, Tools.min(Shared.threads(), 2, ways)); //More than 2 still improves allocation time, but only slightly; ~25% faster at t=4.
395 final AllocThread[] allocators=new AllocThread[t];
396 for(int i=0; i<t; i++){
397 allocators[i]=new AllocThread(tableType, schedule, i, t, mask, tables);
398 }
399 for(AllocThread at : allocators){at.start();}
400 for(AllocThread at : allocators){
401 while(at.getState()!=Thread.State.TERMINATED){
402 try {
403 at.join();
404 } catch (InterruptedException e) {
405 // TODO Auto-generated catch block
406 e.printStackTrace();
407 }
408 }
409 }
410 tm.stop();
411 if(AbstractKmerTableSet.DISPLAY_PROGRESS){System.err.println(tm);}
412 }
413
414 synchronized(tables){
415 for(int i=0; i<tables.length; i++){
416 final AbstractKmerTable akt=tables[i];
417 if(akt==null){
418 throw new RuntimeException("KmerTable allocation failed, probably due to lack of RAM: "+i+", "+tables.length);
419 }
420 }
421 }
422
423 return tables;
424 }
425
426 /*--------------------------------------------------------------*/
427 /*---------------- Nested Classes ----------------*/
428 /*--------------------------------------------------------------*/
429
430 private static class AllocThread extends Thread{
431
432 AllocThread(int type_, int[] schedule_, int mod_, int div_,
433 long mask_, AbstractKmerTable[] tables_){
434 type=type_;
435 schedule=schedule_;
436 size=schedule[0];
437 mod=mod_;
438 div=div_;
439 mask=mask_;
440 growable=schedule.length>1;
441 tables=tables_;
442 }
443
444 @Override
445 public void run(){
446 //Initialize tables
447 long sum=0;
448
449 // Shared.printMemory();}
450 for(int i=mod; i<tables.length; i+=div){
451 // System.err.println("T"+i+" allocating "+i);
452 final AbstractKmerTable akt;
453 if(type==FOREST1D){
454 akt=new HashForest(size, growable, false);
455 }else if(type==TABLE){
456 akt=new KmerTable(size, growable);
457 }else if(type==ARRAY1D){
458 akt=new HashArray1D(schedule, mask);
459 // long len=((HashArray1D)akt).arrayLength();
460 // long mem=((HashArray1D)akt).calcMem();
461 // sum+=mem;
462 // akt=new HashArray1D(size, -1, mask, growable);//TODO: Set maxSize
463 }else if(type==NODE1D){
464 throw new RuntimeException("Must use forest, table, or array data structure. Type="+type);
465 // akt=new KmerNode2(-1, 0);
466 }else if(type==FOREST2D){
467 akt=new HashForest(size, growable, true);
468 }else if(type==TABLE2D){
469 throw new RuntimeException("Must use forest, table, or array data structure. Type="+type);
470 }else if(type==ARRAY2D){
471 akt=new HashArray2D(schedule, mask);
472 // akt=new HashArray2D(size, -1, mask, growable);//TODO: Set maxSize
473 }else if(type==NODE2D){
474 throw new RuntimeException("Must use forest, table, or array data structure. Type="+type);
475 // akt=new KmerNode(-1, 0);
476 }else if(type==ARRAYH){
477 akt=new HashArrayHybrid(schedule, mask);
478 // akt=new HashArrayHybrid(size, -1, mask, growable);//TODO: Set maxSize
479 }else if(type==ARRAYHF){
480 akt=new HashArrayHybridFast(schedule, mask);
481 // akt=new HashArrayHybrid(size, -1, mask, growable);//TODO: Set maxSize
482 }else{
483 throw new RuntimeException("Must use forest, table, or array data structure. Type="+type);
484 }
485 synchronized(tables){
486 tables[i]=akt;
487 }
488 // System.err.println("T"+i+" allocated "+i);
489
490 // if(i%100==0){
491 // System.err.println("Allocated "+(sum/1000000)+"MB");
492 // Shared.printMemory();
493 // }
494 }
495 // Shared.printMemory();
496 // assert(false) : ("Allocated "+(sum/1000000)+"MB");
497 }
498
499 private final int type;
500 private final int[] schedule;
501 private final int size;
502 private final int mod;
503 private final int div;
504 private final long mask;
505 private final boolean growable;
506 final AbstractKmerTable[] tables;
507
508 }
509
510 public Walker walk() {
511 throw new RuntimeException("Unimplemented");
512 }
513
514 /*--------------------------------------------------------------*/
515 /*---------------- Fields ----------------*/
516 /*--------------------------------------------------------------*/
517
518 public static boolean FASTA_DUMP=true;
519 public static boolean NUMERIC_DUMP=false;
520 public static boolean TWO_PASS_RESIZE=false;
521
522 public static final boolean verbose=false;
523 public static final boolean TESTMODE=false; //123 SLOW!
524
525 public static final int UNKNOWN=0, ARRAY1D=1, FOREST1D=2, TABLE=3, NODE1D=4, ARRAY2D=5, FOREST2D=6, TABLE2D=7, NODE2D=8, ARRAYH=9, ARRAYHF=10;
526
527 public static final int NOT_PRESENT=-1, HASH_COLLISION=-2;
528 public static final int NO_OWNER=-1;
529
530 private static final String killMessage=new String("\nThis program ran out of memory. Try increasing the -Xmx flag and setting prealloc.");
531
532 }