diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/kmer/HashArray.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/kmer/HashArray.java	Tue Mar 18 16:23:26 2025 -0400
@@ -0,0 +1,570 @@
+package kmer;
+
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicIntegerArray;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+import fileIO.ByteStreamWriter;
+import fileIO.TextStreamWriter;
+import shared.Primes;
+import shared.Tools;
+import structures.ByteBuilder;
+import structures.SuperLongList;
+
+/**
+ * Stores kmers in a long[] and values in an int[][], with a victim cache.
+ * @author Brian Bushnell
+ * @date Nov 7, 2014
+ *
+ */
+public abstract class HashArray extends AbstractKmerTable {
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Initialization        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	HashArray(int[] schedule_, long coreMask_, boolean twod_){
+		schedule=schedule_;
+		autoResize=schedule.length>1;
+		prime=schedule[0];
+		
+		sizeLimit=(long)((schedule.length==1 ? maxLoadFactorFinal : maxLoadFactor)*prime);
+		array=allocLong1D(prime+extra);
+		victims=new HashForest(Tools.max(10, prime/victimRatio), autoResize, twod_);
+		Arrays.fill(array, NOT_PRESENT);
+		twoD=twod_;
+		coreMask=coreMask_;
+//		coreMask2=coreMask_|3;
+		coreMask2=coreMask_; //Simplifies fast fill
+	}
+	
+	HashArray(int initialSize, long coreMask_, boolean autoResize_, boolean twod_){
+		schedule=null;
+		prime=initialSize;
+		sizeLimit=(long)(maxLoadFactor*prime);
+		array=allocLong1D(prime+extra);
+		victims=new HashForest(Tools.max(10, initialSize/victimRatio), autoResize_, twod_);
+		Arrays.fill(array, NOT_PRESENT);
+		autoResize=autoResize_;
+		twoD=twod_;
+		coreMask=coreMask_;
+//		coreMask2=coreMask_|3;
+		coreMask2=coreMask_; //Simplifies fast fill
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Public Methods        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Test Methods         ----------------*/
+	/*--------------------------------------------------------------*/
+	
+//	public final int set_Test(final long kmer, final int v){
+//		assert(TESTMODE);
+//		final int x;
+//		if(TWOD){
+//			int[] old=getValues(kmer, new int[1]);
+//			assert(old==null || contains(kmer, old));
+//			if(verbose){System.err.println("Fetched "+Arrays.toString(old));}
+//			x=set0(kmer, v);
+//			assert(old==null || contains(kmer, old)) : "old="+Arrays.toString(old)+", v="+v+", kmer="+kmer+
+//				", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1])));
+//			assert(contains(kmer, v));
+//		}else{
+//			int old=getValue(kmer);
+//			assert(old==0 || old==-1 || contains(kmer, old));
+//			x=set0(kmer, v);
+//			assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
+//			assert(v==old || !contains(kmer, old));
+//		}
+//		return x;
+//	}
+//
+//	public final int set_Test(final long kmer, final int v[]){
+//		assert(TESTMODE);
+//		final int x;
+//		if(TWOD){
+//			final int[] singleton=new int[1];
+//			int[] old=getValues(kmer, singleton);
+//			assert(old==null || contains(kmer, old));
+//			if(verbose){System.err.println("Before: old="+Arrays.toString(old)+", v="+Arrays.toString(v));}
+//			x=set0(kmer, v);
+//			if(verbose){System.err.println("After:  old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", get()="+Arrays.toString(getValues(kmer, singleton)));}
+//			assert(old==null || contains(kmer, old)) : "old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", kmer="+kmer+
+//				", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1])));
+//			assert(contains(kmer, v)) : "old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", kmer="+kmer+
+//				", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1])));
+//		}else{
+//			int old=getValue(kmer);
+//			assert(old==0 || old==-1 || contains(kmer, old));
+//			x=set0(kmer, v);
+//			assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
+//			assert(v[0]==old || !contains(kmer, old));
+//		}
+//		return x;
+//	}
+//
+//	public final int setIfNotPresent_Test(long kmer, int v){
+//		assert(TESTMODE);
+//		final int x;
+//		if(TWOD){
+////			int[] vals=getValues(kmer, null);
+////			assert(vals==null || contains(kmer, vals));
+////			x=setIfNotPresent(kmer, v);
+////			assert(contains(kmer, vals));
+////			assert(contains(kmer, v));
+//			x=0;
+//			assert(false);
+//		}else{
+//			int old=getValue(kmer);
+//			assert(old==0 || old==-1 || contains(kmer, old));
+//			x=setIfNotPresent0(kmer, v);
+//			assert((old<1 && contains(kmer, v)) || (old>0 && contains(kmer, old))) : kmer+", "+old+", "+v;
+//		}
+//		return x;
+//	}
+	
+	/*--------------------------------------------------------------*/
+	
+	public final int kmerToCell(long kmer){
+		final int cell=(int)((kmer&coreMask)%prime);
+		return cell;
+	}
+	
+	@Override
+	public final int set(final long kmer, final int[] v, final int vlen){
+		int cell=kmerToCell(kmer);
+		
+		for(final int max=cell+extra; cell<max; cell++){
+			long n=array[cell];
+			if(n==kmer){
+				if(verbose){System.err.println("A2: Adding "+kmer+", "+Arrays.toString(v)+", "+cell);}
+				insertValue(kmer, v, cell, vlen);
+				if(verbose){System.err.println("A2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
+				return 0;
+			}else if(n==NOT_PRESENT){
+				if(verbose){System.err.println("B2: Adding "+kmer+", "+Arrays.toString(v)+", "+cell);}
+				array[cell]=kmer;
+				insertValue(kmer, v, cell, vlen);
+				if(verbose){System.err.println("B2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
+				size++;
+				if(autoResize && size+victims.size>sizeLimit){resize();}
+				return 1;
+			}
+		}
+		if(verbose){System.err.println("C2: Adding "+kmer+", "+v+", "+cell);}
+		final int x=victims.set(kmer, v, vlen);
+		if(autoResize && size+victims.size>sizeLimit){resize();}
+		if(verbose){System.err.println("C2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
+		return x;
+	}
+	
+	@Override
+	public final int set(final long kmer, final int v){
+		int cell=kmerToCell(kmer);
+		
+//		assert(TESTMODE);
+//		ll.add(kmer);
+//		il.add(v);
+		
+		for(final int max=cell+extra; cell<max; cell++){
+			long n=array[cell];
+			if(n==kmer){
+				if(verbose){System.err.println("A1: Adding "+kmer+", "+v+", "+cell);}
+				insertValue(kmer, v, cell);
+				if(verbose){System.err.println("A1: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
+				return 0;
+			}else if(n==NOT_PRESENT){
+				if(verbose){System.err.println("B1: Adding "+kmer+", "+v+", "+cell);}
+				array[cell]=kmer;
+				insertValue(kmer, v, cell);
+				if(verbose){System.err.println("B1: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
+				size++;
+				if(autoResize && size+victims.size>sizeLimit){resize();}
+				return 1;
+			}
+		}
+		if(verbose){System.err.println("C1: Adding "+kmer+", "+v+", "+cell+
+				"; victims.get(kmer)="+Arrays.toString(victims.getValues(kmer, new int[1])));}
+		final int x=victims.set(kmer, v);
+		if(autoResize && size+victims.size>sizeLimit){resize();}
+		if(verbose){System.err.println("C1: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1]))+
+				"; victims.get(kmer)="+Arrays.toString(victims.getValues(kmer, new int[1])));}
+		return x;
+	}
+	
+	@Override
+	public final int setIfNotPresent(long kmer, int value){
+		int cell=kmerToCell(kmer);
+		for(final int max=cell+extra; cell<max; cell++){
+			long n=array[cell];
+			if(n==kmer){
+				return 0;
+			}else if(n==NOT_PRESENT){
+				array[cell]=kmer;
+				insertValue(kmer, value, cell);
+				size++;
+				if(autoResize && size+victims.size>sizeLimit){resize();}
+				return 1;
+			}
+		}
+//		System.err.println("size="+size+", prime="+prime+", limit="+sizeLimit);
+		int x=victims.setIfNotPresent(kmer, value);
+		if(autoResize && size+victims.size>sizeLimit){resize();}
+		return x;
+	}
+	
+	@Override
+	public final int getValue(long kmer){
+		int cell=findKmer(kmer);
+		if(cell==NOT_PRESENT){return NOT_PRESENT;}
+		if(cell==HASH_COLLISION){return victims.getValue(kmer);}
+		return readCellValue(cell);
+	}
+	
+	public final int getValue(long kmer, int startCell){
+		int cell=findKmer(kmer, startCell);
+		if(cell==NOT_PRESENT){return NOT_PRESENT;}
+		if(cell==HASH_COLLISION){return victims.getValue(kmer);}
+		return readCellValue(cell);
+	}
+	
+	@Override
+	public final int[] getValues(long kmer, int[] singleton){
+		int cell=findKmer(kmer);
+		if(cell==NOT_PRESENT){
+			singleton[0]=NOT_PRESENT;
+			return singleton;
+		}
+		if(cell==HASH_COLLISION){return victims.getValues(kmer, singleton);}
+		return readCellValues(cell, singleton);
+	}
+	
+	@Override
+	public final boolean contains(long kmer){
+		int cell=findKmer(kmer);
+		if(cell==NOT_PRESENT){return false;}
+		if(cell==HASH_COLLISION){return victims.contains(kmer);}
+		return true;
+	}
+	
+	public final long getKmer(int cell) {
+		return array[cell];
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------          Ownership           ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	@Override
+	public final void initializeOwnership(){
+		assert(owners==null);
+		owners=allocAtomicInt(array.length);
+		for(int i=0; i<array.length; i++){
+			owners.set(i, NO_OWNER);
+		}
+		victims.initializeOwnership();
+	}
+	
+	@Override
+	public final void clearOwnership(){
+		owners=null;
+		victims.clearOwnership();
+	}
+	
+	@Override
+	public final int setOwner(final long kmer, final int newOwner){
+		final int cell=findKmer(kmer);
+		assert(cell!=NOT_PRESENT);
+		if(cell==HASH_COLLISION){return victims.setOwner(kmer, newOwner);}
+		return setOwner(kmer, newOwner, cell);
+	}
+	
+	public final int setOwner(final long kmer, final int newOwner, final int cell){
+		assert(array[cell]==kmer);
+		final int original=owners.get(cell);
+		int current=original;
+		while(current<newOwner){
+			boolean success=owners.compareAndSet(cell, current, newOwner);
+			if(!success){current=owners.get(cell);}
+			else{current=newOwner;}
+		}
+		assert(current>=original) : "original="+original+", current="+current+", newOwner="+newOwner+", re-read="+owners.get(cell);
+		return current;
+	}
+	
+	@Override
+	public final boolean clearOwner(final long kmer, final int owner){
+		final int cell=findKmer(kmer);
+		assert(cell!=NOT_PRESENT);
+		if(cell==HASH_COLLISION){return victims.clearOwner(kmer, owner);}
+		return clearOwner(kmer, owner, cell);
+	}
+	
+	public final boolean clearOwner(final long kmer, final int owner, final int cell){
+		assert(array[cell]==kmer);
+		boolean success=owners.compareAndSet(cell, owner, NO_OWNER);
+		return success;
+	}
+	
+	@Override
+	public final int getOwner(final long kmer){
+		final int cell=findKmer(kmer);
+		assert(cell!=NOT_PRESENT);
+		if(cell==HASH_COLLISION){return victims.getOwner(kmer);}
+		return getCellOwner(cell);
+	}
+	
+	public final int getCellOwner(final int cell){
+		return owners.get(cell);
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------      Nonpublic Methods       ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	protected abstract void insertValue(final long kmer, final int v, final int cell);
+
+//	protected abstract void insertValue(final long kmer, final int[] vals, final int cell);
+	
+	/** This is for IntList3 support with HashArrayHybridFast */
+	protected abstract void insertValue(final long kmer, final int[] vals, final int cell, final int vlen);
+	
+	protected abstract int readCellValue(int cell);
+	protected abstract int[] readCellValues(int cell, int[] singleton);
+	
+	@Override
+	final Object get(long kmer){
+		throw new RuntimeException("Unimplemented.");
+	}
+	
+	final int findKmer(long kmer){
+		return findKmer(kmer, kmerToCell(kmer));
+	}
+	
+	final int findKmer(final long kmer, final int startCell){
+		int cell=startCell;
+		for(final int max=cell+extra; cell<max; cell++){
+			final long n=array[cell];
+			if(n==kmer){return cell;}
+			else if(n==NOT_PRESENT){return NOT_PRESENT;}
+		}
+		return HASH_COLLISION;
+	}
+	
+	final int findKmerOrEmpty(long kmer){
+		int cell=kmerToCell(kmer);
+		for(final int max=cell+extra; cell<max; cell++){
+			final long n=array[cell];
+			if(n==kmer || n==NOT_PRESENT){return cell;}
+		}
+		return HASH_COLLISION;
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------   Resizing and Rebalancing   ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	@Override
+	final boolean canResize() {return true;}
+	
+	@Override
+	final public long size() {return size;}
+	
+	@Override
+	final public int arrayLength() {return array.length;}
+	
+	@Override
+	protected abstract void resize();
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Info Dumping         ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	@Override
+	public final boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount, int maxcount){
+		if(twoD){
+			final int[] singleton=new int[1];
+			for(int i=0; i<array.length; i++){
+				long kmer=array[i];
+				if(kmer!=NOT_PRESENT){
+					tsw.print(toText(kmer, readCellValues(i, singleton), k).append('\n'));
+				}
+			}
+		}else{
+			for(int i=0; i<array.length; i++){
+				long kmer=array[i];
+				if(kmer!=NOT_PRESENT && (mincount<2 || readCellValue(i)>=mincount)){
+					tsw.print(toText(kmer, readCellValue(i), k).append('\n'));
+				}
+			}
+		}
+		if(victims!=null){
+			victims.dumpKmersAsText(tsw, k, mincount, maxcount);
+		}
+		return true;
+	}
+	
+	@Override
+	public final boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount, int maxcount, AtomicLong remaining){
+		if(twoD){
+			final int[] singleton=new int[1];
+			for(int i=0; i<array.length; i++){
+				long kmer=array[i];
+				if(kmer!=NOT_PRESENT){
+					if(remaining!=null && remaining.decrementAndGet()<0){return true;}
+					bsw.printlnKmer(kmer, readCellValues(i, singleton), k);
+				}
+			}
+		}else{
+			for(int i=0; i<array.length; i++){
+				long kmer=array[i];
+				if(kmer!=NOT_PRESENT && (mincount<2 || readCellValue(i)>=mincount)){
+					if(remaining!=null && remaining.decrementAndGet()<0){return true;}
+					bsw.printlnKmer(kmer, readCellValue(i), k);
+				}
+			}
+		}
+		if(victims!=null){
+			victims.dumpKmersAsBytes(bsw, k, mincount, maxcount, remaining);
+		}
+		return true;
+	}
+	
+	@Override
+	public final boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount, int maxcount, AtomicLong remaining){
+		if(twoD){
+			final int[] singleton=new int[1];
+			for(int i=0; i<array.length; i++){
+				long kmer=array[i];
+				if(kmer!=NOT_PRESENT){
+					if(remaining!=null && remaining.decrementAndGet()<0){return true;}
+					toBytes(kmer, readCellValues(i, singleton), k, bb);
+					bb.nl();
+					if(bb.length()>=16000){
+						ByteBuilder bb2=new ByteBuilder(bb);
+						synchronized(bsw){bsw.addJob(bb2);}
+						bb.clear();
+					}
+				}
+			}
+		}else{
+			for(int i=0; i<array.length; i++){
+				long kmer=array[i];
+				if(kmer!=NOT_PRESENT && (mincount<2 || readCellValue(i)>=mincount)){
+					if(remaining!=null && remaining.decrementAndGet()<0){return true;}
+					toBytes(kmer, readCellValue(i), k, bb);
+					bb.nl();
+					if(bb.length()>=16000){
+						ByteBuilder bb2=new ByteBuilder(bb);
+						synchronized(bsw){bsw.addJob(bb2);}
+						bb.clear();
+					}
+				}
+			}
+		}
+		if(victims!=null){
+			victims.dumpKmersAsBytes_MT(bsw, bb, k, mincount, maxcount, remaining);
+		}
+		return true;
+	}
+	
+	@Override
+	public final void fillHistogram(long[] ca, int max){
+		for(int i=0; i<array.length; i++){
+			long kmer=array[i];
+			if(kmer!=NOT_PRESENT){
+				int count=Tools.min(readCellValue(i), max);
+				ca[count]++;
+			}
+		}
+		if(victims!=null){
+			victims.fillHistogram(ca, max);
+		}
+	}
+	
+	@Override
+	public void fillHistogram(SuperLongList sll){
+		for(int i=0; i<array.length; i++){
+			long kmer=array[i];
+			if(kmer!=NOT_PRESENT){
+				int count=readCellValue(i);
+				sll.add(count);
+			}
+		}
+		if(victims!=null){
+			victims.fillHistogram(sll);
+		}
+	}
+	
+	@Override
+	public final void countGC(long[] gcCounts, int max){
+		for(int i=0; i<array.length; i++){
+			long kmer=array[i];
+			if(kmer!=NOT_PRESENT){
+				int count=readCellValue(i);
+				int index=Tools.min(count, max);
+				gcCounts[index]+=gc(kmer);
+			}
+		}
+		if(victims!=null){
+			victims.countGC(gcCounts, max);
+		}
+	}
+	
+	public HashForest victims(){
+		return victims;
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------            Fields            ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	AtomicIntegerArray owners;
+	long[] array;
+	int prime;
+	long size=0;
+	long sizeLimit;
+	final HashForest victims;
+	final boolean autoResize;
+	public final boolean twoD;
+	private final Lock lock=new ReentrantLock();
+	private final long coreMask;//for ways
+	private final long coreMask2;//for cells
+	
+	protected int nextScheduleSize(){
+		if(schedulePos<schedule.length-1){schedulePos++;}
+		return schedule[schedulePos];
+	}
+	
+	protected boolean atMaxSize(){
+		return schedulePos>=schedule.length-1;
+	}
+	
+	protected final int[] schedule;
+	private int schedulePos=0;
+	
+	public long[] array(){return array;}
+	
+	public AtomicIntegerArray owners() {return owners;}
+	@Override
+	final Lock getLock(){return lock;}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Static Fields         ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	final static int victimRatio=16; //Initial divisor for victim cache size; it self-resizes.
+	final static int extra=60; //Amazingly, increasing this gave increasing returns past 300.  Old default was 21.  Could allow higher maxLoadFactorFinal and smaller victim cache.
+	final static int maxPrime=Primes.primeAtMost(Integer.MAX_VALUE-extra-20);
+	final static float resizeMult=2f; //Resize by a minimum of this much; not needed for schedule
+	final static float minLoadFactor=0.58f; //Resize by enough to get the load above this factor; not needed for schedule
+	final static float maxLoadFactor=0.88f; //Reaching this load triggers resizing
+	final static float maxLoadFactorFinal=0.95f; //Reaching this load triggers killing
+	final static float minLoadMult=1/minLoadFactor;
+	final static float maxLoadMult=1/maxLoadFactor;
+	
+}