view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/kmer/AbstractKmerTable.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line source
package kmer;

import java.util.concurrent.atomic.AtomicIntegerArray;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Lock;

import dna.AminoAcid;
import fileIO.ByteStreamWriter;
import fileIO.TextStreamWriter;
import shared.KillSwitch;
import shared.Shared;
import shared.Tools;
import structures.ByteBuilder;
import structures.SuperLongList;


/**
 * @author Brian Bushnell
 * @date Oct 23, 2013
 *
 */
public abstract class AbstractKmerTable {
	
	/*--------------------------------------------------------------*/
	/*----------------       Abstract Methods       ----------------*/
	/*--------------------------------------------------------------*/
	
//	/** Returns count */
//	public final int increment(long kmer){return increment(kmer, 1);}
	
	/** Returns count */
	public abstract int increment(final long kmer, final int incr);
	
//	/** Returns number of entries created */
//	public final int incrementAndReturnNumCreated(final long kmer){return incrementAndReturnNumCreated(kmer, 1);}
	
	/** Returns number of entries created.  Incr must be positive. */
	public abstract int incrementAndReturnNumCreated(final long kmer, final int incr);

	public abstract int set(long kmer, int value);

//	public abstract int set(long kmer, int[] vals);
	
	/** This is for IntList3 support with HashArrayHybridFast */
	public abstract int set(long kmer, int[] vals, int vlen);
	
	/** Returns number of kmers added */
	public abstract int setIfNotPresent(long kmer, int value);

	/**
	 * Fetch the value associated with a kmer.
	 * @param kmer
	 * @return A value.  -1 means the kmer was not present.
	 */
	public abstract int getValue(long kmer);
	
	/**
	 * Fetch the values associated with a kmer.
	 * @param kmer
	 * @param singleton A blank array of length 1.
	 * @return An array filled with values.  Values of -1 are invalid.
	 */
	public abstract int[] getValues(long kmer, int[] singleton);

	public abstract boolean contains(long kmer);
	
	public final boolean contains(long kmer, int v){
		assert(TESTMODE);
		int[] set=getValues(kmer, new int[] {-1});
		if(set==null){return false;}
		for(int s : set){
			if(s==-1){break;}
			if(s==v){return true;}
		}
		return false;
	}
	
	public final boolean contains(long kmer, int[] vals){
		assert(TESTMODE);
		int[] set=getValues(kmer, new int[] {-1});
		if(set==null){return false;}
		boolean success=true;
		for(int v : vals){
			if(v==-1){break;}
			success=false;
			for(int s : set){
				if(s==v){
					success=true;
					break;
				}
			}
			if(!success){break;}
		}
		return success;
	}

	public abstract void rebalance();

	public abstract long size();
	public abstract int arrayLength();
	public abstract boolean canRebalance();

	public abstract boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount, int maxcount);
	public abstract boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount, int maxcount, AtomicLong remaining);
	public abstract boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount, int maxcount, AtomicLong remaining);

	public abstract void fillHistogram(long[] ca, int max);
	public abstract void fillHistogram(SuperLongList sll);
	public abstract void countGC(long[] gcCounts, int max);
	
	public static final int gc(long kmer){
		int gc=0;
		while(kmer>0){
			long x=kmer&3;
			kmer>>>=2;
			if(x==1 || x==2){gc++;}
		}
		return gc;
	}
	
	abstract Object get(long kmer);
	abstract void resize();
	abstract boolean canResize();
	


	/**
	 * Removes entries with a value of the limit or less.
	 * Rehashes the remainder.
	 * @return Number removed.
	 */
	abstract long regenerate(int limit);

	final void lock(){getLock().lock();}
	final void unlock(){getLock().unlock();}
	final boolean tryLock(){return getLock().tryLock();}
	Lock getLock(){
		throw new RuntimeException("Unimplemented.");
	}
	
	/*--------------------------------------------------------------*/
	/*---------------       Allocation Methods      ----------------*/
	/*--------------------------------------------------------------*/
	
	final static AtomicIntegerArray allocAtomicInt(int len){
		return KillSwitch.allocAtomicInt(len);
	}
	
	final static long[] allocLong1D(int len){
		return KillSwitch.allocLong1D(len);
	}
	
	final static long[][] allocLong2D(int mult, int len){
		return KillSwitch.allocLong2D(mult, len);
	}
	
	final static int[] allocInt1D(int len){
		return KillSwitch.allocInt1D(len);
	}
	
	final static int[][] allocInt2D(int len){
		return KillSwitch.allocInt2D(len);
	}
	
	final static KmerNode[] allocKmerNodeArray(int len){
		KmerNode[] ret=null;
		try {
			ret=new KmerNode[len];
		} catch (OutOfMemoryError e) {
			synchronized(killMessage){
				e.printStackTrace();
				System.err.println(killMessage);
//				Shared.printMemory();
				KillSwitch.killSilent();
			}
		}
		return ret;
	}
	
	/*--------------------------------------------------------------*/
	/*---------------       Ownership Methods       ----------------*/
	/*--------------------------------------------------------------*/
	
	/** Set the thread owning this kmer.  Return the new owner.
	 * Will only change the owner if newOwner is greater than current owner. */
	public abstract int setOwner(long kmer, int newOwner);
	
	/** Reset owner to -1 if this is the current owner. */
	public abstract boolean clearOwner(long kmer, int owner);
	
	/** Return the thread ID owning this kmer, or -1. */
	public abstract int getOwner(long kmer);
	
	/** Create data structures needed for ownership representation */
	public abstract void initializeOwnership();
	
	/** Eliminate ownership data structures or set them to -1. */
	public abstract void clearOwnership();
	
	/*--------------------------------------------------------------*/
	/*----------------           Methods            ----------------*/
	/*--------------------------------------------------------------*/
	
	public static final StringBuilder toText(long kmer, int k){
		byte[] lookup=(Shared.AMINO_IN ? AminoAcid.numberToAcid : AminoAcid.numberToBase);
		int bits=(Shared.AMINO_IN ? 5 : 2);
		int mask=(Shared.AMINO_IN ? 31 : 3);
		StringBuilder sb=new StringBuilder(k);
		for(int i=k-1; i>=0; i--){
			int x=(int)((kmer>>(bits*i))&mask);
			sb.append((char)lookup[x]);
		}
		return sb;
	}

	static final StringBuilder toText(long kmer, int count, int k){
		StringBuilder sb=new StringBuilder(k+10);
		return toText(kmer, count, k, sb);
	}

	static final ByteBuilder toBytes(long kmer, int count, int k){
		ByteBuilder bb=new ByteBuilder(k+10);
		return toBytes(kmer, count, k, bb);
	}

	static final StringBuilder toText(long kmer, int[] values, int k){
		StringBuilder sb=new StringBuilder(k+10);
		return toText(kmer, values, k, sb);
	}

	static final ByteBuilder toBytes(long kmer, int[] values, int k){
		ByteBuilder bb=new ByteBuilder(k+10);
		return toBytes(kmer, values, k, bb);
	}
	
	static final StringBuilder toText(long kmer, int count, int k, StringBuilder sb){
		byte[] lookup=(Shared.AMINO_IN ? AminoAcid.numberToAcid : AminoAcid.numberToBase);
		int bits=(Shared.AMINO_IN ? 5 : 2);
		int mask=(Shared.AMINO_IN ? 31 : 3);
		if(FASTA_DUMP){
			sb.append('>');
			sb.append(count);
			sb.append('\n');
			for(int i=k-1; i>=0; i--){
				int x=(int)((kmer>>(bits*i))&mask);
				sb.append((char)lookup[x]);
			}
		}else{
			for(int i=k-1; i>=0; i--){
				int x=(int)((kmer>>(bits*i))&mask);
				sb.append((char)lookup[x]);
			}
			sb.append('\t');
			sb.append(count);
		}
		return sb;
	}
	
	static final StringBuilder toText(long kmer, int[] values, int k, StringBuilder sb){
		byte[] lookup=(Shared.AMINO_IN ? AminoAcid.numberToAcid : AminoAcid.numberToBase);
		int bits=(Shared.AMINO_IN ? 5 : 2);
		int mask=(Shared.AMINO_IN ? 31 : 3);
		if(FASTA_DUMP){
			sb.append('>');
			for(int i=0; i<values.length; i++){
				int x=values[i];
				if(x==-1){break;}
				if(i>0){sb.append(',');}
				sb.append(x);
			}
			sb.append('\n');
			for(int i=k-1; i>=0; i--){
				int x=(int)((kmer>>(bits*i))&mask);
				sb.append((char)lookup[x]);
			}
		}else{
			for(int i=k-1; i>=0; i--){
				int x=(int)((kmer>>(bits*i))&mask);
				sb.append((char)lookup[x]);
			}
			sb.append('\t');
			for(int i=0; i<values.length; i++){
				int x=values[i];
				if(x==-1){break;}
				if(i>0){sb.append(',');}
				sb.append(x);
			}
		}
		return sb;
	}
	
	public static final ByteBuilder toBytes(long kmer, long count, int k, ByteBuilder bb){
		byte[] lookup=(Shared.AMINO_IN ? AminoAcid.numberToAcid : AminoAcid.numberToBase);
		int bits=(Shared.AMINO_IN ? 5 : 2);
		int mask=(Shared.AMINO_IN ? 31 : 3);
		if(FASTA_DUMP){
			bb.append('>');
			bb.append(count);
			bb.nl();
			for(int i=k-1; i>=0; i--){
				int x=(int)((kmer>>(bits*i))&mask);
				bb.append(lookup[x]);
			}
//			assert(false) : kmer+"->\n"+bb+"\n"+AminoAcid.kmerToStringAA(kmer, k);
		}else if(NUMERIC_DUMP){
			bb.append(Long.toHexString(kmer));
			bb.tab();
			bb.append(count);
		}else{
			for(int i=k-1; i>=0; i--){
				int x=(int)((kmer>>(bits*i))&mask);
				bb.append(lookup[x]);
			}
			bb.tab();
			bb.append(count);
		}
		return bb;
	}
	
	public static final ByteBuilder toBytes(long kmer, int[] values, int k, ByteBuilder bb){
		byte[] lookup=(Shared.AMINO_IN ? AminoAcid.numberToAcid : AminoAcid.numberToBase);
		int bits=(Shared.AMINO_IN ? 5 : 2);
		int mask=(Shared.AMINO_IN ? 31 : 3);
		if(FASTA_DUMP){
			bb.append('>');
			for(int i=0; i<values.length; i++){
				int x=values[i];
				if(x==-1){break;}
				if(i>0){bb.append(',');}
				bb.append(x);
			}
			bb.nl();
			for(int i=k-1; i>=0; i--){
				int x=(int)((kmer>>(bits*i))&mask);
				bb.append(lookup[x]);
			}
		}else if(NUMERIC_DUMP){
			bb.append(Long.toHexString(kmer));
			bb.tab();
			for(int i=0; i<values.length; i++){
				int x=values[i];
				if(x==-1){break;}
				if(i>0){bb.append(',');}
				bb.append(x);
			}
		}else{
			for(int i=k-1; i>=0; i--){
				int x=(int)((kmer>>(bits*i))&mask);
				bb.append(lookup[x]);
			}
			bb.tab();
			for(int i=0; i<values.length; i++){
				int x=values[i];
				if(x==-1){break;}
				if(i>0){bb.append(',');}
				bb.append(x);
			}
		}
		return bb;
	}
	
//	static void appendKmerText(long kmer, int count, int k, StringBuilder sb){
//		sb.setLength(0);
//		toText(kmer, count, k, sb);
//		sb.append('\n');
//	}
	
	static void appendKmerText(long kmer, int count, int k, ByteBuilder bb){
		bb.setLength(0);
		toBytes(kmer, count, k, bb);
		bb.nl();
	}
	
	
	/** For buffered tables. */
	long flush(){
		throw new RuntimeException("Unsupported.");
	}
	
	/**
	 * This allocates the data structures in multiple threads.  Unfortunately, it does not lead to any speedup, at least for ARRAY type.
	 * @param ways
	 * @param tableType
	 * @param schedule
	 * @param mask
	 * @return The preallocated table
	 */
	public static final AbstractKmerTable[] preallocate(int ways, int tableType, int[] schedule, long mask){

		final AbstractKmerTable[] tables=new AbstractKmerTable[ways];
		
		{
			shared.Timer tm=new shared.Timer();
			final int t=Tools.max(1, Tools.min(Shared.threads(), 2, ways)); //More than 2 still improves allocation time, but only slightly; ~25% faster at t=4.
			final AllocThread[] allocators=new AllocThread[t];
			for(int i=0; i<t; i++){
				allocators[i]=new AllocThread(tableType, schedule, i, t, mask, tables);
			}
			for(AllocThread at : allocators){at.start();}
			for(AllocThread at : allocators){
				while(at.getState()!=Thread.State.TERMINATED){
					try {
						at.join();
					} catch (InterruptedException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					}
				}
			}
			tm.stop();
			if(AbstractKmerTableSet.DISPLAY_PROGRESS){System.err.println(tm);}
		}
		
		synchronized(tables){
			for(int i=0; i<tables.length; i++){
				final AbstractKmerTable akt=tables[i];
				if(akt==null){
					throw new RuntimeException("KmerTable allocation failed, probably due to lack of RAM: "+i+", "+tables.length);
				}
			}
		}
		
		return tables;
	}
	
	/*--------------------------------------------------------------*/
	/*----------------        Nested Classes        ----------------*/
	/*--------------------------------------------------------------*/
	
	private static class AllocThread extends Thread{
		
		AllocThread(int type_, int[] schedule_, int mod_, int div_,
				long mask_, AbstractKmerTable[] tables_){
			type=type_;
			schedule=schedule_;
			size=schedule[0];
			mod=mod_;
			div=div_;
			mask=mask_;
			growable=schedule.length>1;
			tables=tables_;
		}
		
		@Override
		public void run(){
			//Initialize tables
			long sum=0;

//			Shared.printMemory();}
			for(int i=mod; i<tables.length; i+=div){
//				System.err.println("T"+i+" allocating "+i);
				final AbstractKmerTable akt;
				if(type==FOREST1D){
					akt=new HashForest(size, growable, false);
				}else if(type==TABLE){
					akt=new KmerTable(size, growable);
				}else if(type==ARRAY1D){
					akt=new HashArray1D(schedule, mask);
//					long len=((HashArray1D)akt).arrayLength();
//					long mem=((HashArray1D)akt).calcMem();
//					sum+=mem;
//					akt=new HashArray1D(size, -1, mask, growable);//TODO: Set maxSize
				}else if(type==NODE1D){
					throw new RuntimeException("Must use forest, table, or array data structure. Type="+type);
//					akt=new KmerNode2(-1, 0);
				}else if(type==FOREST2D){
					akt=new HashForest(size, growable, true);
				}else if(type==TABLE2D){
					throw new RuntimeException("Must use forest, table, or array data structure. Type="+type);
				}else if(type==ARRAY2D){
					akt=new HashArray2D(schedule, mask);
//					akt=new HashArray2D(size, -1, mask, growable);//TODO: Set maxSize
				}else if(type==NODE2D){
					throw new RuntimeException("Must use forest, table, or array data structure. Type="+type);
//					akt=new KmerNode(-1, 0);
				}else if(type==ARRAYH){
					akt=new HashArrayHybrid(schedule, mask);
//					akt=new HashArrayHybrid(size, -1, mask, growable);//TODO: Set maxSize
				}else if(type==ARRAYHF){
					akt=new HashArrayHybridFast(schedule, mask);
//					akt=new HashArrayHybrid(size, -1, mask, growable);//TODO: Set maxSize
				}else{
					throw new RuntimeException("Must use forest, table, or array data structure. Type="+type);
				}
				synchronized(tables){
					tables[i]=akt;
				}
//				System.err.println("T"+i+" allocated "+i);
				
//				if(i%100==0){
//					System.err.println("Allocated "+(sum/1000000)+"MB");
//					Shared.printMemory();
//				}
			}
//			Shared.printMemory();
//			assert(false) : ("Allocated "+(sum/1000000)+"MB");
		}
		
		private final int type;
		private final int[] schedule;
		private final int size;
		private final int mod;
		private final int div;
		private final long mask;
		private final boolean growable;
		final AbstractKmerTable[] tables;
		
	}
	
	public Walker walk() {
		throw new RuntimeException("Unimplemented");
	}
	
	/*--------------------------------------------------------------*/
	/*----------------            Fields            ----------------*/
	/*--------------------------------------------------------------*/
	
	public static boolean FASTA_DUMP=true;
	public static boolean NUMERIC_DUMP=false;
	public static boolean TWO_PASS_RESIZE=false;
	
	public static final boolean verbose=false;
	public static final boolean TESTMODE=false; //123 SLOW!
	
	public static final int UNKNOWN=0, ARRAY1D=1, FOREST1D=2, TABLE=3, NODE1D=4, ARRAY2D=5, FOREST2D=6, TABLE2D=7, NODE2D=8, ARRAYH=9, ARRAYHF=10;
	
	public static final int NOT_PRESENT=-1, HASH_COLLISION=-2;
	public static final int NO_OWNER=-1;
	
	private static final String killMessage=new String("\nThis program ran out of memory.  Try increasing the -Xmx flag and setting prealloc.");
	
}