Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/kmer/AbstractKmerTable.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/kmer/AbstractKmerTable.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,532 @@ +package kmer; + +import java.util.concurrent.atomic.AtomicIntegerArray; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.Lock; + +import dna.AminoAcid; +import fileIO.ByteStreamWriter; +import fileIO.TextStreamWriter; +import shared.KillSwitch; +import shared.Shared; +import shared.Tools; +import structures.ByteBuilder; +import structures.SuperLongList; + + +/** + * @author Brian Bushnell + * @date Oct 23, 2013 + * + */ +public abstract class AbstractKmerTable { + + /*--------------------------------------------------------------*/ + /*---------------- Abstract Methods ----------------*/ + /*--------------------------------------------------------------*/ + +// /** Returns count */ +// public final int increment(long kmer){return increment(kmer, 1);} + + /** Returns count */ + public abstract int increment(final long kmer, final int incr); + +// /** Returns number of entries created */ +// public final int incrementAndReturnNumCreated(final long kmer){return incrementAndReturnNumCreated(kmer, 1);} + + /** Returns number of entries created. Incr must be positive. */ + public abstract int incrementAndReturnNumCreated(final long kmer, final int incr); + + public abstract int set(long kmer, int value); + +// public abstract int set(long kmer, int[] vals); + + /** This is for IntList3 support with HashArrayHybridFast */ + public abstract int set(long kmer, int[] vals, int vlen); + + /** Returns number of kmers added */ + public abstract int setIfNotPresent(long kmer, int value); + + /** + * Fetch the value associated with a kmer. + * @param kmer + * @return A value. -1 means the kmer was not present. + */ + public abstract int getValue(long kmer); + + /** + * Fetch the values associated with a kmer. + * @param kmer + * @param singleton A blank array of length 1. + * @return An array filled with values. Values of -1 are invalid. + */ + public abstract int[] getValues(long kmer, int[] singleton); + + public abstract boolean contains(long kmer); + + public final boolean contains(long kmer, int v){ + assert(TESTMODE); + int[] set=getValues(kmer, new int[] {-1}); + if(set==null){return false;} + for(int s : set){ + if(s==-1){break;} + if(s==v){return true;} + } + return false; + } + + public final boolean contains(long kmer, int[] vals){ + assert(TESTMODE); + int[] set=getValues(kmer, new int[] {-1}); + if(set==null){return false;} + boolean success=true; + for(int v : vals){ + if(v==-1){break;} + success=false; + for(int s : set){ + if(s==v){ + success=true; + break; + } + } + if(!success){break;} + } + return success; + } + + public abstract void rebalance(); + + public abstract long size(); + public abstract int arrayLength(); + public abstract boolean canRebalance(); + + public abstract boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount, int maxcount); + public abstract boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount, int maxcount, AtomicLong remaining); + public abstract boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount, int maxcount, AtomicLong remaining); + + public abstract void fillHistogram(long[] ca, int max); + public abstract void fillHistogram(SuperLongList sll); + public abstract void countGC(long[] gcCounts, int max); + + public static final int gc(long kmer){ + int gc=0; + while(kmer>0){ + long x=kmer&3; + kmer>>>=2; + if(x==1 || x==2){gc++;} + } + return gc; + } + + abstract Object get(long kmer); + abstract void resize(); + abstract boolean canResize(); + + + + /** + * Removes entries with a value of the limit or less. + * Rehashes the remainder. + * @return Number removed. + */ + abstract long regenerate(int limit); + + final void lock(){getLock().lock();} + final void unlock(){getLock().unlock();} + final boolean tryLock(){return getLock().tryLock();} + Lock getLock(){ + throw new RuntimeException("Unimplemented."); + } + + /*--------------------------------------------------------------*/ + /*--------------- Allocation Methods ----------------*/ + /*--------------------------------------------------------------*/ + + final static AtomicIntegerArray allocAtomicInt(int len){ + return KillSwitch.allocAtomicInt(len); + } + + final static long[] allocLong1D(int len){ + return KillSwitch.allocLong1D(len); + } + + final static long[][] allocLong2D(int mult, int len){ + return KillSwitch.allocLong2D(mult, len); + } + + final static int[] allocInt1D(int len){ + return KillSwitch.allocInt1D(len); + } + + final static int[][] allocInt2D(int len){ + return KillSwitch.allocInt2D(len); + } + + final static KmerNode[] allocKmerNodeArray(int len){ + KmerNode[] ret=null; + try { + ret=new KmerNode[len]; + } catch (OutOfMemoryError e) { + synchronized(killMessage){ + e.printStackTrace(); + System.err.println(killMessage); +// Shared.printMemory(); + KillSwitch.killSilent(); + } + } + return ret; + } + + /*--------------------------------------------------------------*/ + /*--------------- Ownership Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** Set the thread owning this kmer. Return the new owner. + * Will only change the owner if newOwner is greater than current owner. */ + public abstract int setOwner(long kmer, int newOwner); + + /** Reset owner to -1 if this is the current owner. */ + public abstract boolean clearOwner(long kmer, int owner); + + /** Return the thread ID owning this kmer, or -1. */ + public abstract int getOwner(long kmer); + + /** Create data structures needed for ownership representation */ + public abstract void initializeOwnership(); + + /** Eliminate ownership data structures or set them to -1. */ + public abstract void clearOwnership(); + + /*--------------------------------------------------------------*/ + /*---------------- Methods ----------------*/ + /*--------------------------------------------------------------*/ + + public static final StringBuilder toText(long kmer, int k){ + byte[] lookup=(Shared.AMINO_IN ? AminoAcid.numberToAcid : AminoAcid.numberToBase); + int bits=(Shared.AMINO_IN ? 5 : 2); + int mask=(Shared.AMINO_IN ? 31 : 3); + StringBuilder sb=new StringBuilder(k); + for(int i=k-1; i>=0; i--){ + int x=(int)((kmer>>(bits*i))&mask); + sb.append((char)lookup[x]); + } + return sb; + } + + static final StringBuilder toText(long kmer, int count, int k){ + StringBuilder sb=new StringBuilder(k+10); + return toText(kmer, count, k, sb); + } + + static final ByteBuilder toBytes(long kmer, int count, int k){ + ByteBuilder bb=new ByteBuilder(k+10); + return toBytes(kmer, count, k, bb); + } + + static final StringBuilder toText(long kmer, int[] values, int k){ + StringBuilder sb=new StringBuilder(k+10); + return toText(kmer, values, k, sb); + } + + static final ByteBuilder toBytes(long kmer, int[] values, int k){ + ByteBuilder bb=new ByteBuilder(k+10); + return toBytes(kmer, values, k, bb); + } + + static final StringBuilder toText(long kmer, int count, int k, StringBuilder sb){ + byte[] lookup=(Shared.AMINO_IN ? AminoAcid.numberToAcid : AminoAcid.numberToBase); + int bits=(Shared.AMINO_IN ? 5 : 2); + int mask=(Shared.AMINO_IN ? 31 : 3); + if(FASTA_DUMP){ + sb.append('>'); + sb.append(count); + sb.append('\n'); + for(int i=k-1; i>=0; i--){ + int x=(int)((kmer>>(bits*i))&mask); + sb.append((char)lookup[x]); + } + }else{ + for(int i=k-1; i>=0; i--){ + int x=(int)((kmer>>(bits*i))&mask); + sb.append((char)lookup[x]); + } + sb.append('\t'); + sb.append(count); + } + return sb; + } + + static final StringBuilder toText(long kmer, int[] values, int k, StringBuilder sb){ + byte[] lookup=(Shared.AMINO_IN ? AminoAcid.numberToAcid : AminoAcid.numberToBase); + int bits=(Shared.AMINO_IN ? 5 : 2); + int mask=(Shared.AMINO_IN ? 31 : 3); + if(FASTA_DUMP){ + sb.append('>'); + for(int i=0; i<values.length; i++){ + int x=values[i]; + if(x==-1){break;} + if(i>0){sb.append(',');} + sb.append(x); + } + sb.append('\n'); + for(int i=k-1; i>=0; i--){ + int x=(int)((kmer>>(bits*i))&mask); + sb.append((char)lookup[x]); + } + }else{ + for(int i=k-1; i>=0; i--){ + int x=(int)((kmer>>(bits*i))&mask); + sb.append((char)lookup[x]); + } + sb.append('\t'); + for(int i=0; i<values.length; i++){ + int x=values[i]; + if(x==-1){break;} + if(i>0){sb.append(',');} + sb.append(x); + } + } + return sb; + } + + public static final ByteBuilder toBytes(long kmer, long count, int k, ByteBuilder bb){ + byte[] lookup=(Shared.AMINO_IN ? AminoAcid.numberToAcid : AminoAcid.numberToBase); + int bits=(Shared.AMINO_IN ? 5 : 2); + int mask=(Shared.AMINO_IN ? 31 : 3); + if(FASTA_DUMP){ + bb.append('>'); + bb.append(count); + bb.nl(); + for(int i=k-1; i>=0; i--){ + int x=(int)((kmer>>(bits*i))&mask); + bb.append(lookup[x]); + } +// assert(false) : kmer+"->\n"+bb+"\n"+AminoAcid.kmerToStringAA(kmer, k); + }else if(NUMERIC_DUMP){ + bb.append(Long.toHexString(kmer)); + bb.tab(); + bb.append(count); + }else{ + for(int i=k-1; i>=0; i--){ + int x=(int)((kmer>>(bits*i))&mask); + bb.append(lookup[x]); + } + bb.tab(); + bb.append(count); + } + return bb; + } + + public static final ByteBuilder toBytes(long kmer, int[] values, int k, ByteBuilder bb){ + byte[] lookup=(Shared.AMINO_IN ? AminoAcid.numberToAcid : AminoAcid.numberToBase); + int bits=(Shared.AMINO_IN ? 5 : 2); + int mask=(Shared.AMINO_IN ? 31 : 3); + if(FASTA_DUMP){ + bb.append('>'); + for(int i=0; i<values.length; i++){ + int x=values[i]; + if(x==-1){break;} + if(i>0){bb.append(',');} + bb.append(x); + } + bb.nl(); + for(int i=k-1; i>=0; i--){ + int x=(int)((kmer>>(bits*i))&mask); + bb.append(lookup[x]); + } + }else if(NUMERIC_DUMP){ + bb.append(Long.toHexString(kmer)); + bb.tab(); + for(int i=0; i<values.length; i++){ + int x=values[i]; + if(x==-1){break;} + if(i>0){bb.append(',');} + bb.append(x); + } + }else{ + for(int i=k-1; i>=0; i--){ + int x=(int)((kmer>>(bits*i))&mask); + bb.append(lookup[x]); + } + bb.tab(); + for(int i=0; i<values.length; i++){ + int x=values[i]; + if(x==-1){break;} + if(i>0){bb.append(',');} + bb.append(x); + } + } + return bb; + } + +// static void appendKmerText(long kmer, int count, int k, StringBuilder sb){ +// sb.setLength(0); +// toText(kmer, count, k, sb); +// sb.append('\n'); +// } + + static void appendKmerText(long kmer, int count, int k, ByteBuilder bb){ + bb.setLength(0); + toBytes(kmer, count, k, bb); + bb.nl(); + } + + + /** For buffered tables. */ + long flush(){ + throw new RuntimeException("Unsupported."); + } + + /** + * This allocates the data structures in multiple threads. Unfortunately, it does not lead to any speedup, at least for ARRAY type. + * @param ways + * @param tableType + * @param schedule + * @param mask + * @return The preallocated table + */ + public static final AbstractKmerTable[] preallocate(int ways, int tableType, int[] schedule, long mask){ + + final AbstractKmerTable[] tables=new AbstractKmerTable[ways]; + + { + shared.Timer tm=new shared.Timer(); + final int t=Tools.max(1, Tools.min(Shared.threads(), 2, ways)); //More than 2 still improves allocation time, but only slightly; ~25% faster at t=4. + final AllocThread[] allocators=new AllocThread[t]; + for(int i=0; i<t; i++){ + allocators[i]=new AllocThread(tableType, schedule, i, t, mask, tables); + } + for(AllocThread at : allocators){at.start();} + for(AllocThread at : allocators){ + while(at.getState()!=Thread.State.TERMINATED){ + try { + at.join(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + tm.stop(); + if(AbstractKmerTableSet.DISPLAY_PROGRESS){System.err.println(tm);} + } + + synchronized(tables){ + for(int i=0; i<tables.length; i++){ + final AbstractKmerTable akt=tables[i]; + if(akt==null){ + throw new RuntimeException("KmerTable allocation failed, probably due to lack of RAM: "+i+", "+tables.length); + } + } + } + + return tables; + } + + /*--------------------------------------------------------------*/ + /*---------------- Nested Classes ----------------*/ + /*--------------------------------------------------------------*/ + + private static class AllocThread extends Thread{ + + AllocThread(int type_, int[] schedule_, int mod_, int div_, + long mask_, AbstractKmerTable[] tables_){ + type=type_; + schedule=schedule_; + size=schedule[0]; + mod=mod_; + div=div_; + mask=mask_; + growable=schedule.length>1; + tables=tables_; + } + + @Override + public void run(){ + //Initialize tables + long sum=0; + +// Shared.printMemory();} + for(int i=mod; i<tables.length; i+=div){ +// System.err.println("T"+i+" allocating "+i); + final AbstractKmerTable akt; + if(type==FOREST1D){ + akt=new HashForest(size, growable, false); + }else if(type==TABLE){ + akt=new KmerTable(size, growable); + }else if(type==ARRAY1D){ + akt=new HashArray1D(schedule, mask); +// long len=((HashArray1D)akt).arrayLength(); +// long mem=((HashArray1D)akt).calcMem(); +// sum+=mem; +// akt=new HashArray1D(size, -1, mask, growable);//TODO: Set maxSize + }else if(type==NODE1D){ + throw new RuntimeException("Must use forest, table, or array data structure. Type="+type); +// akt=new KmerNode2(-1, 0); + }else if(type==FOREST2D){ + akt=new HashForest(size, growable, true); + }else if(type==TABLE2D){ + throw new RuntimeException("Must use forest, table, or array data structure. Type="+type); + }else if(type==ARRAY2D){ + akt=new HashArray2D(schedule, mask); +// akt=new HashArray2D(size, -1, mask, growable);//TODO: Set maxSize + }else if(type==NODE2D){ + throw new RuntimeException("Must use forest, table, or array data structure. Type="+type); +// akt=new KmerNode(-1, 0); + }else if(type==ARRAYH){ + akt=new HashArrayHybrid(schedule, mask); +// akt=new HashArrayHybrid(size, -1, mask, growable);//TODO: Set maxSize + }else if(type==ARRAYHF){ + akt=new HashArrayHybridFast(schedule, mask); +// akt=new HashArrayHybrid(size, -1, mask, growable);//TODO: Set maxSize + }else{ + throw new RuntimeException("Must use forest, table, or array data structure. Type="+type); + } + synchronized(tables){ + tables[i]=akt; + } +// System.err.println("T"+i+" allocated "+i); + +// if(i%100==0){ +// System.err.println("Allocated "+(sum/1000000)+"MB"); +// Shared.printMemory(); +// } + } +// Shared.printMemory(); +// assert(false) : ("Allocated "+(sum/1000000)+"MB"); + } + + private final int type; + private final int[] schedule; + private final int size; + private final int mod; + private final int div; + private final long mask; + private final boolean growable; + final AbstractKmerTable[] tables; + + } + + public Walker walk() { + throw new RuntimeException("Unimplemented"); + } + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + public static boolean FASTA_DUMP=true; + public static boolean NUMERIC_DUMP=false; + public static boolean TWO_PASS_RESIZE=false; + + public static final boolean verbose=false; + public static final boolean TESTMODE=false; //123 SLOW! + + public static final int UNKNOWN=0, ARRAY1D=1, FOREST1D=2, TABLE=3, NODE1D=4, ARRAY2D=5, FOREST2D=6, TABLE2D=7, NODE2D=8, ARRAYH=9, ARRAYHF=10; + + public static final int NOT_PRESENT=-1, HASH_COLLISION=-2; + public static final int NO_OWNER=-1; + + private static final String killMessage=new String("\nThis program ran out of memory. Try increasing the -Xmx flag and setting prealloc."); + +}