diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/SketchIndex.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/SketchIndex.java	Tue Mar 18 16:23:26 2025 -0400
@@ -0,0 +1,358 @@
+package sketch;
+
+import java.util.ArrayList;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import kmer.AbstractKmerTable;
+import kmer.HashBuffer;
+import kmer.KmerTableSet;
+import shared.Shared;
+import shared.Timer;
+import shared.Tools;
+import structures.AbstractBitSet;
+import structures.IntHashMap;
+import structures.IntHashSetList;
+import structures.IntList;
+
+public class SketchIndex extends SketchObject {
+	
+	public SketchIndex(ArrayList<Sketch> refs){
+		refSketches=refs;
+		tables=new KmerTableSet(new String[] {"ways="+WAYS, "tabletype="+AbstractKmerTable.ARRAYHF, "prealloc="+(prealloc>0 ? ""+prealloc : "f")}, 
+				20+(defaultParams.trackCounts() ? 4 : 0)+18);//An extra 18 bytes per kmer because a lot of them occur multiple times
+		tables.allocateTables();
+		tableArray=tables.tables();
+	}
+	
+	public void load(){
+		spawnIndexThreads();
+		if(useWhitelist){
+			assert(!Whitelist.exists());
+			Whitelist.initialize(tableArray);
+		}
+	}
+	
+	/** Spawn index threads */
+	private void spawnIndexThreads(){
+		
+		//Do anything necessary prior to processing
+		
+		//Determine how many threads may be used
+		final int threads=Shared.threads();
+		ArrayList<IndexThread> alht=new ArrayList<IndexThread>(threads);
+		AtomicInteger ai=new AtomicInteger(0);
+		AtomicLong totalKeys=new AtomicLong(0);
+		AtomicLong uniqueKeys=new AtomicLong(0);
+		for(int i=0; i<threads; i++){
+			alht.add(new IndexThread(ai, totalKeys, uniqueKeys));
+		}
+		
+		//Start the threads
+		for(IndexThread pt : alht){
+			pt.start();
+		}
+		
+		//Wait for completion of all threads
+		boolean success=true;
+		long codesProcessed=0;
+		for(IndexThread pt : alht){
+			
+			//Wait until this thread has terminated
+			while(pt.getState()!=Thread.State.TERMINATED){
+				try {
+					//Attempt a join operation
+					pt.join();
+					synchronized(pt){
+						codesProcessed+=pt.codesProcessedT;
+					}
+				} catch (InterruptedException e) {
+					//Potentially handle this, if it is expected to occur
+					e.printStackTrace();
+				}
+			}
+			success&=pt.success;
+		}
+		
+		//Track whether any threads failed
+		if(!success){errorState=true;}
+		
+		System.err.println("Indexed "+uniqueKeys+" unique and "+totalKeys+" total hashcodes."); //For some reason codesProcessed is nondeterministic.
+		
+		//Do anything necessary after processing
+//		System.gc();
+	}
+	
+	/*--------------------------------------------------------------*/
+	
+	public SketchResults getSketches(Sketch a, DisplayParams params){
+		if(useIntMap){
+			return getSketchesMap(a, params);
+		}else{
+			return getSketchesList(a, params);
+		}
+	}
+	
+	/** Return true if added. */
+	private boolean addToTaxSet(int sketchID, IntHashSetList taxSet, int taxLevelExtended){
+		Sketch sk=refSketches.get(sketchID);
+		int taxID=sk.taxID;
+		if(taxID<0 || taxID>=minFakeID){return false;}
+		taxID=taxtree.getIdAtLevelExtended(taxID, taxLevelExtended);
+		return taxSet.add(taxID);
+	}
+	
+	public SketchResults getSketchesList(final Sketch a, DisplayParams params){
+		final int minHits=params.minHits, contamLevel=params.contamLevel();
+		final boolean countContamHits=params.needContamCounts();//, metaFilter=params.hasMetaFilters(), taxFilter=params.hasTaxFilters();
+		final Timer t=(printTime ? new Timer() : null);
+		
+		final int[] singleton=new int[1];
+		final IntList idList=new IntList(Tools.min(targetSketchSize, indexLimit, 1000));
+		AbstractBitSet abs=a.indexBitSet();
+		assert((abs==null)!=countContamHits);
+		
+		final IntHashSetList taxSet;
+		final int[][] taxHits;
+		if(contamLevel>=0){
+			taxSet=new IntHashSetList(31);
+			taxHits=new int[a.length()][];
+			assert(taxtree!=null) : "A TaxTree is required for this operation.";
+		}else{
+			taxSet=null;
+			taxHits=null;
+		}
+		
+		for(int i=0; i<a.keys.length; i++){
+			long key=a.keys[i];
+			AbstractKmerTable set=tableArray[(int)(key%WAYS)];
+//			System.err.println(set.getValue(key));
+			final int[] ids=set.getValues(key, singleton);
+//			System.err.println(Arrays.toString(ids));
+			if(ids!=null && ids[0]>=0){
+				int incr=0;
+				for(int id : ids){
+					if(id>=0){
+						final int trueID=id-1;//Minimum id is 1, indicating sketch 0.
+						idList.add(trueID);//Minimum id is 1, indicating sketch 0.
+						incr++;
+						if(taxSet!=null){addToTaxSet(trueID, taxSet, contamLevel);}
+					}
+				}
+				if(countContamHits && incr>0){abs.increment(i, incr);}
+				if(taxSet!=null && taxSet.size()>0){
+					taxHits[i]=taxSet.toArray();
+					taxSet.clear();
+				}
+			}
+			
+		}
+		
+//		assert(abs!=null);
+//		assert(false) : abs.cardinality();
+		
+		if(printTime){
+			t.stop("\nTime for searching index: \t");
+			t.start();
+		}
+		
+//		System.err.println("idList.size:"+idList.size);
+		if(idList.size<minHits){return new SketchResults(a);}//null breaks some things
+		idList.sort();
+		
+		if(printTime){
+			t.stop("Time for sorting "+idList.size()+" hits:\t");
+			t.start();
+		}
+		
+		ArrayList<Sketch> list=new ArrayList<Sketch>(Tools.min(8, idList.size));
+
+		int last=-1;
+		int hits=0;
+		for(int i=0; i<idList.size; i++){
+			int id=idList.get(i);
+			if(id==last){
+//				System.err.println("A: "+last+", "+id+", "+count+", "+minHits);
+				hits++;
+			}else{
+//				System.err.println("B: "+last+", "+id+", "+count+", "+minHits);
+				if(last>-1 && (hits>=minHits)){
+					final Sketch ref=refSketches.get(last);
+					list.add(ref);
+				}
+				last=id;
+				hits=0;
+			}
+		}
+		if(last>-1 && (hits>=minHits)){
+			final Sketch ref=refSketches.get(last);
+//			if((!metaFilter || ref.passesMeta(params)) && (!taxFilter || params.passesFilter(ref))){list.add(ref);}
+			list.add(ref);
+		}
+		if(printTime){
+			t.stop("Time for fetching sketches: \t");
+		}
+		return list.isEmpty() ? new SketchResults(a) : new SketchResults(a, list, taxHits);
+	}
+	
+//	static ThreadLocal<IntHashMap> intMapHolder=new ThreadLocal<IntHashMap>();
+	
+	public final int[] getSketchIdsMap(long key, int[] singleton){
+		AbstractKmerTable set=tableArray[(int)(key%WAYS)];
+		final int[] ids=set.getValues(key, singleton);
+		return ids;
+	}
+	
+	public SketchResults getSketchesMap(final Sketch a, DisplayParams params){
+		final int minHits=params.minHits, contamLevel=params.contamLevel();
+		final boolean countContamHits=params.needContamCounts();//, metaFilter=params.hasMetaFilters(), taxFilter=params.hasTaxFilters();
+		final Timer t=(printTime ? new Timer() : null);
+		final int[] singleton=new int[1];
+		
+		final IntHashMap idMap=new IntHashMap(Tools.min(targetSketchSize, indexLimit, intMapSize), 0.7f);
+		
+		AbstractBitSet abs=a.indexBitSet();
+		assert((abs==null)!=countContamHits);
+		
+		final IntHashSetList taxSet;
+		final int[][] taxHits;
+		if(contamLevel>=0){
+			taxSet=new IntHashSetList(31);
+			taxHits=new int[a.length()][];
+			assert(taxtree!=null) : "A TaxTree is required for this operation.";
+		}else{
+			taxSet=null;
+			taxHits=null;
+		}
+//		assert(false) : (taxHits==null)+", "+contamLevel;
+		
+		if(printTime){
+			t.stop("\nTime for allocation:      \t");
+			t.start();
+		}
+		
+		int[] refHitCounts;
+		if(params.printRefHits){
+			refHitCounts=new int[a.keys.length];
+			a.setRefHitCounts(refHitCounts);
+		}else{refHitCounts=null;}
+		
+		for(int i=0; i<a.keys.length; i++){
+			long key=a.keys[i];
+			final int[] ids=getSketchIdsMap(key, singleton);
+			
+			if(ids!=null && ids[0]>=0){
+				int incr=0;
+				for(int id : ids){
+					if(id>=0){
+						final int trueID=id-1;//Minimum id is 1, indicating sketch 0.
+						idMap.increment(trueID);
+						if(!allToAll || compareSelf){incr++;}
+						if(taxSet!=null){addToTaxSet(trueID, taxSet, contamLevel);}
+						if(refHitCounts!=null && trueID!=a.sketchID){refHitCounts[i]++;}
+					}
+				}
+				if(countContamHits && incr>0){abs.increment(i, incr);}
+				if(taxSet!=null && taxSet.size()>0){
+					taxHits[i]=taxSet.toArray();
+					taxSet.clear();
+				}
+			}
+		}
+		
+		if(printTime){
+			t.stop("Time for searching index: \t");
+			System.err.println("Size:                   \t"+idMap.size());
+			t.start();
+		}
+		
+//		System.err.println("idList.size:"+idList.size);
+		final int size=idMap.size();
+		if(size==0){return new SketchResults(a);}//null breaks some things
+		
+		ArrayList<Sketch> list=new ArrayList<Sketch>(Tools.min(8, size));
+
+		final int[] keys=idMap.keys();
+		final int[] values=idMap.values();
+		
+		for(int i=0; i<keys.length; i++){
+			int value=values[i];
+			if(value>=minHits){
+				int id=keys[i];
+				final Sketch ref=refSketches.get(id);
+//				if((!metaFilter || ref.passesMeta(params)) && (!taxFilter || params.passesFilter(ref))){list.add(ref);}
+				list.add(ref);
+			}
+		}
+		if(printTime){
+			t.stop("Time for fetching sketches: \t");
+		}
+		return list.isEmpty() ? new SketchResults(a) : new SketchResults(a, list, taxHits);
+	}
+	
+	/*--------------------------------------------------------------*/
+	
+	public class IndexThread extends Thread {
+		
+		public IndexThread(AtomicInteger nextIndex_, AtomicLong keyCount_, AtomicLong uniqueKeyCount_){
+			buffer=new HashBuffer(tableArray, 1000, 31, true, false);
+			nextIndex=nextIndex_;
+			keyCount=keyCount_;
+			uniqueKeyCount=uniqueKeyCount_;
+		}
+		
+		@Override
+		public void run(){
+//			System.err.println("Thread running.");
+			int id=nextIndex.getAndIncrement();
+			final int numSketches=refSketches.size();
+			final int limit0=Tools.min((AUTOSIZE || AUTOSIZE_LINEAR ? Integer.MAX_VALUE : targetSketchSize), indexLimit);
+//			System.err.println("numSketches="+numSketches);
+			while(id<numSketches){
+				final Sketch sk=refSketches.get(id);
+				final long[] array=sk.keys;
+				final int limit=Tools.min(array.length, limit0);
+//				System.err.println("limit="+limit);
+				for(int i=0; i<limit; i++){
+					long key=array[i];
+					buffer.set(key, id+1);//Must be greater than zero
+					codesProcessedT++;
+				}
+				id=nextIndex.getAndIncrement();
+			}
+			long temp=buffer.flush();
+			
+			synchronized(this){
+				codesProcessedT+=0;
+				success=true;
+				keyCount.getAndAdd(codesProcessedT);
+				uniqueKeyCount.getAndAdd(buffer.uniqueAdded);
+//				if(codesProcessedT>0){System.err.println(codesProcessedT);}
+			}
+		}
+		
+		AtomicInteger nextIndex;
+		AtomicLong keyCount;
+		AtomicLong uniqueKeyCount;
+		long codesProcessedT=0;
+		HashBuffer buffer;
+		boolean success=false;
+		
+	}
+	
+	/*--------------------------------------------------------------*/
+	
+	public final KmerTableSet tables;
+	public final AbstractKmerTable[] tableArray;
+	public final ArrayList<Sketch> refSketches;
+	
+	public boolean errorState=false;
+
+	private static final boolean printTime=false;
+	public static boolean useIntMap=true;
+//	public static boolean useIntMapBinary=false;
+	public static int intMapSize=1000;
+	public static int indexLimit=Integer.MAX_VALUE;
+	public static final int WAYS=31;
+	
+}