diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bbmin/Minimizer.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bbmin/Minimizer.java	Tue Mar 18 16:23:26 2025 -0400
@@ -0,0 +1,159 @@
+package bbmin;
+
+import java.util.Arrays;
+
+/**
+ * Generates an array of minimal hash codes (as positive 64-bit longs) for an input sequence.<br>
+ * The resulting array is guaranteed to contain the minimal hash code<br>
+ * for every window, with no duplicates.
+ * On average this is expected to yield 2*(L-K)/W hash codes for sequence length L and window size W.
+ * 
+ * @author Brian Bushnell
+ * @date October 8, 2021
+ *
+ */
+public class Minimizer {
+	
+	public static void main(String[] args){
+		int k=4, w=7;
+		String seq="ACGTCTGAGCCTTGACACATGACT";
+		try {
+			k=Integer.parseInt(args[0]);
+			w=Integer.parseInt(args[1]);
+			seq=args[2];
+		} catch (NumberFormatException e) {
+			//e.printStackTrace();
+			System.err.println("Usage: bbmin.Minimizer kmerlen window seq\n"
+					+ "E.G.\n"
+					+ "bbmin.Minimizer 4 7 ACGTCTGAGCCTTGACACATGACT");
+			System.exit(1);
+		}
+		Minimizer minnow=new Minimizer(k, w);
+		long[] array=minnow.minimize(seq.getBytes());
+		System.err.println(Arrays.toString(array));
+	}
+
+	public Minimizer(int k_, int window_){this(k_, window_, 2);}
+	public Minimizer(int k_, int window_, int bitsPerSymbol_){
+		k=k_;
+		window=window_;
+		bitsPerSymbol=bitsPerSymbol_;
+		shift=bitsPerSymbol*k;
+		shift2=shift-bitsPerSymbol;
+		mask=(shift>63 ? -1L : ~((-1L)<<shift));
+	}
+	
+	public long[] minimize(String str){
+		return minimize(str.getBytes());
+	}
+
+	public long[] minimize(byte[] bases){
+		return minimize(bases, new LongList(16), new LongHashSet(16));
+	}
+
+	/** This method is typically faster since you don't need to construct a new set each time. */
+	public long[] minimize(byte[] bases, LongList list, LongHashSet set){
+		list.clear();
+		//If the set is way too big, resize it
+		if(set.capacity()*(long)window>100L+16L*bases.length){
+			set.resizeDestructive(16);
+		}else{
+			set.clear();
+		}
+		
+		long kmersProcessed=0;
+		long kmer=0;
+		long rkmer=0;
+		int len=0;
+		
+		long bestCode=Long.MAX_VALUE;
+		int bestPosition=-1;
+		long bestKmer=-1;
+		long bestRkmer=-1;
+		int currentWindow=0;
+		
+		for(int i=0; i<bases.length; i++){
+			byte b=bases[i];
+			long x=baseToNumber[b];
+			long x2=baseToComplementNumber[b];
+			
+			kmer=((kmer<<2)|x)&mask;
+			rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
+			if(x<0){
+				len=0;
+				rkmer=0;
+			}else{
+				len++;
+			}
+			
+			if(len>=k){
+				kmersProcessed++;
+				currentWindow++;
+
+				final long hashcode=hash(kmer, rkmer);
+				System.err.println("i="+i+", code="+hashcode);
+
+				//Track the best code in the window and its state
+				if(hashcode>=minCode && hashcode<=bestCode){
+					bestCode=hashcode;
+					bestPosition=i;
+					bestKmer=kmer;
+					bestRkmer=rkmer;
+				}
+				
+				//Once the window size is met, store the best code,
+				//and backtrack to its position to start the next window
+				if(currentWindow>=window && bestPosition>=0){
+					if(!set.contains(bestCode)){
+						set.add(bestCode);
+						list.add(bestCode);
+					}
+					i=bestPosition;
+					kmer=bestKmer;
+					rkmer=bestRkmer;
+					len=k;
+					
+					bestCode=Long.MAX_VALUE;
+					bestPosition=-1;
+					currentWindow=0;
+				}
+			}
+		}
+		list.sort();//optional
+		return list.toArray();
+	}
+
+	public static long canon(long kmer, long rkmer){return max(kmer, rkmer);}
+	public static long hash(long kmer, long rkmer){return hash(canon(kmer, rkmer));}
+	public static long hash(long key) {
+		key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+		key = key ^ (key >>> 24);
+		key = (key + (key << 3)) + (key << 8); // key * 265
+		key = key ^ (key >>> 14);
+		key = (key + (key << 2)) + (key << 4); // key * 21
+		key = key ^ (key >>> 28);
+		key = key + (key << 31);
+		return key;
+	}
+	private static final long max(long x, long y){return x>y ? x : y;}
+
+	public final int k;
+	public final int window;
+	public final int bitsPerSymbol; //2 for nucleotides, 5 for amino acids.
+	private final int shift;
+	private final int shift2;
+	private final long mask;
+	private final long minCode=0;
+	
+	static final byte[] baseToNumber = new byte[128];
+	static final byte[] baseToComplementNumber = new byte[128];
+	
+	static {
+		Arrays.fill(baseToNumber, (byte)-1);
+		Arrays.fill(baseToComplementNumber, (byte)-1);
+		baseToNumber['A']=baseToNumber['a']=baseToComplementNumber['T']=baseToComplementNumber['t']=0;
+		baseToNumber['C']=baseToNumber['c']=baseToComplementNumber['G']=baseToComplementNumber['c']=1;
+		baseToNumber['G']=baseToNumber['g']=baseToComplementNumber['C']=baseToComplementNumber['g']=2;
+		baseToNumber['T']=baseToNumber['t']=baseToComplementNumber['A']=baseToComplementNumber['a']=3;
+	}
+}