view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bbmin/Minimizer.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line source
package bbmin;

import java.util.Arrays;

/**
 * Generates an array of minimal hash codes (as positive 64-bit longs) for an input sequence.<br>
 * The resulting array is guaranteed to contain the minimal hash code<br>
 * for every window, with no duplicates.
 * On average this is expected to yield 2*(L-K)/W hash codes for sequence length L and window size W.
 * 
 * @author Brian Bushnell
 * @date October 8, 2021
 *
 */
public class Minimizer {
	
	public static void main(String[] args){
		int k=4, w=7;
		String seq="ACGTCTGAGCCTTGACACATGACT";
		try {
			k=Integer.parseInt(args[0]);
			w=Integer.parseInt(args[1]);
			seq=args[2];
		} catch (NumberFormatException e) {
			//e.printStackTrace();
			System.err.println("Usage: bbmin.Minimizer kmerlen window seq\n"
					+ "E.G.\n"
					+ "bbmin.Minimizer 4 7 ACGTCTGAGCCTTGACACATGACT");
			System.exit(1);
		}
		Minimizer minnow=new Minimizer(k, w);
		long[] array=minnow.minimize(seq.getBytes());
		System.err.println(Arrays.toString(array));
	}

	public Minimizer(int k_, int window_){this(k_, window_, 2);}
	public Minimizer(int k_, int window_, int bitsPerSymbol_){
		k=k_;
		window=window_;
		bitsPerSymbol=bitsPerSymbol_;
		shift=bitsPerSymbol*k;
		shift2=shift-bitsPerSymbol;
		mask=(shift>63 ? -1L : ~((-1L)<<shift));
	}
	
	public long[] minimize(String str){
		return minimize(str.getBytes());
	}

	public long[] minimize(byte[] bases){
		return minimize(bases, new LongList(16), new LongHashSet(16));
	}

	/** This method is typically faster since you don't need to construct a new set each time. */
	public long[] minimize(byte[] bases, LongList list, LongHashSet set){
		list.clear();
		//If the set is way too big, resize it
		if(set.capacity()*(long)window>100L+16L*bases.length){
			set.resizeDestructive(16);
		}else{
			set.clear();
		}
		
		long kmersProcessed=0;
		long kmer=0;
		long rkmer=0;
		int len=0;
		
		long bestCode=Long.MAX_VALUE;
		int bestPosition=-1;
		long bestKmer=-1;
		long bestRkmer=-1;
		int currentWindow=0;
		
		for(int i=0; i<bases.length; i++){
			byte b=bases[i];
			long x=baseToNumber[b];
			long x2=baseToComplementNumber[b];
			
			kmer=((kmer<<2)|x)&mask;
			rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
			if(x<0){
				len=0;
				rkmer=0;
			}else{
				len++;
			}
			
			if(len>=k){
				kmersProcessed++;
				currentWindow++;

				final long hashcode=hash(kmer, rkmer);
				System.err.println("i="+i+", code="+hashcode);

				//Track the best code in the window and its state
				if(hashcode>=minCode && hashcode<=bestCode){
					bestCode=hashcode;
					bestPosition=i;
					bestKmer=kmer;
					bestRkmer=rkmer;
				}
				
				//Once the window size is met, store the best code,
				//and backtrack to its position to start the next window
				if(currentWindow>=window && bestPosition>=0){
					if(!set.contains(bestCode)){
						set.add(bestCode);
						list.add(bestCode);
					}
					i=bestPosition;
					kmer=bestKmer;
					rkmer=bestRkmer;
					len=k;
					
					bestCode=Long.MAX_VALUE;
					bestPosition=-1;
					currentWindow=0;
				}
			}
		}
		list.sort();//optional
		return list.toArray();
	}

	public static long canon(long kmer, long rkmer){return max(kmer, rkmer);}
	public static long hash(long kmer, long rkmer){return hash(canon(kmer, rkmer));}
	public static long hash(long key) {
		key = (~key) + (key << 21); // key = (key << 21) - key - 1;
		key = key ^ (key >>> 24);
		key = (key + (key << 3)) + (key << 8); // key * 265
		key = key ^ (key >>> 14);
		key = (key + (key << 2)) + (key << 4); // key * 21
		key = key ^ (key >>> 28);
		key = key + (key << 31);
		return key;
	}
	private static final long max(long x, long y){return x>y ? x : y;}

	public final int k;
	public final int window;
	public final int bitsPerSymbol; //2 for nucleotides, 5 for amino acids.
	private final int shift;
	private final int shift2;
	private final long mask;
	private final long minCode=0;
	
	static final byte[] baseToNumber = new byte[128];
	static final byte[] baseToComplementNumber = new byte[128];
	
	static {
		Arrays.fill(baseToNumber, (byte)-1);
		Arrays.fill(baseToComplementNumber, (byte)-1);
		baseToNumber['A']=baseToNumber['a']=baseToComplementNumber['T']=baseToComplementNumber['t']=0;
		baseToNumber['C']=baseToNumber['c']=baseToComplementNumber['G']=baseToComplementNumber['c']=1;
		baseToNumber['G']=baseToNumber['g']=baseToComplementNumber['C']=baseToComplementNumber['g']=2;
		baseToNumber['T']=baseToNumber['t']=baseToComplementNumber['A']=baseToComplementNumber['a']=3;
	}
}