jpayne@68: package bbmin;
jpayne@68:
jpayne@68: import java.util.Arrays;
jpayne@68:
jpayne@68: /**
jpayne@68: * Generates an array of minimal hash codes (as positive 64-bit longs) for an input sequence.
jpayne@68: * The resulting array is guaranteed to contain the minimal hash code
jpayne@68: * for every window, with no duplicates.
jpayne@68: * On average this is expected to yield 2*(L-K)/W hash codes for sequence length L and window size W.
jpayne@68: *
jpayne@68: * @author Brian Bushnell
jpayne@68: * @date October 8, 2021
jpayne@68: *
jpayne@68: */
jpayne@68: public class Minimizer {
jpayne@68:
jpayne@68: public static void main(String[] args){
jpayne@68: int k=4, w=7;
jpayne@68: String seq="ACGTCTGAGCCTTGACACATGACT";
jpayne@68: try {
jpayne@68: k=Integer.parseInt(args[0]);
jpayne@68: w=Integer.parseInt(args[1]);
jpayne@68: seq=args[2];
jpayne@68: } catch (NumberFormatException e) {
jpayne@68: //e.printStackTrace();
jpayne@68: System.err.println("Usage: bbmin.Minimizer kmerlen window seq\n"
jpayne@68: + "E.G.\n"
jpayne@68: + "bbmin.Minimizer 4 7 ACGTCTGAGCCTTGACACATGACT");
jpayne@68: System.exit(1);
jpayne@68: }
jpayne@68: Minimizer minnow=new Minimizer(k, w);
jpayne@68: long[] array=minnow.minimize(seq.getBytes());
jpayne@68: System.err.println(Arrays.toString(array));
jpayne@68: }
jpayne@68:
jpayne@68: public Minimizer(int k_, int window_){this(k_, window_, 2);}
jpayne@68: public Minimizer(int k_, int window_, int bitsPerSymbol_){
jpayne@68: k=k_;
jpayne@68: window=window_;
jpayne@68: bitsPerSymbol=bitsPerSymbol_;
jpayne@68: shift=bitsPerSymbol*k;
jpayne@68: shift2=shift-bitsPerSymbol;
jpayne@68: mask=(shift>63 ? -1L : ~((-1L)<100L+16L*bases.length){
jpayne@68: set.resizeDestructive(16);
jpayne@68: }else{
jpayne@68: set.clear();
jpayne@68: }
jpayne@68:
jpayne@68: long kmersProcessed=0;
jpayne@68: long kmer=0;
jpayne@68: long rkmer=0;
jpayne@68: int len=0;
jpayne@68:
jpayne@68: long bestCode=Long.MAX_VALUE;
jpayne@68: int bestPosition=-1;
jpayne@68: long bestKmer=-1;
jpayne@68: long bestRkmer=-1;
jpayne@68: int currentWindow=0;
jpayne@68:
jpayne@68: for(int i=0; i>>2)|(x2<=k){
jpayne@68: kmersProcessed++;
jpayne@68: currentWindow++;
jpayne@68:
jpayne@68: final long hashcode=hash(kmer, rkmer);
jpayne@68: System.err.println("i="+i+", code="+hashcode);
jpayne@68:
jpayne@68: //Track the best code in the window and its state
jpayne@68: if(hashcode>=minCode && hashcode<=bestCode){
jpayne@68: bestCode=hashcode;
jpayne@68: bestPosition=i;
jpayne@68: bestKmer=kmer;
jpayne@68: bestRkmer=rkmer;
jpayne@68: }
jpayne@68:
jpayne@68: //Once the window size is met, store the best code,
jpayne@68: //and backtrack to its position to start the next window
jpayne@68: if(currentWindow>=window && bestPosition>=0){
jpayne@68: if(!set.contains(bestCode)){
jpayne@68: set.add(bestCode);
jpayne@68: list.add(bestCode);
jpayne@68: }
jpayne@68: i=bestPosition;
jpayne@68: kmer=bestKmer;
jpayne@68: rkmer=bestRkmer;
jpayne@68: len=k;
jpayne@68:
jpayne@68: bestCode=Long.MAX_VALUE;
jpayne@68: bestPosition=-1;
jpayne@68: currentWindow=0;
jpayne@68: }
jpayne@68: }
jpayne@68: }
jpayne@68: list.sort();//optional
jpayne@68: return list.toArray();
jpayne@68: }
jpayne@68:
jpayne@68: public static long canon(long kmer, long rkmer){return max(kmer, rkmer);}
jpayne@68: public static long hash(long kmer, long rkmer){return hash(canon(kmer, rkmer));}
jpayne@68: public static long hash(long key) {
jpayne@68: key = (~key) + (key << 21); // key = (key << 21) - key - 1;
jpayne@68: key = key ^ (key >>> 24);
jpayne@68: key = (key + (key << 3)) + (key << 8); // key * 265
jpayne@68: key = key ^ (key >>> 14);
jpayne@68: key = (key + (key << 2)) + (key << 4); // key * 21
jpayne@68: key = key ^ (key >>> 28);
jpayne@68: key = key + (key << 31);
jpayne@68: return key;
jpayne@68: }
jpayne@68: private static final long max(long x, long y){return x>y ? x : y;}
jpayne@68:
jpayne@68: public final int k;
jpayne@68: public final int window;
jpayne@68: public final int bitsPerSymbol; //2 for nucleotides, 5 for amino acids.
jpayne@68: private final int shift;
jpayne@68: private final int shift2;
jpayne@68: private final long mask;
jpayne@68: private final long minCode=0;
jpayne@68:
jpayne@68: static final byte[] baseToNumber = new byte[128];
jpayne@68: static final byte[] baseToComplementNumber = new byte[128];
jpayne@68:
jpayne@68: static {
jpayne@68: Arrays.fill(baseToNumber, (byte)-1);
jpayne@68: Arrays.fill(baseToComplementNumber, (byte)-1);
jpayne@68: baseToNumber['A']=baseToNumber['a']=baseToComplementNumber['T']=baseToComplementNumber['t']=0;
jpayne@68: baseToNumber['C']=baseToNumber['c']=baseToComplementNumber['G']=baseToComplementNumber['c']=1;
jpayne@68: baseToNumber['G']=baseToNumber['g']=baseToComplementNumber['C']=baseToComplementNumber['g']=2;
jpayne@68: baseToNumber['T']=baseToNumber['t']=baseToComplementNumber['A']=baseToComplementNumber['a']=3;
jpayne@68: }
jpayne@68: }