jpayne@68: package bbmin; jpayne@68: jpayne@68: import java.util.Arrays; jpayne@68: jpayne@68: /** jpayne@68: * Generates an array of minimal hash codes (as positive 64-bit longs) for an input sequence.
jpayne@68: * The resulting array is guaranteed to contain the minimal hash code
jpayne@68: * for every window, with no duplicates. jpayne@68: * On average this is expected to yield 2*(L-K)/W hash codes for sequence length L and window size W. jpayne@68: * jpayne@68: * @author Brian Bushnell jpayne@68: * @date October 8, 2021 jpayne@68: * jpayne@68: */ jpayne@68: public class Minimizer { jpayne@68: jpayne@68: public static void main(String[] args){ jpayne@68: int k=4, w=7; jpayne@68: String seq="ACGTCTGAGCCTTGACACATGACT"; jpayne@68: try { jpayne@68: k=Integer.parseInt(args[0]); jpayne@68: w=Integer.parseInt(args[1]); jpayne@68: seq=args[2]; jpayne@68: } catch (NumberFormatException e) { jpayne@68: //e.printStackTrace(); jpayne@68: System.err.println("Usage: bbmin.Minimizer kmerlen window seq\n" jpayne@68: + "E.G.\n" jpayne@68: + "bbmin.Minimizer 4 7 ACGTCTGAGCCTTGACACATGACT"); jpayne@68: System.exit(1); jpayne@68: } jpayne@68: Minimizer minnow=new Minimizer(k, w); jpayne@68: long[] array=minnow.minimize(seq.getBytes()); jpayne@68: System.err.println(Arrays.toString(array)); jpayne@68: } jpayne@68: jpayne@68: public Minimizer(int k_, int window_){this(k_, window_, 2);} jpayne@68: public Minimizer(int k_, int window_, int bitsPerSymbol_){ jpayne@68: k=k_; jpayne@68: window=window_; jpayne@68: bitsPerSymbol=bitsPerSymbol_; jpayne@68: shift=bitsPerSymbol*k; jpayne@68: shift2=shift-bitsPerSymbol; jpayne@68: mask=(shift>63 ? -1L : ~((-1L)<100L+16L*bases.length){ jpayne@68: set.resizeDestructive(16); jpayne@68: }else{ jpayne@68: set.clear(); jpayne@68: } jpayne@68: jpayne@68: long kmersProcessed=0; jpayne@68: long kmer=0; jpayne@68: long rkmer=0; jpayne@68: int len=0; jpayne@68: jpayne@68: long bestCode=Long.MAX_VALUE; jpayne@68: int bestPosition=-1; jpayne@68: long bestKmer=-1; jpayne@68: long bestRkmer=-1; jpayne@68: int currentWindow=0; jpayne@68: jpayne@68: for(int i=0; i>>2)|(x2<=k){ jpayne@68: kmersProcessed++; jpayne@68: currentWindow++; jpayne@68: jpayne@68: final long hashcode=hash(kmer, rkmer); jpayne@68: System.err.println("i="+i+", code="+hashcode); jpayne@68: jpayne@68: //Track the best code in the window and its state jpayne@68: if(hashcode>=minCode && hashcode<=bestCode){ jpayne@68: bestCode=hashcode; jpayne@68: bestPosition=i; jpayne@68: bestKmer=kmer; jpayne@68: bestRkmer=rkmer; jpayne@68: } jpayne@68: jpayne@68: //Once the window size is met, store the best code, jpayne@68: //and backtrack to its position to start the next window jpayne@68: if(currentWindow>=window && bestPosition>=0){ jpayne@68: if(!set.contains(bestCode)){ jpayne@68: set.add(bestCode); jpayne@68: list.add(bestCode); jpayne@68: } jpayne@68: i=bestPosition; jpayne@68: kmer=bestKmer; jpayne@68: rkmer=bestRkmer; jpayne@68: len=k; jpayne@68: jpayne@68: bestCode=Long.MAX_VALUE; jpayne@68: bestPosition=-1; jpayne@68: currentWindow=0; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: list.sort();//optional jpayne@68: return list.toArray(); jpayne@68: } jpayne@68: jpayne@68: public static long canon(long kmer, long rkmer){return max(kmer, rkmer);} jpayne@68: public static long hash(long kmer, long rkmer){return hash(canon(kmer, rkmer));} jpayne@68: public static long hash(long key) { jpayne@68: key = (~key) + (key << 21); // key = (key << 21) - key - 1; jpayne@68: key = key ^ (key >>> 24); jpayne@68: key = (key + (key << 3)) + (key << 8); // key * 265 jpayne@68: key = key ^ (key >>> 14); jpayne@68: key = (key + (key << 2)) + (key << 4); // key * 21 jpayne@68: key = key ^ (key >>> 28); jpayne@68: key = key + (key << 31); jpayne@68: return key; jpayne@68: } jpayne@68: private static final long max(long x, long y){return x>y ? x : y;} jpayne@68: jpayne@68: public final int k; jpayne@68: public final int window; jpayne@68: public final int bitsPerSymbol; //2 for nucleotides, 5 for amino acids. jpayne@68: private final int shift; jpayne@68: private final int shift2; jpayne@68: private final long mask; jpayne@68: private final long minCode=0; jpayne@68: jpayne@68: static final byte[] baseToNumber = new byte[128]; jpayne@68: static final byte[] baseToComplementNumber = new byte[128]; jpayne@68: jpayne@68: static { jpayne@68: Arrays.fill(baseToNumber, (byte)-1); jpayne@68: Arrays.fill(baseToComplementNumber, (byte)-1); jpayne@68: baseToNumber['A']=baseToNumber['a']=baseToComplementNumber['T']=baseToComplementNumber['t']=0; jpayne@68: baseToNumber['C']=baseToNumber['c']=baseToComplementNumber['G']=baseToComplementNumber['c']=1; jpayne@68: baseToNumber['G']=baseToNumber['g']=baseToComplementNumber['C']=baseToComplementNumber['g']=2; jpayne@68: baseToNumber['T']=baseToNumber['t']=baseToComplementNumber['A']=baseToComplementNumber['a']=3; jpayne@68: } jpayne@68: }