jpayne@68
|
1 package bbmin;
|
jpayne@68
|
2
|
jpayne@68
|
3 import java.util.Arrays;
|
jpayne@68
|
4
|
jpayne@68
|
5 /**
|
jpayne@68
|
6 * Generates an array of minimal hash codes (as positive 64-bit longs) for an input sequence.<br>
|
jpayne@68
|
7 * The resulting array is guaranteed to contain the minimal hash code<br>
|
jpayne@68
|
8 * for every window, with no duplicates.
|
jpayne@68
|
9 * On average this is expected to yield 2*(L-K)/W hash codes for sequence length L and window size W.
|
jpayne@68
|
10 *
|
jpayne@68
|
11 * @author Brian Bushnell
|
jpayne@68
|
12 * @date October 8, 2021
|
jpayne@68
|
13 *
|
jpayne@68
|
14 */
|
jpayne@68
|
15 public class Minimizer {
|
jpayne@68
|
16
|
jpayne@68
|
17 public static void main(String[] args){
|
jpayne@68
|
18 int k=4, w=7;
|
jpayne@68
|
19 String seq="ACGTCTGAGCCTTGACACATGACT";
|
jpayne@68
|
20 try {
|
jpayne@68
|
21 k=Integer.parseInt(args[0]);
|
jpayne@68
|
22 w=Integer.parseInt(args[1]);
|
jpayne@68
|
23 seq=args[2];
|
jpayne@68
|
24 } catch (NumberFormatException e) {
|
jpayne@68
|
25 //e.printStackTrace();
|
jpayne@68
|
26 System.err.println("Usage: bbmin.Minimizer kmerlen window seq\n"
|
jpayne@68
|
27 + "E.G.\n"
|
jpayne@68
|
28 + "bbmin.Minimizer 4 7 ACGTCTGAGCCTTGACACATGACT");
|
jpayne@68
|
29 System.exit(1);
|
jpayne@68
|
30 }
|
jpayne@68
|
31 Minimizer minnow=new Minimizer(k, w);
|
jpayne@68
|
32 long[] array=minnow.minimize(seq.getBytes());
|
jpayne@68
|
33 System.err.println(Arrays.toString(array));
|
jpayne@68
|
34 }
|
jpayne@68
|
35
|
jpayne@68
|
36 public Minimizer(int k_, int window_){this(k_, window_, 2);}
|
jpayne@68
|
37 public Minimizer(int k_, int window_, int bitsPerSymbol_){
|
jpayne@68
|
38 k=k_;
|
jpayne@68
|
39 window=window_;
|
jpayne@68
|
40 bitsPerSymbol=bitsPerSymbol_;
|
jpayne@68
|
41 shift=bitsPerSymbol*k;
|
jpayne@68
|
42 shift2=shift-bitsPerSymbol;
|
jpayne@68
|
43 mask=(shift>63 ? -1L : ~((-1L)<<shift));
|
jpayne@68
|
44 }
|
jpayne@68
|
45
|
jpayne@68
|
46 public long[] minimize(String str){
|
jpayne@68
|
47 return minimize(str.getBytes());
|
jpayne@68
|
48 }
|
jpayne@68
|
49
|
jpayne@68
|
50 public long[] minimize(byte[] bases){
|
jpayne@68
|
51 return minimize(bases, new LongList(16), new LongHashSet(16));
|
jpayne@68
|
52 }
|
jpayne@68
|
53
|
jpayne@68
|
54 /** This method is typically faster since you don't need to construct a new set each time. */
|
jpayne@68
|
55 public long[] minimize(byte[] bases, LongList list, LongHashSet set){
|
jpayne@68
|
56 list.clear();
|
jpayne@68
|
57 //If the set is way too big, resize it
|
jpayne@68
|
58 if(set.capacity()*(long)window>100L+16L*bases.length){
|
jpayne@68
|
59 set.resizeDestructive(16);
|
jpayne@68
|
60 }else{
|
jpayne@68
|
61 set.clear();
|
jpayne@68
|
62 }
|
jpayne@68
|
63
|
jpayne@68
|
64 long kmersProcessed=0;
|
jpayne@68
|
65 long kmer=0;
|
jpayne@68
|
66 long rkmer=0;
|
jpayne@68
|
67 int len=0;
|
jpayne@68
|
68
|
jpayne@68
|
69 long bestCode=Long.MAX_VALUE;
|
jpayne@68
|
70 int bestPosition=-1;
|
jpayne@68
|
71 long bestKmer=-1;
|
jpayne@68
|
72 long bestRkmer=-1;
|
jpayne@68
|
73 int currentWindow=0;
|
jpayne@68
|
74
|
jpayne@68
|
75 for(int i=0; i<bases.length; i++){
|
jpayne@68
|
76 byte b=bases[i];
|
jpayne@68
|
77 long x=baseToNumber[b];
|
jpayne@68
|
78 long x2=baseToComplementNumber[b];
|
jpayne@68
|
79
|
jpayne@68
|
80 kmer=((kmer<<2)|x)&mask;
|
jpayne@68
|
81 rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
|
jpayne@68
|
82 if(x<0){
|
jpayne@68
|
83 len=0;
|
jpayne@68
|
84 rkmer=0;
|
jpayne@68
|
85 }else{
|
jpayne@68
|
86 len++;
|
jpayne@68
|
87 }
|
jpayne@68
|
88
|
jpayne@68
|
89 if(len>=k){
|
jpayne@68
|
90 kmersProcessed++;
|
jpayne@68
|
91 currentWindow++;
|
jpayne@68
|
92
|
jpayne@68
|
93 final long hashcode=hash(kmer, rkmer);
|
jpayne@68
|
94 System.err.println("i="+i+", code="+hashcode);
|
jpayne@68
|
95
|
jpayne@68
|
96 //Track the best code in the window and its state
|
jpayne@68
|
97 if(hashcode>=minCode && hashcode<=bestCode){
|
jpayne@68
|
98 bestCode=hashcode;
|
jpayne@68
|
99 bestPosition=i;
|
jpayne@68
|
100 bestKmer=kmer;
|
jpayne@68
|
101 bestRkmer=rkmer;
|
jpayne@68
|
102 }
|
jpayne@68
|
103
|
jpayne@68
|
104 //Once the window size is met, store the best code,
|
jpayne@68
|
105 //and backtrack to its position to start the next window
|
jpayne@68
|
106 if(currentWindow>=window && bestPosition>=0){
|
jpayne@68
|
107 if(!set.contains(bestCode)){
|
jpayne@68
|
108 set.add(bestCode);
|
jpayne@68
|
109 list.add(bestCode);
|
jpayne@68
|
110 }
|
jpayne@68
|
111 i=bestPosition;
|
jpayne@68
|
112 kmer=bestKmer;
|
jpayne@68
|
113 rkmer=bestRkmer;
|
jpayne@68
|
114 len=k;
|
jpayne@68
|
115
|
jpayne@68
|
116 bestCode=Long.MAX_VALUE;
|
jpayne@68
|
117 bestPosition=-1;
|
jpayne@68
|
118 currentWindow=0;
|
jpayne@68
|
119 }
|
jpayne@68
|
120 }
|
jpayne@68
|
121 }
|
jpayne@68
|
122 list.sort();//optional
|
jpayne@68
|
123 return list.toArray();
|
jpayne@68
|
124 }
|
jpayne@68
|
125
|
jpayne@68
|
126 public static long canon(long kmer, long rkmer){return max(kmer, rkmer);}
|
jpayne@68
|
127 public static long hash(long kmer, long rkmer){return hash(canon(kmer, rkmer));}
|
jpayne@68
|
128 public static long hash(long key) {
|
jpayne@68
|
129 key = (~key) + (key << 21); // key = (key << 21) - key - 1;
|
jpayne@68
|
130 key = key ^ (key >>> 24);
|
jpayne@68
|
131 key = (key + (key << 3)) + (key << 8); // key * 265
|
jpayne@68
|
132 key = key ^ (key >>> 14);
|
jpayne@68
|
133 key = (key + (key << 2)) + (key << 4); // key * 21
|
jpayne@68
|
134 key = key ^ (key >>> 28);
|
jpayne@68
|
135 key = key + (key << 31);
|
jpayne@68
|
136 return key;
|
jpayne@68
|
137 }
|
jpayne@68
|
138 private static final long max(long x, long y){return x>y ? x : y;}
|
jpayne@68
|
139
|
jpayne@68
|
140 public final int k;
|
jpayne@68
|
141 public final int window;
|
jpayne@68
|
142 public final int bitsPerSymbol; //2 for nucleotides, 5 for amino acids.
|
jpayne@68
|
143 private final int shift;
|
jpayne@68
|
144 private final int shift2;
|
jpayne@68
|
145 private final long mask;
|
jpayne@68
|
146 private final long minCode=0;
|
jpayne@68
|
147
|
jpayne@68
|
148 static final byte[] baseToNumber = new byte[128];
|
jpayne@68
|
149 static final byte[] baseToComplementNumber = new byte[128];
|
jpayne@68
|
150
|
jpayne@68
|
151 static {
|
jpayne@68
|
152 Arrays.fill(baseToNumber, (byte)-1);
|
jpayne@68
|
153 Arrays.fill(baseToComplementNumber, (byte)-1);
|
jpayne@68
|
154 baseToNumber['A']=baseToNumber['a']=baseToComplementNumber['T']=baseToComplementNumber['t']=0;
|
jpayne@68
|
155 baseToNumber['C']=baseToNumber['c']=baseToComplementNumber['G']=baseToComplementNumber['c']=1;
|
jpayne@68
|
156 baseToNumber['G']=baseToNumber['g']=baseToComplementNumber['C']=baseToComplementNumber['g']=2;
|
jpayne@68
|
157 baseToNumber['T']=baseToNumber['t']=baseToComplementNumber['A']=baseToComplementNumber['a']=3;
|
jpayne@68
|
158 }
|
jpayne@68
|
159 }
|