jpayne@68
|
1 package sketch;
|
jpayne@68
|
2
|
jpayne@68
|
3 import java.util.Arrays;
|
jpayne@68
|
4 import java.util.Locale;
|
jpayne@68
|
5
|
jpayne@68
|
6 import dna.AminoAcid;
|
jpayne@68
|
7 import shared.KillSwitch;
|
jpayne@68
|
8 import shared.Tools;
|
jpayne@68
|
9 import structures.LongHashMap;
|
jpayne@68
|
10 import structures.LongHeap;
|
jpayne@68
|
11 import structures.LongHeapMap;
|
jpayne@68
|
12 import structures.LongHeapSet;
|
jpayne@68
|
13 import structures.LongHeapSetInterface;
|
jpayne@68
|
14
|
jpayne@68
|
15 public class SketchHeap {
|
jpayne@68
|
16
|
jpayne@68
|
17 SketchHeap(int limit, int minKeyOccuranceCount_, boolean trackCounts){
|
jpayne@68
|
18 minKeyOccuranceCount=minKeyOccuranceCount_;
|
jpayne@68
|
19 setMode=minKeyOccuranceCount<2 && !trackCounts;
|
jpayne@68
|
20 if(setMode){
|
jpayne@68
|
21 setOrMap=set=new LongHeapSet(limit);
|
jpayne@68
|
22 map=null;
|
jpayne@68
|
23 heap=set.heap;
|
jpayne@68
|
24 }else{
|
jpayne@68
|
25 if(minKeyOccuranceCount>1){limit=(int)Tools.min(10000000, limit*SketchObject.sketchHeapFactor);}
|
jpayne@68
|
26 setOrMap=map=new LongHeapMap(limit);
|
jpayne@68
|
27 set=null;
|
jpayne@68
|
28 heap=map.heap;
|
jpayne@68
|
29 }
|
jpayne@68
|
30 }
|
jpayne@68
|
31
|
jpayne@68
|
32 public void clear(boolean clearFname){
|
jpayne@68
|
33 taxID=-1;
|
jpayne@68
|
34 imgID=-1;
|
jpayne@68
|
35 genomeSizeBases=0;
|
jpayne@68
|
36 genomeSizeKmers=0;
|
jpayne@68
|
37 genomeSequences=0;
|
jpayne@68
|
38 if(baseCounts!=null){Arrays.fill(baseCounts, 0);}
|
jpayne@68
|
39 r16S=null;
|
jpayne@68
|
40 r18S=null;
|
jpayne@68
|
41 probSum=0;
|
jpayne@68
|
42 taxName=null;
|
jpayne@68
|
43 name0=null;
|
jpayne@68
|
44 if(clearFname){fname=null;}
|
jpayne@68
|
45 setOrMap.clear();
|
jpayne@68
|
46 }
|
jpayne@68
|
47
|
jpayne@68
|
48 public void add(SketchHeap b){
|
jpayne@68
|
49 if(taxID<0){taxID=b.taxID;}
|
jpayne@68
|
50 if(imgID<0){imgID=b.imgID;}
|
jpayne@68
|
51 if(taxName==null){taxName=b.taxName;}
|
jpayne@68
|
52 if(name0==null){name0=b.name0;}
|
jpayne@68
|
53 if(fname==null){fname=b.fname;}
|
jpayne@68
|
54 genomeSizeBases+=b.genomeSizeBases;
|
jpayne@68
|
55 genomeSizeKmers+=b.genomeSizeKmers;
|
jpayne@68
|
56 genomeSequences+=b.genomeSequences;
|
jpayne@68
|
57 if(baseCounts!=null){Tools.add(baseCounts, b.baseCounts);}
|
jpayne@68
|
58 set16S(b.r16S);
|
jpayne@68
|
59 set18S(b.r18S);
|
jpayne@68
|
60 probSum+=b.probSum;
|
jpayne@68
|
61 if(setMode){
|
jpayne@68
|
62 set.add(b.set);
|
jpayne@68
|
63 }else{
|
jpayne@68
|
64 map.add(b.map);
|
jpayne@68
|
65 }
|
jpayne@68
|
66 }
|
jpayne@68
|
67
|
jpayne@68
|
68 public void add(Sketch b){
|
jpayne@68
|
69 if(taxID<0){taxID=b.taxID;}
|
jpayne@68
|
70 if(imgID<0){imgID=b.imgID;}
|
jpayne@68
|
71 if(taxName==null){taxName=b.taxName();}
|
jpayne@68
|
72 if(name0==null){name0=b.name0();}
|
jpayne@68
|
73 if(fname==null){fname=b.fname();}
|
jpayne@68
|
74 genomeSizeBases+=b.genomeSizeBases;
|
jpayne@68
|
75 genomeSizeKmers+=b.genomeSizeKmers;
|
jpayne@68
|
76 genomeSequences+=b.genomeSequences;
|
jpayne@68
|
77 if(baseCounts!=null && b.baseCounts!=null){Tools.add(baseCounts, b.baseCounts);}
|
jpayne@68
|
78 set16S(b.r16S());
|
jpayne@68
|
79 set18S(b.r18S());
|
jpayne@68
|
80
|
jpayne@68
|
81 long[] keys=b.keys;
|
jpayne@68
|
82 int[] counts=b.keyCounts;
|
jpayne@68
|
83 assert(keys.length==b.length()) : keys.length+", "+b.length(); //Otherwise, change to loop through the size
|
jpayne@68
|
84 for(int i=0; i<keys.length; i++){
|
jpayne@68
|
85 long key=Long.MAX_VALUE-keys[i];
|
jpayne@68
|
86 int count=(counts==null ? 1 : counts[i]);
|
jpayne@68
|
87 assert((key>=SketchObject.minHashValue)==(count>0));
|
jpayne@68
|
88 increment(key, count);
|
jpayne@68
|
89 }
|
jpayne@68
|
90 }
|
jpayne@68
|
91
|
jpayne@68
|
92 public StringBuilder toHeader(){
|
jpayne@68
|
93 StringBuilder sb=new StringBuilder();
|
jpayne@68
|
94 sb.append("#SZ:"+setOrMap.size());
|
jpayne@68
|
95
|
jpayne@68
|
96 sb.append("\tCD:");
|
jpayne@68
|
97 sb.append(SketchObject.codingArray[SketchObject.CODING]);
|
jpayne@68
|
98 if(SketchObject.deltaOut){sb.append('D');}
|
jpayne@68
|
99 if(SketchObject.aminoOrTranslate()){sb.append('M');}
|
jpayne@68
|
100 if(SketchObject.amino8){sb.append('8');}
|
jpayne@68
|
101
|
jpayne@68
|
102 sb.append("\tK:").append(SketchObject.k);
|
jpayne@68
|
103 if(SketchObject.k2>0){sb.append(",").append(SketchObject.k2);}
|
jpayne@68
|
104 if(SketchObject.HASH_VERSION>1){sb.append("\tH:").append(SketchObject.HASH_VERSION);}
|
jpayne@68
|
105
|
jpayne@68
|
106 if(genomeSizeBases>0){sb.append("\tGS:"+genomeSizeBases);}
|
jpayne@68
|
107 if(genomeSizeKmers>0){sb.append("\tGK:"+genomeSizeKmers);}
|
jpayne@68
|
108 final long ge=genomeSizeEstimate();
|
jpayne@68
|
109 if(ge>0){sb.append("\tGE:").append(ge);}
|
jpayne@68
|
110 if(genomeSequences>0){sb.append("\tGQ:"+genomeSequences);}
|
jpayne@68
|
111 if(baseCounts!=null && !SketchObject.aminoOrTranslate()){
|
jpayne@68
|
112 sb.append("\tBC:").append(baseCounts[0]).append(',').append(baseCounts[1]).append(',');
|
jpayne@68
|
113 sb.append(baseCounts[2]).append(',').append(baseCounts[3]);
|
jpayne@68
|
114 }
|
jpayne@68
|
115 if(probSum>0){sb.append("\tPC:"+String.format(Locale.ROOT, "%.4f",probSum/genomeSizeKmers));}
|
jpayne@68
|
116 if(taxID>=0){sb.append("\tID:"+taxID);}
|
jpayne@68
|
117 if(imgID>=0){sb.append("\tIMG:"+imgID);}
|
jpayne@68
|
118 if(taxName!=null){sb.append("\tNM:"+taxName);}
|
jpayne@68
|
119 if(name0!=null){sb.append("\tNM0:"+name0);}
|
jpayne@68
|
120 if(fname!=null){sb.append("\tFN:"+fname);}
|
jpayne@68
|
121
|
jpayne@68
|
122 if(r16S!=null){sb.append("\t16S:"+r16S.length);}
|
jpayne@68
|
123 if(r18S!=null){sb.append("\t18S:"+r18S.length);}
|
jpayne@68
|
124 if(r16S!=null){
|
jpayne@68
|
125 sb.append('\n').append("#16S:");
|
jpayne@68
|
126 for(byte b : r16S){sb.append((char)b);}
|
jpayne@68
|
127 }
|
jpayne@68
|
128 if(r18S!=null){
|
jpayne@68
|
129 sb.append('\n').append("#18S:");
|
jpayne@68
|
130 for(byte b : r18S){sb.append((char)b);}
|
jpayne@68
|
131 }
|
jpayne@68
|
132 return sb;
|
jpayne@68
|
133 }
|
jpayne@68
|
134
|
jpayne@68
|
135 public boolean checkAndAdd(long value){
|
jpayne@68
|
136 assert(value>=SketchObject.minHashValue);
|
jpayne@68
|
137
|
jpayne@68
|
138 // if(!heap.hasRoom() && value<=heap.peek()){return false;}
|
jpayne@68
|
139 // if(Blacklist.contains(value)){return false;}
|
jpayne@68
|
140 // if(!Whitelist.contains(value)){return false;}
|
jpayne@68
|
141
|
jpayne@68
|
142 if(Blacklist.exists() || Whitelist.exists()){
|
jpayne@68
|
143 if(!heap.hasRoom() && value<=heap.peek()){return false;}
|
jpayne@68
|
144 if(Blacklist.contains(value)){return false;}
|
jpayne@68
|
145 if(!Whitelist.containsRaw(value)){return false;}
|
jpayne@68
|
146 }
|
jpayne@68
|
147
|
jpayne@68
|
148 return add(value);
|
jpayne@68
|
149 }
|
jpayne@68
|
150
|
jpayne@68
|
151 public final int maxLen(){
|
jpayne@68
|
152 return SketchObject.toSketchSize(genomeSizeBases, genomeSizeKmers, genomeSizeEstimate(), SketchObject.targetSketchSize);
|
jpayne@68
|
153 }
|
jpayne@68
|
154
|
jpayne@68
|
155 public final long[] toSketchArray(){
|
jpayne@68
|
156 int maxLen=maxLen();
|
jpayne@68
|
157 return toSketchArray(maxLen, minKeyOccuranceCount);
|
jpayne@68
|
158 }
|
jpayne@68
|
159
|
jpayne@68
|
160 public final long[] toSketchArray_minCount(int minKeyOccuranceCount_){
|
jpayne@68
|
161 int maxLen=maxLen();
|
jpayne@68
|
162 return toSketchArray(maxLen, minKeyOccuranceCount_);
|
jpayne@68
|
163 }
|
jpayne@68
|
164
|
jpayne@68
|
165 final long[] toSketchArray_maxLen(int maxLen){
|
jpayne@68
|
166 return toSketchArray(maxLen, minKeyOccuranceCount);
|
jpayne@68
|
167 }
|
jpayne@68
|
168
|
jpayne@68
|
169 private final long[] toSketchArrayOld(int maxLen){//Destructive
|
jpayne@68
|
170 final int initial=heap.size();
|
jpayne@68
|
171 final int len=Tools.min(maxLen, initial);
|
jpayne@68
|
172 final long[] array=KillSwitch.allocLong1D(len);
|
jpayne@68
|
173
|
jpayne@68
|
174 int toSkip=heap.size()-len;
|
jpayne@68
|
175 for(int i=0; i<toSkip; i++){heap.poll();}
|
jpayne@68
|
176 for(int i=0; i<len; i++){
|
jpayne@68
|
177 array[i]=Long.MAX_VALUE-heap.poll();
|
jpayne@68
|
178 }
|
jpayne@68
|
179 Tools.reverseInPlace(array);
|
jpayne@68
|
180 assert(heap.size()==0) : heap.size()+", "+len+", "+maxLen+", "+initial;
|
jpayne@68
|
181 return array;
|
jpayne@68
|
182 }
|
jpayne@68
|
183
|
jpayne@68
|
184 private final long[] toSketchArray(int maxLen, int minKeyOccuranceCount_){//Non-destructive
|
jpayne@68
|
185 if(minKeyOccuranceCount_<0){minKeyOccuranceCount_=minKeyOccuranceCount;}
|
jpayne@68
|
186 if(setMode){return toSketchArrayOld(maxLen);}
|
jpayne@68
|
187 long[] keys=map().toArray(minKeyOccuranceCount_);
|
jpayne@68
|
188 for(int i=0; i<keys.length; i++){
|
jpayne@68
|
189 // assert(keys[i]>0) : Arrays.toString(keys);
|
jpayne@68
|
190 keys[i]=Long.MAX_VALUE-keys[i];
|
jpayne@68
|
191 // assert(keys[i]>0) : Arrays.toString(keys);
|
jpayne@68
|
192 }
|
jpayne@68
|
193 Arrays.sort(keys);
|
jpayne@68
|
194 if(keys.length>maxLen){
|
jpayne@68
|
195 keys=Arrays.copyOf(keys, maxLen);
|
jpayne@68
|
196 }
|
jpayne@68
|
197
|
jpayne@68
|
198 // final LongHeap heap=heap;
|
jpayne@68
|
199 // heap.clear();
|
jpayne@68
|
200 // assert(heap.size()==0) : heap.size()+", "+maxLen;
|
jpayne@68
|
201 return keys;
|
jpayne@68
|
202 }
|
jpayne@68
|
203
|
jpayne@68
|
204 @Override
|
jpayne@68
|
205 public int hashCode(){
|
jpayne@68
|
206 long gSize=genomeSizeKmers>0 ? genomeSizeKmers : genomeSizeBases;
|
jpayne@68
|
207 int code=(int) ((gSize^taxID^imgID^(name0==null ? 0 : name0.hashCode()))&Integer.MAX_VALUE);
|
jpayne@68
|
208 return code;
|
jpayne@68
|
209 }
|
jpayne@68
|
210
|
jpayne@68
|
211 public long genomeSizeEstimate() {
|
jpayne@68
|
212 int size=size();
|
jpayne@68
|
213 if(size==0){return 0;}
|
jpayne@68
|
214 long min=peek();
|
jpayne@68
|
215 long est=Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(Long.MAX_VALUE-min, size));
|
jpayne@68
|
216 // assert(est<30000000) : min+", "+(Long.MAX_VALUE-min)+", "+size+", "+genomeSizeKmers+", "+Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(Long.MAX_VALUE-min, size));
|
jpayne@68
|
217 return est;
|
jpayne@68
|
218 }
|
jpayne@68
|
219
|
jpayne@68
|
220 public long genomeSizeEstimate(int minCount) {
|
jpayne@68
|
221 if(minCount<2){return genomeSizeEstimate();}
|
jpayne@68
|
222 if(size()==0){return 0;}
|
jpayne@68
|
223 long[] min=map.map.getMin(minCount);
|
jpayne@68
|
224 if(min[1]==0){return 0;}
|
jpayne@68
|
225 long est=Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(Long.MAX_VALUE-min[0], (int)min[1]));
|
jpayne@68
|
226 return est;
|
jpayne@68
|
227 }
|
jpayne@68
|
228
|
jpayne@68
|
229 public long sketchSizeEstimate(){
|
jpayne@68
|
230 return SketchObject.toSketchSize(genomeSizeBases, genomeSizeKmers, genomeSizeEstimate(), SketchObject.targetSketchSize);
|
jpayne@68
|
231 }
|
jpayne@68
|
232
|
jpayne@68
|
233 public boolean contains(long key) {
|
jpayne@68
|
234 return setOrMap.contains(key);
|
jpayne@68
|
235 }
|
jpayne@68
|
236
|
jpayne@68
|
237 @Override
|
jpayne@68
|
238 public String toString(){return toHeader().toString();}
|
jpayne@68
|
239
|
jpayne@68
|
240 public String name(){return taxName==null ? name0 : taxName;}
|
jpayne@68
|
241 public String taxName(){return taxName;}
|
jpayne@68
|
242 public String name0(){return name0;}
|
jpayne@68
|
243 public String fname(){return fname;}
|
jpayne@68
|
244 public long[] baseCounts(boolean original){return baseCounts==null ? null : original ? baseCounts : baseCounts.clone();}
|
jpayne@68
|
245 public void setTaxName(String s){taxName=s;}
|
jpayne@68
|
246 public void setName0(String s){name0=s;}
|
jpayne@68
|
247 public void setFname(String s){fname=s;}
|
jpayne@68
|
248
|
jpayne@68
|
249 public byte[] r16S(){return r16S;}
|
jpayne@68
|
250 public int r16SLen(){return r16S==null ? 0 : r16S.length;}
|
jpayne@68
|
251 public void set16S(byte[] b){
|
jpayne@68
|
252 if(b==null || b.length<SketchObject.min_SSU_len){return;}
|
jpayne@68
|
253
|
jpayne@68
|
254 if(r16S==null || score16S(b)>score16S(r16S)){
|
jpayne@68
|
255 r16S=b;
|
jpayne@68
|
256 }
|
jpayne@68
|
257 }
|
jpayne@68
|
258
|
jpayne@68
|
259 private float score16S(byte[] seq){return scoreSSU(seq, 1533);}
|
jpayne@68
|
260 private float score18S(byte[] seq){return scoreSSU(seq, 1858);}
|
jpayne@68
|
261
|
jpayne@68
|
262 private float scoreSSU(byte[] seq, int idealLen){
|
jpayne@68
|
263 float lengthScore=lengthScore(seq.length, idealLen);
|
jpayne@68
|
264 float definedScore=(seq.length-AminoAcid.countUndefined(seq))/(float)seq.length;
|
jpayne@68
|
265 return lengthScore*definedScore;
|
jpayne@68
|
266 }
|
jpayne@68
|
267 private float lengthScore(int len, int ideal){return Tools.min(len, ideal)/(float)Tools.max(len, ideal);}
|
jpayne@68
|
268
|
jpayne@68
|
269 public byte[] r18S(){return r18S;}
|
jpayne@68
|
270 public int r18SLen(){return r18S==null ? 0 : r18S.length;}
|
jpayne@68
|
271 public void set18S(byte[] b){
|
jpayne@68
|
272 if(b==null || b.length<SketchObject.min_SSU_len){return;}
|
jpayne@68
|
273 if(r18S==null || score18S(b)>score16S(r18S)){
|
jpayne@68
|
274 r18S=b;
|
jpayne@68
|
275 }
|
jpayne@68
|
276 }
|
jpayne@68
|
277
|
jpayne@68
|
278 boolean isEukaryote(){
|
jpayne@68
|
279 if(taxID<1 || taxID>=SketchObject.minFakeID){return false;}
|
jpayne@68
|
280 if(SketchObject.taxtree==null){return false;}
|
jpayne@68
|
281 return SketchObject.taxtree.isEukaryote((int)taxID);
|
jpayne@68
|
282 }
|
jpayne@68
|
283
|
jpayne@68
|
284 private String taxName;
|
jpayne@68
|
285 private String name0;
|
jpayne@68
|
286 private String fname;
|
jpayne@68
|
287 public long taxID=-1;
|
jpayne@68
|
288 public long imgID=-1;
|
jpayne@68
|
289 public long genomeSizeBases=0;
|
jpayne@68
|
290 public long genomeSizeKmers=0;
|
jpayne@68
|
291 public long genomeSequences=0;
|
jpayne@68
|
292 public final long[] baseCounts=(SketchObject.aminoOrTranslate() ? null : new long[4]);
|
jpayne@68
|
293 private byte[] r16S;
|
jpayne@68
|
294 private byte[] r18S;
|
jpayne@68
|
295 double probSum=0;
|
jpayne@68
|
296
|
jpayne@68
|
297 public float probCorrect(){return probSum<=0 ? 0f : (float)(probSum/Tools.max(genomeSizeKmers, 1f));}
|
jpayne@68
|
298 public int capacity(){return heap.capacity();}
|
jpayne@68
|
299 public boolean hasRoom(){return heap.hasRoom();}
|
jpayne@68
|
300 public long peek(){return heap.peek();}
|
jpayne@68
|
301 public int size(){return heap.size();}
|
jpayne@68
|
302 public LongHashMap map(){return map.map;}
|
jpayne@68
|
303
|
jpayne@68
|
304 public void clear(){
|
jpayne@68
|
305 setOrMap.clear();
|
jpayne@68
|
306 }
|
jpayne@68
|
307 public void clearSet(){
|
jpayne@68
|
308 if(set==null){map.map.clear();}
|
jpayne@68
|
309 else{set.set.clear();}
|
jpayne@68
|
310 }
|
jpayne@68
|
311 public boolean add(long key){return setOrMap.add(key);}
|
jpayne@68
|
312 public int increment(long key, int incr){return setOrMap.increment(key, incr);}
|
jpayne@68
|
313
|
jpayne@68
|
314 private final LongHeapSet set;
|
jpayne@68
|
315 private final LongHeapMap map;
|
jpayne@68
|
316 private final LongHeapSetInterface setOrMap;
|
jpayne@68
|
317 public final LongHeap heap;
|
jpayne@68
|
318 public final int minKeyOccuranceCount;
|
jpayne@68
|
319 /** Determines whether to use LongHeapSet or LongHeapMap */
|
jpayne@68
|
320 public final boolean setMode;
|
jpayne@68
|
321 }
|