Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/SketchHeap.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package sketch; | |
2 | |
3 import java.util.Arrays; | |
4 import java.util.Locale; | |
5 | |
6 import dna.AminoAcid; | |
7 import shared.KillSwitch; | |
8 import shared.Tools; | |
9 import structures.LongHashMap; | |
10 import structures.LongHeap; | |
11 import structures.LongHeapMap; | |
12 import structures.LongHeapSet; | |
13 import structures.LongHeapSetInterface; | |
14 | |
15 public class SketchHeap { | |
16 | |
17 SketchHeap(int limit, int minKeyOccuranceCount_, boolean trackCounts){ | |
18 minKeyOccuranceCount=minKeyOccuranceCount_; | |
19 setMode=minKeyOccuranceCount<2 && !trackCounts; | |
20 if(setMode){ | |
21 setOrMap=set=new LongHeapSet(limit); | |
22 map=null; | |
23 heap=set.heap; | |
24 }else{ | |
25 if(minKeyOccuranceCount>1){limit=(int)Tools.min(10000000, limit*SketchObject.sketchHeapFactor);} | |
26 setOrMap=map=new LongHeapMap(limit); | |
27 set=null; | |
28 heap=map.heap; | |
29 } | |
30 } | |
31 | |
32 public void clear(boolean clearFname){ | |
33 taxID=-1; | |
34 imgID=-1; | |
35 genomeSizeBases=0; | |
36 genomeSizeKmers=0; | |
37 genomeSequences=0; | |
38 if(baseCounts!=null){Arrays.fill(baseCounts, 0);} | |
39 r16S=null; | |
40 r18S=null; | |
41 probSum=0; | |
42 taxName=null; | |
43 name0=null; | |
44 if(clearFname){fname=null;} | |
45 setOrMap.clear(); | |
46 } | |
47 | |
48 public void add(SketchHeap b){ | |
49 if(taxID<0){taxID=b.taxID;} | |
50 if(imgID<0){imgID=b.imgID;} | |
51 if(taxName==null){taxName=b.taxName;} | |
52 if(name0==null){name0=b.name0;} | |
53 if(fname==null){fname=b.fname;} | |
54 genomeSizeBases+=b.genomeSizeBases; | |
55 genomeSizeKmers+=b.genomeSizeKmers; | |
56 genomeSequences+=b.genomeSequences; | |
57 if(baseCounts!=null){Tools.add(baseCounts, b.baseCounts);} | |
58 set16S(b.r16S); | |
59 set18S(b.r18S); | |
60 probSum+=b.probSum; | |
61 if(setMode){ | |
62 set.add(b.set); | |
63 }else{ | |
64 map.add(b.map); | |
65 } | |
66 } | |
67 | |
68 public void add(Sketch b){ | |
69 if(taxID<0){taxID=b.taxID;} | |
70 if(imgID<0){imgID=b.imgID;} | |
71 if(taxName==null){taxName=b.taxName();} | |
72 if(name0==null){name0=b.name0();} | |
73 if(fname==null){fname=b.fname();} | |
74 genomeSizeBases+=b.genomeSizeBases; | |
75 genomeSizeKmers+=b.genomeSizeKmers; | |
76 genomeSequences+=b.genomeSequences; | |
77 if(baseCounts!=null && b.baseCounts!=null){Tools.add(baseCounts, b.baseCounts);} | |
78 set16S(b.r16S()); | |
79 set18S(b.r18S()); | |
80 | |
81 long[] keys=b.keys; | |
82 int[] counts=b.keyCounts; | |
83 assert(keys.length==b.length()) : keys.length+", "+b.length(); //Otherwise, change to loop through the size | |
84 for(int i=0; i<keys.length; i++){ | |
85 long key=Long.MAX_VALUE-keys[i]; | |
86 int count=(counts==null ? 1 : counts[i]); | |
87 assert((key>=SketchObject.minHashValue)==(count>0)); | |
88 increment(key, count); | |
89 } | |
90 } | |
91 | |
92 public StringBuilder toHeader(){ | |
93 StringBuilder sb=new StringBuilder(); | |
94 sb.append("#SZ:"+setOrMap.size()); | |
95 | |
96 sb.append("\tCD:"); | |
97 sb.append(SketchObject.codingArray[SketchObject.CODING]); | |
98 if(SketchObject.deltaOut){sb.append('D');} | |
99 if(SketchObject.aminoOrTranslate()){sb.append('M');} | |
100 if(SketchObject.amino8){sb.append('8');} | |
101 | |
102 sb.append("\tK:").append(SketchObject.k); | |
103 if(SketchObject.k2>0){sb.append(",").append(SketchObject.k2);} | |
104 if(SketchObject.HASH_VERSION>1){sb.append("\tH:").append(SketchObject.HASH_VERSION);} | |
105 | |
106 if(genomeSizeBases>0){sb.append("\tGS:"+genomeSizeBases);} | |
107 if(genomeSizeKmers>0){sb.append("\tGK:"+genomeSizeKmers);} | |
108 final long ge=genomeSizeEstimate(); | |
109 if(ge>0){sb.append("\tGE:").append(ge);} | |
110 if(genomeSequences>0){sb.append("\tGQ:"+genomeSequences);} | |
111 if(baseCounts!=null && !SketchObject.aminoOrTranslate()){ | |
112 sb.append("\tBC:").append(baseCounts[0]).append(',').append(baseCounts[1]).append(','); | |
113 sb.append(baseCounts[2]).append(',').append(baseCounts[3]); | |
114 } | |
115 if(probSum>0){sb.append("\tPC:"+String.format(Locale.ROOT, "%.4f",probSum/genomeSizeKmers));} | |
116 if(taxID>=0){sb.append("\tID:"+taxID);} | |
117 if(imgID>=0){sb.append("\tIMG:"+imgID);} | |
118 if(taxName!=null){sb.append("\tNM:"+taxName);} | |
119 if(name0!=null){sb.append("\tNM0:"+name0);} | |
120 if(fname!=null){sb.append("\tFN:"+fname);} | |
121 | |
122 if(r16S!=null){sb.append("\t16S:"+r16S.length);} | |
123 if(r18S!=null){sb.append("\t18S:"+r18S.length);} | |
124 if(r16S!=null){ | |
125 sb.append('\n').append("#16S:"); | |
126 for(byte b : r16S){sb.append((char)b);} | |
127 } | |
128 if(r18S!=null){ | |
129 sb.append('\n').append("#18S:"); | |
130 for(byte b : r18S){sb.append((char)b);} | |
131 } | |
132 return sb; | |
133 } | |
134 | |
135 public boolean checkAndAdd(long value){ | |
136 assert(value>=SketchObject.minHashValue); | |
137 | |
138 // if(!heap.hasRoom() && value<=heap.peek()){return false;} | |
139 // if(Blacklist.contains(value)){return false;} | |
140 // if(!Whitelist.contains(value)){return false;} | |
141 | |
142 if(Blacklist.exists() || Whitelist.exists()){ | |
143 if(!heap.hasRoom() && value<=heap.peek()){return false;} | |
144 if(Blacklist.contains(value)){return false;} | |
145 if(!Whitelist.containsRaw(value)){return false;} | |
146 } | |
147 | |
148 return add(value); | |
149 } | |
150 | |
151 public final int maxLen(){ | |
152 return SketchObject.toSketchSize(genomeSizeBases, genomeSizeKmers, genomeSizeEstimate(), SketchObject.targetSketchSize); | |
153 } | |
154 | |
155 public final long[] toSketchArray(){ | |
156 int maxLen=maxLen(); | |
157 return toSketchArray(maxLen, minKeyOccuranceCount); | |
158 } | |
159 | |
160 public final long[] toSketchArray_minCount(int minKeyOccuranceCount_){ | |
161 int maxLen=maxLen(); | |
162 return toSketchArray(maxLen, minKeyOccuranceCount_); | |
163 } | |
164 | |
165 final long[] toSketchArray_maxLen(int maxLen){ | |
166 return toSketchArray(maxLen, minKeyOccuranceCount); | |
167 } | |
168 | |
169 private final long[] toSketchArrayOld(int maxLen){//Destructive | |
170 final int initial=heap.size(); | |
171 final int len=Tools.min(maxLen, initial); | |
172 final long[] array=KillSwitch.allocLong1D(len); | |
173 | |
174 int toSkip=heap.size()-len; | |
175 for(int i=0; i<toSkip; i++){heap.poll();} | |
176 for(int i=0; i<len; i++){ | |
177 array[i]=Long.MAX_VALUE-heap.poll(); | |
178 } | |
179 Tools.reverseInPlace(array); | |
180 assert(heap.size()==0) : heap.size()+", "+len+", "+maxLen+", "+initial; | |
181 return array; | |
182 } | |
183 | |
184 private final long[] toSketchArray(int maxLen, int minKeyOccuranceCount_){//Non-destructive | |
185 if(minKeyOccuranceCount_<0){minKeyOccuranceCount_=minKeyOccuranceCount;} | |
186 if(setMode){return toSketchArrayOld(maxLen);} | |
187 long[] keys=map().toArray(minKeyOccuranceCount_); | |
188 for(int i=0; i<keys.length; i++){ | |
189 // assert(keys[i]>0) : Arrays.toString(keys); | |
190 keys[i]=Long.MAX_VALUE-keys[i]; | |
191 // assert(keys[i]>0) : Arrays.toString(keys); | |
192 } | |
193 Arrays.sort(keys); | |
194 if(keys.length>maxLen){ | |
195 keys=Arrays.copyOf(keys, maxLen); | |
196 } | |
197 | |
198 // final LongHeap heap=heap; | |
199 // heap.clear(); | |
200 // assert(heap.size()==0) : heap.size()+", "+maxLen; | |
201 return keys; | |
202 } | |
203 | |
204 @Override | |
205 public int hashCode(){ | |
206 long gSize=genomeSizeKmers>0 ? genomeSizeKmers : genomeSizeBases; | |
207 int code=(int) ((gSize^taxID^imgID^(name0==null ? 0 : name0.hashCode()))&Integer.MAX_VALUE); | |
208 return code; | |
209 } | |
210 | |
211 public long genomeSizeEstimate() { | |
212 int size=size(); | |
213 if(size==0){return 0;} | |
214 long min=peek(); | |
215 long est=Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(Long.MAX_VALUE-min, size)); | |
216 // assert(est<30000000) : min+", "+(Long.MAX_VALUE-min)+", "+size+", "+genomeSizeKmers+", "+Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(Long.MAX_VALUE-min, size)); | |
217 return est; | |
218 } | |
219 | |
220 public long genomeSizeEstimate(int minCount) { | |
221 if(minCount<2){return genomeSizeEstimate();} | |
222 if(size()==0){return 0;} | |
223 long[] min=map.map.getMin(minCount); | |
224 if(min[1]==0){return 0;} | |
225 long est=Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(Long.MAX_VALUE-min[0], (int)min[1])); | |
226 return est; | |
227 } | |
228 | |
229 public long sketchSizeEstimate(){ | |
230 return SketchObject.toSketchSize(genomeSizeBases, genomeSizeKmers, genomeSizeEstimate(), SketchObject.targetSketchSize); | |
231 } | |
232 | |
233 public boolean contains(long key) { | |
234 return setOrMap.contains(key); | |
235 } | |
236 | |
237 @Override | |
238 public String toString(){return toHeader().toString();} | |
239 | |
240 public String name(){return taxName==null ? name0 : taxName;} | |
241 public String taxName(){return taxName;} | |
242 public String name0(){return name0;} | |
243 public String fname(){return fname;} | |
244 public long[] baseCounts(boolean original){return baseCounts==null ? null : original ? baseCounts : baseCounts.clone();} | |
245 public void setTaxName(String s){taxName=s;} | |
246 public void setName0(String s){name0=s;} | |
247 public void setFname(String s){fname=s;} | |
248 | |
249 public byte[] r16S(){return r16S;} | |
250 public int r16SLen(){return r16S==null ? 0 : r16S.length;} | |
251 public void set16S(byte[] b){ | |
252 if(b==null || b.length<SketchObject.min_SSU_len){return;} | |
253 | |
254 if(r16S==null || score16S(b)>score16S(r16S)){ | |
255 r16S=b; | |
256 } | |
257 } | |
258 | |
259 private float score16S(byte[] seq){return scoreSSU(seq, 1533);} | |
260 private float score18S(byte[] seq){return scoreSSU(seq, 1858);} | |
261 | |
262 private float scoreSSU(byte[] seq, int idealLen){ | |
263 float lengthScore=lengthScore(seq.length, idealLen); | |
264 float definedScore=(seq.length-AminoAcid.countUndefined(seq))/(float)seq.length; | |
265 return lengthScore*definedScore; | |
266 } | |
267 private float lengthScore(int len, int ideal){return Tools.min(len, ideal)/(float)Tools.max(len, ideal);} | |
268 | |
269 public byte[] r18S(){return r18S;} | |
270 public int r18SLen(){return r18S==null ? 0 : r18S.length;} | |
271 public void set18S(byte[] b){ | |
272 if(b==null || b.length<SketchObject.min_SSU_len){return;} | |
273 if(r18S==null || score18S(b)>score16S(r18S)){ | |
274 r18S=b; | |
275 } | |
276 } | |
277 | |
278 boolean isEukaryote(){ | |
279 if(taxID<1 || taxID>=SketchObject.minFakeID){return false;} | |
280 if(SketchObject.taxtree==null){return false;} | |
281 return SketchObject.taxtree.isEukaryote((int)taxID); | |
282 } | |
283 | |
284 private String taxName; | |
285 private String name0; | |
286 private String fname; | |
287 public long taxID=-1; | |
288 public long imgID=-1; | |
289 public long genomeSizeBases=0; | |
290 public long genomeSizeKmers=0; | |
291 public long genomeSequences=0; | |
292 public final long[] baseCounts=(SketchObject.aminoOrTranslate() ? null : new long[4]); | |
293 private byte[] r16S; | |
294 private byte[] r18S; | |
295 double probSum=0; | |
296 | |
297 public float probCorrect(){return probSum<=0 ? 0f : (float)(probSum/Tools.max(genomeSizeKmers, 1f));} | |
298 public int capacity(){return heap.capacity();} | |
299 public boolean hasRoom(){return heap.hasRoom();} | |
300 public long peek(){return heap.peek();} | |
301 public int size(){return heap.size();} | |
302 public LongHashMap map(){return map.map;} | |
303 | |
304 public void clear(){ | |
305 setOrMap.clear(); | |
306 } | |
307 public void clearSet(){ | |
308 if(set==null){map.map.clear();} | |
309 else{set.set.clear();} | |
310 } | |
311 public boolean add(long key){return setOrMap.add(key);} | |
312 public int increment(long key, int incr){return setOrMap.increment(key, incr);} | |
313 | |
314 private final LongHeapSet set; | |
315 private final LongHeapMap map; | |
316 private final LongHeapSetInterface setOrMap; | |
317 public final LongHeap heap; | |
318 public final int minKeyOccuranceCount; | |
319 /** Determines whether to use LongHeapSet or LongHeapMap */ | |
320 public final boolean setMode; | |
321 } |