Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/Sketch.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package sketch; | |
2 | |
3 import java.util.ArrayList; | |
4 import java.util.Arrays; | |
5 import java.util.Collections; | |
6 import java.util.HashMap; | |
7 import java.util.Locale; | |
8 import java.util.concurrent.atomic.AtomicInteger; | |
9 | |
10 import dna.AminoAcid; | |
11 import fileIO.ReadWrite; | |
12 import shared.KillSwitch; | |
13 import shared.Tools; | |
14 import structures.AbstractBitSet; | |
15 import structures.ByteBuilder; | |
16 import structures.IntList; | |
17 import structures.LongHashMap; | |
18 import structures.LongList; | |
19 import structures.LongPair; | |
20 import tax.ImgRecord; | |
21 | |
22 /** | |
23 * @author Brian Bushnell | |
24 * @date July 7, 2016 | |
25 * | |
26 */ | |
27 public class Sketch extends SketchObject implements Comparable<Sketch>, Cloneable { | |
28 | |
29 /*--------------------------------------------------------------*/ | |
30 /*---------------- Constructors ----------------*/ | |
31 /*--------------------------------------------------------------*/ | |
32 | |
33 @Override | |
34 public Sketch clone() { | |
35 Sketch copy=null; | |
36 try { | |
37 copy = (Sketch)super.clone(); | |
38 } catch (CloneNotSupportedException e) { | |
39 // e.printStackTrace(); | |
40 throw new RuntimeException(e); | |
41 } | |
42 copy.compareBitSet=null; | |
43 copy.indexBitSet=null; | |
44 return copy; | |
45 } | |
46 | |
47 //Array should already be hashed, sorted, unique, subtracted from Long.MAX_VALUE, then reversed. | |
48 public Sketch(long[] keys_, int[] keyCounts_, long[] baseCounts_, byte[] r16S_, byte[] r18S_, ArrayList<String> meta_){ | |
49 this(keys_, keyCounts_, baseCounts_, r16S_, r18S_, -1, -1, -1, -1, -1, -1, null, null, null, meta_); | |
50 } | |
51 | |
52 public Sketch(SketchHeap heap, boolean clearFname, boolean keepCounts, ArrayList<String> meta_){ | |
53 this(heap, clearFname, keepCounts, meta_, -1); | |
54 } | |
55 | |
56 public Sketch(SketchHeap heap, boolean clearFname, boolean keepCounts, ArrayList<String> meta_, int minKeyOccuranceCount){ | |
57 this(heap.toSketchArray_minCount(minKeyOccuranceCount), null, heap.baseCounts(false), heap.r16S(), heap.r18S(), (int)heap.taxID, heap.imgID, | |
58 heap.genomeSizeBases, heap.genomeSizeKmers, heap.genomeSequences, | |
59 heap.probCorrect(), heap.taxName(), heap.name0(), heap.fname(), meta_); | |
60 assert(keyCounts==null); | |
61 if(!heap.setMode && keepCounts){ | |
62 LongHashMap map=heap.map(); | |
63 keyCounts=new int[keys.length]; | |
64 for(int i=0; i<keys.length; i++){ | |
65 int count=map.get(Long.MAX_VALUE-keys[i]); | |
66 assert(count>0) : keys[i]+" -> "+count+"\n"+Arrays.toString(map.values())+"\n"+Arrays.toString(map.keys()); | |
67 keyCounts[i]=count; | |
68 } | |
69 } | |
70 if(heap.setMode){heap.clearSet();} | |
71 heap.clear(clearFname); | |
72 // System.err.println("size="+size+", genome="+this.genomeSize+", m"); : (int)(2+maxGenomeFraction*heap.genomeSize)+", "+this.array.length; | |
73 // assert(false) : (int)(2+maxGenomeFraction*heap.genomeSize)+", "+this.array.length; | |
74 // assert(false) : (counts==null)+", "+heap.setMode+", "+keepCounts; | |
75 } | |
76 | |
77 public Sketch(long[] keys_, int[] keyCounts_, long[] baseCounts_, byte[] r16S_, byte[] r18S_, int taxID_, long imgID_, long gSizeBases_, | |
78 long gSizeKmers_, long gSequences_, double probCorrect_, String taxName_, String name0_, String fname_, ArrayList<String> meta_){ | |
79 // Exception e=new Exception(baseCounts_+", "+Arrays.toString(baseCounts_)); | |
80 // e.printStackTrace(); | |
81 keys=keys_; | |
82 keyCounts=keyCounts_; | |
83 // fixKeyCounts(); | |
84 baseCounts=baseCounts_; | |
85 r16S=r16S_; | |
86 r18S=r18S_; | |
87 assert(keyCounts==null || keys==null || keyCounts.length==keys.length) : (keys==null ? "null" : keys.length)+", "+(keyCounts==null ? "null" : keyCounts.length); | |
88 taxID=taxID_; | |
89 imgID=imgID_; | |
90 sketchID=nextSketch.getAndIncrement(); | |
91 genomeSizeBases=gSizeBases_; | |
92 genomeSizeKmers=gSizeKmers_; | |
93 genomeSequences=gSequences_; | |
94 probCorrect=probCorrect_<=0 ? 0f : (float)probCorrect_; | |
95 meta=meta_; | |
96 | |
97 taxName=fix(taxName_); | |
98 name0=fix(name0_); | |
99 fname=fix(fname_); | |
100 if(fname!=null && (fname.startsWith("stdin.") || fname.endsWith(".sketch") || fname.endsWith(".sketch.gz"))){fname=null;} | |
101 | |
102 // if(k2>0){ | |
103 // if(useToValue2) { | |
104 // int count=0; | |
105 // for(long x : array){ | |
106 // count+=(int)(x&1); | |
107 // } | |
108 // k1Count=count; | |
109 // }else { | |
110 // k1Count=array.length/2; | |
111 // } | |
112 // }else{ | |
113 // k1Count=array.length; | |
114 // } | |
115 | |
116 if(ImgRecord.imgMap!=null && imgID>=0 && taxID<0){ | |
117 ImgRecord record=ImgRecord.imgMap.get(imgID); | |
118 if(record!=null){ | |
119 if(record.name!=null && taxName==null){taxName=record.name;} | |
120 taxID=record.taxID; | |
121 } | |
122 } | |
123 } | |
124 | |
125 void loadSSU(){ | |
126 if(taxID>0 && taxID<minFakeID){ | |
127 if(r16S==null && SSUMap.r16SMap!=null){ | |
128 r16S=SSUMap.r16SMap.get(taxID); | |
129 } | |
130 if(r18S==null && SSUMap.r18SMap!=null){ | |
131 r18S=SSUMap.r18SMap.get(taxID); | |
132 } | |
133 } | |
134 // new Exception().printStackTrace(); | |
135 // System.err.println(taxID+", "+ssu+", "+SSUMap.ssuMap.size()); | |
136 } | |
137 | |
138 void addMeta(String s){ | |
139 s=fixMeta(s); | |
140 if(s==null){return;} | |
141 if(meta==null){meta=new ArrayList<String>(1);} | |
142 meta.add(s); | |
143 } | |
144 | |
145 void setMeta(ArrayList<String> list){ | |
146 assert(meta==null); | |
147 meta=list; | |
148 } | |
149 | |
150 private static String fix(String s){ | |
151 if(s==null){return null;} | |
152 return s.replace('\t', ' '); | |
153 } | |
154 | |
155 /*--------------------------------------------------------------*/ | |
156 /*---------------- Methods ----------------*/ | |
157 /*--------------------------------------------------------------*/ | |
158 | |
159 public void add(Sketch other, int maxlen){ | |
160 assert(keyCounts==null && other.keyCounts==null); | |
161 final long[] a=keys; | |
162 final long[] b=other.keys; | |
163 if(maxlen<1){ | |
164 assert(false); | |
165 maxlen=1000000; | |
166 } | |
167 LongList list=new LongList(Tools.min(maxlen, a.length+b.length)); | |
168 | |
169 for(int i=0, j=0; i<a.length && j<b.length; ){ | |
170 final long ka=a[i], kb=b[j]; | |
171 if(ka==kb){//match | |
172 list.add(ka); | |
173 i++; | |
174 j++; | |
175 }else if(ka<kb){ | |
176 list.add(ka); | |
177 i++; | |
178 }else{ | |
179 list.add(kb); | |
180 j++; | |
181 } | |
182 if(list.size()>=maxlen){break;} | |
183 } | |
184 | |
185 if(keys.length==list.size()){ | |
186 for(int i=0; i<list.size; i++){ | |
187 keys[i]=list.array[i]; | |
188 } | |
189 }else{ | |
190 keys=list.toArray(); | |
191 } | |
192 } | |
193 | |
194 public void resize(int newSize) { | |
195 if(newSize>=length()){return;} | |
196 keys=Arrays.copyOf(keys, newSize); | |
197 // fixKeyCounts(newSize); | |
198 if(keyCounts!=null){ | |
199 for(int i=0; i<newSize; i++) | |
200 keyCounts=Arrays.copyOf(keyCounts, newSize); | |
201 } | |
202 } | |
203 | |
204 int fixKeyCounts(){ | |
205 return fixKeyCounts(keyCounts==null ? maxCount : keyCounts.length); | |
206 } | |
207 | |
208 int fixKeyCounts(int maxLen){ | |
209 int max=0; | |
210 if(keyCounts!=null){ | |
211 for(int i=0; i<maxLen && max<2; i++){ | |
212 max=Tools.max(keyCounts[i], max); | |
213 } | |
214 } | |
215 if(max<2){keyCounts=null;} | |
216 maxCount=Tools.max((maxCount>0 ? 1 : 0), max); | |
217 return maxCount; | |
218 } | |
219 | |
220 public int applyBlacklist() { | |
221 assert(blacklist!=null); | |
222 LongList keylist=new LongList(keys.length); | |
223 IntList countlist=(keyCounts==null ? null : new IntList(keys.length)); | |
224 int removed=0; | |
225 for(int i=0; i<keys.length; i++){ | |
226 long key=keys[i]; | |
227 if(!Blacklist.contains(Long.MAX_VALUE-key)){ | |
228 keylist.add(key); | |
229 if(keyCounts!=null){countlist.add(keyCounts[i]);} | |
230 }else{ | |
231 removed++; | |
232 } | |
233 } | |
234 if(keylist.size()!=keys.length){ | |
235 keys=keylist.toArray(); | |
236 if(keyCounts!=null){keyCounts=countlist.toArray();} | |
237 } | |
238 // if(removed>0){ | |
239 // fixKeyCounts(); | |
240 // } | |
241 return removed; | |
242 } | |
243 | |
244 /*--------------------------------------------------------------*/ | |
245 /*---------------- Set Operations ----------------*/ | |
246 /*--------------------------------------------------------------*/ | |
247 | |
248 public static final Sketch intersection(Sketch sa, Sketch sb){ | |
249 Sketch shared=intersection(sa.keys, sb.keys, sa.keyCounts); | |
250 if(shared!=null){ | |
251 shared.taxID=sb.taxID; | |
252 shared.taxName=sb.taxName; | |
253 shared.name0=sb.name0; | |
254 shared.fname=sb.fname; | |
255 shared.meta=sb.meta; | |
256 shared.imgID=sb.imgID; | |
257 shared.spid=sb.spid; | |
258 } | |
259 return shared; | |
260 } | |
261 | |
262 public static final Sketch intersection(long[] a, long[] b, int[] aCounts){ | |
263 int i=0, j=0, matches=0; | |
264 LongList ll=new LongList(); | |
265 IntList il=new IntList(); | |
266 for(; i<a.length && j<b.length; ){ | |
267 final long ka=a[i], kb=b[j]; | |
268 if(ka==kb){ | |
269 matches++; | |
270 ll.add(ka); | |
271 if(aCounts!=null){ | |
272 il.add(aCounts[i]); | |
273 } | |
274 i++; | |
275 j++; | |
276 }else if(ka<kb){ | |
277 i++; | |
278 }else{ | |
279 j++; | |
280 } | |
281 } | |
282 if(matches<1){return null;} | |
283 | |
284 return new Sketch(ll.toArray(), il.size>0 ? il.toArray() : null, null, null, null, null); | |
285 } | |
286 | |
287 public static final Sketch union(Sketch sa, Sketch sb){ | |
288 Sketch shared=union(sa.keys, sb.keys, sa.keyCounts, sb.keyCounts, sa.baseCounts, sb.baseCounts, sa.r16S, sb.r16S, sa.r18S, sb.r18S); | |
289 if(shared!=null){ | |
290 shared.taxID=sa.taxID; | |
291 shared.taxName=sa.taxName; | |
292 shared.name0=sa.name0; | |
293 shared.fname=sa.fname; | |
294 shared.meta=sa.meta; | |
295 shared.imgID=sa.imgID; | |
296 shared.spid=sa.spid; | |
297 } | |
298 return shared; | |
299 } | |
300 | |
301 public static final Sketch union(long[] a, long[] b, int[] aCounts, int[] bCounts, long[] aBaseCounts, long[] bBaseCounts, | |
302 byte[] a16S, byte[] b16S, byte[] a18S, byte[] b18S){ | |
303 int i=0, j=0, matches=0; | |
304 LongList ll=new LongList(); | |
305 IntList il=(aCounts==null || bCounts==null ? null : new IntList()); | |
306 byte[] r16S=(b16S==null ? a16S : a16S==null ? b16S : a16S.length>b16S.length ? a16S : b16S); | |
307 byte[] r18S=(b18S==null ? a18S : a18S==null ? b18S : a18S.length>b18S.length ? a18S : b18S); | |
308 long[] baseCounts=(aBaseCounts==null || bBaseCounts==null ? null : new long[aBaseCounts.length]); | |
309 if(baseCounts!=null){ | |
310 for(int k=0; k<baseCounts.length; k++) {baseCounts[k]+=(aBaseCounts[k]+bBaseCounts[k]);} | |
311 } | |
312 for(; i<a.length && j<b.length; ){ | |
313 final long ka=a[i], kb=b[j]; | |
314 if(ka==kb){ | |
315 matches++; | |
316 ll.add(ka); | |
317 if(il!=null){ | |
318 il.add(aCounts[i]+bCounts[i]); | |
319 } | |
320 i++; | |
321 j++; | |
322 }else if(ka<kb){ | |
323 ll.add(ka); | |
324 if(il!=null){ | |
325 il.add(aCounts[i]); | |
326 } | |
327 i++; | |
328 }else{ | |
329 ll.add(kb); | |
330 if(il!=null){ | |
331 il.add(bCounts[i]); | |
332 } | |
333 j++; | |
334 } | |
335 } | |
336 if(matches<1){return null;} | |
337 | |
338 return new Sketch(ll.toArray(), il.size>0 ? il.toArray() : null, baseCounts, r16S, r18S, null); | |
339 } | |
340 | |
341 /*--------------------------------------------------------------*/ | |
342 /*---------------- Filtering ----------------*/ | |
343 /*--------------------------------------------------------------*/ | |
344 | |
345 boolean passesMeta(DisplayParams params) { | |
346 return passesMeta(params.requiredMeta, params.bannedMeta, /*params.requiredTaxid, params.bannedTaxid,*/ params.requiredMetaAnd); | |
347 } | |
348 | |
349 boolean passesMeta(ArrayList<String> requiredMeta, ArrayList<String> bannedMeta, /*IntList requiredTaxid, IntList bannedTaxid,*/ boolean requiredMetaAnd) { | |
350 assert(requiredMeta!=null || bannedMeta!=null /*|| requiredTaxid!=null || bannedTaxid!=null*/); | |
351 assert(requiredMeta==null || requiredMeta.size()>0); | |
352 assert(bannedMeta==null || bannedMeta.size()>0); | |
353 | |
354 // if(requiredTaxid!=null && !requiredTaxid.contains(taxID)){return false;} | |
355 // if(bannedTaxid!=null && bannedTaxid.contains(taxID)){return false;} | |
356 | |
357 if(requiredMeta!=null){ | |
358 if(meta==null){return false;} | |
359 for(String tag : requiredMeta){ | |
360 if(meta.contains(tag)){ | |
361 if(!requiredMetaAnd){break;} | |
362 }else if(requiredMetaAnd){ | |
363 return false; | |
364 } | |
365 } | |
366 } | |
367 if(bannedMeta!=null && meta!=null){ | |
368 for(String tag : bannedMeta){ | |
369 if(!meta.contains(tag)){ | |
370 return false; | |
371 } | |
372 } | |
373 } | |
374 return true; | |
375 } | |
376 | |
377 /*--------------------------------------------------------------*/ | |
378 /*---------------- Comparison ----------------*/ | |
379 /*--------------------------------------------------------------*/ | |
380 | |
381 public int countMatches(Sketch other, CompareBuffer buffer, AbstractBitSet present, boolean fillPresent, int[][] taxHits, int contamLevel){ | |
382 return countMatches(keys, other.keys, keyCounts, other.keyCounts, refHitCounts, other.taxID, buffer, present, fillPresent, taxHits, contamLevel); | |
383 } | |
384 | |
385 public static final int countMatches(long[] a, long[] b, int[] aCounts, int[] bCounts, int[] refHitCounts, int bid, | |
386 CompareBuffer buffer, AbstractBitSet present, boolean fillPresent, int[][] taxHits, int contamLevel){ | |
387 | |
388 | |
389 // if(verbose2){System.err.println("fillPresent: "+fillPresent+", "+present);} | |
390 // assert(fillPresent) : bid+", "+minFakeID+", "+(taxHits!=null); | |
391 | |
392 if(bid>0 && bid<minFakeID && taxHits!=null){ | |
393 bid=taxtree.getIdAtLevelExtended(bid, contamLevel); | |
394 }else{ | |
395 bid=-1; | |
396 } | |
397 | |
398 // assert(false) : (buffer==null)+", "+fillPresent+", "+present.cardinality(); | |
399 assert(a.length>0 && b.length>0); | |
400 | |
401 //Kmers hitting this reference | |
402 int matches=0; | |
403 | |
404 //Kmers hitting this reference and others | |
405 int multiMatches=0; | |
406 | |
407 //Kmers hitting nothing | |
408 int noHits=0; | |
409 | |
410 //Kmers hitting some organism but not this reference | |
411 int contamHits=0; | |
412 | |
413 //Kmers hitting something in this taxa, but not this reference | |
414 int sameTax=0; | |
415 | |
416 //Kmers hitting this organism and no other taxa | |
417 int unique2=0; | |
418 | |
419 //Kmers hitting only this taxa but not this organism (this count may not include everything due to early exit) | |
420 int unique3_temp=0; | |
421 | |
422 //Kmers hitting multiple organisms but not this reference | |
423 int multiContamHits=0; | |
424 | |
425 //Sum of query counts for shared kmers | |
426 long depthSum=0; | |
427 | |
428 //Sum of query counts for shared kmers divided by ref counts for those kmers | |
429 double depthSum2=0;//Slow, but necessary. | |
430 | |
431 //Number of times matching query keys occurred in all references | |
432 long refHitSum=0; | |
433 | |
434 //Matches from k1 | |
435 int k1hits=0; | |
436 | |
437 //Query kmers from k1 | |
438 int k1seenQ=0; | |
439 | |
440 //Reference kmers from k1 | |
441 int k1seenR=0; | |
442 int i=0, j=0; | |
443 assert(present==null || present.capacity()==a.length); | |
444 // assert(false) : buffer.rbs.capacity()+", "+buffer.rbs+", "+present; | |
445 if(present!=null){ | |
446 if(fillPresent){ | |
447 for(; i<a.length && j<b.length; ){ | |
448 final long ka=a[i], kb=b[j]; | |
449 final int bit=(int)(ka&1); | |
450 if(ka==kb){ | |
451 present.increment(i); | |
452 matches++; | |
453 k1hits+=bit; | |
454 k1seenQ+=bit; | |
455 k1seenR+=bit; | |
456 if(aCounts!=null){ | |
457 depthSum+=aCounts[i]; | |
458 if(bCounts!=null){ | |
459 depthSum2+=aCounts[i]/(double)bCounts[j]; | |
460 } | |
461 } | |
462 if(refHitCounts!=null){refHitSum+=refHitCounts[i];} | |
463 i++; | |
464 j++; | |
465 }else if(ka<kb){ | |
466 i++; | |
467 k1seenQ+=bit; | |
468 }else{ | |
469 j++; | |
470 k1seenR+=bit; | |
471 } | |
472 } | |
473 }else{ | |
474 for(; i<a.length && j<b.length; ){ | |
475 final long ka=a[i], kb=b[j]; | |
476 final int bit=(int)(ka&1); | |
477 if(ka==kb){ | |
478 final int count=present.getCount(i); | |
479 if(count>1){ | |
480 multiMatches++; | |
481 } | |
482 | |
483 matches++; | |
484 k1hits+=bit; | |
485 k1seenQ+=bit; | |
486 k1seenR+=bit; | |
487 if(aCounts!=null){ | |
488 depthSum+=aCounts[i]; | |
489 if(bCounts!=null){ | |
490 depthSum2+=aCounts[i]/(double)bCounts[j]; | |
491 } | |
492 } | |
493 if(refHitCounts!=null){refHitSum+=refHitCounts[i];} | |
494 if(bid>0){ | |
495 int[] taxHitsRow=taxHits[i]; | |
496 if(taxHitsRow!=null && taxHitsRow.length==1 && taxHitsRow[0]==bid){unique2++;} | |
497 } | |
498 | |
499 i++; | |
500 j++; | |
501 }else if(ka<kb){ | |
502 k1seenQ+=bit; | |
503 final int count=present.getCount(i); | |
504 if(count>0){ | |
505 contamHits++; | |
506 if(count>1){ | |
507 multiContamHits++; | |
508 } | |
509 }else{ | |
510 noHits++; | |
511 } | |
512 | |
513 if(bid>0){ | |
514 int[] taxHitsRow=taxHits[i]; | |
515 if(taxHitsRow!=null){ | |
516 if(taxHitsRow!=null && taxHitsRow.length==1 && taxHitsRow[0]==bid){unique3_temp++;} | |
517 for(int tid : taxHitsRow){ | |
518 if(tid==bid){ | |
519 sameTax++; | |
520 break; | |
521 } | |
522 } | |
523 } | |
524 } | |
525 | |
526 i++; | |
527 }else{ | |
528 k1seenR+=bit; | |
529 j++; | |
530 } | |
531 } | |
532 | |
533 //For the remaining query kmers, we don't know whether the reference sketch would have shared them had it been longer. | |
534 //This section can be disabled to prevent them from being displayed. | |
535 if(bid>0 && i<a.length-1){ | |
536 for(; i<a.length; i++){ | |
537 int[] taxHitsRow=taxHits[i]; | |
538 if(taxHitsRow!=null){ | |
539 if(taxHitsRow!=null && taxHitsRow.length==1 && taxHitsRow[0]==bid){unique3_temp++;} | |
540 } | |
541 } | |
542 } | |
543 } | |
544 }else{ | |
545 for(; i<a.length && j<b.length; ){ | |
546 final long ka=a[i], kb=b[j]; | |
547 final int bit=(int)(ka&1); | |
548 final int count=present.getCount(i); | |
549 if(ka==kb){ | |
550 matches++; | |
551 k1hits+=bit; | |
552 k1seenQ+=bit; | |
553 k1seenR+=bit; | |
554 if(aCounts!=null){depthSum+=aCounts[i];} | |
555 if(refHitCounts!=null){refHitSum+=refHitCounts[i];} | |
556 i++; | |
557 j++; | |
558 }else if(ka<kb){ | |
559 i++; | |
560 k1seenQ+=bit; | |
561 }else{ | |
562 j++; | |
563 k1seenR+=bit; | |
564 } | |
565 } | |
566 } | |
567 | |
568 if(k2<1){ | |
569 k1hits=matches; | |
570 k1seenQ=i; | |
571 k1seenR=j; | |
572 } | |
573 | |
574 // if(taxHits!=null){ | |
575 // System.err.println("matches="+matches+", noHits="+noHits+", contamHits="+contamHits+", sameTax="+sameTax+", multiContamHits="+multiContamHits); | |
576 // } | |
577 | |
578 // assert(bid<1 || unique2>=(matches-multiMatches)) : bid+", "+unique2+", "+unique3_temp+", "+matches+", "+multiMatches; | |
579 // assert(matches<1000 || multiMatches==0) : bid+", "+unique2+", "+unique3_temp+", "+matches+", "+multiMatches+", "+fillPresent; | |
580 | |
581 if(buffer!=null){ | |
582 // System.err.println("*A) "+matches+", "+multiMatches+", "+unique2+", "+unique3_temp); | |
583 // new Exception().printStackTrace(); | |
584 // assert(k1hits<=(matches-k1hits)) : k1hits+", "+matches; | |
585 buffer.set(matches, multiMatches, unique2, unique2+unique3_temp, noHits, | |
586 contamHits, contamHits-sameTax, multiContamHits, i, j, | |
587 a.length, b.length, depthSum, depthSum2, refHitSum, k1hits, k1seenQ, k1seenR); | |
588 } | |
589 return matches; | |
590 } | |
591 | |
592 // public float identity(Sketch b, float[] ret){ | |
593 // if(ret!=null){Arrays.fill(ret, 0);} | |
594 // return identityWeighted(array, b.array, ret); | |
595 // } | |
596 // | |
597 // public static float identity(long[] a, long[] b){ | |
598 // int matches=countMatches(a, b); | |
599 // return matches/(float)(Tools.max(1, Tools.min(a.length, b.length))); | |
600 // } | |
601 | |
602 @Override | |
603 public int hashCode(){ | |
604 long gSize=genomeSizeKmers>0 ? genomeSizeKmers : genomeSizeBases; | |
605 int code=(int) ((gSize^taxID^imgID^(name0==null ? 0 : name0.hashCode()))&Integer.MAX_VALUE); | |
606 // System.err.println(code+", "+gSize+", "+taxID+", "+imgID+", "+name0); | |
607 return code; | |
608 } | |
609 | |
610 @Override | |
611 public int compareTo(Sketch b){ | |
612 if(this==b){return 0;} | |
613 if(taxID>-1 && b.taxID>-1){return taxID-b.taxID;} | |
614 int x=taxName.compareTo(b.taxName); | |
615 if(x!=0){return x;} | |
616 if(name0!=null && b.name0!=null){return name0.compareTo(b.name0);} | |
617 return name0!=null ? 1 : b.name0!=null ? -1 : 0; | |
618 } | |
619 | |
620 @Override | |
621 public boolean equals(Object b){ | |
622 if(this==b){return true;} | |
623 if(b==null || this.getClass()!=b.getClass()){return false;} | |
624 return equals((Sketch)b); | |
625 } | |
626 | |
627 public boolean equals(Sketch b){ | |
628 return compareTo(b)==0; | |
629 } | |
630 | |
631 /*--------------------------------------------------------------*/ | |
632 /*---------------- Formatting ----------------*/ | |
633 /*--------------------------------------------------------------*/ | |
634 | |
635 public ByteBuilder toHeader(){ | |
636 ByteBuilder sb=new ByteBuilder(); | |
637 return toHeader(sb); | |
638 } | |
639 | |
640 public ByteBuilder toHeader(ByteBuilder bb){ | |
641 bb.append("#SZ:").append(keys.length); | |
642 bb.append("\tCD:"); | |
643 bb.append(codingArray[CODING]); | |
644 if(deltaOut){bb.append('D');} | |
645 if(keyCounts!=null){bb.append('C');} | |
646 if(aminoOrTranslate()){bb.append('M');} | |
647 if(amino8){bb.append('8');} | |
648 | |
649 bb.append("\tK:").append(k); | |
650 if(k2!=0){bb.append(",").append(k2);} | |
651 if(HASH_VERSION>1){bb.append("\tH:").append(HASH_VERSION);} | |
652 // if(maxCount>0){bb.append("\tMC:").append(maxCount);} | |
653 | |
654 if(genomeSizeBases>0){bb.append("\tGS:").append(genomeSizeBases);} | |
655 if(genomeSizeKmers>0){bb.append("\tGK:").append(genomeSizeKmers);} | |
656 final long ge=genomeSizeEstimate(); | |
657 if(ge>0){bb.append("\tGE:").append(ge);} | |
658 if(genomeSequences>0){bb.append("\tGQ:"+genomeSequences);} | |
659 if(baseCounts!=null && !aminoOrTranslate()){ | |
660 bb.append("\tBC:").append(baseCounts[0]).append(',').append(baseCounts[1]).append(','); | |
661 bb.append(baseCounts[2]).append(',').append(baseCounts[3]); | |
662 } | |
663 if(probCorrect>0){bb.append("\tPC:"+String.format(Locale.ROOT, "%.4f", probCorrect));} | |
664 if(taxID>=0){bb.append("\tID:").append(taxID);} | |
665 if(imgID>=0){bb.append("\tIMG:").append(imgID);} | |
666 if(spid>0){bb.append("\tSPID:").append(spid);} | |
667 if(fname!=null){bb.append("\tFN:").append(fname);} | |
668 if(taxName!=null){bb.append("\tNM:").append(taxName);} | |
669 if(name0!=null){bb.append("\tNM0:").append(name0);} | |
670 if(meta!=null){ | |
671 for(String s : meta){ | |
672 bb.append("\tMT_").append(s); | |
673 } | |
674 } | |
675 | |
676 if(r16S!=null){ | |
677 bb.append("\t16S:").append(r16S.length); | |
678 } | |
679 if(r18S!=null){ | |
680 bb.append("\t18S:").append(r18S.length); | |
681 } | |
682 if(r16S!=null){ | |
683 bb.append('\n'); | |
684 bb.append("#16S:").append(r16S); | |
685 } | |
686 if(r18S!=null){ | |
687 bb.append('\n'); | |
688 bb.append("#18S:").append(r18S); | |
689 } | |
690 return bb; | |
691 } | |
692 | |
693 public ByteBuilder toBytes(){ | |
694 return toBytes(new ByteBuilder()); | |
695 } | |
696 | |
697 public ByteBuilder toBytes(ByteBuilder bb){ | |
698 if(CODING==A48 && deltaOut){return toBytesA48D(bb);} | |
699 long prev=0; | |
700 toHeader(bb); | |
701 bb.append("\n"); | |
702 byte[] temp=null; | |
703 if(CODING==A48){temp=KillSwitch.allocByte1D(12);} | |
704 for(int i=0; i<keys.length; i++){ | |
705 long key=keys[i]; | |
706 int count=(keyCounts==null ? 1 : keyCounts[i]); | |
707 long x=key-prev; | |
708 if(CODING==A48){ | |
709 appendA48(x, bb, temp); | |
710 if(count>1){ | |
711 bb.append('\t'); | |
712 appendA48(count-1, bb, temp); | |
713 } | |
714 bb.append('\n'); | |
715 }else if(CODING==HEX){ | |
716 bb.append(Long.toHexString(x)).append('\n'); | |
717 }else if(CODING==RAW){ | |
718 bb.append(x).append('\n'); | |
719 }else{ | |
720 assert(false); | |
721 } | |
722 if(deltaOut){prev=key;} | |
723 } | |
724 return bb; | |
725 } | |
726 | |
727 //This is to make the common case fast | |
728 private ByteBuilder toBytesA48D(ByteBuilder bb){ | |
729 assert(CODING==A48 && deltaOut); | |
730 long prev=0; | |
731 toHeader(bb); | |
732 bb.append("\n"); | |
733 final byte[] temp=KillSwitch.allocByte1D(12); | |
734 | |
735 if(keyCounts==null){ | |
736 for(int i=0; i<keys.length; i++){ | |
737 long key=keys[i]; | |
738 long x=key-prev; | |
739 if(CODING==A48){ | |
740 appendA48(x, bb, temp); | |
741 bb.append('\n'); | |
742 } | |
743 prev=key; | |
744 } | |
745 }else{ | |
746 for(int i=0; i<keys.length; i++){ | |
747 long key=keys[i]; | |
748 int count=keyCounts[i]; | |
749 long x=key-prev; | |
750 if(CODING==A48){ | |
751 appendA48(x, bb, temp); | |
752 if(count>1){ | |
753 bb.append('\t'); | |
754 appendA48(count-1, bb, temp); | |
755 } | |
756 bb.append('\n'); | |
757 } | |
758 prev=key; | |
759 } | |
760 } | |
761 return bb; | |
762 } | |
763 | |
764 public static final void appendA48(long value, ByteBuilder bb, byte[] temp){ | |
765 int i=0; | |
766 // long value=value0; | |
767 while(value!=0){ | |
768 byte b=(byte)(value&0x3F); | |
769 // assert(i<temp.length) : i+", "+temp.length+", "+value0; | |
770 temp[i]=b; | |
771 value=value>>6; | |
772 i++; | |
773 } | |
774 if(i==0){ | |
775 bb.append((byte)'0'); | |
776 }else{ | |
777 for(i--;i>=0;i--){ | |
778 bb.append((char)(temp[i]+48)); | |
779 } | |
780 } | |
781 } | |
782 | |
783 public static final String toA48(long value){ | |
784 int i=0; | |
785 // long value=value0; | |
786 StringBuilder sb=new StringBuilder(12); | |
787 while(value!=0){ | |
788 byte b=(byte)(value&0x3F); | |
789 // assert(i<temp.length) : i+", "+temp.length+", "+value0; | |
790 sb.append((char)(b+48)); | |
791 value=value>>6; | |
792 i++; | |
793 } | |
794 if(i==0){ | |
795 sb.append((byte)'0'); | |
796 }else{ | |
797 sb.reverse(); | |
798 } | |
799 return sb.toString(); | |
800 } | |
801 | |
802 @Override | |
803 public String toString(){ | |
804 return toBytes().toString(); | |
805 } | |
806 | |
807 /*--------------------------------------------------------------*/ | |
808 /*---------------- String Parsing ----------------*/ | |
809 /*--------------------------------------------------------------*/ | |
810 | |
811 public static long parseA48(String line){ | |
812 if(line.length()==0){return 0;} | |
813 long x=0; | |
814 for(int i=0; i<line.length(); i++){ | |
815 x<<=6; | |
816 long c=line.charAt(i); | |
817 x|=(c-48); | |
818 } | |
819 return x; | |
820 } | |
821 | |
822 /** Parses coverage too */ | |
823 public static long parseA48C(String line, IntList covList){ | |
824 if(line.length()==0){ | |
825 covList.add(1); | |
826 return 0; | |
827 } | |
828 long key=0, cov=0; | |
829 int i=0, len=line.length(); | |
830 for(; i<len; i++){ | |
831 long c=line.charAt(i); | |
832 if(c<48){break;} | |
833 key<<=6; | |
834 key|=(c-48); | |
835 } | |
836 for(i++; i<len; i++){ | |
837 long c=line.charAt(i); | |
838 cov<<=6; | |
839 cov|=(c-48); | |
840 } | |
841 covList.add((int)(cov+1)); | |
842 return key; | |
843 } | |
844 | |
845 public static long parseHex(String line){ | |
846 if(line.length()==0){return 0;} | |
847 long x=0; | |
848 for(int i=0; i<line.length(); i++){ | |
849 x<<=4; | |
850 x|=hexTable[line.charAt(i)]; | |
851 } | |
852 if(line.charAt(0)=='-'){x*=-1;} | |
853 return x; | |
854 } | |
855 | |
856 /*--------------------------------------------------------------*/ | |
857 /*---------------- Array Parsing ----------------*/ | |
858 /*--------------------------------------------------------------*/ | |
859 | |
860 public static long parseA48(byte[] line){ | |
861 if(line.length==0){return 0;} | |
862 long x=0; | |
863 for(byte b : line){ | |
864 x<<=6; | |
865 x|=(((long)b)-48); | |
866 } | |
867 return x; | |
868 } | |
869 | |
870 public static long parseNuc(String line){ | |
871 return parseNuc(line.getBytes()); | |
872 } | |
873 | |
874 /** Returns the maximal key in the sequence */ | |
875 public static long parseNuc(byte[] bases){ | |
876 if(bases.length<k){return -1;} | |
877 final int shift=2*k; | |
878 final int shift2=shift-2; | |
879 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); //Conditional allows K=32 | |
880 | |
881 long kmer=0, rkmer=0; | |
882 int len=0; | |
883 | |
884 long key=-1; | |
885 for(int i=0; i<bases.length; i++){ | |
886 byte b=bases[i]; | |
887 long x=AminoAcid.baseToNumber[b]; | |
888 long x2=AminoAcid.baseToComplementNumber[b]; | |
889 kmer=((kmer<<2)|x)&mask; | |
890 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; | |
891 if(x<0){len=0; rkmer=0;}else{len++;} | |
892 if(len>=k){ | |
893 // long z=Tools.max(kmer, rkmer); | |
894 final long hashcode=hash(kmer, rkmer); | |
895 key=Tools.max(key, hashcode); | |
896 } | |
897 } | |
898 return key<minHashValue ? -1 : Long.MAX_VALUE-key; | |
899 } | |
900 | |
901 /** Parses coverage too */ | |
902 public static long parseA48C(byte[] line, IntList covList){ | |
903 if(line.length==0){ | |
904 covList.add(1); | |
905 return 0; | |
906 } | |
907 long key=0, cov=0; | |
908 int i=0, len=line.length; | |
909 for(; i<len; i++){ | |
910 long b=line[i]; | |
911 if(b<48){break;} | |
912 key<<=6; | |
913 key|=(b-48); | |
914 } | |
915 for(i++; i<len; i++){ | |
916 long b=line[i]; | |
917 cov<<=6; | |
918 cov|=(b-48); | |
919 } | |
920 covList.add((int)(cov+1)); | |
921 return key; | |
922 } | |
923 | |
924 public static long parseHex(byte[] line){ | |
925 if(line.length==0){return 0;} | |
926 long x=0; | |
927 for(byte b : line){ | |
928 x<<=4; | |
929 x|=hexTable[b]; | |
930 } | |
931 if(line[0]=='-'){x*=-1;} | |
932 return x; | |
933 } | |
934 | |
935 /*--------------------------------------------------------------*/ | |
936 /*---------------- Parsing ----------------*/ | |
937 /*--------------------------------------------------------------*/ | |
938 | |
939 public static long parseA48(ByteBuilder bb){ | |
940 final int len=bb.length; | |
941 final byte[] line=bb.array; | |
942 if(len==0){return 0;} | |
943 long x=0; | |
944 for(int i=0; i<len; i++){ | |
945 x<<=6; | |
946 x|=(((long)line[i])-48); | |
947 } | |
948 return x; | |
949 } | |
950 | |
951 public static long parseNuc(ByteBuilder bb){ | |
952 return parseNuc(bb.toBytes()); | |
953 } | |
954 | |
955 /** Parses coverage too */ | |
956 public static long parseA48C(ByteBuilder bb, IntList covList){ | |
957 final int len=bb.length; | |
958 final byte[] line=bb.array; | |
959 if(len==0){ | |
960 covList.add(1); | |
961 return 0; | |
962 } | |
963 long key=0, cov=0; | |
964 int i=0; | |
965 for(; i<len; i++){ | |
966 long b=line[i]; | |
967 if(b<48){break;} | |
968 key<<=6; | |
969 key|=(b-48); | |
970 } | |
971 for(i++; i<len; i++){ | |
972 long b=line[i]; | |
973 cov<<=6; | |
974 cov|=(b-48); | |
975 } | |
976 covList.add((int)(cov+1)); | |
977 return key; | |
978 } | |
979 | |
980 public static long parseHex(ByteBuilder bb){ | |
981 final int len=bb.length; | |
982 final byte[] line=bb.array; | |
983 if(line.length==0){return 0;} | |
984 long x=0; | |
985 for(int i=0; i<len; i++){ | |
986 x<<=4; | |
987 x|=hexTable[line[i]]; | |
988 } | |
989 if(line[0]=='-'){x*=-1;} | |
990 return x; | |
991 } | |
992 | |
993 /*--------------------------------------------------------------*/ | |
994 /*---------------- Getters ----------------*/ | |
995 /*--------------------------------------------------------------*/ | |
996 | |
997 public long genomeSizeEstimate() { | |
998 return keys.length==0 ? 0 : Tools.min(genomeSizeKmers, genomeSizeEstimate(keys[keys.length-1], keys.length)); | |
999 } | |
1000 | |
1001 public long genomeSizeEstimate(int minCount) { | |
1002 if(minCount<2){return genomeSizeEstimate();} | |
1003 if(length()==0){return 0;} | |
1004 long max=0; | |
1005 int num=0; | |
1006 for(int i=0; i<keyCounts.length; i++){ | |
1007 if(keyCounts[i]>=minCount){ | |
1008 max=keys[i]; | |
1009 num++; | |
1010 } | |
1011 } | |
1012 if(max==0){return 0;} | |
1013 long est=Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(max, num)); | |
1014 return est; | |
1015 } | |
1016 | |
1017 public float gc(){ | |
1018 if(baseCounts==null){return 0;} | |
1019 long at=baseCounts[0]+baseCounts[3]; | |
1020 long gc=baseCounts[1]+baseCounts[2]; | |
1021 return gc/(Tools.max(1.0f, at+gc)); | |
1022 } | |
1023 | |
1024 public String filePrefix(){return ReadWrite.stripToCore(fname);} | |
1025 public String name(){return taxName!=null ? taxName : name0!=null ? name0 : fname;} | |
1026 public String taxName(){return taxName;} | |
1027 public String name0(){return name0;} | |
1028 public String fname(){return fname;} | |
1029 public int length(){return keys.length;} | |
1030 public void setTaxName(String s){taxName=s;} | |
1031 public void setName0(String s){name0=s;} | |
1032 public void setFname(String s){ | |
1033 // assert(!s.endsWith("sketch")) : s; //123 | |
1034 fname=(s==null ? s : ReadWrite.stripPath(s)); | |
1035 } | |
1036 | |
1037 // public float k1Fraction(){return k1Count/(float)array.length;} | |
1038 | |
1039 /*--------------------------------------------------------------*/ | |
1040 /*---------------- Assorted ----------------*/ | |
1041 /*--------------------------------------------------------------*/ | |
1042 | |
1043 public ArrayList<LongPair> toKhist() { | |
1044 HashMap<Long, LongPair> map=new HashMap<Long, LongPair>(); | |
1045 for(int i=0; i<keyCounts.length; i++){ | |
1046 int a=keyCounts[i]; | |
1047 Long key=(long)a; | |
1048 LongPair value=map.get(key); | |
1049 if(value==null){ | |
1050 value=new LongPair(); | |
1051 value.a=a; | |
1052 map.put(key, value); | |
1053 } | |
1054 value.b++; | |
1055 } | |
1056 ArrayList<LongPair> list=new ArrayList<LongPair>(map.size()); | |
1057 list.addAll(map.values()); | |
1058 Collections.sort(list); | |
1059 return list; | |
1060 } | |
1061 | |
1062 // static boolean warned=false; | |
1063 public void addSSU(){ | |
1064 if(useSSUMapOnly){r16S=r18S=null;} | |
1065 if(taxID<1 || taxID>=minFakeID){return;} | |
1066 if(!SSUMap.hasMap()){return;} | |
1067 if(taxtree==null/* && !warned*/){ | |
1068 // warned=true; | |
1069 // System.err.println("*** Warning - no taxtree loaded. A taxtree is recommended when adding SSUs. ***"); | |
1070 assert(false) : "Please set a path to the taxtree when adding SSUs."; | |
1071 } | |
1072 final boolean euk=taxtree!=null && (preferSSUMapForEuks || useSSUMapOnlyForEuks || ban16SForEuks) ? taxtree.isEukaryote(taxID) : false; | |
1073 final boolean prok=taxtree!=null && ban18SForProks ? taxtree.isProkaryote(taxID) : false; | |
1074 | |
1075 final boolean mapOnly=(useSSUMapOnly || (useSSUMapOnlyForEuks && euk)); | |
1076 final boolean preferMap=(mapOnly || preferSSUMap || (preferSSUMapForEuks && euk)); | |
1077 if(mapOnly){r16S=r18S=null;} | |
1078 | |
1079 if(SSUMap.r16SMap!=null && (r16S==null || preferMap)){ | |
1080 byte[] x=SSUMap.r16SMap.get(taxID); | |
1081 if(x!=null){r16S=x;} | |
1082 } | |
1083 if(SSUMap.r18SMap!=null && (r18S==null || preferMap)){ | |
1084 byte[] x=SSUMap.r18SMap.get(taxID); | |
1085 if(x!=null){r18S=x;} | |
1086 } | |
1087 if(ban18SForProks && prok){r18S=null;} | |
1088 if(ban16SForEuks && euk){r16S=null;} | |
1089 } | |
1090 | |
1091 /*--------------------------------------------------------------*/ | |
1092 /*---------------- BitSet ----------------*/ | |
1093 /*--------------------------------------------------------------*/ | |
1094 | |
1095 public void makeBitSets(boolean printContam, boolean index){ | |
1096 assert(compareBitSet==null && indexBitSet==null); | |
1097 if(!printContam){return;} | |
1098 compareBitSet=AbstractBitSet.make(length(), bitSetBits); | |
1099 if(index){indexBitSet=AbstractBitSet.make(length(), bitSetBits);} | |
1100 } | |
1101 | |
1102 public void addToBitSet(AbstractBitSet rbs){ | |
1103 compareBitSet.add(rbs); | |
1104 } | |
1105 | |
1106 public AbstractBitSet compareBitSet(){return compareBitSet;} | |
1107 | |
1108 public AbstractBitSet indexBitSet(){return indexBitSet;} | |
1109 | |
1110 public void mergeBitSets(){ | |
1111 assert(!mergedBitSets); | |
1112 if(compareBitSet!=null && indexBitSet!=null){ | |
1113 compareBitSet.setToMax(indexBitSet); | |
1114 } | |
1115 indexBitSet=null; | |
1116 mergedBitSets=true; | |
1117 } | |
1118 | |
1119 public boolean merged(){return mergedBitSets;} | |
1120 | |
1121 public int r16SLen(){return r16S==null ? 0 : r16S.length;} | |
1122 public int r18SLen(){return r18S==null ? 0 : r18S.length;} | |
1123 public byte[] r16S(){return r16S;} | |
1124 public byte[] r18S(){return r18S;} | |
1125 public boolean hasSSU(){return r16S!=null || r18S!=null;} | |
1126 public boolean sharesSSU(Sketch b){ | |
1127 return (r16S!=null && b.r16S!=null) || (r18S!=null && b.r18S!=null); | |
1128 } | |
1129 | |
1130 /*--------------------------------------------------------------*/ | |
1131 /*---------------- Fields ----------------*/ | |
1132 /*--------------------------------------------------------------*/ | |
1133 | |
1134 /** Stores sorted hashcodes, ascending, as Long.MAX_VALUE-(raw hashcode) */ | |
1135 public long[] keys; | |
1136 /** Stores kmer (hashcode) observation counts */ | |
1137 int[] keyCounts; | |
1138 /** Stores base (ACGTN) counts */ | |
1139 final long[] baseCounts; | |
1140 private byte[] r16S; | |
1141 private byte[] r18S; | |
1142 public int taxID; | |
1143 public int maxCount; | |
1144 int sketchID;//Based on loading order | |
1145 public final long genomeSequences; | |
1146 public final long genomeSizeBases; | |
1147 public final long genomeSizeKmers; | |
1148 public final float probCorrect; | |
1149 // public final int k1Count; //Number of keys made from k1 rather than k2 | |
1150 private String taxName; | |
1151 private String name0; | |
1152 private String fname; | |
1153 ArrayList<String> meta; | |
1154 | |
1155 //TODO: These should move to SketchResults. | |
1156 private AbstractBitSet compareBitSet; //Used for comparison | |
1157 private AbstractBitSet indexBitSet; | |
1158 | |
1159 | |
1160 //Extended information | |
1161 public long imgID=-1; | |
1162 public long spid=-1; | |
1163 // public String seqUnitName=null; | |
1164 | |
1165 private boolean mergedBitSets=false; //TODO: Temporary for debugging | |
1166 /** Tracks the number of reference sketches sharing each kmer. | |
1167 * Should be set to null when no longer needed. */ | |
1168 private int[] refHitCounts; | |
1169 | |
1170 public int[] refHitCounts(){return refHitCounts;} | |
1171 public void clearRefHitCounts(){ | |
1172 refHitCounts=null; | |
1173 } | |
1174 public void setRefHitCounts(int[] x) { | |
1175 refHitCounts=x; | |
1176 assert(x!=null); | |
1177 } | |
1178 | |
1179 private static AtomicInteger nextSketch=new AtomicInteger(1); | |
1180 | |
1181 } |