comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/Sketch.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package sketch;
2
3 import java.util.ArrayList;
4 import java.util.Arrays;
5 import java.util.Collections;
6 import java.util.HashMap;
7 import java.util.Locale;
8 import java.util.concurrent.atomic.AtomicInteger;
9
10 import dna.AminoAcid;
11 import fileIO.ReadWrite;
12 import shared.KillSwitch;
13 import shared.Tools;
14 import structures.AbstractBitSet;
15 import structures.ByteBuilder;
16 import structures.IntList;
17 import structures.LongHashMap;
18 import structures.LongList;
19 import structures.LongPair;
20 import tax.ImgRecord;
21
22 /**
23 * @author Brian Bushnell
24 * @date July 7, 2016
25 *
26 */
27 public class Sketch extends SketchObject implements Comparable<Sketch>, Cloneable {
28
29 /*--------------------------------------------------------------*/
30 /*---------------- Constructors ----------------*/
31 /*--------------------------------------------------------------*/
32
33 @Override
34 public Sketch clone() {
35 Sketch copy=null;
36 try {
37 copy = (Sketch)super.clone();
38 } catch (CloneNotSupportedException e) {
39 // e.printStackTrace();
40 throw new RuntimeException(e);
41 }
42 copy.compareBitSet=null;
43 copy.indexBitSet=null;
44 return copy;
45 }
46
47 //Array should already be hashed, sorted, unique, subtracted from Long.MAX_VALUE, then reversed.
48 public Sketch(long[] keys_, int[] keyCounts_, long[] baseCounts_, byte[] r16S_, byte[] r18S_, ArrayList<String> meta_){
49 this(keys_, keyCounts_, baseCounts_, r16S_, r18S_, -1, -1, -1, -1, -1, -1, null, null, null, meta_);
50 }
51
52 public Sketch(SketchHeap heap, boolean clearFname, boolean keepCounts, ArrayList<String> meta_){
53 this(heap, clearFname, keepCounts, meta_, -1);
54 }
55
56 public Sketch(SketchHeap heap, boolean clearFname, boolean keepCounts, ArrayList<String> meta_, int minKeyOccuranceCount){
57 this(heap.toSketchArray_minCount(minKeyOccuranceCount), null, heap.baseCounts(false), heap.r16S(), heap.r18S(), (int)heap.taxID, heap.imgID,
58 heap.genomeSizeBases, heap.genomeSizeKmers, heap.genomeSequences,
59 heap.probCorrect(), heap.taxName(), heap.name0(), heap.fname(), meta_);
60 assert(keyCounts==null);
61 if(!heap.setMode && keepCounts){
62 LongHashMap map=heap.map();
63 keyCounts=new int[keys.length];
64 for(int i=0; i<keys.length; i++){
65 int count=map.get(Long.MAX_VALUE-keys[i]);
66 assert(count>0) : keys[i]+" -> "+count+"\n"+Arrays.toString(map.values())+"\n"+Arrays.toString(map.keys());
67 keyCounts[i]=count;
68 }
69 }
70 if(heap.setMode){heap.clearSet();}
71 heap.clear(clearFname);
72 // System.err.println("size="+size+", genome="+this.genomeSize+", m"); : (int)(2+maxGenomeFraction*heap.genomeSize)+", "+this.array.length;
73 // assert(false) : (int)(2+maxGenomeFraction*heap.genomeSize)+", "+this.array.length;
74 // assert(false) : (counts==null)+", "+heap.setMode+", "+keepCounts;
75 }
76
77 public Sketch(long[] keys_, int[] keyCounts_, long[] baseCounts_, byte[] r16S_, byte[] r18S_, int taxID_, long imgID_, long gSizeBases_,
78 long gSizeKmers_, long gSequences_, double probCorrect_, String taxName_, String name0_, String fname_, ArrayList<String> meta_){
79 // Exception e=new Exception(baseCounts_+", "+Arrays.toString(baseCounts_));
80 // e.printStackTrace();
81 keys=keys_;
82 keyCounts=keyCounts_;
83 // fixKeyCounts();
84 baseCounts=baseCounts_;
85 r16S=r16S_;
86 r18S=r18S_;
87 assert(keyCounts==null || keys==null || keyCounts.length==keys.length) : (keys==null ? "null" : keys.length)+", "+(keyCounts==null ? "null" : keyCounts.length);
88 taxID=taxID_;
89 imgID=imgID_;
90 sketchID=nextSketch.getAndIncrement();
91 genomeSizeBases=gSizeBases_;
92 genomeSizeKmers=gSizeKmers_;
93 genomeSequences=gSequences_;
94 probCorrect=probCorrect_<=0 ? 0f : (float)probCorrect_;
95 meta=meta_;
96
97 taxName=fix(taxName_);
98 name0=fix(name0_);
99 fname=fix(fname_);
100 if(fname!=null && (fname.startsWith("stdin.") || fname.endsWith(".sketch") || fname.endsWith(".sketch.gz"))){fname=null;}
101
102 // if(k2>0){
103 // if(useToValue2) {
104 // int count=0;
105 // for(long x : array){
106 // count+=(int)(x&1);
107 // }
108 // k1Count=count;
109 // }else {
110 // k1Count=array.length/2;
111 // }
112 // }else{
113 // k1Count=array.length;
114 // }
115
116 if(ImgRecord.imgMap!=null && imgID>=0 && taxID<0){
117 ImgRecord record=ImgRecord.imgMap.get(imgID);
118 if(record!=null){
119 if(record.name!=null && taxName==null){taxName=record.name;}
120 taxID=record.taxID;
121 }
122 }
123 }
124
125 void loadSSU(){
126 if(taxID>0 && taxID<minFakeID){
127 if(r16S==null && SSUMap.r16SMap!=null){
128 r16S=SSUMap.r16SMap.get(taxID);
129 }
130 if(r18S==null && SSUMap.r18SMap!=null){
131 r18S=SSUMap.r18SMap.get(taxID);
132 }
133 }
134 // new Exception().printStackTrace();
135 // System.err.println(taxID+", "+ssu+", "+SSUMap.ssuMap.size());
136 }
137
138 void addMeta(String s){
139 s=fixMeta(s);
140 if(s==null){return;}
141 if(meta==null){meta=new ArrayList<String>(1);}
142 meta.add(s);
143 }
144
145 void setMeta(ArrayList<String> list){
146 assert(meta==null);
147 meta=list;
148 }
149
150 private static String fix(String s){
151 if(s==null){return null;}
152 return s.replace('\t', ' ');
153 }
154
155 /*--------------------------------------------------------------*/
156 /*---------------- Methods ----------------*/
157 /*--------------------------------------------------------------*/
158
159 public void add(Sketch other, int maxlen){
160 assert(keyCounts==null && other.keyCounts==null);
161 final long[] a=keys;
162 final long[] b=other.keys;
163 if(maxlen<1){
164 assert(false);
165 maxlen=1000000;
166 }
167 LongList list=new LongList(Tools.min(maxlen, a.length+b.length));
168
169 for(int i=0, j=0; i<a.length && j<b.length; ){
170 final long ka=a[i], kb=b[j];
171 if(ka==kb){//match
172 list.add(ka);
173 i++;
174 j++;
175 }else if(ka<kb){
176 list.add(ka);
177 i++;
178 }else{
179 list.add(kb);
180 j++;
181 }
182 if(list.size()>=maxlen){break;}
183 }
184
185 if(keys.length==list.size()){
186 for(int i=0; i<list.size; i++){
187 keys[i]=list.array[i];
188 }
189 }else{
190 keys=list.toArray();
191 }
192 }
193
194 public void resize(int newSize) {
195 if(newSize>=length()){return;}
196 keys=Arrays.copyOf(keys, newSize);
197 // fixKeyCounts(newSize);
198 if(keyCounts!=null){
199 for(int i=0; i<newSize; i++)
200 keyCounts=Arrays.copyOf(keyCounts, newSize);
201 }
202 }
203
204 int fixKeyCounts(){
205 return fixKeyCounts(keyCounts==null ? maxCount : keyCounts.length);
206 }
207
208 int fixKeyCounts(int maxLen){
209 int max=0;
210 if(keyCounts!=null){
211 for(int i=0; i<maxLen && max<2; i++){
212 max=Tools.max(keyCounts[i], max);
213 }
214 }
215 if(max<2){keyCounts=null;}
216 maxCount=Tools.max((maxCount>0 ? 1 : 0), max);
217 return maxCount;
218 }
219
220 public int applyBlacklist() {
221 assert(blacklist!=null);
222 LongList keylist=new LongList(keys.length);
223 IntList countlist=(keyCounts==null ? null : new IntList(keys.length));
224 int removed=0;
225 for(int i=0; i<keys.length; i++){
226 long key=keys[i];
227 if(!Blacklist.contains(Long.MAX_VALUE-key)){
228 keylist.add(key);
229 if(keyCounts!=null){countlist.add(keyCounts[i]);}
230 }else{
231 removed++;
232 }
233 }
234 if(keylist.size()!=keys.length){
235 keys=keylist.toArray();
236 if(keyCounts!=null){keyCounts=countlist.toArray();}
237 }
238 // if(removed>0){
239 // fixKeyCounts();
240 // }
241 return removed;
242 }
243
244 /*--------------------------------------------------------------*/
245 /*---------------- Set Operations ----------------*/
246 /*--------------------------------------------------------------*/
247
248 public static final Sketch intersection(Sketch sa, Sketch sb){
249 Sketch shared=intersection(sa.keys, sb.keys, sa.keyCounts);
250 if(shared!=null){
251 shared.taxID=sb.taxID;
252 shared.taxName=sb.taxName;
253 shared.name0=sb.name0;
254 shared.fname=sb.fname;
255 shared.meta=sb.meta;
256 shared.imgID=sb.imgID;
257 shared.spid=sb.spid;
258 }
259 return shared;
260 }
261
262 public static final Sketch intersection(long[] a, long[] b, int[] aCounts){
263 int i=0, j=0, matches=0;
264 LongList ll=new LongList();
265 IntList il=new IntList();
266 for(; i<a.length && j<b.length; ){
267 final long ka=a[i], kb=b[j];
268 if(ka==kb){
269 matches++;
270 ll.add(ka);
271 if(aCounts!=null){
272 il.add(aCounts[i]);
273 }
274 i++;
275 j++;
276 }else if(ka<kb){
277 i++;
278 }else{
279 j++;
280 }
281 }
282 if(matches<1){return null;}
283
284 return new Sketch(ll.toArray(), il.size>0 ? il.toArray() : null, null, null, null, null);
285 }
286
287 public static final Sketch union(Sketch sa, Sketch sb){
288 Sketch shared=union(sa.keys, sb.keys, sa.keyCounts, sb.keyCounts, sa.baseCounts, sb.baseCounts, sa.r16S, sb.r16S, sa.r18S, sb.r18S);
289 if(shared!=null){
290 shared.taxID=sa.taxID;
291 shared.taxName=sa.taxName;
292 shared.name0=sa.name0;
293 shared.fname=sa.fname;
294 shared.meta=sa.meta;
295 shared.imgID=sa.imgID;
296 shared.spid=sa.spid;
297 }
298 return shared;
299 }
300
301 public static final Sketch union(long[] a, long[] b, int[] aCounts, int[] bCounts, long[] aBaseCounts, long[] bBaseCounts,
302 byte[] a16S, byte[] b16S, byte[] a18S, byte[] b18S){
303 int i=0, j=0, matches=0;
304 LongList ll=new LongList();
305 IntList il=(aCounts==null || bCounts==null ? null : new IntList());
306 byte[] r16S=(b16S==null ? a16S : a16S==null ? b16S : a16S.length>b16S.length ? a16S : b16S);
307 byte[] r18S=(b18S==null ? a18S : a18S==null ? b18S : a18S.length>b18S.length ? a18S : b18S);
308 long[] baseCounts=(aBaseCounts==null || bBaseCounts==null ? null : new long[aBaseCounts.length]);
309 if(baseCounts!=null){
310 for(int k=0; k<baseCounts.length; k++) {baseCounts[k]+=(aBaseCounts[k]+bBaseCounts[k]);}
311 }
312 for(; i<a.length && j<b.length; ){
313 final long ka=a[i], kb=b[j];
314 if(ka==kb){
315 matches++;
316 ll.add(ka);
317 if(il!=null){
318 il.add(aCounts[i]+bCounts[i]);
319 }
320 i++;
321 j++;
322 }else if(ka<kb){
323 ll.add(ka);
324 if(il!=null){
325 il.add(aCounts[i]);
326 }
327 i++;
328 }else{
329 ll.add(kb);
330 if(il!=null){
331 il.add(bCounts[i]);
332 }
333 j++;
334 }
335 }
336 if(matches<1){return null;}
337
338 return new Sketch(ll.toArray(), il.size>0 ? il.toArray() : null, baseCounts, r16S, r18S, null);
339 }
340
341 /*--------------------------------------------------------------*/
342 /*---------------- Filtering ----------------*/
343 /*--------------------------------------------------------------*/
344
345 boolean passesMeta(DisplayParams params) {
346 return passesMeta(params.requiredMeta, params.bannedMeta, /*params.requiredTaxid, params.bannedTaxid,*/ params.requiredMetaAnd);
347 }
348
349 boolean passesMeta(ArrayList<String> requiredMeta, ArrayList<String> bannedMeta, /*IntList requiredTaxid, IntList bannedTaxid,*/ boolean requiredMetaAnd) {
350 assert(requiredMeta!=null || bannedMeta!=null /*|| requiredTaxid!=null || bannedTaxid!=null*/);
351 assert(requiredMeta==null || requiredMeta.size()>0);
352 assert(bannedMeta==null || bannedMeta.size()>0);
353
354 // if(requiredTaxid!=null && !requiredTaxid.contains(taxID)){return false;}
355 // if(bannedTaxid!=null && bannedTaxid.contains(taxID)){return false;}
356
357 if(requiredMeta!=null){
358 if(meta==null){return false;}
359 for(String tag : requiredMeta){
360 if(meta.contains(tag)){
361 if(!requiredMetaAnd){break;}
362 }else if(requiredMetaAnd){
363 return false;
364 }
365 }
366 }
367 if(bannedMeta!=null && meta!=null){
368 for(String tag : bannedMeta){
369 if(!meta.contains(tag)){
370 return false;
371 }
372 }
373 }
374 return true;
375 }
376
377 /*--------------------------------------------------------------*/
378 /*---------------- Comparison ----------------*/
379 /*--------------------------------------------------------------*/
380
381 public int countMatches(Sketch other, CompareBuffer buffer, AbstractBitSet present, boolean fillPresent, int[][] taxHits, int contamLevel){
382 return countMatches(keys, other.keys, keyCounts, other.keyCounts, refHitCounts, other.taxID, buffer, present, fillPresent, taxHits, contamLevel);
383 }
384
385 public static final int countMatches(long[] a, long[] b, int[] aCounts, int[] bCounts, int[] refHitCounts, int bid,
386 CompareBuffer buffer, AbstractBitSet present, boolean fillPresent, int[][] taxHits, int contamLevel){
387
388
389 // if(verbose2){System.err.println("fillPresent: "+fillPresent+", "+present);}
390 // assert(fillPresent) : bid+", "+minFakeID+", "+(taxHits!=null);
391
392 if(bid>0 && bid<minFakeID && taxHits!=null){
393 bid=taxtree.getIdAtLevelExtended(bid, contamLevel);
394 }else{
395 bid=-1;
396 }
397
398 // assert(false) : (buffer==null)+", "+fillPresent+", "+present.cardinality();
399 assert(a.length>0 && b.length>0);
400
401 //Kmers hitting this reference
402 int matches=0;
403
404 //Kmers hitting this reference and others
405 int multiMatches=0;
406
407 //Kmers hitting nothing
408 int noHits=0;
409
410 //Kmers hitting some organism but not this reference
411 int contamHits=0;
412
413 //Kmers hitting something in this taxa, but not this reference
414 int sameTax=0;
415
416 //Kmers hitting this organism and no other taxa
417 int unique2=0;
418
419 //Kmers hitting only this taxa but not this organism (this count may not include everything due to early exit)
420 int unique3_temp=0;
421
422 //Kmers hitting multiple organisms but not this reference
423 int multiContamHits=0;
424
425 //Sum of query counts for shared kmers
426 long depthSum=0;
427
428 //Sum of query counts for shared kmers divided by ref counts for those kmers
429 double depthSum2=0;//Slow, but necessary.
430
431 //Number of times matching query keys occurred in all references
432 long refHitSum=0;
433
434 //Matches from k1
435 int k1hits=0;
436
437 //Query kmers from k1
438 int k1seenQ=0;
439
440 //Reference kmers from k1
441 int k1seenR=0;
442 int i=0, j=0;
443 assert(present==null || present.capacity()==a.length);
444 // assert(false) : buffer.rbs.capacity()+", "+buffer.rbs+", "+present;
445 if(present!=null){
446 if(fillPresent){
447 for(; i<a.length && j<b.length; ){
448 final long ka=a[i], kb=b[j];
449 final int bit=(int)(ka&1);
450 if(ka==kb){
451 present.increment(i);
452 matches++;
453 k1hits+=bit;
454 k1seenQ+=bit;
455 k1seenR+=bit;
456 if(aCounts!=null){
457 depthSum+=aCounts[i];
458 if(bCounts!=null){
459 depthSum2+=aCounts[i]/(double)bCounts[j];
460 }
461 }
462 if(refHitCounts!=null){refHitSum+=refHitCounts[i];}
463 i++;
464 j++;
465 }else if(ka<kb){
466 i++;
467 k1seenQ+=bit;
468 }else{
469 j++;
470 k1seenR+=bit;
471 }
472 }
473 }else{
474 for(; i<a.length && j<b.length; ){
475 final long ka=a[i], kb=b[j];
476 final int bit=(int)(ka&1);
477 if(ka==kb){
478 final int count=present.getCount(i);
479 if(count>1){
480 multiMatches++;
481 }
482
483 matches++;
484 k1hits+=bit;
485 k1seenQ+=bit;
486 k1seenR+=bit;
487 if(aCounts!=null){
488 depthSum+=aCounts[i];
489 if(bCounts!=null){
490 depthSum2+=aCounts[i]/(double)bCounts[j];
491 }
492 }
493 if(refHitCounts!=null){refHitSum+=refHitCounts[i];}
494 if(bid>0){
495 int[] taxHitsRow=taxHits[i];
496 if(taxHitsRow!=null && taxHitsRow.length==1 && taxHitsRow[0]==bid){unique2++;}
497 }
498
499 i++;
500 j++;
501 }else if(ka<kb){
502 k1seenQ+=bit;
503 final int count=present.getCount(i);
504 if(count>0){
505 contamHits++;
506 if(count>1){
507 multiContamHits++;
508 }
509 }else{
510 noHits++;
511 }
512
513 if(bid>0){
514 int[] taxHitsRow=taxHits[i];
515 if(taxHitsRow!=null){
516 if(taxHitsRow!=null && taxHitsRow.length==1 && taxHitsRow[0]==bid){unique3_temp++;}
517 for(int tid : taxHitsRow){
518 if(tid==bid){
519 sameTax++;
520 break;
521 }
522 }
523 }
524 }
525
526 i++;
527 }else{
528 k1seenR+=bit;
529 j++;
530 }
531 }
532
533 //For the remaining query kmers, we don't know whether the reference sketch would have shared them had it been longer.
534 //This section can be disabled to prevent them from being displayed.
535 if(bid>0 && i<a.length-1){
536 for(; i<a.length; i++){
537 int[] taxHitsRow=taxHits[i];
538 if(taxHitsRow!=null){
539 if(taxHitsRow!=null && taxHitsRow.length==1 && taxHitsRow[0]==bid){unique3_temp++;}
540 }
541 }
542 }
543 }
544 }else{
545 for(; i<a.length && j<b.length; ){
546 final long ka=a[i], kb=b[j];
547 final int bit=(int)(ka&1);
548 final int count=present.getCount(i);
549 if(ka==kb){
550 matches++;
551 k1hits+=bit;
552 k1seenQ+=bit;
553 k1seenR+=bit;
554 if(aCounts!=null){depthSum+=aCounts[i];}
555 if(refHitCounts!=null){refHitSum+=refHitCounts[i];}
556 i++;
557 j++;
558 }else if(ka<kb){
559 i++;
560 k1seenQ+=bit;
561 }else{
562 j++;
563 k1seenR+=bit;
564 }
565 }
566 }
567
568 if(k2<1){
569 k1hits=matches;
570 k1seenQ=i;
571 k1seenR=j;
572 }
573
574 // if(taxHits!=null){
575 // System.err.println("matches="+matches+", noHits="+noHits+", contamHits="+contamHits+", sameTax="+sameTax+", multiContamHits="+multiContamHits);
576 // }
577
578 // assert(bid<1 || unique2>=(matches-multiMatches)) : bid+", "+unique2+", "+unique3_temp+", "+matches+", "+multiMatches;
579 // assert(matches<1000 || multiMatches==0) : bid+", "+unique2+", "+unique3_temp+", "+matches+", "+multiMatches+", "+fillPresent;
580
581 if(buffer!=null){
582 // System.err.println("*A) "+matches+", "+multiMatches+", "+unique2+", "+unique3_temp);
583 // new Exception().printStackTrace();
584 // assert(k1hits<=(matches-k1hits)) : k1hits+", "+matches;
585 buffer.set(matches, multiMatches, unique2, unique2+unique3_temp, noHits,
586 contamHits, contamHits-sameTax, multiContamHits, i, j,
587 a.length, b.length, depthSum, depthSum2, refHitSum, k1hits, k1seenQ, k1seenR);
588 }
589 return matches;
590 }
591
592 // public float identity(Sketch b, float[] ret){
593 // if(ret!=null){Arrays.fill(ret, 0);}
594 // return identityWeighted(array, b.array, ret);
595 // }
596 //
597 // public static float identity(long[] a, long[] b){
598 // int matches=countMatches(a, b);
599 // return matches/(float)(Tools.max(1, Tools.min(a.length, b.length)));
600 // }
601
602 @Override
603 public int hashCode(){
604 long gSize=genomeSizeKmers>0 ? genomeSizeKmers : genomeSizeBases;
605 int code=(int) ((gSize^taxID^imgID^(name0==null ? 0 : name0.hashCode()))&Integer.MAX_VALUE);
606 // System.err.println(code+", "+gSize+", "+taxID+", "+imgID+", "+name0);
607 return code;
608 }
609
610 @Override
611 public int compareTo(Sketch b){
612 if(this==b){return 0;}
613 if(taxID>-1 && b.taxID>-1){return taxID-b.taxID;}
614 int x=taxName.compareTo(b.taxName);
615 if(x!=0){return x;}
616 if(name0!=null && b.name0!=null){return name0.compareTo(b.name0);}
617 return name0!=null ? 1 : b.name0!=null ? -1 : 0;
618 }
619
620 @Override
621 public boolean equals(Object b){
622 if(this==b){return true;}
623 if(b==null || this.getClass()!=b.getClass()){return false;}
624 return equals((Sketch)b);
625 }
626
627 public boolean equals(Sketch b){
628 return compareTo(b)==0;
629 }
630
631 /*--------------------------------------------------------------*/
632 /*---------------- Formatting ----------------*/
633 /*--------------------------------------------------------------*/
634
635 public ByteBuilder toHeader(){
636 ByteBuilder sb=new ByteBuilder();
637 return toHeader(sb);
638 }
639
640 public ByteBuilder toHeader(ByteBuilder bb){
641 bb.append("#SZ:").append(keys.length);
642 bb.append("\tCD:");
643 bb.append(codingArray[CODING]);
644 if(deltaOut){bb.append('D');}
645 if(keyCounts!=null){bb.append('C');}
646 if(aminoOrTranslate()){bb.append('M');}
647 if(amino8){bb.append('8');}
648
649 bb.append("\tK:").append(k);
650 if(k2!=0){bb.append(",").append(k2);}
651 if(HASH_VERSION>1){bb.append("\tH:").append(HASH_VERSION);}
652 // if(maxCount>0){bb.append("\tMC:").append(maxCount);}
653
654 if(genomeSizeBases>0){bb.append("\tGS:").append(genomeSizeBases);}
655 if(genomeSizeKmers>0){bb.append("\tGK:").append(genomeSizeKmers);}
656 final long ge=genomeSizeEstimate();
657 if(ge>0){bb.append("\tGE:").append(ge);}
658 if(genomeSequences>0){bb.append("\tGQ:"+genomeSequences);}
659 if(baseCounts!=null && !aminoOrTranslate()){
660 bb.append("\tBC:").append(baseCounts[0]).append(',').append(baseCounts[1]).append(',');
661 bb.append(baseCounts[2]).append(',').append(baseCounts[3]);
662 }
663 if(probCorrect>0){bb.append("\tPC:"+String.format(Locale.ROOT, "%.4f", probCorrect));}
664 if(taxID>=0){bb.append("\tID:").append(taxID);}
665 if(imgID>=0){bb.append("\tIMG:").append(imgID);}
666 if(spid>0){bb.append("\tSPID:").append(spid);}
667 if(fname!=null){bb.append("\tFN:").append(fname);}
668 if(taxName!=null){bb.append("\tNM:").append(taxName);}
669 if(name0!=null){bb.append("\tNM0:").append(name0);}
670 if(meta!=null){
671 for(String s : meta){
672 bb.append("\tMT_").append(s);
673 }
674 }
675
676 if(r16S!=null){
677 bb.append("\t16S:").append(r16S.length);
678 }
679 if(r18S!=null){
680 bb.append("\t18S:").append(r18S.length);
681 }
682 if(r16S!=null){
683 bb.append('\n');
684 bb.append("#16S:").append(r16S);
685 }
686 if(r18S!=null){
687 bb.append('\n');
688 bb.append("#18S:").append(r18S);
689 }
690 return bb;
691 }
692
693 public ByteBuilder toBytes(){
694 return toBytes(new ByteBuilder());
695 }
696
697 public ByteBuilder toBytes(ByteBuilder bb){
698 if(CODING==A48 && deltaOut){return toBytesA48D(bb);}
699 long prev=0;
700 toHeader(bb);
701 bb.append("\n");
702 byte[] temp=null;
703 if(CODING==A48){temp=KillSwitch.allocByte1D(12);}
704 for(int i=0; i<keys.length; i++){
705 long key=keys[i];
706 int count=(keyCounts==null ? 1 : keyCounts[i]);
707 long x=key-prev;
708 if(CODING==A48){
709 appendA48(x, bb, temp);
710 if(count>1){
711 bb.append('\t');
712 appendA48(count-1, bb, temp);
713 }
714 bb.append('\n');
715 }else if(CODING==HEX){
716 bb.append(Long.toHexString(x)).append('\n');
717 }else if(CODING==RAW){
718 bb.append(x).append('\n');
719 }else{
720 assert(false);
721 }
722 if(deltaOut){prev=key;}
723 }
724 return bb;
725 }
726
727 //This is to make the common case fast
728 private ByteBuilder toBytesA48D(ByteBuilder bb){
729 assert(CODING==A48 && deltaOut);
730 long prev=0;
731 toHeader(bb);
732 bb.append("\n");
733 final byte[] temp=KillSwitch.allocByte1D(12);
734
735 if(keyCounts==null){
736 for(int i=0; i<keys.length; i++){
737 long key=keys[i];
738 long x=key-prev;
739 if(CODING==A48){
740 appendA48(x, bb, temp);
741 bb.append('\n');
742 }
743 prev=key;
744 }
745 }else{
746 for(int i=0; i<keys.length; i++){
747 long key=keys[i];
748 int count=keyCounts[i];
749 long x=key-prev;
750 if(CODING==A48){
751 appendA48(x, bb, temp);
752 if(count>1){
753 bb.append('\t');
754 appendA48(count-1, bb, temp);
755 }
756 bb.append('\n');
757 }
758 prev=key;
759 }
760 }
761 return bb;
762 }
763
764 public static final void appendA48(long value, ByteBuilder bb, byte[] temp){
765 int i=0;
766 // long value=value0;
767 while(value!=0){
768 byte b=(byte)(value&0x3F);
769 // assert(i<temp.length) : i+", "+temp.length+", "+value0;
770 temp[i]=b;
771 value=value>>6;
772 i++;
773 }
774 if(i==0){
775 bb.append((byte)'0');
776 }else{
777 for(i--;i>=0;i--){
778 bb.append((char)(temp[i]+48));
779 }
780 }
781 }
782
783 public static final String toA48(long value){
784 int i=0;
785 // long value=value0;
786 StringBuilder sb=new StringBuilder(12);
787 while(value!=0){
788 byte b=(byte)(value&0x3F);
789 // assert(i<temp.length) : i+", "+temp.length+", "+value0;
790 sb.append((char)(b+48));
791 value=value>>6;
792 i++;
793 }
794 if(i==0){
795 sb.append((byte)'0');
796 }else{
797 sb.reverse();
798 }
799 return sb.toString();
800 }
801
802 @Override
803 public String toString(){
804 return toBytes().toString();
805 }
806
807 /*--------------------------------------------------------------*/
808 /*---------------- String Parsing ----------------*/
809 /*--------------------------------------------------------------*/
810
811 public static long parseA48(String line){
812 if(line.length()==0){return 0;}
813 long x=0;
814 for(int i=0; i<line.length(); i++){
815 x<<=6;
816 long c=line.charAt(i);
817 x|=(c-48);
818 }
819 return x;
820 }
821
822 /** Parses coverage too */
823 public static long parseA48C(String line, IntList covList){
824 if(line.length()==0){
825 covList.add(1);
826 return 0;
827 }
828 long key=0, cov=0;
829 int i=0, len=line.length();
830 for(; i<len; i++){
831 long c=line.charAt(i);
832 if(c<48){break;}
833 key<<=6;
834 key|=(c-48);
835 }
836 for(i++; i<len; i++){
837 long c=line.charAt(i);
838 cov<<=6;
839 cov|=(c-48);
840 }
841 covList.add((int)(cov+1));
842 return key;
843 }
844
845 public static long parseHex(String line){
846 if(line.length()==0){return 0;}
847 long x=0;
848 for(int i=0; i<line.length(); i++){
849 x<<=4;
850 x|=hexTable[line.charAt(i)];
851 }
852 if(line.charAt(0)=='-'){x*=-1;}
853 return x;
854 }
855
856 /*--------------------------------------------------------------*/
857 /*---------------- Array Parsing ----------------*/
858 /*--------------------------------------------------------------*/
859
860 public static long parseA48(byte[] line){
861 if(line.length==0){return 0;}
862 long x=0;
863 for(byte b : line){
864 x<<=6;
865 x|=(((long)b)-48);
866 }
867 return x;
868 }
869
870 public static long parseNuc(String line){
871 return parseNuc(line.getBytes());
872 }
873
874 /** Returns the maximal key in the sequence */
875 public static long parseNuc(byte[] bases){
876 if(bases.length<k){return -1;}
877 final int shift=2*k;
878 final int shift2=shift-2;
879 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); //Conditional allows K=32
880
881 long kmer=0, rkmer=0;
882 int len=0;
883
884 long key=-1;
885 for(int i=0; i<bases.length; i++){
886 byte b=bases[i];
887 long x=AminoAcid.baseToNumber[b];
888 long x2=AminoAcid.baseToComplementNumber[b];
889 kmer=((kmer<<2)|x)&mask;
890 rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
891 if(x<0){len=0; rkmer=0;}else{len++;}
892 if(len>=k){
893 // long z=Tools.max(kmer, rkmer);
894 final long hashcode=hash(kmer, rkmer);
895 key=Tools.max(key, hashcode);
896 }
897 }
898 return key<minHashValue ? -1 : Long.MAX_VALUE-key;
899 }
900
901 /** Parses coverage too */
902 public static long parseA48C(byte[] line, IntList covList){
903 if(line.length==0){
904 covList.add(1);
905 return 0;
906 }
907 long key=0, cov=0;
908 int i=0, len=line.length;
909 for(; i<len; i++){
910 long b=line[i];
911 if(b<48){break;}
912 key<<=6;
913 key|=(b-48);
914 }
915 for(i++; i<len; i++){
916 long b=line[i];
917 cov<<=6;
918 cov|=(b-48);
919 }
920 covList.add((int)(cov+1));
921 return key;
922 }
923
924 public static long parseHex(byte[] line){
925 if(line.length==0){return 0;}
926 long x=0;
927 for(byte b : line){
928 x<<=4;
929 x|=hexTable[b];
930 }
931 if(line[0]=='-'){x*=-1;}
932 return x;
933 }
934
935 /*--------------------------------------------------------------*/
936 /*---------------- Parsing ----------------*/
937 /*--------------------------------------------------------------*/
938
939 public static long parseA48(ByteBuilder bb){
940 final int len=bb.length;
941 final byte[] line=bb.array;
942 if(len==0){return 0;}
943 long x=0;
944 for(int i=0; i<len; i++){
945 x<<=6;
946 x|=(((long)line[i])-48);
947 }
948 return x;
949 }
950
951 public static long parseNuc(ByteBuilder bb){
952 return parseNuc(bb.toBytes());
953 }
954
955 /** Parses coverage too */
956 public static long parseA48C(ByteBuilder bb, IntList covList){
957 final int len=bb.length;
958 final byte[] line=bb.array;
959 if(len==0){
960 covList.add(1);
961 return 0;
962 }
963 long key=0, cov=0;
964 int i=0;
965 for(; i<len; i++){
966 long b=line[i];
967 if(b<48){break;}
968 key<<=6;
969 key|=(b-48);
970 }
971 for(i++; i<len; i++){
972 long b=line[i];
973 cov<<=6;
974 cov|=(b-48);
975 }
976 covList.add((int)(cov+1));
977 return key;
978 }
979
980 public static long parseHex(ByteBuilder bb){
981 final int len=bb.length;
982 final byte[] line=bb.array;
983 if(line.length==0){return 0;}
984 long x=0;
985 for(int i=0; i<len; i++){
986 x<<=4;
987 x|=hexTable[line[i]];
988 }
989 if(line[0]=='-'){x*=-1;}
990 return x;
991 }
992
993 /*--------------------------------------------------------------*/
994 /*---------------- Getters ----------------*/
995 /*--------------------------------------------------------------*/
996
997 public long genomeSizeEstimate() {
998 return keys.length==0 ? 0 : Tools.min(genomeSizeKmers, genomeSizeEstimate(keys[keys.length-1], keys.length));
999 }
1000
1001 public long genomeSizeEstimate(int minCount) {
1002 if(minCount<2){return genomeSizeEstimate();}
1003 if(length()==0){return 0;}
1004 long max=0;
1005 int num=0;
1006 for(int i=0; i<keyCounts.length; i++){
1007 if(keyCounts[i]>=minCount){
1008 max=keys[i];
1009 num++;
1010 }
1011 }
1012 if(max==0){return 0;}
1013 long est=Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(max, num));
1014 return est;
1015 }
1016
1017 public float gc(){
1018 if(baseCounts==null){return 0;}
1019 long at=baseCounts[0]+baseCounts[3];
1020 long gc=baseCounts[1]+baseCounts[2];
1021 return gc/(Tools.max(1.0f, at+gc));
1022 }
1023
1024 public String filePrefix(){return ReadWrite.stripToCore(fname);}
1025 public String name(){return taxName!=null ? taxName : name0!=null ? name0 : fname;}
1026 public String taxName(){return taxName;}
1027 public String name0(){return name0;}
1028 public String fname(){return fname;}
1029 public int length(){return keys.length;}
1030 public void setTaxName(String s){taxName=s;}
1031 public void setName0(String s){name0=s;}
1032 public void setFname(String s){
1033 // assert(!s.endsWith("sketch")) : s; //123
1034 fname=(s==null ? s : ReadWrite.stripPath(s));
1035 }
1036
1037 // public float k1Fraction(){return k1Count/(float)array.length;}
1038
1039 /*--------------------------------------------------------------*/
1040 /*---------------- Assorted ----------------*/
1041 /*--------------------------------------------------------------*/
1042
1043 public ArrayList<LongPair> toKhist() {
1044 HashMap<Long, LongPair> map=new HashMap<Long, LongPair>();
1045 for(int i=0; i<keyCounts.length; i++){
1046 int a=keyCounts[i];
1047 Long key=(long)a;
1048 LongPair value=map.get(key);
1049 if(value==null){
1050 value=new LongPair();
1051 value.a=a;
1052 map.put(key, value);
1053 }
1054 value.b++;
1055 }
1056 ArrayList<LongPair> list=new ArrayList<LongPair>(map.size());
1057 list.addAll(map.values());
1058 Collections.sort(list);
1059 return list;
1060 }
1061
1062 // static boolean warned=false;
1063 public void addSSU(){
1064 if(useSSUMapOnly){r16S=r18S=null;}
1065 if(taxID<1 || taxID>=minFakeID){return;}
1066 if(!SSUMap.hasMap()){return;}
1067 if(taxtree==null/* && !warned*/){
1068 // warned=true;
1069 // System.err.println("*** Warning - no taxtree loaded. A taxtree is recommended when adding SSUs. ***");
1070 assert(false) : "Please set a path to the taxtree when adding SSUs.";
1071 }
1072 final boolean euk=taxtree!=null && (preferSSUMapForEuks || useSSUMapOnlyForEuks || ban16SForEuks) ? taxtree.isEukaryote(taxID) : false;
1073 final boolean prok=taxtree!=null && ban18SForProks ? taxtree.isProkaryote(taxID) : false;
1074
1075 final boolean mapOnly=(useSSUMapOnly || (useSSUMapOnlyForEuks && euk));
1076 final boolean preferMap=(mapOnly || preferSSUMap || (preferSSUMapForEuks && euk));
1077 if(mapOnly){r16S=r18S=null;}
1078
1079 if(SSUMap.r16SMap!=null && (r16S==null || preferMap)){
1080 byte[] x=SSUMap.r16SMap.get(taxID);
1081 if(x!=null){r16S=x;}
1082 }
1083 if(SSUMap.r18SMap!=null && (r18S==null || preferMap)){
1084 byte[] x=SSUMap.r18SMap.get(taxID);
1085 if(x!=null){r18S=x;}
1086 }
1087 if(ban18SForProks && prok){r18S=null;}
1088 if(ban16SForEuks && euk){r16S=null;}
1089 }
1090
1091 /*--------------------------------------------------------------*/
1092 /*---------------- BitSet ----------------*/
1093 /*--------------------------------------------------------------*/
1094
1095 public void makeBitSets(boolean printContam, boolean index){
1096 assert(compareBitSet==null && indexBitSet==null);
1097 if(!printContam){return;}
1098 compareBitSet=AbstractBitSet.make(length(), bitSetBits);
1099 if(index){indexBitSet=AbstractBitSet.make(length(), bitSetBits);}
1100 }
1101
1102 public void addToBitSet(AbstractBitSet rbs){
1103 compareBitSet.add(rbs);
1104 }
1105
1106 public AbstractBitSet compareBitSet(){return compareBitSet;}
1107
1108 public AbstractBitSet indexBitSet(){return indexBitSet;}
1109
1110 public void mergeBitSets(){
1111 assert(!mergedBitSets);
1112 if(compareBitSet!=null && indexBitSet!=null){
1113 compareBitSet.setToMax(indexBitSet);
1114 }
1115 indexBitSet=null;
1116 mergedBitSets=true;
1117 }
1118
1119 public boolean merged(){return mergedBitSets;}
1120
1121 public int r16SLen(){return r16S==null ? 0 : r16S.length;}
1122 public int r18SLen(){return r18S==null ? 0 : r18S.length;}
1123 public byte[] r16S(){return r16S;}
1124 public byte[] r18S(){return r18S;}
1125 public boolean hasSSU(){return r16S!=null || r18S!=null;}
1126 public boolean sharesSSU(Sketch b){
1127 return (r16S!=null && b.r16S!=null) || (r18S!=null && b.r18S!=null);
1128 }
1129
1130 /*--------------------------------------------------------------*/
1131 /*---------------- Fields ----------------*/
1132 /*--------------------------------------------------------------*/
1133
1134 /** Stores sorted hashcodes, ascending, as Long.MAX_VALUE-(raw hashcode) */
1135 public long[] keys;
1136 /** Stores kmer (hashcode) observation counts */
1137 int[] keyCounts;
1138 /** Stores base (ACGTN) counts */
1139 final long[] baseCounts;
1140 private byte[] r16S;
1141 private byte[] r18S;
1142 public int taxID;
1143 public int maxCount;
1144 int sketchID;//Based on loading order
1145 public final long genomeSequences;
1146 public final long genomeSizeBases;
1147 public final long genomeSizeKmers;
1148 public final float probCorrect;
1149 // public final int k1Count; //Number of keys made from k1 rather than k2
1150 private String taxName;
1151 private String name0;
1152 private String fname;
1153 ArrayList<String> meta;
1154
1155 //TODO: These should move to SketchResults.
1156 private AbstractBitSet compareBitSet; //Used for comparison
1157 private AbstractBitSet indexBitSet;
1158
1159
1160 //Extended information
1161 public long imgID=-1;
1162 public long spid=-1;
1163 // public String seqUnitName=null;
1164
1165 private boolean mergedBitSets=false; //TODO: Temporary for debugging
1166 /** Tracks the number of reference sketches sharing each kmer.
1167 * Should be set to null when no longer needed. */
1168 private int[] refHitCounts;
1169
1170 public int[] refHitCounts(){return refHitCounts;}
1171 public void clearRefHitCounts(){
1172 refHitCounts=null;
1173 }
1174 public void setRefHitCounts(int[] x) {
1175 refHitCounts=x;
1176 assert(x!=null);
1177 }
1178
1179 private static AtomicInteger nextSketch=new AtomicInteger(1);
1180
1181 }