Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/Sketch.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/Sketch.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,1181 @@ +package sketch; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Locale; +import java.util.concurrent.atomic.AtomicInteger; + +import dna.AminoAcid; +import fileIO.ReadWrite; +import shared.KillSwitch; +import shared.Tools; +import structures.AbstractBitSet; +import structures.ByteBuilder; +import structures.IntList; +import structures.LongHashMap; +import structures.LongList; +import structures.LongPair; +import tax.ImgRecord; + +/** + * @author Brian Bushnell + * @date July 7, 2016 + * + */ +public class Sketch extends SketchObject implements Comparable<Sketch>, Cloneable { + + /*--------------------------------------------------------------*/ + /*---------------- Constructors ----------------*/ + /*--------------------------------------------------------------*/ + + @Override + public Sketch clone() { + Sketch copy=null; + try { + copy = (Sketch)super.clone(); + } catch (CloneNotSupportedException e) { +// e.printStackTrace(); + throw new RuntimeException(e); + } + copy.compareBitSet=null; + copy.indexBitSet=null; + return copy; + } + + //Array should already be hashed, sorted, unique, subtracted from Long.MAX_VALUE, then reversed. + public Sketch(long[] keys_, int[] keyCounts_, long[] baseCounts_, byte[] r16S_, byte[] r18S_, ArrayList<String> meta_){ + this(keys_, keyCounts_, baseCounts_, r16S_, r18S_, -1, -1, -1, -1, -1, -1, null, null, null, meta_); + } + + public Sketch(SketchHeap heap, boolean clearFname, boolean keepCounts, ArrayList<String> meta_){ + this(heap, clearFname, keepCounts, meta_, -1); + } + + public Sketch(SketchHeap heap, boolean clearFname, boolean keepCounts, ArrayList<String> meta_, int minKeyOccuranceCount){ + this(heap.toSketchArray_minCount(minKeyOccuranceCount), null, heap.baseCounts(false), heap.r16S(), heap.r18S(), (int)heap.taxID, heap.imgID, + heap.genomeSizeBases, heap.genomeSizeKmers, heap.genomeSequences, + heap.probCorrect(), heap.taxName(), heap.name0(), heap.fname(), meta_); + assert(keyCounts==null); + if(!heap.setMode && keepCounts){ + LongHashMap map=heap.map(); + keyCounts=new int[keys.length]; + for(int i=0; i<keys.length; i++){ + int count=map.get(Long.MAX_VALUE-keys[i]); + assert(count>0) : keys[i]+" -> "+count+"\n"+Arrays.toString(map.values())+"\n"+Arrays.toString(map.keys()); + keyCounts[i]=count; + } + } + if(heap.setMode){heap.clearSet();} + heap.clear(clearFname); +// System.err.println("size="+size+", genome="+this.genomeSize+", m"); : (int)(2+maxGenomeFraction*heap.genomeSize)+", "+this.array.length; +// assert(false) : (int)(2+maxGenomeFraction*heap.genomeSize)+", "+this.array.length; +// assert(false) : (counts==null)+", "+heap.setMode+", "+keepCounts; + } + + public Sketch(long[] keys_, int[] keyCounts_, long[] baseCounts_, byte[] r16S_, byte[] r18S_, int taxID_, long imgID_, long gSizeBases_, + long gSizeKmers_, long gSequences_, double probCorrect_, String taxName_, String name0_, String fname_, ArrayList<String> meta_){ +// Exception e=new Exception(baseCounts_+", "+Arrays.toString(baseCounts_)); +// e.printStackTrace(); + keys=keys_; + keyCounts=keyCounts_; +// fixKeyCounts(); + baseCounts=baseCounts_; + r16S=r16S_; + r18S=r18S_; + assert(keyCounts==null || keys==null || keyCounts.length==keys.length) : (keys==null ? "null" : keys.length)+", "+(keyCounts==null ? "null" : keyCounts.length); + taxID=taxID_; + imgID=imgID_; + sketchID=nextSketch.getAndIncrement(); + genomeSizeBases=gSizeBases_; + genomeSizeKmers=gSizeKmers_; + genomeSequences=gSequences_; + probCorrect=probCorrect_<=0 ? 0f : (float)probCorrect_; + meta=meta_; + + taxName=fix(taxName_); + name0=fix(name0_); + fname=fix(fname_); + if(fname!=null && (fname.startsWith("stdin.") || fname.endsWith(".sketch") || fname.endsWith(".sketch.gz"))){fname=null;} + +// if(k2>0){ +// if(useToValue2) { +// int count=0; +// for(long x : array){ +// count+=(int)(x&1); +// } +// k1Count=count; +// }else { +// k1Count=array.length/2; +// } +// }else{ +// k1Count=array.length; +// } + + if(ImgRecord.imgMap!=null && imgID>=0 && taxID<0){ + ImgRecord record=ImgRecord.imgMap.get(imgID); + if(record!=null){ + if(record.name!=null && taxName==null){taxName=record.name;} + taxID=record.taxID; + } + } + } + + void loadSSU(){ + if(taxID>0 && taxID<minFakeID){ + if(r16S==null && SSUMap.r16SMap!=null){ + r16S=SSUMap.r16SMap.get(taxID); + } + if(r18S==null && SSUMap.r18SMap!=null){ + r18S=SSUMap.r18SMap.get(taxID); + } + } +// new Exception().printStackTrace(); +// System.err.println(taxID+", "+ssu+", "+SSUMap.ssuMap.size()); + } + + void addMeta(String s){ + s=fixMeta(s); + if(s==null){return;} + if(meta==null){meta=new ArrayList<String>(1);} + meta.add(s); + } + + void setMeta(ArrayList<String> list){ + assert(meta==null); + meta=list; + } + + private static String fix(String s){ + if(s==null){return null;} + return s.replace('\t', ' '); + } + + /*--------------------------------------------------------------*/ + /*---------------- Methods ----------------*/ + /*--------------------------------------------------------------*/ + + public void add(Sketch other, int maxlen){ + assert(keyCounts==null && other.keyCounts==null); + final long[] a=keys; + final long[] b=other.keys; + if(maxlen<1){ + assert(false); + maxlen=1000000; + } + LongList list=new LongList(Tools.min(maxlen, a.length+b.length)); + + for(int i=0, j=0; i<a.length && j<b.length; ){ + final long ka=a[i], kb=b[j]; + if(ka==kb){//match + list.add(ka); + i++; + j++; + }else if(ka<kb){ + list.add(ka); + i++; + }else{ + list.add(kb); + j++; + } + if(list.size()>=maxlen){break;} + } + + if(keys.length==list.size()){ + for(int i=0; i<list.size; i++){ + keys[i]=list.array[i]; + } + }else{ + keys=list.toArray(); + } + } + + public void resize(int newSize) { + if(newSize>=length()){return;} + keys=Arrays.copyOf(keys, newSize); +// fixKeyCounts(newSize); + if(keyCounts!=null){ + for(int i=0; i<newSize; i++) + keyCounts=Arrays.copyOf(keyCounts, newSize); + } + } + + int fixKeyCounts(){ + return fixKeyCounts(keyCounts==null ? maxCount : keyCounts.length); + } + + int fixKeyCounts(int maxLen){ + int max=0; + if(keyCounts!=null){ + for(int i=0; i<maxLen && max<2; i++){ + max=Tools.max(keyCounts[i], max); + } + } + if(max<2){keyCounts=null;} + maxCount=Tools.max((maxCount>0 ? 1 : 0), max); + return maxCount; + } + + public int applyBlacklist() { + assert(blacklist!=null); + LongList keylist=new LongList(keys.length); + IntList countlist=(keyCounts==null ? null : new IntList(keys.length)); + int removed=0; + for(int i=0; i<keys.length; i++){ + long key=keys[i]; + if(!Blacklist.contains(Long.MAX_VALUE-key)){ + keylist.add(key); + if(keyCounts!=null){countlist.add(keyCounts[i]);} + }else{ + removed++; + } + } + if(keylist.size()!=keys.length){ + keys=keylist.toArray(); + if(keyCounts!=null){keyCounts=countlist.toArray();} + } +// if(removed>0){ +// fixKeyCounts(); +// } + return removed; + } + + /*--------------------------------------------------------------*/ + /*---------------- Set Operations ----------------*/ + /*--------------------------------------------------------------*/ + + public static final Sketch intersection(Sketch sa, Sketch sb){ + Sketch shared=intersection(sa.keys, sb.keys, sa.keyCounts); + if(shared!=null){ + shared.taxID=sb.taxID; + shared.taxName=sb.taxName; + shared.name0=sb.name0; + shared.fname=sb.fname; + shared.meta=sb.meta; + shared.imgID=sb.imgID; + shared.spid=sb.spid; + } + return shared; + } + + public static final Sketch intersection(long[] a, long[] b, int[] aCounts){ + int i=0, j=0, matches=0; + LongList ll=new LongList(); + IntList il=new IntList(); + for(; i<a.length && j<b.length; ){ + final long ka=a[i], kb=b[j]; + if(ka==kb){ + matches++; + ll.add(ka); + if(aCounts!=null){ + il.add(aCounts[i]); + } + i++; + j++; + }else if(ka<kb){ + i++; + }else{ + j++; + } + } + if(matches<1){return null;} + + return new Sketch(ll.toArray(), il.size>0 ? il.toArray() : null, null, null, null, null); + } + + public static final Sketch union(Sketch sa, Sketch sb){ + Sketch shared=union(sa.keys, sb.keys, sa.keyCounts, sb.keyCounts, sa.baseCounts, sb.baseCounts, sa.r16S, sb.r16S, sa.r18S, sb.r18S); + if(shared!=null){ + shared.taxID=sa.taxID; + shared.taxName=sa.taxName; + shared.name0=sa.name0; + shared.fname=sa.fname; + shared.meta=sa.meta; + shared.imgID=sa.imgID; + shared.spid=sa.spid; + } + return shared; + } + + public static final Sketch union(long[] a, long[] b, int[] aCounts, int[] bCounts, long[] aBaseCounts, long[] bBaseCounts, + byte[] a16S, byte[] b16S, byte[] a18S, byte[] b18S){ + int i=0, j=0, matches=0; + LongList ll=new LongList(); + IntList il=(aCounts==null || bCounts==null ? null : new IntList()); + byte[] r16S=(b16S==null ? a16S : a16S==null ? b16S : a16S.length>b16S.length ? a16S : b16S); + byte[] r18S=(b18S==null ? a18S : a18S==null ? b18S : a18S.length>b18S.length ? a18S : b18S); + long[] baseCounts=(aBaseCounts==null || bBaseCounts==null ? null : new long[aBaseCounts.length]); + if(baseCounts!=null){ + for(int k=0; k<baseCounts.length; k++) {baseCounts[k]+=(aBaseCounts[k]+bBaseCounts[k]);} + } + for(; i<a.length && j<b.length; ){ + final long ka=a[i], kb=b[j]; + if(ka==kb){ + matches++; + ll.add(ka); + if(il!=null){ + il.add(aCounts[i]+bCounts[i]); + } + i++; + j++; + }else if(ka<kb){ + ll.add(ka); + if(il!=null){ + il.add(aCounts[i]); + } + i++; + }else{ + ll.add(kb); + if(il!=null){ + il.add(bCounts[i]); + } + j++; + } + } + if(matches<1){return null;} + + return new Sketch(ll.toArray(), il.size>0 ? il.toArray() : null, baseCounts, r16S, r18S, null); + } + + /*--------------------------------------------------------------*/ + /*---------------- Filtering ----------------*/ + /*--------------------------------------------------------------*/ + + boolean passesMeta(DisplayParams params) { + return passesMeta(params.requiredMeta, params.bannedMeta, /*params.requiredTaxid, params.bannedTaxid,*/ params.requiredMetaAnd); + } + + boolean passesMeta(ArrayList<String> requiredMeta, ArrayList<String> bannedMeta, /*IntList requiredTaxid, IntList bannedTaxid,*/ boolean requiredMetaAnd) { + assert(requiredMeta!=null || bannedMeta!=null /*|| requiredTaxid!=null || bannedTaxid!=null*/); + assert(requiredMeta==null || requiredMeta.size()>0); + assert(bannedMeta==null || bannedMeta.size()>0); + +// if(requiredTaxid!=null && !requiredTaxid.contains(taxID)){return false;} +// if(bannedTaxid!=null && bannedTaxid.contains(taxID)){return false;} + + if(requiredMeta!=null){ + if(meta==null){return false;} + for(String tag : requiredMeta){ + if(meta.contains(tag)){ + if(!requiredMetaAnd){break;} + }else if(requiredMetaAnd){ + return false; + } + } + } + if(bannedMeta!=null && meta!=null){ + for(String tag : bannedMeta){ + if(!meta.contains(tag)){ + return false; + } + } + } + return true; + } + + /*--------------------------------------------------------------*/ + /*---------------- Comparison ----------------*/ + /*--------------------------------------------------------------*/ + + public int countMatches(Sketch other, CompareBuffer buffer, AbstractBitSet present, boolean fillPresent, int[][] taxHits, int contamLevel){ + return countMatches(keys, other.keys, keyCounts, other.keyCounts, refHitCounts, other.taxID, buffer, present, fillPresent, taxHits, contamLevel); + } + + public static final int countMatches(long[] a, long[] b, int[] aCounts, int[] bCounts, int[] refHitCounts, int bid, + CompareBuffer buffer, AbstractBitSet present, boolean fillPresent, int[][] taxHits, int contamLevel){ + + +// if(verbose2){System.err.println("fillPresent: "+fillPresent+", "+present);} +// assert(fillPresent) : bid+", "+minFakeID+", "+(taxHits!=null); + + if(bid>0 && bid<minFakeID && taxHits!=null){ + bid=taxtree.getIdAtLevelExtended(bid, contamLevel); + }else{ + bid=-1; + } + +// assert(false) : (buffer==null)+", "+fillPresent+", "+present.cardinality(); + assert(a.length>0 && b.length>0); + + //Kmers hitting this reference + int matches=0; + + //Kmers hitting this reference and others + int multiMatches=0; + + //Kmers hitting nothing + int noHits=0; + + //Kmers hitting some organism but not this reference + int contamHits=0; + + //Kmers hitting something in this taxa, but not this reference + int sameTax=0; + + //Kmers hitting this organism and no other taxa + int unique2=0; + + //Kmers hitting only this taxa but not this organism (this count may not include everything due to early exit) + int unique3_temp=0; + + //Kmers hitting multiple organisms but not this reference + int multiContamHits=0; + + //Sum of query counts for shared kmers + long depthSum=0; + + //Sum of query counts for shared kmers divided by ref counts for those kmers + double depthSum2=0;//Slow, but necessary. + + //Number of times matching query keys occurred in all references + long refHitSum=0; + + //Matches from k1 + int k1hits=0; + + //Query kmers from k1 + int k1seenQ=0; + + //Reference kmers from k1 + int k1seenR=0; + int i=0, j=0; + assert(present==null || present.capacity()==a.length); +// assert(false) : buffer.rbs.capacity()+", "+buffer.rbs+", "+present; + if(present!=null){ + if(fillPresent){ + for(; i<a.length && j<b.length; ){ + final long ka=a[i], kb=b[j]; + final int bit=(int)(ka&1); + if(ka==kb){ + present.increment(i); + matches++; + k1hits+=bit; + k1seenQ+=bit; + k1seenR+=bit; + if(aCounts!=null){ + depthSum+=aCounts[i]; + if(bCounts!=null){ + depthSum2+=aCounts[i]/(double)bCounts[j]; + } + } + if(refHitCounts!=null){refHitSum+=refHitCounts[i];} + i++; + j++; + }else if(ka<kb){ + i++; + k1seenQ+=bit; + }else{ + j++; + k1seenR+=bit; + } + } + }else{ + for(; i<a.length && j<b.length; ){ + final long ka=a[i], kb=b[j]; + final int bit=(int)(ka&1); + if(ka==kb){ + final int count=present.getCount(i); + if(count>1){ + multiMatches++; + } + + matches++; + k1hits+=bit; + k1seenQ+=bit; + k1seenR+=bit; + if(aCounts!=null){ + depthSum+=aCounts[i]; + if(bCounts!=null){ + depthSum2+=aCounts[i]/(double)bCounts[j]; + } + } + if(refHitCounts!=null){refHitSum+=refHitCounts[i];} + if(bid>0){ + int[] taxHitsRow=taxHits[i]; + if(taxHitsRow!=null && taxHitsRow.length==1 && taxHitsRow[0]==bid){unique2++;} + } + + i++; + j++; + }else if(ka<kb){ + k1seenQ+=bit; + final int count=present.getCount(i); + if(count>0){ + contamHits++; + if(count>1){ + multiContamHits++; + } + }else{ + noHits++; + } + + if(bid>0){ + int[] taxHitsRow=taxHits[i]; + if(taxHitsRow!=null){ + if(taxHitsRow!=null && taxHitsRow.length==1 && taxHitsRow[0]==bid){unique3_temp++;} + for(int tid : taxHitsRow){ + if(tid==bid){ + sameTax++; + break; + } + } + } + } + + i++; + }else{ + k1seenR+=bit; + j++; + } + } + + //For the remaining query kmers, we don't know whether the reference sketch would have shared them had it been longer. + //This section can be disabled to prevent them from being displayed. + if(bid>0 && i<a.length-1){ + for(; i<a.length; i++){ + int[] taxHitsRow=taxHits[i]; + if(taxHitsRow!=null){ + if(taxHitsRow!=null && taxHitsRow.length==1 && taxHitsRow[0]==bid){unique3_temp++;} + } + } + } + } + }else{ + for(; i<a.length && j<b.length; ){ + final long ka=a[i], kb=b[j]; + final int bit=(int)(ka&1); + final int count=present.getCount(i); + if(ka==kb){ + matches++; + k1hits+=bit; + k1seenQ+=bit; + k1seenR+=bit; + if(aCounts!=null){depthSum+=aCounts[i];} + if(refHitCounts!=null){refHitSum+=refHitCounts[i];} + i++; + j++; + }else if(ka<kb){ + i++; + k1seenQ+=bit; + }else{ + j++; + k1seenR+=bit; + } + } + } + + if(k2<1){ + k1hits=matches; + k1seenQ=i; + k1seenR=j; + } + +// if(taxHits!=null){ +// System.err.println("matches="+matches+", noHits="+noHits+", contamHits="+contamHits+", sameTax="+sameTax+", multiContamHits="+multiContamHits); +// } + +// assert(bid<1 || unique2>=(matches-multiMatches)) : bid+", "+unique2+", "+unique3_temp+", "+matches+", "+multiMatches; +// assert(matches<1000 || multiMatches==0) : bid+", "+unique2+", "+unique3_temp+", "+matches+", "+multiMatches+", "+fillPresent; + + if(buffer!=null){ +// System.err.println("*A) "+matches+", "+multiMatches+", "+unique2+", "+unique3_temp); +// new Exception().printStackTrace(); +// assert(k1hits<=(matches-k1hits)) : k1hits+", "+matches; + buffer.set(matches, multiMatches, unique2, unique2+unique3_temp, noHits, + contamHits, contamHits-sameTax, multiContamHits, i, j, + a.length, b.length, depthSum, depthSum2, refHitSum, k1hits, k1seenQ, k1seenR); + } + return matches; + } + +// public float identity(Sketch b, float[] ret){ +// if(ret!=null){Arrays.fill(ret, 0);} +// return identityWeighted(array, b.array, ret); +// } +// +// public static float identity(long[] a, long[] b){ +// int matches=countMatches(a, b); +// return matches/(float)(Tools.max(1, Tools.min(a.length, b.length))); +// } + + @Override + public int hashCode(){ + long gSize=genomeSizeKmers>0 ? genomeSizeKmers : genomeSizeBases; + int code=(int) ((gSize^taxID^imgID^(name0==null ? 0 : name0.hashCode()))&Integer.MAX_VALUE); +// System.err.println(code+", "+gSize+", "+taxID+", "+imgID+", "+name0); + return code; + } + + @Override + public int compareTo(Sketch b){ + if(this==b){return 0;} + if(taxID>-1 && b.taxID>-1){return taxID-b.taxID;} + int x=taxName.compareTo(b.taxName); + if(x!=0){return x;} + if(name0!=null && b.name0!=null){return name0.compareTo(b.name0);} + return name0!=null ? 1 : b.name0!=null ? -1 : 0; + } + + @Override + public boolean equals(Object b){ + if(this==b){return true;} + if(b==null || this.getClass()!=b.getClass()){return false;} + return equals((Sketch)b); + } + + public boolean equals(Sketch b){ + return compareTo(b)==0; + } + + /*--------------------------------------------------------------*/ + /*---------------- Formatting ----------------*/ + /*--------------------------------------------------------------*/ + + public ByteBuilder toHeader(){ + ByteBuilder sb=new ByteBuilder(); + return toHeader(sb); + } + + public ByteBuilder toHeader(ByteBuilder bb){ + bb.append("#SZ:").append(keys.length); + bb.append("\tCD:"); + bb.append(codingArray[CODING]); + if(deltaOut){bb.append('D');} + if(keyCounts!=null){bb.append('C');} + if(aminoOrTranslate()){bb.append('M');} + if(amino8){bb.append('8');} + + bb.append("\tK:").append(k); + if(k2!=0){bb.append(",").append(k2);} + if(HASH_VERSION>1){bb.append("\tH:").append(HASH_VERSION);} +// if(maxCount>0){bb.append("\tMC:").append(maxCount);} + + if(genomeSizeBases>0){bb.append("\tGS:").append(genomeSizeBases);} + if(genomeSizeKmers>0){bb.append("\tGK:").append(genomeSizeKmers);} + final long ge=genomeSizeEstimate(); + if(ge>0){bb.append("\tGE:").append(ge);} + if(genomeSequences>0){bb.append("\tGQ:"+genomeSequences);} + if(baseCounts!=null && !aminoOrTranslate()){ + bb.append("\tBC:").append(baseCounts[0]).append(',').append(baseCounts[1]).append(','); + bb.append(baseCounts[2]).append(',').append(baseCounts[3]); + } + if(probCorrect>0){bb.append("\tPC:"+String.format(Locale.ROOT, "%.4f", probCorrect));} + if(taxID>=0){bb.append("\tID:").append(taxID);} + if(imgID>=0){bb.append("\tIMG:").append(imgID);} + if(spid>0){bb.append("\tSPID:").append(spid);} + if(fname!=null){bb.append("\tFN:").append(fname);} + if(taxName!=null){bb.append("\tNM:").append(taxName);} + if(name0!=null){bb.append("\tNM0:").append(name0);} + if(meta!=null){ + for(String s : meta){ + bb.append("\tMT_").append(s); + } + } + + if(r16S!=null){ + bb.append("\t16S:").append(r16S.length); + } + if(r18S!=null){ + bb.append("\t18S:").append(r18S.length); + } + if(r16S!=null){ + bb.append('\n'); + bb.append("#16S:").append(r16S); + } + if(r18S!=null){ + bb.append('\n'); + bb.append("#18S:").append(r18S); + } + return bb; + } + + public ByteBuilder toBytes(){ + return toBytes(new ByteBuilder()); + } + + public ByteBuilder toBytes(ByteBuilder bb){ + if(CODING==A48 && deltaOut){return toBytesA48D(bb);} + long prev=0; + toHeader(bb); + bb.append("\n"); + byte[] temp=null; + if(CODING==A48){temp=KillSwitch.allocByte1D(12);} + for(int i=0; i<keys.length; i++){ + long key=keys[i]; + int count=(keyCounts==null ? 1 : keyCounts[i]); + long x=key-prev; + if(CODING==A48){ + appendA48(x, bb, temp); + if(count>1){ + bb.append('\t'); + appendA48(count-1, bb, temp); + } + bb.append('\n'); + }else if(CODING==HEX){ + bb.append(Long.toHexString(x)).append('\n'); + }else if(CODING==RAW){ + bb.append(x).append('\n'); + }else{ + assert(false); + } + if(deltaOut){prev=key;} + } + return bb; + } + + //This is to make the common case fast + private ByteBuilder toBytesA48D(ByteBuilder bb){ + assert(CODING==A48 && deltaOut); + long prev=0; + toHeader(bb); + bb.append("\n"); + final byte[] temp=KillSwitch.allocByte1D(12); + + if(keyCounts==null){ + for(int i=0; i<keys.length; i++){ + long key=keys[i]; + long x=key-prev; + if(CODING==A48){ + appendA48(x, bb, temp); + bb.append('\n'); + } + prev=key; + } + }else{ + for(int i=0; i<keys.length; i++){ + long key=keys[i]; + int count=keyCounts[i]; + long x=key-prev; + if(CODING==A48){ + appendA48(x, bb, temp); + if(count>1){ + bb.append('\t'); + appendA48(count-1, bb, temp); + } + bb.append('\n'); + } + prev=key; + } + } + return bb; + } + + public static final void appendA48(long value, ByteBuilder bb, byte[] temp){ + int i=0; +// long value=value0; + while(value!=0){ + byte b=(byte)(value&0x3F); +// assert(i<temp.length) : i+", "+temp.length+", "+value0; + temp[i]=b; + value=value>>6; + i++; + } + if(i==0){ + bb.append((byte)'0'); + }else{ + for(i--;i>=0;i--){ + bb.append((char)(temp[i]+48)); + } + } + } + + public static final String toA48(long value){ + int i=0; +// long value=value0; + StringBuilder sb=new StringBuilder(12); + while(value!=0){ + byte b=(byte)(value&0x3F); +// assert(i<temp.length) : i+", "+temp.length+", "+value0; + sb.append((char)(b+48)); + value=value>>6; + i++; + } + if(i==0){ + sb.append((byte)'0'); + }else{ + sb.reverse(); + } + return sb.toString(); + } + + @Override + public String toString(){ + return toBytes().toString(); + } + + /*--------------------------------------------------------------*/ + /*---------------- String Parsing ----------------*/ + /*--------------------------------------------------------------*/ + + public static long parseA48(String line){ + if(line.length()==0){return 0;} + long x=0; + for(int i=0; i<line.length(); i++){ + x<<=6; + long c=line.charAt(i); + x|=(c-48); + } + return x; + } + + /** Parses coverage too */ + public static long parseA48C(String line, IntList covList){ + if(line.length()==0){ + covList.add(1); + return 0; + } + long key=0, cov=0; + int i=0, len=line.length(); + for(; i<len; i++){ + long c=line.charAt(i); + if(c<48){break;} + key<<=6; + key|=(c-48); + } + for(i++; i<len; i++){ + long c=line.charAt(i); + cov<<=6; + cov|=(c-48); + } + covList.add((int)(cov+1)); + return key; + } + + public static long parseHex(String line){ + if(line.length()==0){return 0;} + long x=0; + for(int i=0; i<line.length(); i++){ + x<<=4; + x|=hexTable[line.charAt(i)]; + } + if(line.charAt(0)=='-'){x*=-1;} + return x; + } + + /*--------------------------------------------------------------*/ + /*---------------- Array Parsing ----------------*/ + /*--------------------------------------------------------------*/ + + public static long parseA48(byte[] line){ + if(line.length==0){return 0;} + long x=0; + for(byte b : line){ + x<<=6; + x|=(((long)b)-48); + } + return x; + } + + public static long parseNuc(String line){ + return parseNuc(line.getBytes()); + } + + /** Returns the maximal key in the sequence */ + public static long parseNuc(byte[] bases){ + if(bases.length<k){return -1;} + final int shift=2*k; + final int shift2=shift-2; + final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); //Conditional allows K=32 + + long kmer=0, rkmer=0; + int len=0; + + long key=-1; + for(int i=0; i<bases.length; i++){ + byte b=bases[i]; + long x=AminoAcid.baseToNumber[b]; + long x2=AminoAcid.baseToComplementNumber[b]; + kmer=((kmer<<2)|x)&mask; + rkmer=((rkmer>>>2)|(x2<<shift2))&mask; + if(x<0){len=0; rkmer=0;}else{len++;} + if(len>=k){ +// long z=Tools.max(kmer, rkmer); + final long hashcode=hash(kmer, rkmer); + key=Tools.max(key, hashcode); + } + } + return key<minHashValue ? -1 : Long.MAX_VALUE-key; + } + + /** Parses coverage too */ + public static long parseA48C(byte[] line, IntList covList){ + if(line.length==0){ + covList.add(1); + return 0; + } + long key=0, cov=0; + int i=0, len=line.length; + for(; i<len; i++){ + long b=line[i]; + if(b<48){break;} + key<<=6; + key|=(b-48); + } + for(i++; i<len; i++){ + long b=line[i]; + cov<<=6; + cov|=(b-48); + } + covList.add((int)(cov+1)); + return key; + } + + public static long parseHex(byte[] line){ + if(line.length==0){return 0;} + long x=0; + for(byte b : line){ + x<<=4; + x|=hexTable[b]; + } + if(line[0]=='-'){x*=-1;} + return x; + } + + /*--------------------------------------------------------------*/ + /*---------------- Parsing ----------------*/ + /*--------------------------------------------------------------*/ + + public static long parseA48(ByteBuilder bb){ + final int len=bb.length; + final byte[] line=bb.array; + if(len==0){return 0;} + long x=0; + for(int i=0; i<len; i++){ + x<<=6; + x|=(((long)line[i])-48); + } + return x; + } + + public static long parseNuc(ByteBuilder bb){ + return parseNuc(bb.toBytes()); + } + + /** Parses coverage too */ + public static long parseA48C(ByteBuilder bb, IntList covList){ + final int len=bb.length; + final byte[] line=bb.array; + if(len==0){ + covList.add(1); + return 0; + } + long key=0, cov=0; + int i=0; + for(; i<len; i++){ + long b=line[i]; + if(b<48){break;} + key<<=6; + key|=(b-48); + } + for(i++; i<len; i++){ + long b=line[i]; + cov<<=6; + cov|=(b-48); + } + covList.add((int)(cov+1)); + return key; + } + + public static long parseHex(ByteBuilder bb){ + final int len=bb.length; + final byte[] line=bb.array; + if(line.length==0){return 0;} + long x=0; + for(int i=0; i<len; i++){ + x<<=4; + x|=hexTable[line[i]]; + } + if(line[0]=='-'){x*=-1;} + return x; + } + + /*--------------------------------------------------------------*/ + /*---------------- Getters ----------------*/ + /*--------------------------------------------------------------*/ + + public long genomeSizeEstimate() { + return keys.length==0 ? 0 : Tools.min(genomeSizeKmers, genomeSizeEstimate(keys[keys.length-1], keys.length)); + } + + public long genomeSizeEstimate(int minCount) { + if(minCount<2){return genomeSizeEstimate();} + if(length()==0){return 0;} + long max=0; + int num=0; + for(int i=0; i<keyCounts.length; i++){ + if(keyCounts[i]>=minCount){ + max=keys[i]; + num++; + } + } + if(max==0){return 0;} + long est=Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(max, num)); + return est; + } + + public float gc(){ + if(baseCounts==null){return 0;} + long at=baseCounts[0]+baseCounts[3]; + long gc=baseCounts[1]+baseCounts[2]; + return gc/(Tools.max(1.0f, at+gc)); + } + + public String filePrefix(){return ReadWrite.stripToCore(fname);} + public String name(){return taxName!=null ? taxName : name0!=null ? name0 : fname;} + public String taxName(){return taxName;} + public String name0(){return name0;} + public String fname(){return fname;} + public int length(){return keys.length;} + public void setTaxName(String s){taxName=s;} + public void setName0(String s){name0=s;} + public void setFname(String s){ +// assert(!s.endsWith("sketch")) : s; //123 + fname=(s==null ? s : ReadWrite.stripPath(s)); + } + +// public float k1Fraction(){return k1Count/(float)array.length;} + + /*--------------------------------------------------------------*/ + /*---------------- Assorted ----------------*/ + /*--------------------------------------------------------------*/ + + public ArrayList<LongPair> toKhist() { + HashMap<Long, LongPair> map=new HashMap<Long, LongPair>(); + for(int i=0; i<keyCounts.length; i++){ + int a=keyCounts[i]; + Long key=(long)a; + LongPair value=map.get(key); + if(value==null){ + value=new LongPair(); + value.a=a; + map.put(key, value); + } + value.b++; + } + ArrayList<LongPair> list=new ArrayList<LongPair>(map.size()); + list.addAll(map.values()); + Collections.sort(list); + return list; + } + +// static boolean warned=false; + public void addSSU(){ + if(useSSUMapOnly){r16S=r18S=null;} + if(taxID<1 || taxID>=minFakeID){return;} + if(!SSUMap.hasMap()){return;} + if(taxtree==null/* && !warned*/){ +// warned=true; +// System.err.println("*** Warning - no taxtree loaded. A taxtree is recommended when adding SSUs. ***"); + assert(false) : "Please set a path to the taxtree when adding SSUs."; + } + final boolean euk=taxtree!=null && (preferSSUMapForEuks || useSSUMapOnlyForEuks || ban16SForEuks) ? taxtree.isEukaryote(taxID) : false; + final boolean prok=taxtree!=null && ban18SForProks ? taxtree.isProkaryote(taxID) : false; + + final boolean mapOnly=(useSSUMapOnly || (useSSUMapOnlyForEuks && euk)); + final boolean preferMap=(mapOnly || preferSSUMap || (preferSSUMapForEuks && euk)); + if(mapOnly){r16S=r18S=null;} + + if(SSUMap.r16SMap!=null && (r16S==null || preferMap)){ + byte[] x=SSUMap.r16SMap.get(taxID); + if(x!=null){r16S=x;} + } + if(SSUMap.r18SMap!=null && (r18S==null || preferMap)){ + byte[] x=SSUMap.r18SMap.get(taxID); + if(x!=null){r18S=x;} + } + if(ban18SForProks && prok){r18S=null;} + if(ban16SForEuks && euk){r16S=null;} + } + + /*--------------------------------------------------------------*/ + /*---------------- BitSet ----------------*/ + /*--------------------------------------------------------------*/ + + public void makeBitSets(boolean printContam, boolean index){ + assert(compareBitSet==null && indexBitSet==null); + if(!printContam){return;} + compareBitSet=AbstractBitSet.make(length(), bitSetBits); + if(index){indexBitSet=AbstractBitSet.make(length(), bitSetBits);} + } + + public void addToBitSet(AbstractBitSet rbs){ + compareBitSet.add(rbs); + } + + public AbstractBitSet compareBitSet(){return compareBitSet;} + + public AbstractBitSet indexBitSet(){return indexBitSet;} + + public void mergeBitSets(){ + assert(!mergedBitSets); + if(compareBitSet!=null && indexBitSet!=null){ + compareBitSet.setToMax(indexBitSet); + } + indexBitSet=null; + mergedBitSets=true; + } + + public boolean merged(){return mergedBitSets;} + + public int r16SLen(){return r16S==null ? 0 : r16S.length;} + public int r18SLen(){return r18S==null ? 0 : r18S.length;} + public byte[] r16S(){return r16S;} + public byte[] r18S(){return r18S;} + public boolean hasSSU(){return r16S!=null || r18S!=null;} + public boolean sharesSSU(Sketch b){ + return (r16S!=null && b.r16S!=null) || (r18S!=null && b.r18S!=null); + } + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Stores sorted hashcodes, ascending, as Long.MAX_VALUE-(raw hashcode) */ + public long[] keys; + /** Stores kmer (hashcode) observation counts */ + int[] keyCounts; + /** Stores base (ACGTN) counts */ + final long[] baseCounts; + private byte[] r16S; + private byte[] r18S; + public int taxID; + public int maxCount; + int sketchID;//Based on loading order + public final long genomeSequences; + public final long genomeSizeBases; + public final long genomeSizeKmers; + public final float probCorrect; +// public final int k1Count; //Number of keys made from k1 rather than k2 + private String taxName; + private String name0; + private String fname; + ArrayList<String> meta; + + //TODO: These should move to SketchResults. + private AbstractBitSet compareBitSet; //Used for comparison + private AbstractBitSet indexBitSet; + + + //Extended information + public long imgID=-1; + public long spid=-1; +// public String seqUnitName=null; + + private boolean mergedBitSets=false; //TODO: Temporary for debugging + /** Tracks the number of reference sketches sharing each kmer. + * Should be set to null when no longer needed. */ + private int[] refHitCounts; + + public int[] refHitCounts(){return refHitCounts;} + public void clearRefHitCounts(){ + refHitCounts=null; + } + public void setRefHitCounts(int[] x) { + refHitCounts=x; + assert(x!=null); + } + + private static AtomicInteger nextSketch=new AtomicInteger(1); + +}