Mercurial > repos > rliterman > csp2
view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/Sketch.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line source
package sketch; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.Locale; import java.util.concurrent.atomic.AtomicInteger; import dna.AminoAcid; import fileIO.ReadWrite; import shared.KillSwitch; import shared.Tools; import structures.AbstractBitSet; import structures.ByteBuilder; import structures.IntList; import structures.LongHashMap; import structures.LongList; import structures.LongPair; import tax.ImgRecord; /** * @author Brian Bushnell * @date July 7, 2016 * */ public class Sketch extends SketchObject implements Comparable<Sketch>, Cloneable { /*--------------------------------------------------------------*/ /*---------------- Constructors ----------------*/ /*--------------------------------------------------------------*/ @Override public Sketch clone() { Sketch copy=null; try { copy = (Sketch)super.clone(); } catch (CloneNotSupportedException e) { // e.printStackTrace(); throw new RuntimeException(e); } copy.compareBitSet=null; copy.indexBitSet=null; return copy; } //Array should already be hashed, sorted, unique, subtracted from Long.MAX_VALUE, then reversed. public Sketch(long[] keys_, int[] keyCounts_, long[] baseCounts_, byte[] r16S_, byte[] r18S_, ArrayList<String> meta_){ this(keys_, keyCounts_, baseCounts_, r16S_, r18S_, -1, -1, -1, -1, -1, -1, null, null, null, meta_); } public Sketch(SketchHeap heap, boolean clearFname, boolean keepCounts, ArrayList<String> meta_){ this(heap, clearFname, keepCounts, meta_, -1); } public Sketch(SketchHeap heap, boolean clearFname, boolean keepCounts, ArrayList<String> meta_, int minKeyOccuranceCount){ this(heap.toSketchArray_minCount(minKeyOccuranceCount), null, heap.baseCounts(false), heap.r16S(), heap.r18S(), (int)heap.taxID, heap.imgID, heap.genomeSizeBases, heap.genomeSizeKmers, heap.genomeSequences, heap.probCorrect(), heap.taxName(), heap.name0(), heap.fname(), meta_); assert(keyCounts==null); if(!heap.setMode && keepCounts){ LongHashMap map=heap.map(); keyCounts=new int[keys.length]; for(int i=0; i<keys.length; i++){ int count=map.get(Long.MAX_VALUE-keys[i]); assert(count>0) : keys[i]+" -> "+count+"\n"+Arrays.toString(map.values())+"\n"+Arrays.toString(map.keys()); keyCounts[i]=count; } } if(heap.setMode){heap.clearSet();} heap.clear(clearFname); // System.err.println("size="+size+", genome="+this.genomeSize+", m"); : (int)(2+maxGenomeFraction*heap.genomeSize)+", "+this.array.length; // assert(false) : (int)(2+maxGenomeFraction*heap.genomeSize)+", "+this.array.length; // assert(false) : (counts==null)+", "+heap.setMode+", "+keepCounts; } public Sketch(long[] keys_, int[] keyCounts_, long[] baseCounts_, byte[] r16S_, byte[] r18S_, int taxID_, long imgID_, long gSizeBases_, long gSizeKmers_, long gSequences_, double probCorrect_, String taxName_, String name0_, String fname_, ArrayList<String> meta_){ // Exception e=new Exception(baseCounts_+", "+Arrays.toString(baseCounts_)); // e.printStackTrace(); keys=keys_; keyCounts=keyCounts_; // fixKeyCounts(); baseCounts=baseCounts_; r16S=r16S_; r18S=r18S_; assert(keyCounts==null || keys==null || keyCounts.length==keys.length) : (keys==null ? "null" : keys.length)+", "+(keyCounts==null ? "null" : keyCounts.length); taxID=taxID_; imgID=imgID_; sketchID=nextSketch.getAndIncrement(); genomeSizeBases=gSizeBases_; genomeSizeKmers=gSizeKmers_; genomeSequences=gSequences_; probCorrect=probCorrect_<=0 ? 0f : (float)probCorrect_; meta=meta_; taxName=fix(taxName_); name0=fix(name0_); fname=fix(fname_); if(fname!=null && (fname.startsWith("stdin.") || fname.endsWith(".sketch") || fname.endsWith(".sketch.gz"))){fname=null;} // if(k2>0){ // if(useToValue2) { // int count=0; // for(long x : array){ // count+=(int)(x&1); // } // k1Count=count; // }else { // k1Count=array.length/2; // } // }else{ // k1Count=array.length; // } if(ImgRecord.imgMap!=null && imgID>=0 && taxID<0){ ImgRecord record=ImgRecord.imgMap.get(imgID); if(record!=null){ if(record.name!=null && taxName==null){taxName=record.name;} taxID=record.taxID; } } } void loadSSU(){ if(taxID>0 && taxID<minFakeID){ if(r16S==null && SSUMap.r16SMap!=null){ r16S=SSUMap.r16SMap.get(taxID); } if(r18S==null && SSUMap.r18SMap!=null){ r18S=SSUMap.r18SMap.get(taxID); } } // new Exception().printStackTrace(); // System.err.println(taxID+", "+ssu+", "+SSUMap.ssuMap.size()); } void addMeta(String s){ s=fixMeta(s); if(s==null){return;} if(meta==null){meta=new ArrayList<String>(1);} meta.add(s); } void setMeta(ArrayList<String> list){ assert(meta==null); meta=list; } private static String fix(String s){ if(s==null){return null;} return s.replace('\t', ' '); } /*--------------------------------------------------------------*/ /*---------------- Methods ----------------*/ /*--------------------------------------------------------------*/ public void add(Sketch other, int maxlen){ assert(keyCounts==null && other.keyCounts==null); final long[] a=keys; final long[] b=other.keys; if(maxlen<1){ assert(false); maxlen=1000000; } LongList list=new LongList(Tools.min(maxlen, a.length+b.length)); for(int i=0, j=0; i<a.length && j<b.length; ){ final long ka=a[i], kb=b[j]; if(ka==kb){//match list.add(ka); i++; j++; }else if(ka<kb){ list.add(ka); i++; }else{ list.add(kb); j++; } if(list.size()>=maxlen){break;} } if(keys.length==list.size()){ for(int i=0; i<list.size; i++){ keys[i]=list.array[i]; } }else{ keys=list.toArray(); } } public void resize(int newSize) { if(newSize>=length()){return;} keys=Arrays.copyOf(keys, newSize); // fixKeyCounts(newSize); if(keyCounts!=null){ for(int i=0; i<newSize; i++) keyCounts=Arrays.copyOf(keyCounts, newSize); } } int fixKeyCounts(){ return fixKeyCounts(keyCounts==null ? maxCount : keyCounts.length); } int fixKeyCounts(int maxLen){ int max=0; if(keyCounts!=null){ for(int i=0; i<maxLen && max<2; i++){ max=Tools.max(keyCounts[i], max); } } if(max<2){keyCounts=null;} maxCount=Tools.max((maxCount>0 ? 1 : 0), max); return maxCount; } public int applyBlacklist() { assert(blacklist!=null); LongList keylist=new LongList(keys.length); IntList countlist=(keyCounts==null ? null : new IntList(keys.length)); int removed=0; for(int i=0; i<keys.length; i++){ long key=keys[i]; if(!Blacklist.contains(Long.MAX_VALUE-key)){ keylist.add(key); if(keyCounts!=null){countlist.add(keyCounts[i]);} }else{ removed++; } } if(keylist.size()!=keys.length){ keys=keylist.toArray(); if(keyCounts!=null){keyCounts=countlist.toArray();} } // if(removed>0){ // fixKeyCounts(); // } return removed; } /*--------------------------------------------------------------*/ /*---------------- Set Operations ----------------*/ /*--------------------------------------------------------------*/ public static final Sketch intersection(Sketch sa, Sketch sb){ Sketch shared=intersection(sa.keys, sb.keys, sa.keyCounts); if(shared!=null){ shared.taxID=sb.taxID; shared.taxName=sb.taxName; shared.name0=sb.name0; shared.fname=sb.fname; shared.meta=sb.meta; shared.imgID=sb.imgID; shared.spid=sb.spid; } return shared; } public static final Sketch intersection(long[] a, long[] b, int[] aCounts){ int i=0, j=0, matches=0; LongList ll=new LongList(); IntList il=new IntList(); for(; i<a.length && j<b.length; ){ final long ka=a[i], kb=b[j]; if(ka==kb){ matches++; ll.add(ka); if(aCounts!=null){ il.add(aCounts[i]); } i++; j++; }else if(ka<kb){ i++; }else{ j++; } } if(matches<1){return null;} return new Sketch(ll.toArray(), il.size>0 ? il.toArray() : null, null, null, null, null); } public static final Sketch union(Sketch sa, Sketch sb){ Sketch shared=union(sa.keys, sb.keys, sa.keyCounts, sb.keyCounts, sa.baseCounts, sb.baseCounts, sa.r16S, sb.r16S, sa.r18S, sb.r18S); if(shared!=null){ shared.taxID=sa.taxID; shared.taxName=sa.taxName; shared.name0=sa.name0; shared.fname=sa.fname; shared.meta=sa.meta; shared.imgID=sa.imgID; shared.spid=sa.spid; } return shared; } public static final Sketch union(long[] a, long[] b, int[] aCounts, int[] bCounts, long[] aBaseCounts, long[] bBaseCounts, byte[] a16S, byte[] b16S, byte[] a18S, byte[] b18S){ int i=0, j=0, matches=0; LongList ll=new LongList(); IntList il=(aCounts==null || bCounts==null ? null : new IntList()); byte[] r16S=(b16S==null ? a16S : a16S==null ? b16S : a16S.length>b16S.length ? a16S : b16S); byte[] r18S=(b18S==null ? a18S : a18S==null ? b18S : a18S.length>b18S.length ? a18S : b18S); long[] baseCounts=(aBaseCounts==null || bBaseCounts==null ? null : new long[aBaseCounts.length]); if(baseCounts!=null){ for(int k=0; k<baseCounts.length; k++) {baseCounts[k]+=(aBaseCounts[k]+bBaseCounts[k]);} } for(; i<a.length && j<b.length; ){ final long ka=a[i], kb=b[j]; if(ka==kb){ matches++; ll.add(ka); if(il!=null){ il.add(aCounts[i]+bCounts[i]); } i++; j++; }else if(ka<kb){ ll.add(ka); if(il!=null){ il.add(aCounts[i]); } i++; }else{ ll.add(kb); if(il!=null){ il.add(bCounts[i]); } j++; } } if(matches<1){return null;} return new Sketch(ll.toArray(), il.size>0 ? il.toArray() : null, baseCounts, r16S, r18S, null); } /*--------------------------------------------------------------*/ /*---------------- Filtering ----------------*/ /*--------------------------------------------------------------*/ boolean passesMeta(DisplayParams params) { return passesMeta(params.requiredMeta, params.bannedMeta, /*params.requiredTaxid, params.bannedTaxid,*/ params.requiredMetaAnd); } boolean passesMeta(ArrayList<String> requiredMeta, ArrayList<String> bannedMeta, /*IntList requiredTaxid, IntList bannedTaxid,*/ boolean requiredMetaAnd) { assert(requiredMeta!=null || bannedMeta!=null /*|| requiredTaxid!=null || bannedTaxid!=null*/); assert(requiredMeta==null || requiredMeta.size()>0); assert(bannedMeta==null || bannedMeta.size()>0); // if(requiredTaxid!=null && !requiredTaxid.contains(taxID)){return false;} // if(bannedTaxid!=null && bannedTaxid.contains(taxID)){return false;} if(requiredMeta!=null){ if(meta==null){return false;} for(String tag : requiredMeta){ if(meta.contains(tag)){ if(!requiredMetaAnd){break;} }else if(requiredMetaAnd){ return false; } } } if(bannedMeta!=null && meta!=null){ for(String tag : bannedMeta){ if(!meta.contains(tag)){ return false; } } } return true; } /*--------------------------------------------------------------*/ /*---------------- Comparison ----------------*/ /*--------------------------------------------------------------*/ public int countMatches(Sketch other, CompareBuffer buffer, AbstractBitSet present, boolean fillPresent, int[][] taxHits, int contamLevel){ return countMatches(keys, other.keys, keyCounts, other.keyCounts, refHitCounts, other.taxID, buffer, present, fillPresent, taxHits, contamLevel); } public static final int countMatches(long[] a, long[] b, int[] aCounts, int[] bCounts, int[] refHitCounts, int bid, CompareBuffer buffer, AbstractBitSet present, boolean fillPresent, int[][] taxHits, int contamLevel){ // if(verbose2){System.err.println("fillPresent: "+fillPresent+", "+present);} // assert(fillPresent) : bid+", "+minFakeID+", "+(taxHits!=null); if(bid>0 && bid<minFakeID && taxHits!=null){ bid=taxtree.getIdAtLevelExtended(bid, contamLevel); }else{ bid=-1; } // assert(false) : (buffer==null)+", "+fillPresent+", "+present.cardinality(); assert(a.length>0 && b.length>0); //Kmers hitting this reference int matches=0; //Kmers hitting this reference and others int multiMatches=0; //Kmers hitting nothing int noHits=0; //Kmers hitting some organism but not this reference int contamHits=0; //Kmers hitting something in this taxa, but not this reference int sameTax=0; //Kmers hitting this organism and no other taxa int unique2=0; //Kmers hitting only this taxa but not this organism (this count may not include everything due to early exit) int unique3_temp=0; //Kmers hitting multiple organisms but not this reference int multiContamHits=0; //Sum of query counts for shared kmers long depthSum=0; //Sum of query counts for shared kmers divided by ref counts for those kmers double depthSum2=0;//Slow, but necessary. //Number of times matching query keys occurred in all references long refHitSum=0; //Matches from k1 int k1hits=0; //Query kmers from k1 int k1seenQ=0; //Reference kmers from k1 int k1seenR=0; int i=0, j=0; assert(present==null || present.capacity()==a.length); // assert(false) : buffer.rbs.capacity()+", "+buffer.rbs+", "+present; if(present!=null){ if(fillPresent){ for(; i<a.length && j<b.length; ){ final long ka=a[i], kb=b[j]; final int bit=(int)(ka&1); if(ka==kb){ present.increment(i); matches++; k1hits+=bit; k1seenQ+=bit; k1seenR+=bit; if(aCounts!=null){ depthSum+=aCounts[i]; if(bCounts!=null){ depthSum2+=aCounts[i]/(double)bCounts[j]; } } if(refHitCounts!=null){refHitSum+=refHitCounts[i];} i++; j++; }else if(ka<kb){ i++; k1seenQ+=bit; }else{ j++; k1seenR+=bit; } } }else{ for(; i<a.length && j<b.length; ){ final long ka=a[i], kb=b[j]; final int bit=(int)(ka&1); if(ka==kb){ final int count=present.getCount(i); if(count>1){ multiMatches++; } matches++; k1hits+=bit; k1seenQ+=bit; k1seenR+=bit; if(aCounts!=null){ depthSum+=aCounts[i]; if(bCounts!=null){ depthSum2+=aCounts[i]/(double)bCounts[j]; } } if(refHitCounts!=null){refHitSum+=refHitCounts[i];} if(bid>0){ int[] taxHitsRow=taxHits[i]; if(taxHitsRow!=null && taxHitsRow.length==1 && taxHitsRow[0]==bid){unique2++;} } i++; j++; }else if(ka<kb){ k1seenQ+=bit; final int count=present.getCount(i); if(count>0){ contamHits++; if(count>1){ multiContamHits++; } }else{ noHits++; } if(bid>0){ int[] taxHitsRow=taxHits[i]; if(taxHitsRow!=null){ if(taxHitsRow!=null && taxHitsRow.length==1 && taxHitsRow[0]==bid){unique3_temp++;} for(int tid : taxHitsRow){ if(tid==bid){ sameTax++; break; } } } } i++; }else{ k1seenR+=bit; j++; } } //For the remaining query kmers, we don't know whether the reference sketch would have shared them had it been longer. //This section can be disabled to prevent them from being displayed. if(bid>0 && i<a.length-1){ for(; i<a.length; i++){ int[] taxHitsRow=taxHits[i]; if(taxHitsRow!=null){ if(taxHitsRow!=null && taxHitsRow.length==1 && taxHitsRow[0]==bid){unique3_temp++;} } } } } }else{ for(; i<a.length && j<b.length; ){ final long ka=a[i], kb=b[j]; final int bit=(int)(ka&1); final int count=present.getCount(i); if(ka==kb){ matches++; k1hits+=bit; k1seenQ+=bit; k1seenR+=bit; if(aCounts!=null){depthSum+=aCounts[i];} if(refHitCounts!=null){refHitSum+=refHitCounts[i];} i++; j++; }else if(ka<kb){ i++; k1seenQ+=bit; }else{ j++; k1seenR+=bit; } } } if(k2<1){ k1hits=matches; k1seenQ=i; k1seenR=j; } // if(taxHits!=null){ // System.err.println("matches="+matches+", noHits="+noHits+", contamHits="+contamHits+", sameTax="+sameTax+", multiContamHits="+multiContamHits); // } // assert(bid<1 || unique2>=(matches-multiMatches)) : bid+", "+unique2+", "+unique3_temp+", "+matches+", "+multiMatches; // assert(matches<1000 || multiMatches==0) : bid+", "+unique2+", "+unique3_temp+", "+matches+", "+multiMatches+", "+fillPresent; if(buffer!=null){ // System.err.println("*A) "+matches+", "+multiMatches+", "+unique2+", "+unique3_temp); // new Exception().printStackTrace(); // assert(k1hits<=(matches-k1hits)) : k1hits+", "+matches; buffer.set(matches, multiMatches, unique2, unique2+unique3_temp, noHits, contamHits, contamHits-sameTax, multiContamHits, i, j, a.length, b.length, depthSum, depthSum2, refHitSum, k1hits, k1seenQ, k1seenR); } return matches; } // public float identity(Sketch b, float[] ret){ // if(ret!=null){Arrays.fill(ret, 0);} // return identityWeighted(array, b.array, ret); // } // // public static float identity(long[] a, long[] b){ // int matches=countMatches(a, b); // return matches/(float)(Tools.max(1, Tools.min(a.length, b.length))); // } @Override public int hashCode(){ long gSize=genomeSizeKmers>0 ? genomeSizeKmers : genomeSizeBases; int code=(int) ((gSize^taxID^imgID^(name0==null ? 0 : name0.hashCode()))&Integer.MAX_VALUE); // System.err.println(code+", "+gSize+", "+taxID+", "+imgID+", "+name0); return code; } @Override public int compareTo(Sketch b){ if(this==b){return 0;} if(taxID>-1 && b.taxID>-1){return taxID-b.taxID;} int x=taxName.compareTo(b.taxName); if(x!=0){return x;} if(name0!=null && b.name0!=null){return name0.compareTo(b.name0);} return name0!=null ? 1 : b.name0!=null ? -1 : 0; } @Override public boolean equals(Object b){ if(this==b){return true;} if(b==null || this.getClass()!=b.getClass()){return false;} return equals((Sketch)b); } public boolean equals(Sketch b){ return compareTo(b)==0; } /*--------------------------------------------------------------*/ /*---------------- Formatting ----------------*/ /*--------------------------------------------------------------*/ public ByteBuilder toHeader(){ ByteBuilder sb=new ByteBuilder(); return toHeader(sb); } public ByteBuilder toHeader(ByteBuilder bb){ bb.append("#SZ:").append(keys.length); bb.append("\tCD:"); bb.append(codingArray[CODING]); if(deltaOut){bb.append('D');} if(keyCounts!=null){bb.append('C');} if(aminoOrTranslate()){bb.append('M');} if(amino8){bb.append('8');} bb.append("\tK:").append(k); if(k2!=0){bb.append(",").append(k2);} if(HASH_VERSION>1){bb.append("\tH:").append(HASH_VERSION);} // if(maxCount>0){bb.append("\tMC:").append(maxCount);} if(genomeSizeBases>0){bb.append("\tGS:").append(genomeSizeBases);} if(genomeSizeKmers>0){bb.append("\tGK:").append(genomeSizeKmers);} final long ge=genomeSizeEstimate(); if(ge>0){bb.append("\tGE:").append(ge);} if(genomeSequences>0){bb.append("\tGQ:"+genomeSequences);} if(baseCounts!=null && !aminoOrTranslate()){ bb.append("\tBC:").append(baseCounts[0]).append(',').append(baseCounts[1]).append(','); bb.append(baseCounts[2]).append(',').append(baseCounts[3]); } if(probCorrect>0){bb.append("\tPC:"+String.format(Locale.ROOT, "%.4f", probCorrect));} if(taxID>=0){bb.append("\tID:").append(taxID);} if(imgID>=0){bb.append("\tIMG:").append(imgID);} if(spid>0){bb.append("\tSPID:").append(spid);} if(fname!=null){bb.append("\tFN:").append(fname);} if(taxName!=null){bb.append("\tNM:").append(taxName);} if(name0!=null){bb.append("\tNM0:").append(name0);} if(meta!=null){ for(String s : meta){ bb.append("\tMT_").append(s); } } if(r16S!=null){ bb.append("\t16S:").append(r16S.length); } if(r18S!=null){ bb.append("\t18S:").append(r18S.length); } if(r16S!=null){ bb.append('\n'); bb.append("#16S:").append(r16S); } if(r18S!=null){ bb.append('\n'); bb.append("#18S:").append(r18S); } return bb; } public ByteBuilder toBytes(){ return toBytes(new ByteBuilder()); } public ByteBuilder toBytes(ByteBuilder bb){ if(CODING==A48 && deltaOut){return toBytesA48D(bb);} long prev=0; toHeader(bb); bb.append("\n"); byte[] temp=null; if(CODING==A48){temp=KillSwitch.allocByte1D(12);} for(int i=0; i<keys.length; i++){ long key=keys[i]; int count=(keyCounts==null ? 1 : keyCounts[i]); long x=key-prev; if(CODING==A48){ appendA48(x, bb, temp); if(count>1){ bb.append('\t'); appendA48(count-1, bb, temp); } bb.append('\n'); }else if(CODING==HEX){ bb.append(Long.toHexString(x)).append('\n'); }else if(CODING==RAW){ bb.append(x).append('\n'); }else{ assert(false); } if(deltaOut){prev=key;} } return bb; } //This is to make the common case fast private ByteBuilder toBytesA48D(ByteBuilder bb){ assert(CODING==A48 && deltaOut); long prev=0; toHeader(bb); bb.append("\n"); final byte[] temp=KillSwitch.allocByte1D(12); if(keyCounts==null){ for(int i=0; i<keys.length; i++){ long key=keys[i]; long x=key-prev; if(CODING==A48){ appendA48(x, bb, temp); bb.append('\n'); } prev=key; } }else{ for(int i=0; i<keys.length; i++){ long key=keys[i]; int count=keyCounts[i]; long x=key-prev; if(CODING==A48){ appendA48(x, bb, temp); if(count>1){ bb.append('\t'); appendA48(count-1, bb, temp); } bb.append('\n'); } prev=key; } } return bb; } public static final void appendA48(long value, ByteBuilder bb, byte[] temp){ int i=0; // long value=value0; while(value!=0){ byte b=(byte)(value&0x3F); // assert(i<temp.length) : i+", "+temp.length+", "+value0; temp[i]=b; value=value>>6; i++; } if(i==0){ bb.append((byte)'0'); }else{ for(i--;i>=0;i--){ bb.append((char)(temp[i]+48)); } } } public static final String toA48(long value){ int i=0; // long value=value0; StringBuilder sb=new StringBuilder(12); while(value!=0){ byte b=(byte)(value&0x3F); // assert(i<temp.length) : i+", "+temp.length+", "+value0; sb.append((char)(b+48)); value=value>>6; i++; } if(i==0){ sb.append((byte)'0'); }else{ sb.reverse(); } return sb.toString(); } @Override public String toString(){ return toBytes().toString(); } /*--------------------------------------------------------------*/ /*---------------- String Parsing ----------------*/ /*--------------------------------------------------------------*/ public static long parseA48(String line){ if(line.length()==0){return 0;} long x=0; for(int i=0; i<line.length(); i++){ x<<=6; long c=line.charAt(i); x|=(c-48); } return x; } /** Parses coverage too */ public static long parseA48C(String line, IntList covList){ if(line.length()==0){ covList.add(1); return 0; } long key=0, cov=0; int i=0, len=line.length(); for(; i<len; i++){ long c=line.charAt(i); if(c<48){break;} key<<=6; key|=(c-48); } for(i++; i<len; i++){ long c=line.charAt(i); cov<<=6; cov|=(c-48); } covList.add((int)(cov+1)); return key; } public static long parseHex(String line){ if(line.length()==0){return 0;} long x=0; for(int i=0; i<line.length(); i++){ x<<=4; x|=hexTable[line.charAt(i)]; } if(line.charAt(0)=='-'){x*=-1;} return x; } /*--------------------------------------------------------------*/ /*---------------- Array Parsing ----------------*/ /*--------------------------------------------------------------*/ public static long parseA48(byte[] line){ if(line.length==0){return 0;} long x=0; for(byte b : line){ x<<=6; x|=(((long)b)-48); } return x; } public static long parseNuc(String line){ return parseNuc(line.getBytes()); } /** Returns the maximal key in the sequence */ public static long parseNuc(byte[] bases){ if(bases.length<k){return -1;} final int shift=2*k; final int shift2=shift-2; final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); //Conditional allows K=32 long kmer=0, rkmer=0; int len=0; long key=-1; for(int i=0; i<bases.length; i++){ byte b=bases[i]; long x=AminoAcid.baseToNumber[b]; long x2=AminoAcid.baseToComplementNumber[b]; kmer=((kmer<<2)|x)&mask; rkmer=((rkmer>>>2)|(x2<<shift2))&mask; if(x<0){len=0; rkmer=0;}else{len++;} if(len>=k){ // long z=Tools.max(kmer, rkmer); final long hashcode=hash(kmer, rkmer); key=Tools.max(key, hashcode); } } return key<minHashValue ? -1 : Long.MAX_VALUE-key; } /** Parses coverage too */ public static long parseA48C(byte[] line, IntList covList){ if(line.length==0){ covList.add(1); return 0; } long key=0, cov=0; int i=0, len=line.length; for(; i<len; i++){ long b=line[i]; if(b<48){break;} key<<=6; key|=(b-48); } for(i++; i<len; i++){ long b=line[i]; cov<<=6; cov|=(b-48); } covList.add((int)(cov+1)); return key; } public static long parseHex(byte[] line){ if(line.length==0){return 0;} long x=0; for(byte b : line){ x<<=4; x|=hexTable[b]; } if(line[0]=='-'){x*=-1;} return x; } /*--------------------------------------------------------------*/ /*---------------- Parsing ----------------*/ /*--------------------------------------------------------------*/ public static long parseA48(ByteBuilder bb){ final int len=bb.length; final byte[] line=bb.array; if(len==0){return 0;} long x=0; for(int i=0; i<len; i++){ x<<=6; x|=(((long)line[i])-48); } return x; } public static long parseNuc(ByteBuilder bb){ return parseNuc(bb.toBytes()); } /** Parses coverage too */ public static long parseA48C(ByteBuilder bb, IntList covList){ final int len=bb.length; final byte[] line=bb.array; if(len==0){ covList.add(1); return 0; } long key=0, cov=0; int i=0; for(; i<len; i++){ long b=line[i]; if(b<48){break;} key<<=6; key|=(b-48); } for(i++; i<len; i++){ long b=line[i]; cov<<=6; cov|=(b-48); } covList.add((int)(cov+1)); return key; } public static long parseHex(ByteBuilder bb){ final int len=bb.length; final byte[] line=bb.array; if(line.length==0){return 0;} long x=0; for(int i=0; i<len; i++){ x<<=4; x|=hexTable[line[i]]; } if(line[0]=='-'){x*=-1;} return x; } /*--------------------------------------------------------------*/ /*---------------- Getters ----------------*/ /*--------------------------------------------------------------*/ public long genomeSizeEstimate() { return keys.length==0 ? 0 : Tools.min(genomeSizeKmers, genomeSizeEstimate(keys[keys.length-1], keys.length)); } public long genomeSizeEstimate(int minCount) { if(minCount<2){return genomeSizeEstimate();} if(length()==0){return 0;} long max=0; int num=0; for(int i=0; i<keyCounts.length; i++){ if(keyCounts[i]>=minCount){ max=keys[i]; num++; } } if(max==0){return 0;} long est=Tools.min(genomeSizeKmers, SketchObject.genomeSizeEstimate(max, num)); return est; } public float gc(){ if(baseCounts==null){return 0;} long at=baseCounts[0]+baseCounts[3]; long gc=baseCounts[1]+baseCounts[2]; return gc/(Tools.max(1.0f, at+gc)); } public String filePrefix(){return ReadWrite.stripToCore(fname);} public String name(){return taxName!=null ? taxName : name0!=null ? name0 : fname;} public String taxName(){return taxName;} public String name0(){return name0;} public String fname(){return fname;} public int length(){return keys.length;} public void setTaxName(String s){taxName=s;} public void setName0(String s){name0=s;} public void setFname(String s){ // assert(!s.endsWith("sketch")) : s; //123 fname=(s==null ? s : ReadWrite.stripPath(s)); } // public float k1Fraction(){return k1Count/(float)array.length;} /*--------------------------------------------------------------*/ /*---------------- Assorted ----------------*/ /*--------------------------------------------------------------*/ public ArrayList<LongPair> toKhist() { HashMap<Long, LongPair> map=new HashMap<Long, LongPair>(); for(int i=0; i<keyCounts.length; i++){ int a=keyCounts[i]; Long key=(long)a; LongPair value=map.get(key); if(value==null){ value=new LongPair(); value.a=a; map.put(key, value); } value.b++; } ArrayList<LongPair> list=new ArrayList<LongPair>(map.size()); list.addAll(map.values()); Collections.sort(list); return list; } // static boolean warned=false; public void addSSU(){ if(useSSUMapOnly){r16S=r18S=null;} if(taxID<1 || taxID>=minFakeID){return;} if(!SSUMap.hasMap()){return;} if(taxtree==null/* && !warned*/){ // warned=true; // System.err.println("*** Warning - no taxtree loaded. A taxtree is recommended when adding SSUs. ***"); assert(false) : "Please set a path to the taxtree when adding SSUs."; } final boolean euk=taxtree!=null && (preferSSUMapForEuks || useSSUMapOnlyForEuks || ban16SForEuks) ? taxtree.isEukaryote(taxID) : false; final boolean prok=taxtree!=null && ban18SForProks ? taxtree.isProkaryote(taxID) : false; final boolean mapOnly=(useSSUMapOnly || (useSSUMapOnlyForEuks && euk)); final boolean preferMap=(mapOnly || preferSSUMap || (preferSSUMapForEuks && euk)); if(mapOnly){r16S=r18S=null;} if(SSUMap.r16SMap!=null && (r16S==null || preferMap)){ byte[] x=SSUMap.r16SMap.get(taxID); if(x!=null){r16S=x;} } if(SSUMap.r18SMap!=null && (r18S==null || preferMap)){ byte[] x=SSUMap.r18SMap.get(taxID); if(x!=null){r18S=x;} } if(ban18SForProks && prok){r18S=null;} if(ban16SForEuks && euk){r16S=null;} } /*--------------------------------------------------------------*/ /*---------------- BitSet ----------------*/ /*--------------------------------------------------------------*/ public void makeBitSets(boolean printContam, boolean index){ assert(compareBitSet==null && indexBitSet==null); if(!printContam){return;} compareBitSet=AbstractBitSet.make(length(), bitSetBits); if(index){indexBitSet=AbstractBitSet.make(length(), bitSetBits);} } public void addToBitSet(AbstractBitSet rbs){ compareBitSet.add(rbs); } public AbstractBitSet compareBitSet(){return compareBitSet;} public AbstractBitSet indexBitSet(){return indexBitSet;} public void mergeBitSets(){ assert(!mergedBitSets); if(compareBitSet!=null && indexBitSet!=null){ compareBitSet.setToMax(indexBitSet); } indexBitSet=null; mergedBitSets=true; } public boolean merged(){return mergedBitSets;} public int r16SLen(){return r16S==null ? 0 : r16S.length;} public int r18SLen(){return r18S==null ? 0 : r18S.length;} public byte[] r16S(){return r16S;} public byte[] r18S(){return r18S;} public boolean hasSSU(){return r16S!=null || r18S!=null;} public boolean sharesSSU(Sketch b){ return (r16S!=null && b.r16S!=null) || (r18S!=null && b.r18S!=null); } /*--------------------------------------------------------------*/ /*---------------- Fields ----------------*/ /*--------------------------------------------------------------*/ /** Stores sorted hashcodes, ascending, as Long.MAX_VALUE-(raw hashcode) */ public long[] keys; /** Stores kmer (hashcode) observation counts */ int[] keyCounts; /** Stores base (ACGTN) counts */ final long[] baseCounts; private byte[] r16S; private byte[] r18S; public int taxID; public int maxCount; int sketchID;//Based on loading order public final long genomeSequences; public final long genomeSizeBases; public final long genomeSizeKmers; public final float probCorrect; // public final int k1Count; //Number of keys made from k1 rather than k2 private String taxName; private String name0; private String fname; ArrayList<String> meta; //TODO: These should move to SketchResults. private AbstractBitSet compareBitSet; //Used for comparison private AbstractBitSet indexBitSet; //Extended information public long imgID=-1; public long spid=-1; // public String seqUnitName=null; private boolean mergedBitSets=false; //TODO: Temporary for debugging /** Tracks the number of reference sketches sharing each kmer. * Should be set to null when no longer needed. */ private int[] refHitCounts; public int[] refHitCounts(){return refHitCounts;} public void clearRefHitCounts(){ refHitCounts=null; } public void setRefHitCounts(int[] x) { refHitCounts=x; assert(x!=null); } private static AtomicInteger nextSketch=new AtomicInteger(1); }