Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/ReadCounter.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/ReadCounter.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,1071 @@ +package bloom; + +import java.io.File; +import java.lang.Thread.State; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.Locale; +import dna.AminoAcid; +import fileIO.FileFormat; +import fileIO.ReadWrite; +import jgi.BBMerge; +import kmer.KmerTableSet; +import shared.Parse; +import shared.Parser; +import shared.PreParser; +import shared.Shared; +import shared.Timer; +import shared.Tools; +import sketch.SketchObject; +import stream.ConcurrentReadInputStream; +import stream.FastaReadInputStream; +import stream.Read; +import structures.ListNum; +import structures.LongList; +import ukmer.Kmer; + +/** + * @author Brian Bushnell + * @date Jul 5, 2012 + * + */ +public class ReadCounter extends KmerCountAbstract { + + public static void main(String[] args){ + + {//Preparse block for help, config files, and outstream + PreParser pp=new PreParser(args, new Object() { }.getClass().getEnclosingClass(), false); + args=pp.args; +// outstream=pp.outstream; + } + + Timer t=new Timer(); + + String fname1=args[0]; + String fname2=(args.length>1 ? args[1] : null); + int k=14; + int cbits=16; + int matrixbits=-1; + int hashes=1; + + for(int i=2; i<args.length; i++){ + final String arg=args[i]; + final String[] split=arg.split("="); + String a=split[0].toLowerCase(); + String b=split.length>1 ? split[1] : null; + + if(a.equals("k") || a.equals("kmer")){ + k=Integer.parseInt(b); + }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ + cbits=Integer.parseInt(b); + }else if(a.startsWith("reads") || a.startsWith("maxreads")){ + maxReads=Parse.parseKMG(b); + }else if(a.startsWith("matrixbits")){ + matrixbits=Integer.parseInt(b); + }else if(a.startsWith("hashes")){ + hashes=Integer.parseInt(b); + }else if(a.equals("canonical")){ + CANONICAL=Parse.parseBoolean(b); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + {//Process parser fields + Parser.processQuality(); + } + + int kbits=Tools.min(2*k, 62); + if(matrixbits<0){ + matrixbits=kbits; + } + matrixbits=Tools.min(kbits, matrixbits); + + if(fileIO.FileFormat.hasFastaExtension(fname1)){ + assert(!FastaReadInputStream.SPLIT_READS); + FastaReadInputStream.MIN_READ_LEN=k; + } + + KCountArray counts=KCountArray.makeNew(1L<<matrixbits, cbits, hashes); + ReadCounter rc=new ReadCounter(k, true, false, false, false); + try { + rc.count(fname1, fname2, counts); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + counts.shutdown(); + +// verbose=true; + + t.stop(); + System.out.println("Finished counting; time = "+t); + + rc.printStatistics(counts); + + } + + /** Defaults for nucleotides. */ + public ReadCounter(final int k_){this(k_, true, false, false, false);} + + public ReadCounter(final int k_, final boolean rcomp_, + final boolean ecco_, final boolean merge_, final boolean amino_){ + k=k_; + rcomp=rcomp_; + ecco=ecco_; + merge=merge_; + amino=amino_; + + final int bitsPerChar=(amino ? AminoAcid.AMINO_SHIFT : 2); + aminoShift=AminoAcid.AMINO_SHIFT; + shift=bitsPerChar*k; + shift2=shift-bitsPerChar; + mask=(shift>63 ? -1L : ~((-1L)<<shift)); //Conditional allows K=32 + + assert(!amino || k*bitsPerChar<64); + assert(!amino || !rcomp); + assert(k>0); + } + + public void printStatistics(KCountArray counts){ + long[] freq=counts.transformToFrequency(); + +// System.out.println(count+"\n"); +// System.out.println(Arrays.toString(freq)+"\n"); + + long sum=sum(freq); + System.out.println("Kmer fraction:"); + int lim1=8, lim2=16; + for(int i=0; i<lim1; i++){ + String prefix=i+""; + while(prefix.length()<8){prefix=prefix+" ";} + System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]); + } + while(lim1<=freq.length){ + int x=0; + for(int i=lim1; i<lim2; i++){ + x+=freq[i]; + } + String prefix=lim1+"-"+(lim2-1); + if(lim2>=freq.length){prefix=lim1+"+";} + while(prefix.length()<8){prefix=prefix+" ";} + System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x); + lim1*=2; + lim2=min(lim2*2, freq.length); + } + + long sum2=sum-freq[0]; + long x=freq[1]; + System.out.println(); + System.out.println("Keys Counted: \t \t"+keysCounted); + System.out.println("Unique: \t \t"+sum2); + System.out.println("Avg Sites/Key: \t \t"+String.format(Locale.ROOT, "%.3f ",(keysCounted*1d/sum2))); + System.out.println(); + System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + x=sum2-x; + System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + } + + public KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles, int cbits){ + return makeKca(fname1, fname2, extraFiles, cbits, Tools.min(2*k, 35), 1, minQuality, + maxReads, 1, 1, 1, 2); + } + + public KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles, + int cbits, int matrixbits, int hashes, int minqual, long maxreads){ + assert(matrixbits<63); + return makeKca(fname1, fname2, extraFiles, cbits, matrixbits, hashes, minqual, + maxreads, 1, 1, 1, 2); + } + + public KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles, + int cbits, int matrixbits, int hashes, int minqual, + long maxreads, int passes, int stepsize, int thresh1, int thresh2){ + assert(matrixbits<63); + return makeKca(fname1, fname2, extraFiles, + cbits, 1L<<matrixbits, hashes, minqual, + maxreads, passes, stepsize, thresh1, thresh2, null, 0); + } + + public KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles, + int cbits, long cells, int hashes, int minqual, + long maxreads, int passes, int stepsize, int thresh1, int thresh2){ + return makeKca(fname1, fname2, extraFiles, + cbits, cells, hashes, minqual, + maxreads, passes, stepsize, thresh1, thresh2, null, 0); + } + + public KCountArray makeKca_als(ArrayList<String> fname1, ArrayList<String> fname2, Iterable<String> extraFiles, + int cbits, long cells, int hashes, int minqual, + long maxreads, int passes, int stepsize, int thresh1, int thresh2, + KCountArray prefilter, int prefilterLimit_){ + String a=null, b=null; + ArrayList<String> list=new ArrayList<String>(); + if(fname1!=null){ + for(int i=0; i<fname1.size(); i++){ + if(i==0){a=fname1.get(i);} + else{list.add(fname1.get(i));} + } + } + if(fname2!=null){ + for(int i=0; i<fname2.size(); i++){ + if(i==0){b=fname2.get(i);} + else{list.add(fname2.get(i));} + } + } + if(extraFiles!=null){ + for(String s : extraFiles){ + list.add(s); + } + } + return makeKca(a, b, list.isEmpty() ? null : list, cbits, cells, hashes, minqual, + maxreads, passes, stepsize, thresh1, thresh2, + prefilter, prefilterLimit_); + } + + public KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles, + int cbits, long cells, int hashes, int minqual, + long maxreads, int passes, int stepsize, int thresh1, int thresh2, + KCountArray prefilter, int prefilterLimit_){ +// verbose=true; + if(verbose){System.err.println("Making kca from ("+fname1+", "+fname2+")\nk="+k+", cells="+Tools.toKMG(cells)+", cbits="+cbits);} + + if(fname1==null && fname2==null && extraFiles==null){ + return KCountArray.makeNew(cells, cbits, hashes, prefilter, prefilterLimit_); + } + + boolean oldsplit=FastaReadInputStream.SPLIT_READS; + long oldmax=maxReads; + byte oldq=minQuality; + maxReads=maxreads; + minQuality=(byte)minqual; + // System.out.println("kbits="+(kbits)+" -> "+(1L<<kbits)+", matrixbits="+(matrixbits)+" -> "+(1L<<matrixbits)+", cbits="+cbits+", gap="+gap+", hashes="+hashes); + KCountArray kca=KCountArray.makeNew(cells, cbits, hashes, prefilter, prefilterLimit_); + +// System.out.println("a"); + {//For processing input lists + ArrayList<String> extra2=null; + if(fname1!=null && fname1.contains(",")){ + String[] s=fname1.split(","); + if(extra2==null){extra2=new ArrayList<String>();} + for(int i=1; i<s.length; i++){extra2.add(s[i]);} + fname1=s[0]; + } + if(fname2!=null && fname2.contains(",")){ + String[] s=fname2.split(","); + if(extra2==null){extra2=new ArrayList<String>();} + for(int i=1; i<s.length; i++){extra2.add(s[i]);} + fname2=s[0]; + } + if(extra2!=null){ + if(extraFiles!=null){ + for(String s : extraFiles){ + extra2.add(s); + } + } + extraFiles=extra2; + } + } +// System.out.println("b"); + + if(extraFiles!=null){ + for(String s : extraFiles){ + if(fileIO.FileFormat.hasFastaExtension(s)){ + assert(!FastaReadInputStream.SPLIT_READS); + } + } + } + +// System.out.println("c"); + + if(passes==1){ +// System.out.println("c1"); + if(fname1!=null){ + try { + count(fname1, fname2, kca); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + if(extraFiles!=null){ + maxReads=-1; + for(String s : extraFiles){ + try { + count(s, null, kca); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + kca.shutdown(); + + }else{ +// System.out.println("c2"); + assert(passes>1); + KCountArray trusted=null; + for(int i=1; i<passes; i++){ + boolean conservative=i>2;// /*or, alternately, (trusted==null || trusted.capacity()>0.3) + int step=(stepsize==1 ? 1 : stepsize+i%2); + // if(!conservative){step=(step+3)/4;} + if(!conservative){step=Tools.min(3, (step+3)/4);} + + try { + count(fname1, fname2, cbits, kca, trusted, maxreads, thresh1, step, conservative); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(extraFiles!=null){ + maxReads=-1; + for(String s : extraFiles){ + try { + count(s, null, cbits, kca, trusted, maxreads, thresh1, step, conservative); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + kca.shutdown(); + + System.out.println("Trusted: \t"+kca.toShortString()); + trusted=kca; + kca=KCountArray.makeNew(cells, cbits, hashes, prefilter, prefilterLimit_); + + } + + try { + count(fname1, fname2, cbits, kca, trusted, maxreads, thresh2, stepsize, true); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(extraFiles!=null){ + maxReads=-1; + for(String s : extraFiles){ + try { + count(s, null, cbits, kca, trusted, maxreads, thresh2, stepsize, true); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + kca.shutdown(); + } +// System.out.println("d"); + minQuality=oldq; + maxReads=oldmax; + FastaReadInputStream.SPLIT_READS=oldsplit; + + + return kca; + } + + public KCountArray count(String reads1, String reads2, KCountArray counts) throws Exception{ +// System.err.println("countFastq... making a new cris"); + assert(counts!=null); + + { + int pound=reads1.lastIndexOf('#'); + if(pound>=0 && reads2==null && !new File(reads1).exists()){ + String a=reads1.substring(0, pound); + String b=reads1.substring(pound+1); + reads1=a+1+b; + reads2=a+2+b; + } + } + + final ConcurrentReadInputStream cris; + { + FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); + if(ff2==null){ff1.preferShreds=true;} +// if(ff2!=null){ //TODO - interleaved flag + cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); + cris.start(); //4567 + } + + assert(cris!=null) : reads1; + if(verbose){System.err.println("Started cris");} + boolean paired=cris.paired(); + if(verbose){System.err.println("Paired: "+paired);} + +// countFastq(cris, count); +// assert(false) : THREADS; + CountThread[] cta=new CountThread[Shared.threads()]; + for(int i=0; i<cta.length; i++){ + cta[i]=new CountThread(cris, counts); + cta[i].start(); + } +// System.out.println("~1"); + for(int i=0; i<cta.length; i++){ +// System.out.println("~2"); + CountThread ct=cta[i]; + synchronized(ct){ +// System.out.println("~3"); + while(ct.getState()!=State.TERMINATED){ +// System.out.println("~4"); + try { + ct.join(2000); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } +// System.out.println("~5"); + } + } + } +// System.out.println("~6"); + + ReadWrite.closeStream(cris); + if(verbose){System.err.println("Closed stream");} + if(verbose){System.err.println("Processed "+readsProcessed+" reads.");} + + + return counts; + } + + public KCountArray count(String reads1, String reads2, final int cbits, + KCountArray counts, final KCountArray trusted, final long maxReads, final int thresh, + final int detectStepsize, final boolean conservative) + throws Exception{ + + assert(counts!=null); + + { + int pound=reads1.lastIndexOf('#'); + if(pound>=0 && reads2==null && !new File(reads1).exists()){ + String a=reads1.substring(0, pound); + String b=reads1.substring(pound+1); + reads1=a+1+b; + reads2=a+2+b; + } + } + + final ConcurrentReadInputStream cris; + { + FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); + if(ff2==null){ff1.preferShreds=true;} + cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); + cris.start(); //4567 + } + + assert(cris!=null) : reads1; + if(verbose){System.err.println("Started cris");} + boolean paired=cris.paired(); + if(verbose){System.err.println("Paired: "+paired);} + + +// countFastq(cris, count, trusted, thresh, detectStepsize, conservative); + +// assert(false) : THREADS; + CountThread[] cta=new CountThread[Shared.threads()]; + for(int i=0; i<cta.length; i++){ + cta[i]=new CountThread(cris, counts, trusted, thresh, detectStepsize, conservative); + cta[i].start(); + } + + for(int i=0; i<cta.length; i++){ + CountThread ct=cta[i]; + synchronized(ct){ + while(ct.isAlive()){ + try { + ct.join(1000); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + } + + cris.close(); + if(verbose){System.err.println("Closed stream");} + +// System.out.println("*** after ***"); +// System.out.println("\ntrusted=\n"+trusted); +// System.out.println("\ncount=\n"+count); + + return counts; + } + + private final int findOverlap(Read r1, Read r2, boolean ecc){ + if(vstrict){ + return BBMerge.findOverlapVStrict(r1, r2, ecc); + }else{ + return BBMerge.findOverlapStrict(r1, r2, ecc); + } + } + + private class CountThread extends Thread{ + + CountThread(final ConcurrentReadInputStream cris_, final KCountArray counts_){ + this(cris_, counts_, null, 2, 1, true); + } + + CountThread(final ConcurrentReadInputStream cris_, + final KCountArray counts_, final KCountArray trusted_, final int thresh_, + final int detectStepsize_, final boolean conservative_){ + cris=cris_; + counts=counts_; + trusted=trusted_; + thresh=thresh_; + detectStepsize=detectStepsize_; + conservative=conservative_; + } + + @Override + public void run(){ +// System.out.println("Running"); + if(trusted==null){ + count(cris); + }else{ + count(cris, thresh, detectStepsize, conservative); + } +// System.out.println("Finished: "+readsProcessedLocal); + + if(BUFFERED){dumpBuffer();} + + synchronized(getClass()){ + keysCounted+=keysCountedLocal; + increments+=incrementsLocal; + readsProcessed+=readsProcessedLocal; + + if(verbose){System.err.println(keysCounted+", "+keysCountedLocal);} + if(verbose){System.err.println(readsProcessed+", "+readsProcessedLocal);} + } + } + + private void increment(final long key){ + if(BUFFERED){ + buffer.add(key); + if(buffer.size>=BUFFERLEN){ + dumpBuffer(); + } + }else{ + incrementByAmount(key, 1); + } + } + + + //TODO: All this 'sum' nonsense is just to count kmers added. Could be derived from buffer length. + private void dumpBuffer(){ + final int lim=buffer.size; + if(lim<1){return;} + final long[] array=buffer.array; +// if(SORT_SERIAL){ +// buffer.sortSerial(); +// }else{ +// buffer.sort(); +// } + buffer.sort();//Can be disabled via parallelsort flag but only affects arrays>10k + long kmer=array[0]-1;//Ensures a nonmatch + int streak=0; + for(int i=0; i<lim; i++){ + long x=array[i]; + if(x==kmer){streak++;} + else{ + if(streak>0){incrementByAmount(kmer, streak);} + kmer=x; + streak=1; + } + } + assert(streak>0); + incrementByAmount(kmer, streak); + buffer.clear(); + } + + private void incrementByAmount(final long key, int amt){ + if(SKETCH_MODE){ + final long code=SketchObject.hash(key); + if(code<minHashValue){return;} + counts.increment(STORE_HASHED ? code : key, amt); + }else{ + counts.increment(key, amt); + } + keysCountedLocal+=amt; + incrementsLocal++; + } + + private final void count(ConcurrentReadInputStream cris){ + assert(k>=1 && counts!=null); +// System.out.println("Waiting for list"); + ListNum<Read> ln=cris.nextList(); + ArrayList<Read> reads=(ln!=null ? ln.list : null); +// System.out.println("Got list: "+(ln==null ? "null" : ln.id)+", "+(ln==null || ln.list==null ? "null" : ln.list.size())); + + long[] array=null; + final Kmer kmer=new Kmer(k); + + while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning + //System.err.println("reads.size()="+reads.size()); + for(Read r1 : reads){ + + Read r2=r1.mate; + if((ecco || merge) && r2!=null){ + if(merge){ + final int insert=findOverlap(r1, r2, false); + if(insert>0){ + r2.reverseComplement(); + r1=r1.joinRead(insert); + r2=null; + } + }else if(ecco){ + findOverlap(r1, r2, true); + } + } + readsProcessedLocal++; + + if(k<=maxShortKmerLength){ + array=addRead_Advanced(r1, array); + }else{ + addReadBig(r1, kmer); + addReadBig(r1.mate, kmer); + } + // System.out.println(r); + // System.out.println("kmers hashed: "+keysCountedLocal); + } + //System.err.println("returning list"); + cris.returnList(ln); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + + + if(verbose){System.err.println("Finished reading");} + cris.returnList(ln); + if(verbose){System.err.println("Returned list");} + } + + + + + private final void count(final ConcurrentReadInputStream cris, final int thresh, + final int detectStepsize, final boolean conservative){ + + ListNum<Read> ln=cris.nextList(); + ArrayList<Read> reads=(ln!=null ? ln.list : null); + + long[] array=null; + while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning + //System.err.println("reads.size()="+reads.size()); + for(Read r1 : reads){ + + Read r2=r1.mate; + if((ecco || merge) && r2!=null){ + if(merge){ + final int insert=findOverlap(r1, r2, false); + if(insert>0){ + r2.reverseComplement(); + r1=r1.joinRead(insert); + r2=null; + } + }else if(ecco){ + findOverlap(r1, r2, true); + } + } + { + if(trusted!=null){ + BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r1, trusted, k, thresh, detectStepsize) : + ErrorCorrect.detectTrusted(r1, trusted, k, thresh, detectStepsize)); +// System.out.println("\n"+toString(bs, r.length())); +// System.out.println(new String(r.bases)); + if(bs!=null){ + for(int i=bs.nextClearBit(0); i<r1.length(); i=bs.nextClearBit(i+1)){ + r1.bases[i]='N'; + if(r1.quality!=null){r1.quality[i]=0;} + } + } +// System.out.println(new String(r.bases)); +// System.out.println("used = "+String.format(Locale.ROOT, "%.3f%%",count.usedFraction()*100)); +// System.out.println("used = "+((KCountArray4)count).cellsUsed()); +// if(bs.length()<r.length()){r=null;} + } +// if(r!=null){addRead(r, count, rcomp);} + } + if(r2!=null){ + if(trusted!=null){ + BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r2, trusted, k, thresh, detectStepsize) : + ErrorCorrect.detectTrusted(r2, trusted, k, thresh, detectStepsize)); + if(bs!=null){ + for(int i=bs.nextClearBit(0); i<r2.length(); i=bs.nextClearBit(i+1)){ + r2.bases[i]='N'; + if(r2.quality!=null){r2.quality[i]=0;} + } + } + } + } + array=addRead_Advanced(r1, array); + + } + //System.err.println("returning list"); + cris.returnList(ln); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + + if(verbose){System.err.println("Finished reading");} + cris.returnList(ln); + if(verbose){System.err.println("Returned list");} + } + + + + /** + * Hash a read's kmers into the KCountArray. + * Advanced mode processes paired reads together and sorts kmers to eliminate spurious duplicates. + * @param r1 + * @param counts + * @param k + * @param mask + * @param rcomp + */ + private final long[] addRead_Advanced(Read r1, long[] array){ + if(PREJOIN && r1.mate!=null && r1.insert()>0){ + r1.mate.reverseComplement(); + r1=r1.joinRead(); + } + Read r2=r1.mate; + final int len1=Tools.max(0, r1.length()-k+1); + final int len2=(r2==null || r2.bases==null) ? 0 : Tools.max(0, r2.length()-k+1); + final int len=len1+len2; + if(len<1){return array;} + + if(!KEEP_DUPLICATE_KMERS){ + if(array==null || array.length!=len){array=new long[len];} + Arrays.fill(array, -1); + fillKmerArray(r1, array, 0, len1); + if(r2!=null){fillKmerArray(r2, array, len1, len);} + + Arrays.sort(array); + long prev=-1; + for(int i=0; i<array.length; i++){ + long kmer=array[i]; + if(kmer!=prev){ + increment(kmer); + prev=kmer; + } + } + }else{ + if(len1>0){addRead(r1);} + if(len2>0){addRead(r2);} + } + return array; + } + + private final void addReadBig(Read r, Kmer kmer){ + if(r==null || r.bases==null){return;} + final byte[] bases=r.bases; + final byte[] quals=r.quality; + int len=0; + + if(bases==null || bases.length<k){return;} + kmer.clear(); + + /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ + float prob=1; + for(int i=0; i<bases.length; i++){ + final byte b=bases[i]; + final long x=AminoAcid.baseToNumber[b]; + + //Update kmers + kmer.addRight(b); + + if(minProb>0 && quals!=null){//Update probability + prob=prob*KmerTableSet.PROB_CORRECT[quals[i]]; + if(len>k){ + byte oldq=quals[i-k]; + prob=prob*KmerTableSet.PROB_CORRECT_INVERSE[oldq]; + } + } + + //Handle Ns + if(x<0){ + len=0; + prob=1; + }else{len++;} + + assert(len==kmer.len); + + if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+"\t"+new String(bases, Tools.max(0, i-k), Tools.min(i+1, k)));} + if(len>=k && prob>=minProb){ +// System.err.println("Incrementing xor()="+kmer.xor()); + increment(kmer.xor()); +// counts.incrementAndReturnUnincremented(kmer.xor(), 1); +// keysCountedLocal++; + } + } + } + + private final void fillKmerArray(Read r, final long[] array, final int start, final int stop){ + if(amino){ + fillKmerArrayAmino(r, array, start, stop); + return; + } + if(k>maxShortKmerLength){ + fillKmerArrayLong(r, array, start, stop); + return; + } + assert(k<=maxShortKmerLength); + assert(!PREJOIN || r.mate==null); + assert(CANONICAL); + assert(array!=null); + + final byte[] bases=r.bases; + final byte[] quals=r.quality; + + if(bases==null || bases.length<k){return;} + + final int passes=(rcomp ? 2 : 1); + for(int pass=0; pass<passes; pass++){ + int len=0; + int idx=(pass==0 ? start-k+1 : stop+k-2); + long kmer=0; + float prob=1; + for(int i=0; i<bases.length; i++){ + byte b=bases[i]; + assert(b>=0) : r.id+", "+bases.length+", "+i+", "+b+"\n"+(bases.length<20000 ? r.toFasta().toString() : ""); + int x=AminoAcid.baseToNumber[b]; + +// int x=AminoAcid.baseToNumber[b<0 ? 'N' : b]; + + byte q; + if(quals==null){ + q=50; + }else{ + q=quals[i]; + prob=prob*align2.QualityTools.PROB_CORRECT[q]; + if(len>k){ + byte oldq=quals[i-k]; + prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; + } + } + + if(x<0 || q<minQuality){ + len=0; + kmer=0; + prob=1; + }else{ + kmer=((kmer<<2)|x)&mask; + len++; + if(len>=k && prob>=minProb){ + array[idx]=Tools.max(array[idx], kmer); + } + } + if(pass==0){idx++;}else{idx--;} + } +// System.out.println(Arrays.toString(array)); + r.reverseComplement(); + } + } + + private final void fillKmerArrayAmino(Read r, final long[] array, final int start, final int stop){ + assert(false) : "TODO"; //TODO + if(k>maxShortKmerLength){ + fillKmerArrayLong(r, array, start, stop); + return; + } + assert(k<=maxShortKmerLength); + assert(!PREJOIN || r.mate==null); + assert(CANONICAL); + assert(array!=null); + + final byte[] bases=r.bases; + final byte[] quals=r.quality; + + if(bases==null || bases.length<k){return;} + + for(int pass=0; pass<2; pass++){ + int len=0; + int idx=(pass==0 ? start-k+1 : stop+k-2); + long kmer=0; + float prob=1; + for(int i=0; i<bases.length; i++){ + byte b=bases[i]; + assert(b>=0) : r.id+", "+bases.length+", "+i+", "+b+"\n"+(bases.length<20000 ? r.toFasta().toString() : ""); + int x=AminoAcid.baseToNumber[b]; + +// int x=AminoAcid.baseToNumber[b<0 ? 'N' : b]; + + byte q; + if(quals==null){ + q=50; + }else{ + q=quals[i]; + prob=prob*align2.QualityTools.PROB_CORRECT[q]; + if(len>k){ + byte oldq=quals[i-k]; + prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; + } + } + + if(x<0 || q<minQuality){ + len=0; + kmer=0; + prob=1; + }else{ + kmer=((kmer<<2)|x)&mask; + len++; + if(len>=k && prob>=minProb){ + array[idx]=Tools.max(array[idx], kmer); + } + } + if(pass==0){idx++;}else{idx--;} + } +// System.out.println(Arrays.toString(array)); + r.reverseComplement(); + } + } + + private final void addRead(Read r){ + if(amino){ + addReadAmino(r); + return; + } + assert(k<=maxShortKmerLength); + assert(!PREJOIN || r.mate==null); + assert(CANONICAL); + + final byte[] bases=r.bases; + final byte[] quals=r.quality; + + if(bases==null || bases.length<k){return;} + + long kmer=0; + long rkmer=0; + int len=0; + float prob=1; + + for(int i=0; i<bases.length; i++){ + final byte b=bases[i]; + long x=AminoAcid.baseToNumber[b]; + long x2=AminoAcid.baseToComplementNumber[b]; + kmer=((kmer<<2)|x)&mask; + rkmer=((rkmer>>>2)|(x2<<shift2))&mask; + + final byte q; + if(quals==null){ + q=50; + }else{ + q=quals[i]; + prob=prob*align2.QualityTools.PROB_CORRECT[q]; + if(len>k){ + byte oldq=quals[i-k]; + prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; + } + } + + if(x<0 || q<minQuality){ + len=0; + kmer=rkmer=0; + prob=1; + }else{ + len++; + if(len>=k && prob>=minProb){ + long key=(rcomp ? Tools.max(kmer, rkmer) : kmer); + increment(key); + } + } + } + } + + private final void addReadAmino(Read r){ + assert(!PREJOIN || r.mate==null); + + final byte[] bases=r.bases; + + if(bases==null || bases.length<k){return;} + + long kmer=0; + int len=0; + + for(int i=0; i<bases.length; i++){ + final byte b=bases[i]; + long x=AminoAcid.acidToNumber[b]; + kmer=((kmer<<aminoShift)|x)&mask; + + if(x<0){ + len=0; + kmer=0; + }else{ + len++; + if(len>=k){ + increment(kmer); + } + } + } + } + + private final void fillKmerArrayLong(Read r, final long[] array, final int start, final int stop){ + assert(k>maxShortKmerLength) : k; + assert(!PREJOIN || r.mate==null); + assert(CANONICAL); + assert(array!=null); + Kmer kmer=new Kmer(k); + + float prob=1; + byte[] bases=r.bases; + byte[] quals=r.quality; + + kmer.clear(); + + for(int i=0, idx=start-k+1; i<bases.length; i++, idx++){ + byte b=bases[i]; + kmer.addRight(b); + + byte q; + if(quals==null){ + q=50; + }else{ + q=quals[i]; + prob=prob*align2.QualityTools.PROB_CORRECT[q]; + if(kmer.len>k){ + byte oldq=quals[i-k]; + prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; + } + } + + if(!AminoAcid.isFullyDefined(b) || q<minQuality){ + kmer.clear(); + prob=1; + } + if(kmer.len>=k && prob>=minProb){ + array[idx]=kmer.xor(); + } + } + } + + private final ConcurrentReadInputStream cris; + + private final KCountArray counts; + private final KCountArray trusted; + private final int thresh; + private final int detectStepsize; + private final boolean conservative; + private long keysCountedLocal=0; + private long incrementsLocal=0; + private long readsProcessedLocal=0; + private final long minHashValue=SketchObject.minHashValue; + private final LongList buffer=(BUFFERED ? new LongList(BUFFERLEN) : null); + } + + public static boolean vstrict=false; + + private final int k; + private final int aminoShift; + private final int shift; + private final int shift2; + private final long mask; + private final boolean rcomp; + private final boolean ecco; + private final boolean merge; + private final boolean amino; + +}