Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/LargeKmerCount2.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/LargeKmerCount2.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,329 @@ +package bloom; + +import java.util.ArrayList; +import java.util.Locale; +import java.util.Random; + +import dna.AminoAcid; +import fileIO.FileFormat; +import fileIO.ReadWrite; +import shared.Timer; +import stream.ConcurrentReadInputStream; +import stream.FastaReadInputStream; +import stream.Read; +import structures.ListNum; + +/** + * @author Brian Bushnell + * @date Jul 6, 2012 + * + */ +public class LargeKmerCount2 { + +public static void main(String[] args){ + + Timer t=new Timer(); + + String fname1=args[0]; + String fname2=(args.length>4 || args[1].contains(".") ? args[1] : null); + int indexbits=Integer.parseInt(args[args.length-3]); + int cbits=Integer.parseInt(args[args.length-2]); + int k=Integer.parseInt(args[args.length-1]); + + KCountArray2 count=null; + + if(fileIO.FileFormat.hasFastaExtension(fname1)){ + FastaReadInputStream.MIN_READ_LEN=k; + } + count=countFastq(fname1, fname2, indexbits, cbits, k); + + t.stop(); + System.out.println("Finished counting; time = "+t); + + long[] freq=count.transformToFrequency(); + +// System.out.println(count+"\n"); +// System.out.println(Arrays.toString(freq)+"\n"); + + long sum=sum(freq); + System.out.println("Kmer fraction:"); + int lim1=8, lim2=16; + for(int i=0; i<lim1; i++){ + String prefix=i+""; + while(prefix.length()<8){prefix=prefix+" ";} + System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]); + } + while(lim1<=freq.length){ + int x=0; + for(int i=lim1; i<lim2; i++){ + x+=freq[i]; + } + String prefix=lim1+"-"+(lim2-1); + if(lim2>=freq.length){prefix=lim1+"+";} + while(prefix.length()<8){prefix=prefix+" ";} + System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x); + lim1*=2; + lim2=min(lim2*2, freq.length); + } + + long estKmers=load+min(actualCollisions, (long)expectedCollisions); + + long sum2=sum-freq[0]; + long x=freq[1]; + System.out.println(); + System.out.println("Keys Counted: \t \t"+keysCounted); + System.out.println("Unique: \t \t"+sum2); + System.out.println("probCollisions:\t \t"+(long)probNewKeyCollisions); + System.out.println("EstimateP: \t \t"+(sum2+(long)probNewKeyCollisions)); + System.out.println("expectedColl: \t \t"+(long)expectedCollisions); + System.out.println("actualColl: \t \t"+(long)actualCollisions); + System.out.println("estimateKmers: \t \t"+estKmers); + System.out.println(); + System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + x=sum2-x; + System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + + } + + public static KCountArray2 countFastq(String reads1, String reads2, int indexbits, int cbits, int k){ + assert(indexbits>=1 && indexbits<40); + final long cells=1L<<indexbits; + final int kbits=ROTATE_DIST*k; + final int xorShift=kbits%64; + final long[] rotMasks=makeRotMasks(xorShift); + final int[] buffer=new int[k]; + if(verbose){System.err.println("k="+k+", kbits="+kbits+", indexbits="+indexbits+", cells="+cells+", cbits="+cbits);} + if(verbose){System.err.println("xorShift="+xorShift+", rotMasks[3]="+Long.toHexString(rotMasks[3]));} + final KCountArray2 count=new KCountArray2(cells, cbits); + load=0; + probNewKeyCollisions=0; + invCells=1d/cells; + invKmerSpace=Math.pow(0.5, 2*k); + if(cells>=Math.pow(4, k)){invCells=0;} + + final ConcurrentReadInputStream cris; + { + FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); + if(verbose){System.err.println("Started cris");} + cris.start(); //4567 + } + boolean paired=cris.paired(); + if(verbose){System.err.println("Paired: "+paired);} + + long kmer=0; //current kmer + int len=0; //distance since last contig start or ambiguous base + + { + ListNum<Read> ln=cris.nextList(); + ArrayList<Read> reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + } + + while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessed++; + + len=0; + kmer=0; + byte[] bases=r.bases; + byte[] quals=r.quality; + + for(int i=0; i<bases.length; i++){ + byte b=bases[i]; + int x=AminoAcid.baseToNumber[b]; + int x2=buffer[len%buffer.length]; + buffer[len%buffer.length]=x; + if(x<0){ + len=0; + kmer=0; + }else{ + kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x); + len++; + if(len>=k){ + keysCounted++; + if(len>k){kmer=kmer^rotMasks[x2];} + long hashcode=kmer&0x7fffffffffffffffL; +// hashcode=randy.nextLong()&~((-1L)<<(2*k)); + long code1=hashcode%(cells-3); +// long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5); + int value=count.increment2(code1, 1); + + double probCollision=load*invCells; +// expectedCollisions+=probCollision; + expectedCollisions+=probCollision*(1-(load+min(expectedCollisions, actualCollisions))*invKmerSpace); + if(value==0){load++;} + else{ + actualCollisions++; + double probNewKey=(load*invCells)*expectedCollisions/(min(expectedCollisions, actualCollisions)); + double estKeys=load+probNewKeyCollisions; + double probOldKey=estKeys*invKmerSpace; + probNewKeyCollisions+=probNewKey*(1-probOldKey); + +// double estKmers=load+min(actualCollisions, expectedCollisions); +// double probOldKmer=estKmers*invKmerSpace; +// probNewKeyCollisions+=(prob*(1-prob2)); + } + +//// probCollisions+=(load*invCells); +// if(value==0){load++;} +// else{ +//// long load2=keysCounted-load; +// double prob=Math.sqrt(load*invCells); +// double estKmers=load+probNewKeyCollisions; +// double prob2=estKmers*invKmerSpace; +//// probCollisions+=(prob*(1-prob2)); +//// probCollisions+=Math.sqrt(prob*(1-prob2)); +// probNewKeyCollisions+=Math.sqrt(prob*(1-prob2)); +//// probCollisions+=min(prob, 1-prob2); +//// probCollisions+=(load*invCells); +// } + } + } + } + + + if(r.mate!=null){ + len=0; + kmer=0; + bases=r.mate.bases; + quals=r.mate.quality; + for(int i=0; i<bases.length; i++){ + byte b=bases[i]; + int x=AminoAcid.baseToNumber[b]; + int x2=buffer[len%buffer.length]; + buffer[len%buffer.length]=x; + if(x<0){ + len=0; + kmer=0; + }else{ + kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x); + len++; + if(len>=k){ + keysCounted++; + if(len>k){kmer=kmer^rotMasks[x2];} + long hashcode=kmer&0x7fffffffffffffffL; +// hashcode=randy.nextLong()&~((-1L)<<(2*k)); + long code1=hashcode%(cells-3); +// long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5); + int value=count.increment2(code1, 1); + + double probCollision=load*invCells; +// expectedCollisions+=probCollision; + expectedCollisions+=probCollision*(1-(load+min(expectedCollisions, actualCollisions))*invKmerSpace); + if(value==0){load++;} + else{ + actualCollisions++; + double probNewKey=(load*invCells)*expectedCollisions/(min(expectedCollisions, actualCollisions)); + double estKeys=load+probNewKeyCollisions; + double probOldKey=estKeys*invKmerSpace; + probNewKeyCollisions+=probNewKey*(1-probOldKey); + +// double estKmers=load+min(actualCollisions, expectedCollisions); +// double probOldKmer=estKmers*invKmerSpace; +// probNewKeyCollisions+=(prob*(1-prob2)); + } + +//// probCollisions+=(load*invCells); +// if(value==0){load++;} +// else{ +//// long load2=keysCounted-load; +// double prob=Math.sqrt(load*invCells); +// double estKmers=load+probNewKeyCollisions; +// double prob2=estKmers*invKmerSpace; +//// probCollisions+=(prob*(1-prob2)); +//// probCollisions+=Math.sqrt(prob*(1-prob2)); +// probNewKeyCollisions+=Math.sqrt(prob*(1-prob2)); +//// probCollisions+=min(prob, 1-prob2); +//// probCollisions+=(load*invCells); +// } + } + } + } + } + + } + //System.err.println("returning list"); + cris.returnList(ln); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + System.err.println("Finished reading"); + cris.returnList(ln); + System.err.println("Returned list"); + ReadWrite.closeStream(cris); + System.err.println("Closed stream"); + System.err.println("Processed "+readsProcessed+" reads."); + } + + return count; + } + + public static final long[] makeRotMasks(int rotDist){ + long[] masks=new long[4]; + for(long i=0; i<4; i++){ + masks[(int)i]=Long.rotateLeft(i, rotDist); + } + return masks; + } + + public static long[] transformToFrequency(int[] count){ + long[] freq=new long[2000]; + int max=freq.length-1; + for(int i=0; i<count.length; i++){ + int x=count[i]; + x=min(x, max); + freq[x]++; + } + return freq; + } + + public static long sum(int[] array){ + long x=0; + for(int y : array){x+=y;} + return x; + } + + public static long sum(long[] array){ + long x=0; + for(long y : array){x+=y;} + return x; + } + + public static final int min(int x, int y){return x<y ? x : y;} + public static final int max(int x, int y){return x>y ? x : y;} + public static final long min(long x, long y){return x<y ? x : y;} + public static final long max(long x, long y){return x>y ? x : y;} + public static final double min(double x, double y){return x<y ? x : y;} + public static final double max(double x, double y){return x>y ? x : y;} + + public static boolean verbose=true; + public static byte minQuality=-5; + public static long readsProcessed=0; + public static long maxReads=10000000L; + public static final int ROTATE_DIST=2; + + /** Non-empty cells in hash table */ + public static long load; + /** Number of expected collisions */ + public static double expectedCollisions; + /** Number of actual collisions (possibly by same value) */ + public static long actualCollisions; + /** Number of probable collisions caused by new keys */ + public static double probNewKeyCollisions; + /** Inverse of hash table size */ + public static double invCells; + /** Inverse of number of potential kmers */ + public static double invKmerSpace; + /** Inverse of number of potential kmers */ + public static long keysCounted; + + public static final Random randy=new Random(1); + +}