Mercurial > repos > rliterman > csp2
view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/LargeKmerCount2.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line source
package bloom; import java.util.ArrayList; import java.util.Locale; import java.util.Random; import dna.AminoAcid; import fileIO.FileFormat; import fileIO.ReadWrite; import shared.Timer; import stream.ConcurrentReadInputStream; import stream.FastaReadInputStream; import stream.Read; import structures.ListNum; /** * @author Brian Bushnell * @date Jul 6, 2012 * */ public class LargeKmerCount2 { public static void main(String[] args){ Timer t=new Timer(); String fname1=args[0]; String fname2=(args.length>4 || args[1].contains(".") ? args[1] : null); int indexbits=Integer.parseInt(args[args.length-3]); int cbits=Integer.parseInt(args[args.length-2]); int k=Integer.parseInt(args[args.length-1]); KCountArray2 count=null; if(fileIO.FileFormat.hasFastaExtension(fname1)){ FastaReadInputStream.MIN_READ_LEN=k; } count=countFastq(fname1, fname2, indexbits, cbits, k); t.stop(); System.out.println("Finished counting; time = "+t); long[] freq=count.transformToFrequency(); // System.out.println(count+"\n"); // System.out.println(Arrays.toString(freq)+"\n"); long sum=sum(freq); System.out.println("Kmer fraction:"); int lim1=8, lim2=16; for(int i=0; i<lim1; i++){ String prefix=i+""; while(prefix.length()<8){prefix=prefix+" ";} System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]); } while(lim1<=freq.length){ int x=0; for(int i=lim1; i<lim2; i++){ x+=freq[i]; } String prefix=lim1+"-"+(lim2-1); if(lim2>=freq.length){prefix=lim1+"+";} while(prefix.length()<8){prefix=prefix+" ";} System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x); lim1*=2; lim2=min(lim2*2, freq.length); } long estKmers=load+min(actualCollisions, (long)expectedCollisions); long sum2=sum-freq[0]; long x=freq[1]; System.out.println(); System.out.println("Keys Counted: \t \t"+keysCounted); System.out.println("Unique: \t \t"+sum2); System.out.println("probCollisions:\t \t"+(long)probNewKeyCollisions); System.out.println("EstimateP: \t \t"+(sum2+(long)probNewKeyCollisions)); System.out.println("expectedColl: \t \t"+(long)expectedCollisions); System.out.println("actualColl: \t \t"+(long)actualCollisions); System.out.println("estimateKmers: \t \t"+estKmers); System.out.println(); System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); x=sum2-x; System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); } public static KCountArray2 countFastq(String reads1, String reads2, int indexbits, int cbits, int k){ assert(indexbits>=1 && indexbits<40); final long cells=1L<<indexbits; final int kbits=ROTATE_DIST*k; final int xorShift=kbits%64; final long[] rotMasks=makeRotMasks(xorShift); final int[] buffer=new int[k]; if(verbose){System.err.println("k="+k+", kbits="+kbits+", indexbits="+indexbits+", cells="+cells+", cbits="+cbits);} if(verbose){System.err.println("xorShift="+xorShift+", rotMasks[3]="+Long.toHexString(rotMasks[3]));} final KCountArray2 count=new KCountArray2(cells, cbits); load=0; probNewKeyCollisions=0; invCells=1d/cells; invKmerSpace=Math.pow(0.5, 2*k); if(cells>=Math.pow(4, k)){invCells=0;} final ConcurrentReadInputStream cris; { FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); if(verbose){System.err.println("Started cris");} cris.start(); //4567 } boolean paired=cris.paired(); if(verbose){System.err.println("Paired: "+paired);} long kmer=0; //current kmer int len=0; //distance since last contig start or ambiguous base { ListNum<Read> ln=cris.nextList(); ArrayList<Read> reads=(ln!=null ? ln.list : null); if(reads!=null && !reads.isEmpty()){ Read r=reads.get(0); assert(paired==(r.mate!=null)); } while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning //System.err.println("reads.size()="+reads.size()); for(Read r : reads){ readsProcessed++; len=0; kmer=0; byte[] bases=r.bases; byte[] quals=r.quality; for(int i=0; i<bases.length; i++){ byte b=bases[i]; int x=AminoAcid.baseToNumber[b]; int x2=buffer[len%buffer.length]; buffer[len%buffer.length]=x; if(x<0){ len=0; kmer=0; }else{ kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x); len++; if(len>=k){ keysCounted++; if(len>k){kmer=kmer^rotMasks[x2];} long hashcode=kmer&0x7fffffffffffffffL; // hashcode=randy.nextLong()&~((-1L)<<(2*k)); long code1=hashcode%(cells-3); // long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5); int value=count.increment2(code1, 1); double probCollision=load*invCells; // expectedCollisions+=probCollision; expectedCollisions+=probCollision*(1-(load+min(expectedCollisions, actualCollisions))*invKmerSpace); if(value==0){load++;} else{ actualCollisions++; double probNewKey=(load*invCells)*expectedCollisions/(min(expectedCollisions, actualCollisions)); double estKeys=load+probNewKeyCollisions; double probOldKey=estKeys*invKmerSpace; probNewKeyCollisions+=probNewKey*(1-probOldKey); // double estKmers=load+min(actualCollisions, expectedCollisions); // double probOldKmer=estKmers*invKmerSpace; // probNewKeyCollisions+=(prob*(1-prob2)); } //// probCollisions+=(load*invCells); // if(value==0){load++;} // else{ //// long load2=keysCounted-load; // double prob=Math.sqrt(load*invCells); // double estKmers=load+probNewKeyCollisions; // double prob2=estKmers*invKmerSpace; //// probCollisions+=(prob*(1-prob2)); //// probCollisions+=Math.sqrt(prob*(1-prob2)); // probNewKeyCollisions+=Math.sqrt(prob*(1-prob2)); //// probCollisions+=min(prob, 1-prob2); //// probCollisions+=(load*invCells); // } } } } if(r.mate!=null){ len=0; kmer=0; bases=r.mate.bases; quals=r.mate.quality; for(int i=0; i<bases.length; i++){ byte b=bases[i]; int x=AminoAcid.baseToNumber[b]; int x2=buffer[len%buffer.length]; buffer[len%buffer.length]=x; if(x<0){ len=0; kmer=0; }else{ kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x); len++; if(len>=k){ keysCounted++; if(len>k){kmer=kmer^rotMasks[x2];} long hashcode=kmer&0x7fffffffffffffffL; // hashcode=randy.nextLong()&~((-1L)<<(2*k)); long code1=hashcode%(cells-3); // long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5); int value=count.increment2(code1, 1); double probCollision=load*invCells; // expectedCollisions+=probCollision; expectedCollisions+=probCollision*(1-(load+min(expectedCollisions, actualCollisions))*invKmerSpace); if(value==0){load++;} else{ actualCollisions++; double probNewKey=(load*invCells)*expectedCollisions/(min(expectedCollisions, actualCollisions)); double estKeys=load+probNewKeyCollisions; double probOldKey=estKeys*invKmerSpace; probNewKeyCollisions+=probNewKey*(1-probOldKey); // double estKmers=load+min(actualCollisions, expectedCollisions); // double probOldKmer=estKmers*invKmerSpace; // probNewKeyCollisions+=(prob*(1-prob2)); } //// probCollisions+=(load*invCells); // if(value==0){load++;} // else{ //// long load2=keysCounted-load; // double prob=Math.sqrt(load*invCells); // double estKmers=load+probNewKeyCollisions; // double prob2=estKmers*invKmerSpace; //// probCollisions+=(prob*(1-prob2)); //// probCollisions+=Math.sqrt(prob*(1-prob2)); // probNewKeyCollisions+=Math.sqrt(prob*(1-prob2)); //// probCollisions+=min(prob, 1-prob2); //// probCollisions+=(load*invCells); // } } } } } } //System.err.println("returning list"); cris.returnList(ln); //System.err.println("fetching list"); ln=cris.nextList(); reads=(ln!=null ? ln.list : null); } System.err.println("Finished reading"); cris.returnList(ln); System.err.println("Returned list"); ReadWrite.closeStream(cris); System.err.println("Closed stream"); System.err.println("Processed "+readsProcessed+" reads."); } return count; } public static final long[] makeRotMasks(int rotDist){ long[] masks=new long[4]; for(long i=0; i<4; i++){ masks[(int)i]=Long.rotateLeft(i, rotDist); } return masks; } public static long[] transformToFrequency(int[] count){ long[] freq=new long[2000]; int max=freq.length-1; for(int i=0; i<count.length; i++){ int x=count[i]; x=min(x, max); freq[x]++; } return freq; } public static long sum(int[] array){ long x=0; for(int y : array){x+=y;} return x; } public static long sum(long[] array){ long x=0; for(long y : array){x+=y;} return x; } public static final int min(int x, int y){return x<y ? x : y;} public static final int max(int x, int y){return x>y ? x : y;} public static final long min(long x, long y){return x<y ? x : y;} public static final long max(long x, long y){return x>y ? x : y;} public static final double min(double x, double y){return x<y ? x : y;} public static final double max(double x, double y){return x>y ? x : y;} public static boolean verbose=true; public static byte minQuality=-5; public static long readsProcessed=0; public static long maxReads=10000000L; public static final int ROTATE_DIST=2; /** Non-empty cells in hash table */ public static long load; /** Number of expected collisions */ public static double expectedCollisions; /** Number of actual collisions (possibly by same value) */ public static long actualCollisions; /** Number of probable collisions caused by new keys */ public static double probNewKeyCollisions; /** Inverse of hash table size */ public static double invCells; /** Inverse of number of potential kmers */ public static double invKmerSpace; /** Inverse of number of potential kmers */ public static long keysCounted; public static final Random randy=new Random(1); }