Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/KmerCount4.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/KmerCount4.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,358 @@ +package bloom; + +import java.util.ArrayList; +import java.util.Locale; + +import dna.AminoAcid; +import fileIO.FileFormat; +import shared.Timer; +import stream.ConcurrentReadInputStream; +import stream.FastaReadInputStream; +import stream.Read; +import structures.ListNum; + +/** + * @author Brian Bushnell + * @date Jul 5, 2012 + * + */ +public class KmerCount4 extends KmerCountAbstract { + + public static void main(String[] args){ + + Timer t=new Timer(); + + String fname1=args[0]; + String fname2=(args.length>3 || args[1].contains(".") ? args[1] : null); + int k=14; + int cbits=16; + int gap=0; + + for(int i=(fname2==null ? 1 : 2); i<args.length; i++){ + final String arg=args[i]; + final String[] split=arg.split("="); + String a=split[0].toLowerCase(); + String b=split.length>1 ? split[1] : null; + + if(a.equals("k") || a.equals("kmer")){ + k=Integer.parseInt(b); + }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ + cbits=Integer.parseInt(b); + }else if(a.startsWith("gap")){ + gap=Integer.parseInt(b); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + KCountArray2 count=null; + + if(fileIO.FileFormat.hasFastaExtension(fname1)){ + assert(!FastaReadInputStream.SPLIT_READS); + FastaReadInputStream.MIN_READ_LEN=k; + } + + if(gap==0){ + count=count(fname1, fname2, k, cbits, true); + }else{ + count=countFastqSplit(fname1, fname2, (k+1)/2, k/2, gap, cbits, true, null); + } + + + t.stop(); + System.out.println("Finished counting; time = "+t); + + printStatistics(count); + + } + + public static void printStatistics(KCountArray2 count){ + long[] freq=count.transformToFrequency(); + +// System.out.println(count+"\n"); +// System.out.println(Arrays.toString(freq)+"\n"); + + long sum=sum(freq); + System.out.println("Kmer fraction:"); + int lim1=8, lim2=16; + for(int i=0; i<lim1; i++){ + String prefix=i+""; + while(prefix.length()<8){prefix=prefix+" ";} + System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]); + } + while(lim1<=freq.length){ + int x=0; + for(int i=lim1; i<lim2; i++){ + x+=freq[i]; + } + String prefix=lim1+"-"+(lim2-1); + if(lim2>=freq.length){prefix=lim1+"+";} + while(prefix.length()<8){prefix=prefix+" ";} + System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x); + lim1*=2; + lim2=min(lim2*2, freq.length); + } + + long sum2=sum-freq[0]; + long x=freq[1]; + System.out.println(); + System.out.println("Keys Counted: \t \t"+keysCounted); + System.out.println("Unique: \t \t"+sum2); + System.out.println("Avg Sites/Key: \t \t"+String.format(Locale.ROOT, "%.3f ",(keysCounted*1d/sum2))); + System.out.println(); + System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + x=sum2-x; + System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + } + + public static KCountArray2 count(String reads1, String reads2, int k, int cbits, boolean rcomp){ + return count(reads1, reads2, k, cbits, rcomp, null); + } + + public static KCountArray2 count(String reads1, String reads2, int k, int cbits, boolean rcomp, KCountArray2 count){ + assert(k>=1 && k<20); + final int kbits=2*k; + final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits)); + + if(count==null){ + final long cells=1L<<kbits; + if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));} + count=new KCountArray2(cells, cbits); + } + + final ConcurrentReadInputStream cris; + { + FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); + cris.start(); //4567 + } + + assert(cris!=null) : reads1; + System.err.println("Started cris"); + boolean paired=cris.paired(); + if(verbose){System.err.println("Paired: "+paired);} + + ListNum<Read> ln=cris.nextList(); + ArrayList<Read> reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + } + + while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessed++; + + addRead(r, count, k, mask, rcomp); + if(r.mate!=null){ + addRead(r.mate, count, k, mask, rcomp); + } + + } + //System.err.println("returning list"); + cris.returnList(ln); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + if(verbose){System.err.println("Finished reading");} + cris.returnList(ln); + if(verbose){System.err.println("Returned list");} + cris.close(); + if(verbose){System.err.println("Closed stream");} + if(verbose){System.err.println("Processed "+readsProcessed+" reads.");} + + + return count; + } + + public static KCountArray2 countFastqSplit(String reads1, String reads2, int k1, int k2, int gap, int cbits, boolean rcomp, KCountArray2 count){ + assert(k1+k2>=1 && k1+k2<20); + assert(gap>=0); + final int kbits1=2*k1; + final int kbits2=2*k2; + final long mask1=~((-1L)<<(kbits1)); + final long mask2=~((-1L)<<(kbits2)); + + if(count==null){ + final long cells=1L<<(kbits1+kbits2); + if(verbose){System.err.println("k1="+k1+", k2="+k2+", kbits1="+kbits1+", kbits2="+kbits2+", cells="+cells+ + ", mask1="+Long.toHexString(mask1)+", mask2="+Long.toHexString(mask2));} + count=new KCountArray2(cells, cbits, gap); + } + assert(count.gap==gap); + + final ConcurrentReadInputStream cris; + { + FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); + cris.start(); //4567 + } + + assert(cris!=null) : reads1; + System.err.println("Started cris"); + boolean paired=cris.paired(); + if(verbose){System.err.println("Paired: "+paired);} + + ListNum<Read> ln=cris.nextList(); + ArrayList<Read> reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + } + + while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessed++; + + addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp); + if(r.mate!=null){ + addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp); + } + + } + //System.err.println("returning list"); + cris.returnList(ln); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + if(verbose){System.err.println("Finished reading");} + cris.returnList(ln); + if(verbose){System.err.println("Returned list");} + cris.close(); + if(verbose){System.err.println("Closed stream");} + if(verbose){System.err.println("Processed "+readsProcessed+" reads.");} + + + return count; + } + + public static void addRead(final Read r, final KCountArray2 count, final int k, final long mask, boolean rcomp){ + int len=0; + long kmer=0; + byte[] bases=r.bases; + byte[] quals=r.quality; + for(int i=0; i<bases.length; i++){ + byte b=bases[i]; + int x=AminoAcid.baseToNumber[b]; + if(x<0 || (quals!=null && quals[i]<minQuality)){ + len=0; + kmer=0; + }else{ + kmer=((kmer<<2)|x)&mask; + len++; + if(len>=k){ + keysCounted++; +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + count.increment(kmer, 1); +// System.out.println(" -> "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + if(rcomp){ + r.reverseComplement(); + addRead(r, count, k, mask, false); + } + } + + public static void addReadSplit(final Read r, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ + int len=0; + int shift=k2*2; + long kmer1=0; + long kmer2=0; + byte[] bases=r.bases; + byte[] quals=r.quality; + + assert(kmer1>=kmer2); + +// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; + + for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){ + int x1=AminoAcid.baseToNumber[bases[i]]; + int x2=AminoAcid.baseToNumber[bases[j]]; + if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){ + len=0; + kmer1=0; + kmer2=0; + }else{ + kmer1=((kmer1<<2)|x1)&mask1; + kmer2=((kmer2<<2)|x2)&mask2; + len++; + if(len>=k1){ + keysCounted++; +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + + long key=(kmer1<<shift)|kmer2; +// System.err.println(Long.toHexString(key)); + count.increment(key, 1); +// System.out.println(" -> "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + if(rcomp){ + r.reverseComplement(); + addReadSplit(r, count, k1, k2, mask1, mask2, gap, false); + } + } + + public static void addReadSplit(final byte[] bases, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ + int len=0; + int shift=k2*2; + long kmer1=0; + long kmer2=0; + byte[] quals=null; + + assert(kmer1>=kmer2); + +// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; + + for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){ + int x1=AminoAcid.baseToNumber[bases[i]]; + int x2=AminoAcid.baseToNumber[bases[j]]; + if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){ + len=0; + kmer1=0; + kmer2=0; + }else{ + kmer1=((kmer1<<2)|x1)&mask1; + kmer2=((kmer2<<2)|x2)&mask2; + len++; + if(len>=k1){ + keysCounted++; +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + + long key=(kmer1<<shift)|kmer2; + System.out.println(Long.toHexString(kmer1)); + System.out.println(Long.toHexString(kmer2)); + System.out.println(Long.toHexString(key)); + count.increment(key, 1); +// System.out.println(" -> "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + if(rcomp){ + AminoAcid.reverseComplementBasesInPlace(bases); + addReadSplit(bases, count, k1, k2, mask1, mask2, gap, false); + } + } + +}