comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/KmerCount5.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package bloom;
2
3 import java.util.ArrayList;
4 import java.util.BitSet;
5 import java.util.Locale;
6
7 import dna.AminoAcid;
8 import fileIO.FileFormat;
9 import shared.Timer;
10 import stream.ConcurrentReadInputStream;
11 import stream.FastaReadInputStream;
12 import stream.Read;
13 import structures.ListNum;
14
15 /**
16 * @author Brian Bushnell
17 * @date Jul 5, 2012
18 *
19 */
20 public class KmerCount5 extends KmerCountAbstract {
21
22 public static void main(String[] args){
23
24 Timer t=new Timer();
25
26 String fname1=args[0];
27 String fname2=(args.length>1 ? args[1] : null);
28 int k=14;
29 int cbits=16;
30 int gap=0;
31
32 for(int i=2; i<args.length; i++){
33 final String arg=args[i];
34 final String[] split=arg.split("=");
35 String a=split[0].toLowerCase();
36 String b=split.length>1 ? split[1] : null;
37
38 if(a.equals("k") || a.equals("kmer")){
39 k=Integer.parseInt(b);
40 }else if(a.startsWith("cbits") || a.startsWith("cellbits")){
41 cbits=Integer.parseInt(b);
42 }else if(a.startsWith("gap")){
43 gap=Integer.parseInt(b);
44 }else{
45 throw new RuntimeException("Unknown parameter "+args[i]);
46 }
47 }
48
49 KCountArray count=null;
50
51 if(fileIO.FileFormat.hasFastaExtension(fname1)){
52 assert(!FastaReadInputStream.SPLIT_READS);
53 FastaReadInputStream.MIN_READ_LEN=k;
54 }
55
56 if(gap==0){
57 count=count(fname1, fname2, k, cbits, true);
58 }else{
59 count=countFastqSplit(fname1, fname2, (k+1)/2, k/2, gap, cbits, true, null);
60 }
61
62
63 t.stop();
64 System.out.println("Finished counting; time = "+t);
65
66 printStatistics(count);
67
68 }
69
70 public static void printStatistics(KCountArray count){
71 long[] freq=count.transformToFrequency();
72
73 // System.out.println(count+"\n");
74 // System.out.println(Arrays.toString(freq)+"\n");
75
76 long sum=sum(freq);
77 System.out.println("Kmer fraction:");
78 int lim1=8, lim2=16;
79 for(int i=0; i<lim1; i++){
80 String prefix=i+"";
81 while(prefix.length()<8){prefix=prefix+" ";}
82 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
83 }
84 while(lim1<=freq.length){
85 int x=0;
86 for(int i=lim1; i<lim2; i++){
87 x+=freq[i];
88 }
89 String prefix=lim1+"-"+(lim2-1);
90 if(lim2>=freq.length){prefix=lim1+"+";}
91 while(prefix.length()<8){prefix=prefix+" ";}
92 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x);
93 lim1*=2;
94 lim2=min(lim2*2, freq.length);
95 }
96
97 long sum2=sum-freq[0];
98 long x=freq[1];
99 System.out.println();
100 System.out.println("Keys Counted: \t \t"+keysCounted);
101 System.out.println("Unique: \t \t"+sum2);
102 System.out.println("Avg Sites/Key: \t \t"+String.format(Locale.ROOT, "%.3f ",(keysCounted*1d/sum2)));
103 System.out.println();
104 System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
105 x=sum2-x;
106 System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
107 }
108
109 public static KCountArray count(String reads1, String reads2, int k, int cbits, boolean rcomp){
110 return count(reads1, reads2, k, cbits, rcomp, null);
111 }
112
113 public static KCountArray count(String reads1, String reads2, int k, int cbits, boolean rcomp, KCountArray count){
114 assert(k<32 && k>=1 && (count!=null || k<20));
115 final int kbits=2*k;
116 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
117
118 if(count==null){
119 final long cells=1L<<kbits;
120 if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
121 count=KCountArray.makeNew(cells, cbits);
122 }
123
124 final ConcurrentReadInputStream cris;
125 {
126 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
127 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
128 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
129 cris.start(); //4567
130 }
131
132 assert(cris!=null) : reads1;
133 System.err.println("Started cris");
134 boolean paired=cris.paired();
135 if(verbose){System.err.println("Paired: "+paired);}
136
137 ListNum<Read> ln=cris.nextList();
138 ArrayList<Read> reads=(ln!=null ? ln.list : null);
139
140 if(reads!=null && !reads.isEmpty()){
141 Read r=reads.get(0);
142 assert(paired==(r.mate!=null));
143 }
144
145 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
146 //System.err.println("reads.size()="+reads.size());
147 for(Read r : reads){
148 readsProcessed++;
149
150 addRead(r, count, k, mask, rcomp);
151 if(r.mate!=null){
152 addRead(r.mate, count, k, mask, rcomp);
153 }
154
155 }
156 //System.err.println("returning list");
157 cris.returnList(ln);
158 //System.err.println("fetching list");
159 ln=cris.nextList();
160 reads=(ln!=null ? ln.list : null);
161 }
162 if(verbose){System.err.println("Finished reading");}
163 cris.returnList(ln);
164 if(verbose){System.err.println("Returned list");}
165 cris.close();
166 if(verbose){System.err.println("Closed stream");}
167 if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
168
169
170 return count;
171 }
172
173
174
175
176 public static KCountArray count(final String reads1, final String reads2, final int k, final int cbits, final boolean rcomp,
177 KCountArray counts, final KCountArray trusted, final long maxReads, final int thresh, final int detectStepsize, final boolean conservative){
178
179 assert(k<32 && k>=1 && (counts!=null || k<20));
180 final int kbits=2*k;
181 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
182
183 // System.out.println("k="+k+", kbits="+kbits+", mask="+Long.toHexString(mask)+", thresh="+thresh);
184 // System.out.println("\ntrusted=\n"+trusted);
185 // System.out.println("\ncount=\n"+count);
186
187 if(counts==null){
188 final long cells=1L<<kbits;
189 if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
190 counts=KCountArray.makeNew(cells, cbits);
191 }
192
193 final ConcurrentReadInputStream cris;
194 {
195 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
196 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
197 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
198 cris.start(); //4567
199 }
200
201 assert(cris!=null) : reads1;
202 System.err.println("Started cris");
203 boolean paired=cris.paired();
204 if(verbose){System.err.println("Paired: "+paired);}
205
206 ListNum<Read> ln=cris.nextList();
207 ArrayList<Read> reads=(ln!=null ? ln.list : null);
208
209 if(reads!=null && !reads.isEmpty()){
210 Read r=reads.get(0);
211 assert(paired==(r.mate!=null));
212 }
213
214
215 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
216 //System.err.println("reads.size()="+reads.size());
217 for(Read r : reads){
218
219 Read r2=r.mate;
220 {
221 if(trusted!=null){
222 BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) :
223 ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize));
224 // System.out.println("\n"+toString(bs, r.length()));
225 // System.out.println(new String(r.bases));
226 for(int i=bs.nextClearBit(0); i<r.length(); i=bs.nextClearBit(i+1)){
227 r.bases[i]='N';
228 r.quality[i]=0;
229 }
230 // System.out.println(new String(r.bases));
231 // System.out.println("used = "+String.format(Locale.ROOT, "%.3f%%",count.usedFraction()*100));
232 // System.out.println("used = "+((KCountArray4)count).cellsUsed());
233 // if(bs.length()<r.length()){r=null;}
234 }
235 // if(r!=null){KmerCount5.addRead(r, count, k, mask, rcomp);}
236 KmerCount5.addRead(r, counts, k, mask, rcomp);
237 }
238 if(r2!=null){
239 if(trusted!=null){
240 BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r2, trusted, k, thresh, detectStepsize) :
241 ErrorCorrect.detectTrusted(r2, trusted, k, thresh, detectStepsize));
242 for(int i=bs.nextClearBit(0); i<r2.length(); i=bs.nextClearBit(i+1)){
243 r2.bases[i]='N';
244 r2.quality[i]=0;
245 }
246 }
247 KmerCount5.addRead(r2, counts, k, mask, rcomp);
248 }
249
250 }
251 //System.err.println("returning list");
252 cris.returnList(ln);
253 //System.err.println("fetching list");
254 ln=cris.nextList();
255 reads=(ln!=null ? ln.list : null);
256 }
257
258 if(verbose){System.err.println("Finished reading");}
259 cris.returnList(ln);
260 if(verbose){System.err.println("Returned list");}
261 cris.close();
262 if(verbose){System.err.println("Closed stream");}
263
264 // System.out.println("*** after ***");
265 // System.out.println("\ntrusted=\n"+trusted);
266 // System.out.println("\ncount=\n"+count);
267
268 return counts;
269 }
270
271
272 public static KCountArray countFastqSplit(String reads1, String reads2, int k1, int k2, int gap, int cbits, boolean rcomp, KCountArray counts){
273 int k=k1+k2;
274 assert(k<32 && k>=1 && (counts!=null || k<20));
275 assert(gap>=0);
276 final int kbits1=2*k1;
277 final int kbits2=2*k2;
278 final long mask1=~((-1L)<<(kbits1));
279 final long mask2=~((-1L)<<(kbits2));
280
281 if(counts==null){
282 final long cells=1L<<(kbits1+kbits2);
283 if(verbose){System.err.println("k1="+k1+", k2="+k2+", kbits1="+kbits1+", kbits2="+kbits2+", cells="+cells+
284 ", mask1="+Long.toHexString(mask1)+", mask2="+Long.toHexString(mask2));}
285 counts=KCountArray.makeNew(cells, cbits);
286 }
287
288 final ConcurrentReadInputStream cris;
289 {
290 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
291 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
292 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
293 cris.start(); //4567
294 }
295
296 assert(cris!=null) : reads1;
297 System.err.println("Started cris");
298 boolean paired=cris.paired();
299 if(verbose){System.err.println("Paired: "+paired);}
300
301 ListNum<Read> ln=cris.nextList();
302 ArrayList<Read> reads=(ln!=null ? ln.list : null);
303
304 if(reads!=null && !reads.isEmpty()){
305 Read r=reads.get(0);
306 assert(paired==(r.mate!=null));
307 }
308
309 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
310 //System.err.println("reads.size()="+reads.size());
311 for(Read r : reads){
312 readsProcessed++;
313
314 addReadSplit(r, counts, k1, k2, mask1, mask2, gap, rcomp);
315 if(r.mate!=null){
316 addReadSplit(r.mate, counts, k1, k2, mask1, mask2, gap, rcomp);
317 }
318
319 }
320 //System.err.println("returning list");
321 cris.returnList(ln);
322 //System.err.println("fetching list");
323 ln=cris.nextList();
324 reads=(ln!=null ? ln.list : null);
325 }
326 if(verbose){System.err.println("Finished reading");}
327 cris.returnList(ln);
328 if(verbose){System.err.println("Returned list");}
329 cris.close();
330 if(verbose){System.err.println("Closed stream");}
331 if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
332
333
334 return counts;
335 }
336
337 public static void addRead(final Read r, final KCountArray count, final int k, final long mask, boolean rcomp){
338 int len=0;
339 long kmer=0;
340 byte[] bases=r.bases;
341 byte[] quals=r.quality;
342 for(int i=0; i<bases.length; i++){
343 byte b=bases[i];
344 int x=AminoAcid.baseToNumber[b];
345 if(x<0 || (quals!=null && quals[i]<minQuality)){
346 len=0;
347 kmer=0;
348 }else{
349 kmer=((kmer<<2)|x)&mask;
350 len++;
351 if(len>=k){
352 keysCounted++;
353 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
354 count.increment(kmer);
355 // System.out.println(" -> "+count.read(kmer));
356 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
357 // array[(int)kmer]++;
358 // System.out.println(" -> "+array[(int)kmer]+"\n");
359 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
360 }
361 }
362 }
363 if(rcomp){
364 r.reverseComplement();
365 addRead(r, count, k, mask, false);
366 }
367 }
368
369 public static void addReadSplit(final Read r, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
370 int len=0;
371 int shift=k2*2;
372 long kmer1=0;
373 long kmer2=0;
374 byte[] bases=r.bases;
375 byte[] quals=r.quality;
376
377 assert(kmer1>=kmer2);
378
379 // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
380
381 for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
382 int x1=AminoAcid.baseToNumber[bases[i]];
383 int x2=AminoAcid.baseToNumber[bases[j]];
384 if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
385 len=0;
386 kmer1=0;
387 kmer2=0;
388 }else{
389 kmer1=((kmer1<<2)|x1)&mask1;
390 kmer2=((kmer2<<2)|x2)&mask2;
391 len++;
392 if(len>=k1){
393 keysCounted++;
394 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
395
396 long key=(kmer1<<shift)|kmer2;
397 // System.err.println(Long.toHexString(key));
398 count.increment(key);
399 // System.out.println(" -> "+count.read(kmer));
400 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
401 // array[(int)kmer]++;
402 // System.out.println(" -> "+array[(int)kmer]+"\n");
403 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
404 }
405 }
406 }
407 if(rcomp){
408 r.reverseComplement();
409 addReadSplit(r, count, k1, k2, mask1, mask2, gap, false);
410 }
411 }
412
413 public static void addReadSplit(final byte[] bases, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
414 int len=0;
415 int shift=k2*2;
416 long kmer1=0;
417 long kmer2=0;
418 byte[] quals=null;
419
420 assert(kmer1>=kmer2);
421
422 // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
423
424 for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
425 int x1=AminoAcid.baseToNumber[bases[i]];
426 int x2=AminoAcid.baseToNumber[bases[j]];
427 if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
428 len=0;
429 kmer1=0;
430 kmer2=0;
431 }else{
432 kmer1=((kmer1<<2)|x1)&mask1;
433 kmer2=((kmer2<<2)|x2)&mask2;
434 len++;
435 if(len>=k1){
436 keysCounted++;
437 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
438
439 long key=(kmer1<<shift)|kmer2;
440 System.out.println(Long.toHexString(kmer1));
441 System.out.println(Long.toHexString(kmer2));
442 System.out.println(Long.toHexString(key));
443 count.increment(key);
444 // System.out.println(" -> "+count.read(kmer));
445 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
446 // array[(int)kmer]++;
447 // System.out.println(" -> "+array[(int)kmer]+"\n");
448 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
449 }
450 }
451 }
452 if(rcomp){
453 AminoAcid.reverseComplementBasesInPlace(bases);
454 addReadSplit(bases, count, k1, k2, mask1, mask2, gap, false);
455 }
456 }
457
458 }