comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/KmerCount6.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package bloom;
2
3 import java.util.ArrayList;
4 import java.util.BitSet;
5 import java.util.Locale;
6
7 import dna.AminoAcid;
8 import fileIO.FileFormat;
9 import shared.Timer;
10 import stream.ConcurrentReadInputStream;
11 import stream.FastaReadInputStream;
12 import stream.Read;
13 import structures.ListNum;
14
15 /**
16 * @author Brian Bushnell
17 * @date Jul 5, 2012
18 *
19 */
20 public class KmerCount6 extends KmerCountAbstract {
21
22 public static void main(String[] args){
23
24 Timer t=new Timer();
25
26 String fname1=args[0];
27 String fname2=(args.length>1 ? args[1] : null);
28 int k=14;
29 int cbits=16;
30 int gap=0;
31
32 for(int i=2; i<args.length; i++){
33 final String arg=args[i];
34 final String[] split=arg.split("=");
35 String a=split[0].toLowerCase();
36 String b=split.length>1 ? split[1] : null;
37
38 if(a.equals("k") || a.equals("kmer")){
39 k=Integer.parseInt(b);
40 }else if(a.startsWith("cbits") || a.startsWith("cellbits")){
41 cbits=Integer.parseInt(b);
42 }else if(a.startsWith("gap")){
43 gap=Integer.parseInt(b);
44 }else{
45 throw new RuntimeException("Unknown parameter "+args[i]);
46 }
47 }
48
49 KCountArray count=null;
50
51 if(fileIO.FileFormat.hasFastaExtension(fname1)){
52 assert(!FastaReadInputStream.SPLIT_READS);
53 FastaReadInputStream.MIN_READ_LEN=k;
54 }
55
56 count=count(fname1, fname2, k, cbits, true, null);
57
58
59 t.stop();
60 System.out.println("Finished counting; time = "+t);
61
62 printStatistics(count);
63
64 }
65
66 public static void printStatistics(KCountArray count){
67 long[] freq=count.transformToFrequency();
68
69 // System.out.println(count+"\n");
70 // System.out.println(Arrays.toString(freq)+"\n");
71
72 long sum=sum(freq);
73 System.out.println("Kmer fraction:");
74 int lim1=8, lim2=16;
75 for(int i=0; i<lim1; i++){
76 String prefix=i+"";
77 while(prefix.length()<8){prefix=prefix+" ";}
78 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
79 }
80 while(lim1<=freq.length){
81 int x=0;
82 for(int i=lim1; i<lim2; i++){
83 x+=freq[i];
84 }
85 String prefix=lim1+"-"+(lim2-1);
86 if(lim2>=freq.length){prefix=lim1+"+";}
87 while(prefix.length()<8){prefix=prefix+" ";}
88 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x);
89 lim1*=2;
90 lim2=min(lim2*2, freq.length);
91 }
92
93 long sum2=sum-freq[0];
94 long x=freq[1];
95 System.out.println();
96 System.out.println("Keys Counted: \t \t"+keysCounted);
97 System.out.println("Unique: \t \t"+sum2);
98 System.out.println("Avg Sites/Key: \t \t"+String.format(Locale.ROOT, "%.3f ",(keysCounted*1d/sum2)));
99 System.out.println();
100 System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
101 x=sum2-x;
102 System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
103 }
104
105 public static KCountArray count(String reads1, String reads2, int k, int cbits, boolean rcomp, KCountArray count){
106 assert(k<32 && k>=1 && (count!=null || k<20));
107 final int kbits=2*k;
108 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
109
110 if(count==null){
111 final long cells=1L<<kbits;
112 if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
113 count=KCountArray.makeNew(cells, cbits);
114 }
115
116 final ConcurrentReadInputStream cris;
117 {
118 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
119 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
120 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
121 cris.start(); //4567
122 }
123
124 assert(cris!=null) : reads1;
125 System.err.println("Started cris");
126 boolean paired=cris.paired();
127 if(verbose){System.err.println("Paired: "+paired);}
128
129 count(cris, k, rcomp, count);
130
131 cris.close();
132 if(verbose){System.err.println("Closed stream");}
133 if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
134
135
136 return count;
137 }
138
139
140 public static void count(ConcurrentReadInputStream cris, int k, boolean rcomp, KCountArray count){
141 assert(k<32 && k>=1 && (count!=null || k<20));
142
143 assert(count!=null);
144
145 ListNum<Read> ln=cris.nextList();
146 ArrayList<Read> reads=(ln!=null ? ln.list : null);
147
148
149 if(true /*count.gap==0*/){
150 final int kbits=2*k;
151 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
152
153
154 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
155 //System.err.println("reads.size()="+reads.size());
156 for(Read r : reads){
157 readsProcessed++;
158
159 addRead(r, count, k, mask, rcomp);
160 if(r.mate!=null){
161 addRead(r.mate, count, k, mask, rcomp);
162 }
163
164 }
165 //System.err.println("returning list");
166 cris.returnList(ln);
167 //System.err.println("fetching list");
168 ln=cris.nextList();
169 reads=(ln!=null ? ln.list : null);
170 }
171 }else{
172 final int k1=(k+1)/2;
173 final int k2=k/2;
174 final int kbits1=2*k1;
175 final int kbits2=2*k2;
176 // final int gap=count.gap;
177 int gap=0; assert(false);
178 final long mask1=~((-1L)<<(kbits1));
179 final long mask2=~((-1L)<<(kbits2));
180 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
181 //System.err.println("reads.size()="+reads.size());
182 for(Read r : reads){
183 readsProcessed++;
184
185 addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp);
186 if(r.mate!=null){
187 addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp);
188 }
189
190 }
191 //System.err.println("returning list");
192 cris.returnList(ln);
193 //System.err.println("fetching list");
194 ln=cris.nextList();
195 reads=(ln!=null ? ln.list : null);
196 }
197 }
198
199 if(verbose){System.err.println("Finished reading");}
200 cris.returnList(ln);
201 if(verbose){System.err.println("Returned list");}
202 }
203
204
205
206
207 public static KCountArray count(final String reads1, final String reads2, final int k, final int cbits, final boolean rcomp,
208 KCountArray count, final KCountArray trusted, final long maxReads, final int thresh, final int detectStepsize, final boolean conservative){
209
210 assert(k<32 && k>=1 && (count!=null || k<20));
211 final int kbits=2*k;
212 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
213
214 // System.out.println("k="+k+", kbits="+kbits+", mask="+Long.toHexString(mask)+", thresh="+thresh);
215 // System.out.println("\ntrusted=\n"+trusted);
216 // System.out.println("\ncount=\n"+count);
217
218 if(count==null){
219 final long cells=1L<<kbits;
220 if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
221 count=KCountArray.makeNew(cells, cbits);
222 }
223
224 final ConcurrentReadInputStream cris;
225 {
226 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
227 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
228 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
229 cris.start(); //4567
230 }
231
232 assert(cris!=null) : reads1;
233 System.err.println("Started cris");
234 boolean paired=cris.paired();
235 if(verbose){System.err.println("Paired: "+paired);}
236
237 count(cris, k, rcomp, count, trusted, thresh, detectStepsize, conservative);
238
239 cris.close();
240 if(verbose){System.err.println("Closed stream");}
241
242 // System.out.println("*** after ***");
243 // System.out.println("\ntrusted=\n"+trusted);
244 // System.out.println("\ncount=\n"+count);
245
246 return count;
247 }
248
249
250
251
252 public static void count(final ConcurrentReadInputStream cris, final int k, final boolean rcomp,
253 final KCountArray count, final KCountArray trusted, final int thresh, final int detectStepsize, final boolean conservative){
254
255 assert(k<32 && k>=1 && (count!=null || k<20));
256 final int kbits=2*k;
257 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
258
259 ListNum<Read> ln=cris.nextList();
260 ArrayList<Read> reads=(ln!=null ? ln.list : null);
261
262 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
263 //System.err.println("reads.size()="+reads.size());
264 for(Read r : reads){
265
266 Read r2=r.mate;
267 {
268 if(trusted!=null){
269 BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) :
270 ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize));
271 // System.out.println("\n"+toString(bs, r.length()));
272 // System.out.println(new String(r.bases));
273 for(int i=bs.nextClearBit(0); i<r.length(); i=bs.nextClearBit(i+1)){
274 r.bases[i]='N';
275 r.quality[i]=0;
276 }
277 // System.out.println(new String(r.bases));
278 // System.out.println("used = "+String.format(Locale.ROOT, "%.3f%%",count.usedFraction()*100));
279 // System.out.println("used = "+((KCountArray4)count).cellsUsed());
280 // if(bs.length()<r.length()){r=null;}
281 }
282 // if(r!=null){addRead(r, count, k, mask, rcomp);}
283 addRead(r, count, k, mask, rcomp);
284 }
285 if(r2!=null){
286 if(trusted!=null){
287 BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r2, trusted, k, thresh, detectStepsize) :
288 ErrorCorrect.detectTrusted(r2, trusted, k, thresh, detectStepsize));
289 for(int i=bs.nextClearBit(0); i<r2.length(); i=bs.nextClearBit(i+1)){
290 r2.bases[i]='N';
291 r2.quality[i]=0;
292 }
293 }
294 addRead(r2, count, k, mask, rcomp);
295 }
296
297 }
298 //System.err.println("returning list");
299 cris.returnList(ln);
300 //System.err.println("fetching list");
301 ln=cris.nextList();
302 reads=(ln!=null ? ln.list : null);
303 }
304
305 if(verbose){System.err.println("Finished reading");}
306 cris.returnList(ln);
307 if(verbose){System.err.println("Returned list");}
308 }
309
310
311
312 public static void addRead(final Read r, final KCountArray count, final int k, final long mask, boolean rcomp){
313 int len=0;
314 long kmer=0;
315 byte[] bases=r.bases;
316 byte[] quals=r.quality;
317 for(int i=0; i<bases.length; i++){
318 byte b=bases[i];
319 int x=AminoAcid.baseToNumber[b];
320 if(x<0 || (quals!=null && quals[i]<minQuality)){
321 len=0;
322 kmer=0;
323 }else{
324 kmer=((kmer<<2)|x)&mask;
325 len++;
326 if(len>=k){
327 keysCounted++;
328 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
329 count.increment(kmer);
330 // System.out.println(" -> "+count.read(kmer));
331 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
332 // array[(int)kmer]++;
333 // System.out.println(" -> "+array[(int)kmer]+"\n");
334 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
335 }
336 }
337 }
338 if(rcomp){
339 r.reverseComplement();
340 addRead(r, count, k, mask, false);
341 }
342 }
343
344 public static void addReadSplit(final Read r, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
345 int len=0;
346 int shift=k2*2;
347 long kmer1=0;
348 long kmer2=0;
349 byte[] bases=r.bases;
350 byte[] quals=r.quality;
351
352 assert(kmer1>=kmer2);
353
354 // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
355
356 for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
357 int x1=AminoAcid.baseToNumber[bases[i]];
358 int x2=AminoAcid.baseToNumber[bases[j]];
359 if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
360 len=0;
361 kmer1=0;
362 kmer2=0;
363 }else{
364 kmer1=((kmer1<<2)|x1)&mask1;
365 kmer2=((kmer2<<2)|x2)&mask2;
366 len++;
367 if(len>=k1){
368 keysCounted++;
369 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
370
371 long key=(kmer1<<shift)|kmer2;
372 // System.err.println(Long.toHexString(key));
373 count.increment(key);
374 // System.out.println(" -> "+count.read(kmer));
375 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
376 // array[(int)kmer]++;
377 // System.out.println(" -> "+array[(int)kmer]+"\n");
378 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
379 }
380 }
381 }
382 if(rcomp){
383 r.reverseComplement();
384 addReadSplit(r, count, k1, k2, mask1, mask2, gap, false);
385 }
386 }
387
388 public static void addReadSplit(final byte[] bases, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
389 int len=0;
390 int shift=k2*2;
391 long kmer1=0;
392 long kmer2=0;
393 byte[] quals=null;
394
395 assert(kmer1>=kmer2);
396
397 // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
398
399 for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
400 int x1=AminoAcid.baseToNumber[bases[i]];
401 int x2=AminoAcid.baseToNumber[bases[j]];
402 if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
403 len=0;
404 kmer1=0;
405 kmer2=0;
406 }else{
407 kmer1=((kmer1<<2)|x1)&mask1;
408 kmer2=((kmer2<<2)|x2)&mask2;
409 len++;
410 if(len>=k1){
411 keysCounted++;
412 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
413
414 long key=(kmer1<<shift)|kmer2;
415 System.out.println(Long.toHexString(kmer1));
416 System.out.println(Long.toHexString(kmer2));
417 System.out.println(Long.toHexString(key));
418 count.increment(key);
419 // System.out.println(" -> "+count.read(kmer));
420 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
421 // array[(int)kmer]++;
422 // System.out.println(" -> "+array[(int)kmer]+"\n");
423 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
424 }
425 }
426 }
427 if(rcomp){
428 AminoAcid.reverseComplementBasesInPlace(bases);
429 addReadSplit(bases, count, k1, k2, mask1, mask2, gap, false);
430 }
431 }
432
433 }