Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/KmerCount5.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package bloom; | |
2 | |
3 import java.util.ArrayList; | |
4 import java.util.BitSet; | |
5 import java.util.Locale; | |
6 | |
7 import dna.AminoAcid; | |
8 import fileIO.FileFormat; | |
9 import shared.Timer; | |
10 import stream.ConcurrentReadInputStream; | |
11 import stream.FastaReadInputStream; | |
12 import stream.Read; | |
13 import structures.ListNum; | |
14 | |
15 /** | |
16 * @author Brian Bushnell | |
17 * @date Jul 5, 2012 | |
18 * | |
19 */ | |
20 public class KmerCount5 extends KmerCountAbstract { | |
21 | |
22 public static void main(String[] args){ | |
23 | |
24 Timer t=new Timer(); | |
25 | |
26 String fname1=args[0]; | |
27 String fname2=(args.length>1 ? args[1] : null); | |
28 int k=14; | |
29 int cbits=16; | |
30 int gap=0; | |
31 | |
32 for(int i=2; i<args.length; i++){ | |
33 final String arg=args[i]; | |
34 final String[] split=arg.split("="); | |
35 String a=split[0].toLowerCase(); | |
36 String b=split.length>1 ? split[1] : null; | |
37 | |
38 if(a.equals("k") || a.equals("kmer")){ | |
39 k=Integer.parseInt(b); | |
40 }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ | |
41 cbits=Integer.parseInt(b); | |
42 }else if(a.startsWith("gap")){ | |
43 gap=Integer.parseInt(b); | |
44 }else{ | |
45 throw new RuntimeException("Unknown parameter "+args[i]); | |
46 } | |
47 } | |
48 | |
49 KCountArray count=null; | |
50 | |
51 if(fileIO.FileFormat.hasFastaExtension(fname1)){ | |
52 assert(!FastaReadInputStream.SPLIT_READS); | |
53 FastaReadInputStream.MIN_READ_LEN=k; | |
54 } | |
55 | |
56 if(gap==0){ | |
57 count=count(fname1, fname2, k, cbits, true); | |
58 }else{ | |
59 count=countFastqSplit(fname1, fname2, (k+1)/2, k/2, gap, cbits, true, null); | |
60 } | |
61 | |
62 | |
63 t.stop(); | |
64 System.out.println("Finished counting; time = "+t); | |
65 | |
66 printStatistics(count); | |
67 | |
68 } | |
69 | |
70 public static void printStatistics(KCountArray count){ | |
71 long[] freq=count.transformToFrequency(); | |
72 | |
73 // System.out.println(count+"\n"); | |
74 // System.out.println(Arrays.toString(freq)+"\n"); | |
75 | |
76 long sum=sum(freq); | |
77 System.out.println("Kmer fraction:"); | |
78 int lim1=8, lim2=16; | |
79 for(int i=0; i<lim1; i++){ | |
80 String prefix=i+""; | |
81 while(prefix.length()<8){prefix=prefix+" ";} | |
82 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]); | |
83 } | |
84 while(lim1<=freq.length){ | |
85 int x=0; | |
86 for(int i=lim1; i<lim2; i++){ | |
87 x+=freq[i]; | |
88 } | |
89 String prefix=lim1+"-"+(lim2-1); | |
90 if(lim2>=freq.length){prefix=lim1+"+";} | |
91 while(prefix.length()<8){prefix=prefix+" ";} | |
92 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x); | |
93 lim1*=2; | |
94 lim2=min(lim2*2, freq.length); | |
95 } | |
96 | |
97 long sum2=sum-freq[0]; | |
98 long x=freq[1]; | |
99 System.out.println(); | |
100 System.out.println("Keys Counted: \t \t"+keysCounted); | |
101 System.out.println("Unique: \t \t"+sum2); | |
102 System.out.println("Avg Sites/Key: \t \t"+String.format(Locale.ROOT, "%.3f ",(keysCounted*1d/sum2))); | |
103 System.out.println(); | |
104 System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); | |
105 x=sum2-x; | |
106 System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); | |
107 } | |
108 | |
109 public static KCountArray count(String reads1, String reads2, int k, int cbits, boolean rcomp){ | |
110 return count(reads1, reads2, k, cbits, rcomp, null); | |
111 } | |
112 | |
113 public static KCountArray count(String reads1, String reads2, int k, int cbits, boolean rcomp, KCountArray count){ | |
114 assert(k<32 && k>=1 && (count!=null || k<20)); | |
115 final int kbits=2*k; | |
116 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits)); | |
117 | |
118 if(count==null){ | |
119 final long cells=1L<<kbits; | |
120 if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));} | |
121 count=KCountArray.makeNew(cells, cbits); | |
122 } | |
123 | |
124 final ConcurrentReadInputStream cris; | |
125 { | |
126 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); | |
127 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); | |
128 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); | |
129 cris.start(); //4567 | |
130 } | |
131 | |
132 assert(cris!=null) : reads1; | |
133 System.err.println("Started cris"); | |
134 boolean paired=cris.paired(); | |
135 if(verbose){System.err.println("Paired: "+paired);} | |
136 | |
137 ListNum<Read> ln=cris.nextList(); | |
138 ArrayList<Read> reads=(ln!=null ? ln.list : null); | |
139 | |
140 if(reads!=null && !reads.isEmpty()){ | |
141 Read r=reads.get(0); | |
142 assert(paired==(r.mate!=null)); | |
143 } | |
144 | |
145 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
146 //System.err.println("reads.size()="+reads.size()); | |
147 for(Read r : reads){ | |
148 readsProcessed++; | |
149 | |
150 addRead(r, count, k, mask, rcomp); | |
151 if(r.mate!=null){ | |
152 addRead(r.mate, count, k, mask, rcomp); | |
153 } | |
154 | |
155 } | |
156 //System.err.println("returning list"); | |
157 cris.returnList(ln); | |
158 //System.err.println("fetching list"); | |
159 ln=cris.nextList(); | |
160 reads=(ln!=null ? ln.list : null); | |
161 } | |
162 if(verbose){System.err.println("Finished reading");} | |
163 cris.returnList(ln); | |
164 if(verbose){System.err.println("Returned list");} | |
165 cris.close(); | |
166 if(verbose){System.err.println("Closed stream");} | |
167 if(verbose){System.err.println("Processed "+readsProcessed+" reads.");} | |
168 | |
169 | |
170 return count; | |
171 } | |
172 | |
173 | |
174 | |
175 | |
176 public static KCountArray count(final String reads1, final String reads2, final int k, final int cbits, final boolean rcomp, | |
177 KCountArray counts, final KCountArray trusted, final long maxReads, final int thresh, final int detectStepsize, final boolean conservative){ | |
178 | |
179 assert(k<32 && k>=1 && (counts!=null || k<20)); | |
180 final int kbits=2*k; | |
181 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits)); | |
182 | |
183 // System.out.println("k="+k+", kbits="+kbits+", mask="+Long.toHexString(mask)+", thresh="+thresh); | |
184 // System.out.println("\ntrusted=\n"+trusted); | |
185 // System.out.println("\ncount=\n"+count); | |
186 | |
187 if(counts==null){ | |
188 final long cells=1L<<kbits; | |
189 if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));} | |
190 counts=KCountArray.makeNew(cells, cbits); | |
191 } | |
192 | |
193 final ConcurrentReadInputStream cris; | |
194 { | |
195 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); | |
196 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); | |
197 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); | |
198 cris.start(); //4567 | |
199 } | |
200 | |
201 assert(cris!=null) : reads1; | |
202 System.err.println("Started cris"); | |
203 boolean paired=cris.paired(); | |
204 if(verbose){System.err.println("Paired: "+paired);} | |
205 | |
206 ListNum<Read> ln=cris.nextList(); | |
207 ArrayList<Read> reads=(ln!=null ? ln.list : null); | |
208 | |
209 if(reads!=null && !reads.isEmpty()){ | |
210 Read r=reads.get(0); | |
211 assert(paired==(r.mate!=null)); | |
212 } | |
213 | |
214 | |
215 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
216 //System.err.println("reads.size()="+reads.size()); | |
217 for(Read r : reads){ | |
218 | |
219 Read r2=r.mate; | |
220 { | |
221 if(trusted!=null){ | |
222 BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) : | |
223 ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize)); | |
224 // System.out.println("\n"+toString(bs, r.length())); | |
225 // System.out.println(new String(r.bases)); | |
226 for(int i=bs.nextClearBit(0); i<r.length(); i=bs.nextClearBit(i+1)){ | |
227 r.bases[i]='N'; | |
228 r.quality[i]=0; | |
229 } | |
230 // System.out.println(new String(r.bases)); | |
231 // System.out.println("used = "+String.format(Locale.ROOT, "%.3f%%",count.usedFraction()*100)); | |
232 // System.out.println("used = "+((KCountArray4)count).cellsUsed()); | |
233 // if(bs.length()<r.length()){r=null;} | |
234 } | |
235 // if(r!=null){KmerCount5.addRead(r, count, k, mask, rcomp);} | |
236 KmerCount5.addRead(r, counts, k, mask, rcomp); | |
237 } | |
238 if(r2!=null){ | |
239 if(trusted!=null){ | |
240 BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r2, trusted, k, thresh, detectStepsize) : | |
241 ErrorCorrect.detectTrusted(r2, trusted, k, thresh, detectStepsize)); | |
242 for(int i=bs.nextClearBit(0); i<r2.length(); i=bs.nextClearBit(i+1)){ | |
243 r2.bases[i]='N'; | |
244 r2.quality[i]=0; | |
245 } | |
246 } | |
247 KmerCount5.addRead(r2, counts, k, mask, rcomp); | |
248 } | |
249 | |
250 } | |
251 //System.err.println("returning list"); | |
252 cris.returnList(ln); | |
253 //System.err.println("fetching list"); | |
254 ln=cris.nextList(); | |
255 reads=(ln!=null ? ln.list : null); | |
256 } | |
257 | |
258 if(verbose){System.err.println("Finished reading");} | |
259 cris.returnList(ln); | |
260 if(verbose){System.err.println("Returned list");} | |
261 cris.close(); | |
262 if(verbose){System.err.println("Closed stream");} | |
263 | |
264 // System.out.println("*** after ***"); | |
265 // System.out.println("\ntrusted=\n"+trusted); | |
266 // System.out.println("\ncount=\n"+count); | |
267 | |
268 return counts; | |
269 } | |
270 | |
271 | |
272 public static KCountArray countFastqSplit(String reads1, String reads2, int k1, int k2, int gap, int cbits, boolean rcomp, KCountArray counts){ | |
273 int k=k1+k2; | |
274 assert(k<32 && k>=1 && (counts!=null || k<20)); | |
275 assert(gap>=0); | |
276 final int kbits1=2*k1; | |
277 final int kbits2=2*k2; | |
278 final long mask1=~((-1L)<<(kbits1)); | |
279 final long mask2=~((-1L)<<(kbits2)); | |
280 | |
281 if(counts==null){ | |
282 final long cells=1L<<(kbits1+kbits2); | |
283 if(verbose){System.err.println("k1="+k1+", k2="+k2+", kbits1="+kbits1+", kbits2="+kbits2+", cells="+cells+ | |
284 ", mask1="+Long.toHexString(mask1)+", mask2="+Long.toHexString(mask2));} | |
285 counts=KCountArray.makeNew(cells, cbits); | |
286 } | |
287 | |
288 final ConcurrentReadInputStream cris; | |
289 { | |
290 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); | |
291 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); | |
292 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); | |
293 cris.start(); //4567 | |
294 } | |
295 | |
296 assert(cris!=null) : reads1; | |
297 System.err.println("Started cris"); | |
298 boolean paired=cris.paired(); | |
299 if(verbose){System.err.println("Paired: "+paired);} | |
300 | |
301 ListNum<Read> ln=cris.nextList(); | |
302 ArrayList<Read> reads=(ln!=null ? ln.list : null); | |
303 | |
304 if(reads!=null && !reads.isEmpty()){ | |
305 Read r=reads.get(0); | |
306 assert(paired==(r.mate!=null)); | |
307 } | |
308 | |
309 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
310 //System.err.println("reads.size()="+reads.size()); | |
311 for(Read r : reads){ | |
312 readsProcessed++; | |
313 | |
314 addReadSplit(r, counts, k1, k2, mask1, mask2, gap, rcomp); | |
315 if(r.mate!=null){ | |
316 addReadSplit(r.mate, counts, k1, k2, mask1, mask2, gap, rcomp); | |
317 } | |
318 | |
319 } | |
320 //System.err.println("returning list"); | |
321 cris.returnList(ln); | |
322 //System.err.println("fetching list"); | |
323 ln=cris.nextList(); | |
324 reads=(ln!=null ? ln.list : null); | |
325 } | |
326 if(verbose){System.err.println("Finished reading");} | |
327 cris.returnList(ln); | |
328 if(verbose){System.err.println("Returned list");} | |
329 cris.close(); | |
330 if(verbose){System.err.println("Closed stream");} | |
331 if(verbose){System.err.println("Processed "+readsProcessed+" reads.");} | |
332 | |
333 | |
334 return counts; | |
335 } | |
336 | |
337 public static void addRead(final Read r, final KCountArray count, final int k, final long mask, boolean rcomp){ | |
338 int len=0; | |
339 long kmer=0; | |
340 byte[] bases=r.bases; | |
341 byte[] quals=r.quality; | |
342 for(int i=0; i<bases.length; i++){ | |
343 byte b=bases[i]; | |
344 int x=AminoAcid.baseToNumber[b]; | |
345 if(x<0 || (quals!=null && quals[i]<minQuality)){ | |
346 len=0; | |
347 kmer=0; | |
348 }else{ | |
349 kmer=((kmer<<2)|x)&mask; | |
350 len++; | |
351 if(len>=k){ | |
352 keysCounted++; | |
353 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); | |
354 count.increment(kmer); | |
355 // System.out.println(" -> "+count.read(kmer)); | |
356 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); | |
357 // array[(int)kmer]++; | |
358 // System.out.println(" -> "+array[(int)kmer]+"\n"); | |
359 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); | |
360 } | |
361 } | |
362 } | |
363 if(rcomp){ | |
364 r.reverseComplement(); | |
365 addRead(r, count, k, mask, false); | |
366 } | |
367 } | |
368 | |
369 public static void addReadSplit(final Read r, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ | |
370 int len=0; | |
371 int shift=k2*2; | |
372 long kmer1=0; | |
373 long kmer2=0; | |
374 byte[] bases=r.bases; | |
375 byte[] quals=r.quality; | |
376 | |
377 assert(kmer1>=kmer2); | |
378 | |
379 // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; | |
380 | |
381 for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){ | |
382 int x1=AminoAcid.baseToNumber[bases[i]]; | |
383 int x2=AminoAcid.baseToNumber[bases[j]]; | |
384 if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){ | |
385 len=0; | |
386 kmer1=0; | |
387 kmer2=0; | |
388 }else{ | |
389 kmer1=((kmer1<<2)|x1)&mask1; | |
390 kmer2=((kmer2<<2)|x2)&mask2; | |
391 len++; | |
392 if(len>=k1){ | |
393 keysCounted++; | |
394 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); | |
395 | |
396 long key=(kmer1<<shift)|kmer2; | |
397 // System.err.println(Long.toHexString(key)); | |
398 count.increment(key); | |
399 // System.out.println(" -> "+count.read(kmer)); | |
400 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); | |
401 // array[(int)kmer]++; | |
402 // System.out.println(" -> "+array[(int)kmer]+"\n"); | |
403 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); | |
404 } | |
405 } | |
406 } | |
407 if(rcomp){ | |
408 r.reverseComplement(); | |
409 addReadSplit(r, count, k1, k2, mask1, mask2, gap, false); | |
410 } | |
411 } | |
412 | |
413 public static void addReadSplit(final byte[] bases, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ | |
414 int len=0; | |
415 int shift=k2*2; | |
416 long kmer1=0; | |
417 long kmer2=0; | |
418 byte[] quals=null; | |
419 | |
420 assert(kmer1>=kmer2); | |
421 | |
422 // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; | |
423 | |
424 for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){ | |
425 int x1=AminoAcid.baseToNumber[bases[i]]; | |
426 int x2=AminoAcid.baseToNumber[bases[j]]; | |
427 if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){ | |
428 len=0; | |
429 kmer1=0; | |
430 kmer2=0; | |
431 }else{ | |
432 kmer1=((kmer1<<2)|x1)&mask1; | |
433 kmer2=((kmer2<<2)|x2)&mask2; | |
434 len++; | |
435 if(len>=k1){ | |
436 keysCounted++; | |
437 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); | |
438 | |
439 long key=(kmer1<<shift)|kmer2; | |
440 System.out.println(Long.toHexString(kmer1)); | |
441 System.out.println(Long.toHexString(kmer2)); | |
442 System.out.println(Long.toHexString(key)); | |
443 count.increment(key); | |
444 // System.out.println(" -> "+count.read(kmer)); | |
445 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); | |
446 // array[(int)kmer]++; | |
447 // System.out.println(" -> "+array[(int)kmer]+"\n"); | |
448 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); | |
449 } | |
450 } | |
451 } | |
452 if(rcomp){ | |
453 AminoAcid.reverseComplementBasesInPlace(bases); | |
454 addReadSplit(bases, count, k1, k2, mask1, mask2, gap, false); | |
455 } | |
456 } | |
457 | |
458 } |