Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/KmerCount6.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package bloom; | |
2 | |
3 import java.util.ArrayList; | |
4 import java.util.BitSet; | |
5 import java.util.Locale; | |
6 | |
7 import dna.AminoAcid; | |
8 import fileIO.FileFormat; | |
9 import shared.Timer; | |
10 import stream.ConcurrentReadInputStream; | |
11 import stream.FastaReadInputStream; | |
12 import stream.Read; | |
13 import structures.ListNum; | |
14 | |
15 /** | |
16 * @author Brian Bushnell | |
17 * @date Jul 5, 2012 | |
18 * | |
19 */ | |
20 public class KmerCount6 extends KmerCountAbstract { | |
21 | |
22 public static void main(String[] args){ | |
23 | |
24 Timer t=new Timer(); | |
25 | |
26 String fname1=args[0]; | |
27 String fname2=(args.length>1 ? args[1] : null); | |
28 int k=14; | |
29 int cbits=16; | |
30 int gap=0; | |
31 | |
32 for(int i=2; i<args.length; i++){ | |
33 final String arg=args[i]; | |
34 final String[] split=arg.split("="); | |
35 String a=split[0].toLowerCase(); | |
36 String b=split.length>1 ? split[1] : null; | |
37 | |
38 if(a.equals("k") || a.equals("kmer")){ | |
39 k=Integer.parseInt(b); | |
40 }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ | |
41 cbits=Integer.parseInt(b); | |
42 }else if(a.startsWith("gap")){ | |
43 gap=Integer.parseInt(b); | |
44 }else{ | |
45 throw new RuntimeException("Unknown parameter "+args[i]); | |
46 } | |
47 } | |
48 | |
49 KCountArray count=null; | |
50 | |
51 if(fileIO.FileFormat.hasFastaExtension(fname1)){ | |
52 assert(!FastaReadInputStream.SPLIT_READS); | |
53 FastaReadInputStream.MIN_READ_LEN=k; | |
54 } | |
55 | |
56 count=count(fname1, fname2, k, cbits, true, null); | |
57 | |
58 | |
59 t.stop(); | |
60 System.out.println("Finished counting; time = "+t); | |
61 | |
62 printStatistics(count); | |
63 | |
64 } | |
65 | |
66 public static void printStatistics(KCountArray count){ | |
67 long[] freq=count.transformToFrequency(); | |
68 | |
69 // System.out.println(count+"\n"); | |
70 // System.out.println(Arrays.toString(freq)+"\n"); | |
71 | |
72 long sum=sum(freq); | |
73 System.out.println("Kmer fraction:"); | |
74 int lim1=8, lim2=16; | |
75 for(int i=0; i<lim1; i++){ | |
76 String prefix=i+""; | |
77 while(prefix.length()<8){prefix=prefix+" ";} | |
78 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]); | |
79 } | |
80 while(lim1<=freq.length){ | |
81 int x=0; | |
82 for(int i=lim1; i<lim2; i++){ | |
83 x+=freq[i]; | |
84 } | |
85 String prefix=lim1+"-"+(lim2-1); | |
86 if(lim2>=freq.length){prefix=lim1+"+";} | |
87 while(prefix.length()<8){prefix=prefix+" ";} | |
88 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x); | |
89 lim1*=2; | |
90 lim2=min(lim2*2, freq.length); | |
91 } | |
92 | |
93 long sum2=sum-freq[0]; | |
94 long x=freq[1]; | |
95 System.out.println(); | |
96 System.out.println("Keys Counted: \t \t"+keysCounted); | |
97 System.out.println("Unique: \t \t"+sum2); | |
98 System.out.println("Avg Sites/Key: \t \t"+String.format(Locale.ROOT, "%.3f ",(keysCounted*1d/sum2))); | |
99 System.out.println(); | |
100 System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); | |
101 x=sum2-x; | |
102 System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); | |
103 } | |
104 | |
105 public static KCountArray count(String reads1, String reads2, int k, int cbits, boolean rcomp, KCountArray count){ | |
106 assert(k<32 && k>=1 && (count!=null || k<20)); | |
107 final int kbits=2*k; | |
108 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits)); | |
109 | |
110 if(count==null){ | |
111 final long cells=1L<<kbits; | |
112 if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));} | |
113 count=KCountArray.makeNew(cells, cbits); | |
114 } | |
115 | |
116 final ConcurrentReadInputStream cris; | |
117 { | |
118 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); | |
119 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); | |
120 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); | |
121 cris.start(); //4567 | |
122 } | |
123 | |
124 assert(cris!=null) : reads1; | |
125 System.err.println("Started cris"); | |
126 boolean paired=cris.paired(); | |
127 if(verbose){System.err.println("Paired: "+paired);} | |
128 | |
129 count(cris, k, rcomp, count); | |
130 | |
131 cris.close(); | |
132 if(verbose){System.err.println("Closed stream");} | |
133 if(verbose){System.err.println("Processed "+readsProcessed+" reads.");} | |
134 | |
135 | |
136 return count; | |
137 } | |
138 | |
139 | |
140 public static void count(ConcurrentReadInputStream cris, int k, boolean rcomp, KCountArray count){ | |
141 assert(k<32 && k>=1 && (count!=null || k<20)); | |
142 | |
143 assert(count!=null); | |
144 | |
145 ListNum<Read> ln=cris.nextList(); | |
146 ArrayList<Read> reads=(ln!=null ? ln.list : null); | |
147 | |
148 | |
149 if(true /*count.gap==0*/){ | |
150 final int kbits=2*k; | |
151 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits)); | |
152 | |
153 | |
154 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
155 //System.err.println("reads.size()="+reads.size()); | |
156 for(Read r : reads){ | |
157 readsProcessed++; | |
158 | |
159 addRead(r, count, k, mask, rcomp); | |
160 if(r.mate!=null){ | |
161 addRead(r.mate, count, k, mask, rcomp); | |
162 } | |
163 | |
164 } | |
165 //System.err.println("returning list"); | |
166 cris.returnList(ln); | |
167 //System.err.println("fetching list"); | |
168 ln=cris.nextList(); | |
169 reads=(ln!=null ? ln.list : null); | |
170 } | |
171 }else{ | |
172 final int k1=(k+1)/2; | |
173 final int k2=k/2; | |
174 final int kbits1=2*k1; | |
175 final int kbits2=2*k2; | |
176 // final int gap=count.gap; | |
177 int gap=0; assert(false); | |
178 final long mask1=~((-1L)<<(kbits1)); | |
179 final long mask2=~((-1L)<<(kbits2)); | |
180 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
181 //System.err.println("reads.size()="+reads.size()); | |
182 for(Read r : reads){ | |
183 readsProcessed++; | |
184 | |
185 addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp); | |
186 if(r.mate!=null){ | |
187 addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp); | |
188 } | |
189 | |
190 } | |
191 //System.err.println("returning list"); | |
192 cris.returnList(ln); | |
193 //System.err.println("fetching list"); | |
194 ln=cris.nextList(); | |
195 reads=(ln!=null ? ln.list : null); | |
196 } | |
197 } | |
198 | |
199 if(verbose){System.err.println("Finished reading");} | |
200 cris.returnList(ln); | |
201 if(verbose){System.err.println("Returned list");} | |
202 } | |
203 | |
204 | |
205 | |
206 | |
207 public static KCountArray count(final String reads1, final String reads2, final int k, final int cbits, final boolean rcomp, | |
208 KCountArray count, final KCountArray trusted, final long maxReads, final int thresh, final int detectStepsize, final boolean conservative){ | |
209 | |
210 assert(k<32 && k>=1 && (count!=null || k<20)); | |
211 final int kbits=2*k; | |
212 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits)); | |
213 | |
214 // System.out.println("k="+k+", kbits="+kbits+", mask="+Long.toHexString(mask)+", thresh="+thresh); | |
215 // System.out.println("\ntrusted=\n"+trusted); | |
216 // System.out.println("\ncount=\n"+count); | |
217 | |
218 if(count==null){ | |
219 final long cells=1L<<kbits; | |
220 if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));} | |
221 count=KCountArray.makeNew(cells, cbits); | |
222 } | |
223 | |
224 final ConcurrentReadInputStream cris; | |
225 { | |
226 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); | |
227 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); | |
228 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); | |
229 cris.start(); //4567 | |
230 } | |
231 | |
232 assert(cris!=null) : reads1; | |
233 System.err.println("Started cris"); | |
234 boolean paired=cris.paired(); | |
235 if(verbose){System.err.println("Paired: "+paired);} | |
236 | |
237 count(cris, k, rcomp, count, trusted, thresh, detectStepsize, conservative); | |
238 | |
239 cris.close(); | |
240 if(verbose){System.err.println("Closed stream");} | |
241 | |
242 // System.out.println("*** after ***"); | |
243 // System.out.println("\ntrusted=\n"+trusted); | |
244 // System.out.println("\ncount=\n"+count); | |
245 | |
246 return count; | |
247 } | |
248 | |
249 | |
250 | |
251 | |
252 public static void count(final ConcurrentReadInputStream cris, final int k, final boolean rcomp, | |
253 final KCountArray count, final KCountArray trusted, final int thresh, final int detectStepsize, final boolean conservative){ | |
254 | |
255 assert(k<32 && k>=1 && (count!=null || k<20)); | |
256 final int kbits=2*k; | |
257 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits)); | |
258 | |
259 ListNum<Read> ln=cris.nextList(); | |
260 ArrayList<Read> reads=(ln!=null ? ln.list : null); | |
261 | |
262 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
263 //System.err.println("reads.size()="+reads.size()); | |
264 for(Read r : reads){ | |
265 | |
266 Read r2=r.mate; | |
267 { | |
268 if(trusted!=null){ | |
269 BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) : | |
270 ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize)); | |
271 // System.out.println("\n"+toString(bs, r.length())); | |
272 // System.out.println(new String(r.bases)); | |
273 for(int i=bs.nextClearBit(0); i<r.length(); i=bs.nextClearBit(i+1)){ | |
274 r.bases[i]='N'; | |
275 r.quality[i]=0; | |
276 } | |
277 // System.out.println(new String(r.bases)); | |
278 // System.out.println("used = "+String.format(Locale.ROOT, "%.3f%%",count.usedFraction()*100)); | |
279 // System.out.println("used = "+((KCountArray4)count).cellsUsed()); | |
280 // if(bs.length()<r.length()){r=null;} | |
281 } | |
282 // if(r!=null){addRead(r, count, k, mask, rcomp);} | |
283 addRead(r, count, k, mask, rcomp); | |
284 } | |
285 if(r2!=null){ | |
286 if(trusted!=null){ | |
287 BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r2, trusted, k, thresh, detectStepsize) : | |
288 ErrorCorrect.detectTrusted(r2, trusted, k, thresh, detectStepsize)); | |
289 for(int i=bs.nextClearBit(0); i<r2.length(); i=bs.nextClearBit(i+1)){ | |
290 r2.bases[i]='N'; | |
291 r2.quality[i]=0; | |
292 } | |
293 } | |
294 addRead(r2, count, k, mask, rcomp); | |
295 } | |
296 | |
297 } | |
298 //System.err.println("returning list"); | |
299 cris.returnList(ln); | |
300 //System.err.println("fetching list"); | |
301 ln=cris.nextList(); | |
302 reads=(ln!=null ? ln.list : null); | |
303 } | |
304 | |
305 if(verbose){System.err.println("Finished reading");} | |
306 cris.returnList(ln); | |
307 if(verbose){System.err.println("Returned list");} | |
308 } | |
309 | |
310 | |
311 | |
312 public static void addRead(final Read r, final KCountArray count, final int k, final long mask, boolean rcomp){ | |
313 int len=0; | |
314 long kmer=0; | |
315 byte[] bases=r.bases; | |
316 byte[] quals=r.quality; | |
317 for(int i=0; i<bases.length; i++){ | |
318 byte b=bases[i]; | |
319 int x=AminoAcid.baseToNumber[b]; | |
320 if(x<0 || (quals!=null && quals[i]<minQuality)){ | |
321 len=0; | |
322 kmer=0; | |
323 }else{ | |
324 kmer=((kmer<<2)|x)&mask; | |
325 len++; | |
326 if(len>=k){ | |
327 keysCounted++; | |
328 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); | |
329 count.increment(kmer); | |
330 // System.out.println(" -> "+count.read(kmer)); | |
331 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); | |
332 // array[(int)kmer]++; | |
333 // System.out.println(" -> "+array[(int)kmer]+"\n"); | |
334 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); | |
335 } | |
336 } | |
337 } | |
338 if(rcomp){ | |
339 r.reverseComplement(); | |
340 addRead(r, count, k, mask, false); | |
341 } | |
342 } | |
343 | |
344 public static void addReadSplit(final Read r, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ | |
345 int len=0; | |
346 int shift=k2*2; | |
347 long kmer1=0; | |
348 long kmer2=0; | |
349 byte[] bases=r.bases; | |
350 byte[] quals=r.quality; | |
351 | |
352 assert(kmer1>=kmer2); | |
353 | |
354 // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; | |
355 | |
356 for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){ | |
357 int x1=AminoAcid.baseToNumber[bases[i]]; | |
358 int x2=AminoAcid.baseToNumber[bases[j]]; | |
359 if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){ | |
360 len=0; | |
361 kmer1=0; | |
362 kmer2=0; | |
363 }else{ | |
364 kmer1=((kmer1<<2)|x1)&mask1; | |
365 kmer2=((kmer2<<2)|x2)&mask2; | |
366 len++; | |
367 if(len>=k1){ | |
368 keysCounted++; | |
369 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); | |
370 | |
371 long key=(kmer1<<shift)|kmer2; | |
372 // System.err.println(Long.toHexString(key)); | |
373 count.increment(key); | |
374 // System.out.println(" -> "+count.read(kmer)); | |
375 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); | |
376 // array[(int)kmer]++; | |
377 // System.out.println(" -> "+array[(int)kmer]+"\n"); | |
378 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); | |
379 } | |
380 } | |
381 } | |
382 if(rcomp){ | |
383 r.reverseComplement(); | |
384 addReadSplit(r, count, k1, k2, mask1, mask2, gap, false); | |
385 } | |
386 } | |
387 | |
388 public static void addReadSplit(final byte[] bases, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ | |
389 int len=0; | |
390 int shift=k2*2; | |
391 long kmer1=0; | |
392 long kmer2=0; | |
393 byte[] quals=null; | |
394 | |
395 assert(kmer1>=kmer2); | |
396 | |
397 // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; | |
398 | |
399 for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){ | |
400 int x1=AminoAcid.baseToNumber[bases[i]]; | |
401 int x2=AminoAcid.baseToNumber[bases[j]]; | |
402 if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){ | |
403 len=0; | |
404 kmer1=0; | |
405 kmer2=0; | |
406 }else{ | |
407 kmer1=((kmer1<<2)|x1)&mask1; | |
408 kmer2=((kmer2<<2)|x2)&mask2; | |
409 len++; | |
410 if(len>=k1){ | |
411 keysCounted++; | |
412 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); | |
413 | |
414 long key=(kmer1<<shift)|kmer2; | |
415 System.out.println(Long.toHexString(kmer1)); | |
416 System.out.println(Long.toHexString(kmer2)); | |
417 System.out.println(Long.toHexString(key)); | |
418 count.increment(key); | |
419 // System.out.println(" -> "+count.read(kmer)); | |
420 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); | |
421 // array[(int)kmer]++; | |
422 // System.out.println(" -> "+array[(int)kmer]+"\n"); | |
423 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); | |
424 } | |
425 } | |
426 } | |
427 if(rcomp){ | |
428 AminoAcid.reverseComplementBasesInPlace(bases); | |
429 addReadSplit(bases, count, k1, k2, mask1, mask2, gap, false); | |
430 } | |
431 } | |
432 | |
433 } |