Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/ReadCounter.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package bloom; | |
2 | |
3 import java.io.File; | |
4 import java.lang.Thread.State; | |
5 import java.util.ArrayList; | |
6 import java.util.Arrays; | |
7 import java.util.BitSet; | |
8 import java.util.Locale; | |
9 import dna.AminoAcid; | |
10 import fileIO.FileFormat; | |
11 import fileIO.ReadWrite; | |
12 import jgi.BBMerge; | |
13 import kmer.KmerTableSet; | |
14 import shared.Parse; | |
15 import shared.Parser; | |
16 import shared.PreParser; | |
17 import shared.Shared; | |
18 import shared.Timer; | |
19 import shared.Tools; | |
20 import sketch.SketchObject; | |
21 import stream.ConcurrentReadInputStream; | |
22 import stream.FastaReadInputStream; | |
23 import stream.Read; | |
24 import structures.ListNum; | |
25 import structures.LongList; | |
26 import ukmer.Kmer; | |
27 | |
28 /** | |
29 * @author Brian Bushnell | |
30 * @date Jul 5, 2012 | |
31 * | |
32 */ | |
33 public class ReadCounter extends KmerCountAbstract { | |
34 | |
35 public static void main(String[] args){ | |
36 | |
37 {//Preparse block for help, config files, and outstream | |
38 PreParser pp=new PreParser(args, new Object() { }.getClass().getEnclosingClass(), false); | |
39 args=pp.args; | |
40 // outstream=pp.outstream; | |
41 } | |
42 | |
43 Timer t=new Timer(); | |
44 | |
45 String fname1=args[0]; | |
46 String fname2=(args.length>1 ? args[1] : null); | |
47 int k=14; | |
48 int cbits=16; | |
49 int matrixbits=-1; | |
50 int hashes=1; | |
51 | |
52 for(int i=2; i<args.length; i++){ | |
53 final String arg=args[i]; | |
54 final String[] split=arg.split("="); | |
55 String a=split[0].toLowerCase(); | |
56 String b=split.length>1 ? split[1] : null; | |
57 | |
58 if(a.equals("k") || a.equals("kmer")){ | |
59 k=Integer.parseInt(b); | |
60 }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ | |
61 cbits=Integer.parseInt(b); | |
62 }else if(a.startsWith("reads") || a.startsWith("maxreads")){ | |
63 maxReads=Parse.parseKMG(b); | |
64 }else if(a.startsWith("matrixbits")){ | |
65 matrixbits=Integer.parseInt(b); | |
66 }else if(a.startsWith("hashes")){ | |
67 hashes=Integer.parseInt(b); | |
68 }else if(a.equals("canonical")){ | |
69 CANONICAL=Parse.parseBoolean(b); | |
70 }else{ | |
71 throw new RuntimeException("Unknown parameter "+args[i]); | |
72 } | |
73 } | |
74 | |
75 {//Process parser fields | |
76 Parser.processQuality(); | |
77 } | |
78 | |
79 int kbits=Tools.min(2*k, 62); | |
80 if(matrixbits<0){ | |
81 matrixbits=kbits; | |
82 } | |
83 matrixbits=Tools.min(kbits, matrixbits); | |
84 | |
85 if(fileIO.FileFormat.hasFastaExtension(fname1)){ | |
86 assert(!FastaReadInputStream.SPLIT_READS); | |
87 FastaReadInputStream.MIN_READ_LEN=k; | |
88 } | |
89 | |
90 KCountArray counts=KCountArray.makeNew(1L<<matrixbits, cbits, hashes); | |
91 ReadCounter rc=new ReadCounter(k, true, false, false, false); | |
92 try { | |
93 rc.count(fname1, fname2, counts); | |
94 } catch (Exception e) { | |
95 // TODO Auto-generated catch block | |
96 e.printStackTrace(); | |
97 } | |
98 counts.shutdown(); | |
99 | |
100 // verbose=true; | |
101 | |
102 t.stop(); | |
103 System.out.println("Finished counting; time = "+t); | |
104 | |
105 rc.printStatistics(counts); | |
106 | |
107 } | |
108 | |
109 /** Defaults for nucleotides. */ | |
110 public ReadCounter(final int k_){this(k_, true, false, false, false);} | |
111 | |
112 public ReadCounter(final int k_, final boolean rcomp_, | |
113 final boolean ecco_, final boolean merge_, final boolean amino_){ | |
114 k=k_; | |
115 rcomp=rcomp_; | |
116 ecco=ecco_; | |
117 merge=merge_; | |
118 amino=amino_; | |
119 | |
120 final int bitsPerChar=(amino ? AminoAcid.AMINO_SHIFT : 2); | |
121 aminoShift=AminoAcid.AMINO_SHIFT; | |
122 shift=bitsPerChar*k; | |
123 shift2=shift-bitsPerChar; | |
124 mask=(shift>63 ? -1L : ~((-1L)<<shift)); //Conditional allows K=32 | |
125 | |
126 assert(!amino || k*bitsPerChar<64); | |
127 assert(!amino || !rcomp); | |
128 assert(k>0); | |
129 } | |
130 | |
131 public void printStatistics(KCountArray counts){ | |
132 long[] freq=counts.transformToFrequency(); | |
133 | |
134 // System.out.println(count+"\n"); | |
135 // System.out.println(Arrays.toString(freq)+"\n"); | |
136 | |
137 long sum=sum(freq); | |
138 System.out.println("Kmer fraction:"); | |
139 int lim1=8, lim2=16; | |
140 for(int i=0; i<lim1; i++){ | |
141 String prefix=i+""; | |
142 while(prefix.length()<8){prefix=prefix+" ";} | |
143 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]); | |
144 } | |
145 while(lim1<=freq.length){ | |
146 int x=0; | |
147 for(int i=lim1; i<lim2; i++){ | |
148 x+=freq[i]; | |
149 } | |
150 String prefix=lim1+"-"+(lim2-1); | |
151 if(lim2>=freq.length){prefix=lim1+"+";} | |
152 while(prefix.length()<8){prefix=prefix+" ";} | |
153 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x); | |
154 lim1*=2; | |
155 lim2=min(lim2*2, freq.length); | |
156 } | |
157 | |
158 long sum2=sum-freq[0]; | |
159 long x=freq[1]; | |
160 System.out.println(); | |
161 System.out.println("Keys Counted: \t \t"+keysCounted); | |
162 System.out.println("Unique: \t \t"+sum2); | |
163 System.out.println("Avg Sites/Key: \t \t"+String.format(Locale.ROOT, "%.3f ",(keysCounted*1d/sum2))); | |
164 System.out.println(); | |
165 System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); | |
166 x=sum2-x; | |
167 System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); | |
168 } | |
169 | |
170 public KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles, int cbits){ | |
171 return makeKca(fname1, fname2, extraFiles, cbits, Tools.min(2*k, 35), 1, minQuality, | |
172 maxReads, 1, 1, 1, 2); | |
173 } | |
174 | |
175 public KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles, | |
176 int cbits, int matrixbits, int hashes, int minqual, long maxreads){ | |
177 assert(matrixbits<63); | |
178 return makeKca(fname1, fname2, extraFiles, cbits, matrixbits, hashes, minqual, | |
179 maxreads, 1, 1, 1, 2); | |
180 } | |
181 | |
182 public KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles, | |
183 int cbits, int matrixbits, int hashes, int minqual, | |
184 long maxreads, int passes, int stepsize, int thresh1, int thresh2){ | |
185 assert(matrixbits<63); | |
186 return makeKca(fname1, fname2, extraFiles, | |
187 cbits, 1L<<matrixbits, hashes, minqual, | |
188 maxreads, passes, stepsize, thresh1, thresh2, null, 0); | |
189 } | |
190 | |
191 public KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles, | |
192 int cbits, long cells, int hashes, int minqual, | |
193 long maxreads, int passes, int stepsize, int thresh1, int thresh2){ | |
194 return makeKca(fname1, fname2, extraFiles, | |
195 cbits, cells, hashes, minqual, | |
196 maxreads, passes, stepsize, thresh1, thresh2, null, 0); | |
197 } | |
198 | |
199 public KCountArray makeKca_als(ArrayList<String> fname1, ArrayList<String> fname2, Iterable<String> extraFiles, | |
200 int cbits, long cells, int hashes, int minqual, | |
201 long maxreads, int passes, int stepsize, int thresh1, int thresh2, | |
202 KCountArray prefilter, int prefilterLimit_){ | |
203 String a=null, b=null; | |
204 ArrayList<String> list=new ArrayList<String>(); | |
205 if(fname1!=null){ | |
206 for(int i=0; i<fname1.size(); i++){ | |
207 if(i==0){a=fname1.get(i);} | |
208 else{list.add(fname1.get(i));} | |
209 } | |
210 } | |
211 if(fname2!=null){ | |
212 for(int i=0; i<fname2.size(); i++){ | |
213 if(i==0){b=fname2.get(i);} | |
214 else{list.add(fname2.get(i));} | |
215 } | |
216 } | |
217 if(extraFiles!=null){ | |
218 for(String s : extraFiles){ | |
219 list.add(s); | |
220 } | |
221 } | |
222 return makeKca(a, b, list.isEmpty() ? null : list, cbits, cells, hashes, minqual, | |
223 maxreads, passes, stepsize, thresh1, thresh2, | |
224 prefilter, prefilterLimit_); | |
225 } | |
226 | |
227 public KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles, | |
228 int cbits, long cells, int hashes, int minqual, | |
229 long maxreads, int passes, int stepsize, int thresh1, int thresh2, | |
230 KCountArray prefilter, int prefilterLimit_){ | |
231 // verbose=true; | |
232 if(verbose){System.err.println("Making kca from ("+fname1+", "+fname2+")\nk="+k+", cells="+Tools.toKMG(cells)+", cbits="+cbits);} | |
233 | |
234 if(fname1==null && fname2==null && extraFiles==null){ | |
235 return KCountArray.makeNew(cells, cbits, hashes, prefilter, prefilterLimit_); | |
236 } | |
237 | |
238 boolean oldsplit=FastaReadInputStream.SPLIT_READS; | |
239 long oldmax=maxReads; | |
240 byte oldq=minQuality; | |
241 maxReads=maxreads; | |
242 minQuality=(byte)minqual; | |
243 // System.out.println("kbits="+(kbits)+" -> "+(1L<<kbits)+", matrixbits="+(matrixbits)+" -> "+(1L<<matrixbits)+", cbits="+cbits+", gap="+gap+", hashes="+hashes); | |
244 KCountArray kca=KCountArray.makeNew(cells, cbits, hashes, prefilter, prefilterLimit_); | |
245 | |
246 // System.out.println("a"); | |
247 {//For processing input lists | |
248 ArrayList<String> extra2=null; | |
249 if(fname1!=null && fname1.contains(",")){ | |
250 String[] s=fname1.split(","); | |
251 if(extra2==null){extra2=new ArrayList<String>();} | |
252 for(int i=1; i<s.length; i++){extra2.add(s[i]);} | |
253 fname1=s[0]; | |
254 } | |
255 if(fname2!=null && fname2.contains(",")){ | |
256 String[] s=fname2.split(","); | |
257 if(extra2==null){extra2=new ArrayList<String>();} | |
258 for(int i=1; i<s.length; i++){extra2.add(s[i]);} | |
259 fname2=s[0]; | |
260 } | |
261 if(extra2!=null){ | |
262 if(extraFiles!=null){ | |
263 for(String s : extraFiles){ | |
264 extra2.add(s); | |
265 } | |
266 } | |
267 extraFiles=extra2; | |
268 } | |
269 } | |
270 // System.out.println("b"); | |
271 | |
272 if(extraFiles!=null){ | |
273 for(String s : extraFiles){ | |
274 if(fileIO.FileFormat.hasFastaExtension(s)){ | |
275 assert(!FastaReadInputStream.SPLIT_READS); | |
276 } | |
277 } | |
278 } | |
279 | |
280 // System.out.println("c"); | |
281 | |
282 if(passes==1){ | |
283 // System.out.println("c1"); | |
284 if(fname1!=null){ | |
285 try { | |
286 count(fname1, fname2, kca); | |
287 } catch (Exception e) { | |
288 // TODO Auto-generated catch block | |
289 e.printStackTrace(); | |
290 } | |
291 } | |
292 if(extraFiles!=null){ | |
293 maxReads=-1; | |
294 for(String s : extraFiles){ | |
295 try { | |
296 count(s, null, kca); | |
297 } catch (Exception e) { | |
298 // TODO Auto-generated catch block | |
299 e.printStackTrace(); | |
300 } | |
301 } | |
302 } | |
303 kca.shutdown(); | |
304 | |
305 }else{ | |
306 // System.out.println("c2"); | |
307 assert(passes>1); | |
308 KCountArray trusted=null; | |
309 for(int i=1; i<passes; i++){ | |
310 boolean conservative=i>2;// /*or, alternately, (trusted==null || trusted.capacity()>0.3) | |
311 int step=(stepsize==1 ? 1 : stepsize+i%2); | |
312 // if(!conservative){step=(step+3)/4;} | |
313 if(!conservative){step=Tools.min(3, (step+3)/4);} | |
314 | |
315 try { | |
316 count(fname1, fname2, cbits, kca, trusted, maxreads, thresh1, step, conservative); | |
317 } catch (Exception e) { | |
318 // TODO Auto-generated catch block | |
319 e.printStackTrace(); | |
320 } | |
321 if(extraFiles!=null){ | |
322 maxReads=-1; | |
323 for(String s : extraFiles){ | |
324 try { | |
325 count(s, null, cbits, kca, trusted, maxreads, thresh1, step, conservative); | |
326 } catch (Exception e) { | |
327 // TODO Auto-generated catch block | |
328 e.printStackTrace(); | |
329 } | |
330 } | |
331 } | |
332 kca.shutdown(); | |
333 | |
334 System.out.println("Trusted: \t"+kca.toShortString()); | |
335 trusted=kca; | |
336 kca=KCountArray.makeNew(cells, cbits, hashes, prefilter, prefilterLimit_); | |
337 | |
338 } | |
339 | |
340 try { | |
341 count(fname1, fname2, cbits, kca, trusted, maxreads, thresh2, stepsize, true); | |
342 } catch (Exception e) { | |
343 // TODO Auto-generated catch block | |
344 e.printStackTrace(); | |
345 } | |
346 if(extraFiles!=null){ | |
347 maxReads=-1; | |
348 for(String s : extraFiles){ | |
349 try { | |
350 count(s, null, cbits, kca, trusted, maxreads, thresh2, stepsize, true); | |
351 } catch (Exception e) { | |
352 // TODO Auto-generated catch block | |
353 e.printStackTrace(); | |
354 } | |
355 } | |
356 } | |
357 kca.shutdown(); | |
358 } | |
359 // System.out.println("d"); | |
360 minQuality=oldq; | |
361 maxReads=oldmax; | |
362 FastaReadInputStream.SPLIT_READS=oldsplit; | |
363 | |
364 | |
365 return kca; | |
366 } | |
367 | |
368 public KCountArray count(String reads1, String reads2, KCountArray counts) throws Exception{ | |
369 // System.err.println("countFastq... making a new cris"); | |
370 assert(counts!=null); | |
371 | |
372 { | |
373 int pound=reads1.lastIndexOf('#'); | |
374 if(pound>=0 && reads2==null && !new File(reads1).exists()){ | |
375 String a=reads1.substring(0, pound); | |
376 String b=reads1.substring(pound+1); | |
377 reads1=a+1+b; | |
378 reads2=a+2+b; | |
379 } | |
380 } | |
381 | |
382 final ConcurrentReadInputStream cris; | |
383 { | |
384 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); | |
385 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); | |
386 if(ff2==null){ff1.preferShreds=true;} | |
387 // if(ff2!=null){ //TODO - interleaved flag | |
388 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); | |
389 cris.start(); //4567 | |
390 } | |
391 | |
392 assert(cris!=null) : reads1; | |
393 if(verbose){System.err.println("Started cris");} | |
394 boolean paired=cris.paired(); | |
395 if(verbose){System.err.println("Paired: "+paired);} | |
396 | |
397 // countFastq(cris, count); | |
398 // assert(false) : THREADS; | |
399 CountThread[] cta=new CountThread[Shared.threads()]; | |
400 for(int i=0; i<cta.length; i++){ | |
401 cta[i]=new CountThread(cris, counts); | |
402 cta[i].start(); | |
403 } | |
404 // System.out.println("~1"); | |
405 for(int i=0; i<cta.length; i++){ | |
406 // System.out.println("~2"); | |
407 CountThread ct=cta[i]; | |
408 synchronized(ct){ | |
409 // System.out.println("~3"); | |
410 while(ct.getState()!=State.TERMINATED){ | |
411 // System.out.println("~4"); | |
412 try { | |
413 ct.join(2000); | |
414 } catch (InterruptedException e) { | |
415 // TODO Auto-generated catch block | |
416 e.printStackTrace(); | |
417 } | |
418 // System.out.println("~5"); | |
419 } | |
420 } | |
421 } | |
422 // System.out.println("~6"); | |
423 | |
424 ReadWrite.closeStream(cris); | |
425 if(verbose){System.err.println("Closed stream");} | |
426 if(verbose){System.err.println("Processed "+readsProcessed+" reads.");} | |
427 | |
428 | |
429 return counts; | |
430 } | |
431 | |
432 public KCountArray count(String reads1, String reads2, final int cbits, | |
433 KCountArray counts, final KCountArray trusted, final long maxReads, final int thresh, | |
434 final int detectStepsize, final boolean conservative) | |
435 throws Exception{ | |
436 | |
437 assert(counts!=null); | |
438 | |
439 { | |
440 int pound=reads1.lastIndexOf('#'); | |
441 if(pound>=0 && reads2==null && !new File(reads1).exists()){ | |
442 String a=reads1.substring(0, pound); | |
443 String b=reads1.substring(pound+1); | |
444 reads1=a+1+b; | |
445 reads2=a+2+b; | |
446 } | |
447 } | |
448 | |
449 final ConcurrentReadInputStream cris; | |
450 { | |
451 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); | |
452 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); | |
453 if(ff2==null){ff1.preferShreds=true;} | |
454 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); | |
455 cris.start(); //4567 | |
456 } | |
457 | |
458 assert(cris!=null) : reads1; | |
459 if(verbose){System.err.println("Started cris");} | |
460 boolean paired=cris.paired(); | |
461 if(verbose){System.err.println("Paired: "+paired);} | |
462 | |
463 | |
464 // countFastq(cris, count, trusted, thresh, detectStepsize, conservative); | |
465 | |
466 // assert(false) : THREADS; | |
467 CountThread[] cta=new CountThread[Shared.threads()]; | |
468 for(int i=0; i<cta.length; i++){ | |
469 cta[i]=new CountThread(cris, counts, trusted, thresh, detectStepsize, conservative); | |
470 cta[i].start(); | |
471 } | |
472 | |
473 for(int i=0; i<cta.length; i++){ | |
474 CountThread ct=cta[i]; | |
475 synchronized(ct){ | |
476 while(ct.isAlive()){ | |
477 try { | |
478 ct.join(1000); | |
479 } catch (InterruptedException e) { | |
480 // TODO Auto-generated catch block | |
481 e.printStackTrace(); | |
482 } | |
483 } | |
484 } | |
485 } | |
486 | |
487 cris.close(); | |
488 if(verbose){System.err.println("Closed stream");} | |
489 | |
490 // System.out.println("*** after ***"); | |
491 // System.out.println("\ntrusted=\n"+trusted); | |
492 // System.out.println("\ncount=\n"+count); | |
493 | |
494 return counts; | |
495 } | |
496 | |
497 private final int findOverlap(Read r1, Read r2, boolean ecc){ | |
498 if(vstrict){ | |
499 return BBMerge.findOverlapVStrict(r1, r2, ecc); | |
500 }else{ | |
501 return BBMerge.findOverlapStrict(r1, r2, ecc); | |
502 } | |
503 } | |
504 | |
505 private class CountThread extends Thread{ | |
506 | |
507 CountThread(final ConcurrentReadInputStream cris_, final KCountArray counts_){ | |
508 this(cris_, counts_, null, 2, 1, true); | |
509 } | |
510 | |
511 CountThread(final ConcurrentReadInputStream cris_, | |
512 final KCountArray counts_, final KCountArray trusted_, final int thresh_, | |
513 final int detectStepsize_, final boolean conservative_){ | |
514 cris=cris_; | |
515 counts=counts_; | |
516 trusted=trusted_; | |
517 thresh=thresh_; | |
518 detectStepsize=detectStepsize_; | |
519 conservative=conservative_; | |
520 } | |
521 | |
522 @Override | |
523 public void run(){ | |
524 // System.out.println("Running"); | |
525 if(trusted==null){ | |
526 count(cris); | |
527 }else{ | |
528 count(cris, thresh, detectStepsize, conservative); | |
529 } | |
530 // System.out.println("Finished: "+readsProcessedLocal); | |
531 | |
532 if(BUFFERED){dumpBuffer();} | |
533 | |
534 synchronized(getClass()){ | |
535 keysCounted+=keysCountedLocal; | |
536 increments+=incrementsLocal; | |
537 readsProcessed+=readsProcessedLocal; | |
538 | |
539 if(verbose){System.err.println(keysCounted+", "+keysCountedLocal);} | |
540 if(verbose){System.err.println(readsProcessed+", "+readsProcessedLocal);} | |
541 } | |
542 } | |
543 | |
544 private void increment(final long key){ | |
545 if(BUFFERED){ | |
546 buffer.add(key); | |
547 if(buffer.size>=BUFFERLEN){ | |
548 dumpBuffer(); | |
549 } | |
550 }else{ | |
551 incrementByAmount(key, 1); | |
552 } | |
553 } | |
554 | |
555 | |
556 //TODO: All this 'sum' nonsense is just to count kmers added. Could be derived from buffer length. | |
557 private void dumpBuffer(){ | |
558 final int lim=buffer.size; | |
559 if(lim<1){return;} | |
560 final long[] array=buffer.array; | |
561 // if(SORT_SERIAL){ | |
562 // buffer.sortSerial(); | |
563 // }else{ | |
564 // buffer.sort(); | |
565 // } | |
566 buffer.sort();//Can be disabled via parallelsort flag but only affects arrays>10k | |
567 long kmer=array[0]-1;//Ensures a nonmatch | |
568 int streak=0; | |
569 for(int i=0; i<lim; i++){ | |
570 long x=array[i]; | |
571 if(x==kmer){streak++;} | |
572 else{ | |
573 if(streak>0){incrementByAmount(kmer, streak);} | |
574 kmer=x; | |
575 streak=1; | |
576 } | |
577 } | |
578 assert(streak>0); | |
579 incrementByAmount(kmer, streak); | |
580 buffer.clear(); | |
581 } | |
582 | |
583 private void incrementByAmount(final long key, int amt){ | |
584 if(SKETCH_MODE){ | |
585 final long code=SketchObject.hash(key); | |
586 if(code<minHashValue){return;} | |
587 counts.increment(STORE_HASHED ? code : key, amt); | |
588 }else{ | |
589 counts.increment(key, amt); | |
590 } | |
591 keysCountedLocal+=amt; | |
592 incrementsLocal++; | |
593 } | |
594 | |
595 private final void count(ConcurrentReadInputStream cris){ | |
596 assert(k>=1 && counts!=null); | |
597 // System.out.println("Waiting for list"); | |
598 ListNum<Read> ln=cris.nextList(); | |
599 ArrayList<Read> reads=(ln!=null ? ln.list : null); | |
600 // System.out.println("Got list: "+(ln==null ? "null" : ln.id)+", "+(ln==null || ln.list==null ? "null" : ln.list.size())); | |
601 | |
602 long[] array=null; | |
603 final Kmer kmer=new Kmer(k); | |
604 | |
605 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
606 //System.err.println("reads.size()="+reads.size()); | |
607 for(Read r1 : reads){ | |
608 | |
609 Read r2=r1.mate; | |
610 if((ecco || merge) && r2!=null){ | |
611 if(merge){ | |
612 final int insert=findOverlap(r1, r2, false); | |
613 if(insert>0){ | |
614 r2.reverseComplement(); | |
615 r1=r1.joinRead(insert); | |
616 r2=null; | |
617 } | |
618 }else if(ecco){ | |
619 findOverlap(r1, r2, true); | |
620 } | |
621 } | |
622 readsProcessedLocal++; | |
623 | |
624 if(k<=maxShortKmerLength){ | |
625 array=addRead_Advanced(r1, array); | |
626 }else{ | |
627 addReadBig(r1, kmer); | |
628 addReadBig(r1.mate, kmer); | |
629 } | |
630 // System.out.println(r); | |
631 // System.out.println("kmers hashed: "+keysCountedLocal); | |
632 } | |
633 //System.err.println("returning list"); | |
634 cris.returnList(ln); | |
635 //System.err.println("fetching list"); | |
636 ln=cris.nextList(); | |
637 reads=(ln!=null ? ln.list : null); | |
638 } | |
639 | |
640 | |
641 if(verbose){System.err.println("Finished reading");} | |
642 cris.returnList(ln); | |
643 if(verbose){System.err.println("Returned list");} | |
644 } | |
645 | |
646 | |
647 | |
648 | |
649 private final void count(final ConcurrentReadInputStream cris, final int thresh, | |
650 final int detectStepsize, final boolean conservative){ | |
651 | |
652 ListNum<Read> ln=cris.nextList(); | |
653 ArrayList<Read> reads=(ln!=null ? ln.list : null); | |
654 | |
655 long[] array=null; | |
656 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
657 //System.err.println("reads.size()="+reads.size()); | |
658 for(Read r1 : reads){ | |
659 | |
660 Read r2=r1.mate; | |
661 if((ecco || merge) && r2!=null){ | |
662 if(merge){ | |
663 final int insert=findOverlap(r1, r2, false); | |
664 if(insert>0){ | |
665 r2.reverseComplement(); | |
666 r1=r1.joinRead(insert); | |
667 r2=null; | |
668 } | |
669 }else if(ecco){ | |
670 findOverlap(r1, r2, true); | |
671 } | |
672 } | |
673 { | |
674 if(trusted!=null){ | |
675 BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r1, trusted, k, thresh, detectStepsize) : | |
676 ErrorCorrect.detectTrusted(r1, trusted, k, thresh, detectStepsize)); | |
677 // System.out.println("\n"+toString(bs, r.length())); | |
678 // System.out.println(new String(r.bases)); | |
679 if(bs!=null){ | |
680 for(int i=bs.nextClearBit(0); i<r1.length(); i=bs.nextClearBit(i+1)){ | |
681 r1.bases[i]='N'; | |
682 if(r1.quality!=null){r1.quality[i]=0;} | |
683 } | |
684 } | |
685 // System.out.println(new String(r.bases)); | |
686 // System.out.println("used = "+String.format(Locale.ROOT, "%.3f%%",count.usedFraction()*100)); | |
687 // System.out.println("used = "+((KCountArray4)count).cellsUsed()); | |
688 // if(bs.length()<r.length()){r=null;} | |
689 } | |
690 // if(r!=null){addRead(r, count, rcomp);} | |
691 } | |
692 if(r2!=null){ | |
693 if(trusted!=null){ | |
694 BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r2, trusted, k, thresh, detectStepsize) : | |
695 ErrorCorrect.detectTrusted(r2, trusted, k, thresh, detectStepsize)); | |
696 if(bs!=null){ | |
697 for(int i=bs.nextClearBit(0); i<r2.length(); i=bs.nextClearBit(i+1)){ | |
698 r2.bases[i]='N'; | |
699 if(r2.quality!=null){r2.quality[i]=0;} | |
700 } | |
701 } | |
702 } | |
703 } | |
704 array=addRead_Advanced(r1, array); | |
705 | |
706 } | |
707 //System.err.println("returning list"); | |
708 cris.returnList(ln); | |
709 //System.err.println("fetching list"); | |
710 ln=cris.nextList(); | |
711 reads=(ln!=null ? ln.list : null); | |
712 } | |
713 | |
714 if(verbose){System.err.println("Finished reading");} | |
715 cris.returnList(ln); | |
716 if(verbose){System.err.println("Returned list");} | |
717 } | |
718 | |
719 | |
720 | |
721 /** | |
722 * Hash a read's kmers into the KCountArray. | |
723 * Advanced mode processes paired reads together and sorts kmers to eliminate spurious duplicates. | |
724 * @param r1 | |
725 * @param counts | |
726 * @param k | |
727 * @param mask | |
728 * @param rcomp | |
729 */ | |
730 private final long[] addRead_Advanced(Read r1, long[] array){ | |
731 if(PREJOIN && r1.mate!=null && r1.insert()>0){ | |
732 r1.mate.reverseComplement(); | |
733 r1=r1.joinRead(); | |
734 } | |
735 Read r2=r1.mate; | |
736 final int len1=Tools.max(0, r1.length()-k+1); | |
737 final int len2=(r2==null || r2.bases==null) ? 0 : Tools.max(0, r2.length()-k+1); | |
738 final int len=len1+len2; | |
739 if(len<1){return array;} | |
740 | |
741 if(!KEEP_DUPLICATE_KMERS){ | |
742 if(array==null || array.length!=len){array=new long[len];} | |
743 Arrays.fill(array, -1); | |
744 fillKmerArray(r1, array, 0, len1); | |
745 if(r2!=null){fillKmerArray(r2, array, len1, len);} | |
746 | |
747 Arrays.sort(array); | |
748 long prev=-1; | |
749 for(int i=0; i<array.length; i++){ | |
750 long kmer=array[i]; | |
751 if(kmer!=prev){ | |
752 increment(kmer); | |
753 prev=kmer; | |
754 } | |
755 } | |
756 }else{ | |
757 if(len1>0){addRead(r1);} | |
758 if(len2>0){addRead(r2);} | |
759 } | |
760 return array; | |
761 } | |
762 | |
763 private final void addReadBig(Read r, Kmer kmer){ | |
764 if(r==null || r.bases==null){return;} | |
765 final byte[] bases=r.bases; | |
766 final byte[] quals=r.quality; | |
767 int len=0; | |
768 | |
769 if(bases==null || bases.length<k){return;} | |
770 kmer.clear(); | |
771 | |
772 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ | |
773 float prob=1; | |
774 for(int i=0; i<bases.length; i++){ | |
775 final byte b=bases[i]; | |
776 final long x=AminoAcid.baseToNumber[b]; | |
777 | |
778 //Update kmers | |
779 kmer.addRight(b); | |
780 | |
781 if(minProb>0 && quals!=null){//Update probability | |
782 prob=prob*KmerTableSet.PROB_CORRECT[quals[i]]; | |
783 if(len>k){ | |
784 byte oldq=quals[i-k]; | |
785 prob=prob*KmerTableSet.PROB_CORRECT_INVERSE[oldq]; | |
786 } | |
787 } | |
788 | |
789 //Handle Ns | |
790 if(x<0){ | |
791 len=0; | |
792 prob=1; | |
793 }else{len++;} | |
794 | |
795 assert(len==kmer.len); | |
796 | |
797 if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+"\t"+new String(bases, Tools.max(0, i-k), Tools.min(i+1, k)));} | |
798 if(len>=k && prob>=minProb){ | |
799 // System.err.println("Incrementing xor()="+kmer.xor()); | |
800 increment(kmer.xor()); | |
801 // counts.incrementAndReturnUnincremented(kmer.xor(), 1); | |
802 // keysCountedLocal++; | |
803 } | |
804 } | |
805 } | |
806 | |
807 private final void fillKmerArray(Read r, final long[] array, final int start, final int stop){ | |
808 if(amino){ | |
809 fillKmerArrayAmino(r, array, start, stop); | |
810 return; | |
811 } | |
812 if(k>maxShortKmerLength){ | |
813 fillKmerArrayLong(r, array, start, stop); | |
814 return; | |
815 } | |
816 assert(k<=maxShortKmerLength); | |
817 assert(!PREJOIN || r.mate==null); | |
818 assert(CANONICAL); | |
819 assert(array!=null); | |
820 | |
821 final byte[] bases=r.bases; | |
822 final byte[] quals=r.quality; | |
823 | |
824 if(bases==null || bases.length<k){return;} | |
825 | |
826 final int passes=(rcomp ? 2 : 1); | |
827 for(int pass=0; pass<passes; pass++){ | |
828 int len=0; | |
829 int idx=(pass==0 ? start-k+1 : stop+k-2); | |
830 long kmer=0; | |
831 float prob=1; | |
832 for(int i=0; i<bases.length; i++){ | |
833 byte b=bases[i]; | |
834 assert(b>=0) : r.id+", "+bases.length+", "+i+", "+b+"\n"+(bases.length<20000 ? r.toFasta().toString() : ""); | |
835 int x=AminoAcid.baseToNumber[b]; | |
836 | |
837 // int x=AminoAcid.baseToNumber[b<0 ? 'N' : b]; | |
838 | |
839 byte q; | |
840 if(quals==null){ | |
841 q=50; | |
842 }else{ | |
843 q=quals[i]; | |
844 prob=prob*align2.QualityTools.PROB_CORRECT[q]; | |
845 if(len>k){ | |
846 byte oldq=quals[i-k]; | |
847 prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; | |
848 } | |
849 } | |
850 | |
851 if(x<0 || q<minQuality){ | |
852 len=0; | |
853 kmer=0; | |
854 prob=1; | |
855 }else{ | |
856 kmer=((kmer<<2)|x)&mask; | |
857 len++; | |
858 if(len>=k && prob>=minProb){ | |
859 array[idx]=Tools.max(array[idx], kmer); | |
860 } | |
861 } | |
862 if(pass==0){idx++;}else{idx--;} | |
863 } | |
864 // System.out.println(Arrays.toString(array)); | |
865 r.reverseComplement(); | |
866 } | |
867 } | |
868 | |
869 private final void fillKmerArrayAmino(Read r, final long[] array, final int start, final int stop){ | |
870 assert(false) : "TODO"; //TODO | |
871 if(k>maxShortKmerLength){ | |
872 fillKmerArrayLong(r, array, start, stop); | |
873 return; | |
874 } | |
875 assert(k<=maxShortKmerLength); | |
876 assert(!PREJOIN || r.mate==null); | |
877 assert(CANONICAL); | |
878 assert(array!=null); | |
879 | |
880 final byte[] bases=r.bases; | |
881 final byte[] quals=r.quality; | |
882 | |
883 if(bases==null || bases.length<k){return;} | |
884 | |
885 for(int pass=0; pass<2; pass++){ | |
886 int len=0; | |
887 int idx=(pass==0 ? start-k+1 : stop+k-2); | |
888 long kmer=0; | |
889 float prob=1; | |
890 for(int i=0; i<bases.length; i++){ | |
891 byte b=bases[i]; | |
892 assert(b>=0) : r.id+", "+bases.length+", "+i+", "+b+"\n"+(bases.length<20000 ? r.toFasta().toString() : ""); | |
893 int x=AminoAcid.baseToNumber[b]; | |
894 | |
895 // int x=AminoAcid.baseToNumber[b<0 ? 'N' : b]; | |
896 | |
897 byte q; | |
898 if(quals==null){ | |
899 q=50; | |
900 }else{ | |
901 q=quals[i]; | |
902 prob=prob*align2.QualityTools.PROB_CORRECT[q]; | |
903 if(len>k){ | |
904 byte oldq=quals[i-k]; | |
905 prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; | |
906 } | |
907 } | |
908 | |
909 if(x<0 || q<minQuality){ | |
910 len=0; | |
911 kmer=0; | |
912 prob=1; | |
913 }else{ | |
914 kmer=((kmer<<2)|x)&mask; | |
915 len++; | |
916 if(len>=k && prob>=minProb){ | |
917 array[idx]=Tools.max(array[idx], kmer); | |
918 } | |
919 } | |
920 if(pass==0){idx++;}else{idx--;} | |
921 } | |
922 // System.out.println(Arrays.toString(array)); | |
923 r.reverseComplement(); | |
924 } | |
925 } | |
926 | |
927 private final void addRead(Read r){ | |
928 if(amino){ | |
929 addReadAmino(r); | |
930 return; | |
931 } | |
932 assert(k<=maxShortKmerLength); | |
933 assert(!PREJOIN || r.mate==null); | |
934 assert(CANONICAL); | |
935 | |
936 final byte[] bases=r.bases; | |
937 final byte[] quals=r.quality; | |
938 | |
939 if(bases==null || bases.length<k){return;} | |
940 | |
941 long kmer=0; | |
942 long rkmer=0; | |
943 int len=0; | |
944 float prob=1; | |
945 | |
946 for(int i=0; i<bases.length; i++){ | |
947 final byte b=bases[i]; | |
948 long x=AminoAcid.baseToNumber[b]; | |
949 long x2=AminoAcid.baseToComplementNumber[b]; | |
950 kmer=((kmer<<2)|x)&mask; | |
951 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; | |
952 | |
953 final byte q; | |
954 if(quals==null){ | |
955 q=50; | |
956 }else{ | |
957 q=quals[i]; | |
958 prob=prob*align2.QualityTools.PROB_CORRECT[q]; | |
959 if(len>k){ | |
960 byte oldq=quals[i-k]; | |
961 prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; | |
962 } | |
963 } | |
964 | |
965 if(x<0 || q<minQuality){ | |
966 len=0; | |
967 kmer=rkmer=0; | |
968 prob=1; | |
969 }else{ | |
970 len++; | |
971 if(len>=k && prob>=minProb){ | |
972 long key=(rcomp ? Tools.max(kmer, rkmer) : kmer); | |
973 increment(key); | |
974 } | |
975 } | |
976 } | |
977 } | |
978 | |
979 private final void addReadAmino(Read r){ | |
980 assert(!PREJOIN || r.mate==null); | |
981 | |
982 final byte[] bases=r.bases; | |
983 | |
984 if(bases==null || bases.length<k){return;} | |
985 | |
986 long kmer=0; | |
987 int len=0; | |
988 | |
989 for(int i=0; i<bases.length; i++){ | |
990 final byte b=bases[i]; | |
991 long x=AminoAcid.acidToNumber[b]; | |
992 kmer=((kmer<<aminoShift)|x)&mask; | |
993 | |
994 if(x<0){ | |
995 len=0; | |
996 kmer=0; | |
997 }else{ | |
998 len++; | |
999 if(len>=k){ | |
1000 increment(kmer); | |
1001 } | |
1002 } | |
1003 } | |
1004 } | |
1005 | |
1006 private final void fillKmerArrayLong(Read r, final long[] array, final int start, final int stop){ | |
1007 assert(k>maxShortKmerLength) : k; | |
1008 assert(!PREJOIN || r.mate==null); | |
1009 assert(CANONICAL); | |
1010 assert(array!=null); | |
1011 Kmer kmer=new Kmer(k); | |
1012 | |
1013 float prob=1; | |
1014 byte[] bases=r.bases; | |
1015 byte[] quals=r.quality; | |
1016 | |
1017 kmer.clear(); | |
1018 | |
1019 for(int i=0, idx=start-k+1; i<bases.length; i++, idx++){ | |
1020 byte b=bases[i]; | |
1021 kmer.addRight(b); | |
1022 | |
1023 byte q; | |
1024 if(quals==null){ | |
1025 q=50; | |
1026 }else{ | |
1027 q=quals[i]; | |
1028 prob=prob*align2.QualityTools.PROB_CORRECT[q]; | |
1029 if(kmer.len>k){ | |
1030 byte oldq=quals[i-k]; | |
1031 prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; | |
1032 } | |
1033 } | |
1034 | |
1035 if(!AminoAcid.isFullyDefined(b) || q<minQuality){ | |
1036 kmer.clear(); | |
1037 prob=1; | |
1038 } | |
1039 if(kmer.len>=k && prob>=minProb){ | |
1040 array[idx]=kmer.xor(); | |
1041 } | |
1042 } | |
1043 } | |
1044 | |
1045 private final ConcurrentReadInputStream cris; | |
1046 | |
1047 private final KCountArray counts; | |
1048 private final KCountArray trusted; | |
1049 private final int thresh; | |
1050 private final int detectStepsize; | |
1051 private final boolean conservative; | |
1052 private long keysCountedLocal=0; | |
1053 private long incrementsLocal=0; | |
1054 private long readsProcessedLocal=0; | |
1055 private final long minHashValue=SketchObject.minHashValue; | |
1056 private final LongList buffer=(BUFFERED ? new LongList(BUFFERLEN) : null); | |
1057 } | |
1058 | |
1059 public static boolean vstrict=false; | |
1060 | |
1061 private final int k; | |
1062 private final int aminoShift; | |
1063 private final int shift; | |
1064 private final int shift2; | |
1065 private final long mask; | |
1066 private final boolean rcomp; | |
1067 private final boolean ecco; | |
1068 private final boolean merge; | |
1069 private final boolean amino; | |
1070 | |
1071 } |