comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/ReadCounter.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package bloom;
2
3 import java.io.File;
4 import java.lang.Thread.State;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.BitSet;
8 import java.util.Locale;
9 import dna.AminoAcid;
10 import fileIO.FileFormat;
11 import fileIO.ReadWrite;
12 import jgi.BBMerge;
13 import kmer.KmerTableSet;
14 import shared.Parse;
15 import shared.Parser;
16 import shared.PreParser;
17 import shared.Shared;
18 import shared.Timer;
19 import shared.Tools;
20 import sketch.SketchObject;
21 import stream.ConcurrentReadInputStream;
22 import stream.FastaReadInputStream;
23 import stream.Read;
24 import structures.ListNum;
25 import structures.LongList;
26 import ukmer.Kmer;
27
28 /**
29 * @author Brian Bushnell
30 * @date Jul 5, 2012
31 *
32 */
33 public class ReadCounter extends KmerCountAbstract {
34
35 public static void main(String[] args){
36
37 {//Preparse block for help, config files, and outstream
38 PreParser pp=new PreParser(args, new Object() { }.getClass().getEnclosingClass(), false);
39 args=pp.args;
40 // outstream=pp.outstream;
41 }
42
43 Timer t=new Timer();
44
45 String fname1=args[0];
46 String fname2=(args.length>1 ? args[1] : null);
47 int k=14;
48 int cbits=16;
49 int matrixbits=-1;
50 int hashes=1;
51
52 for(int i=2; i<args.length; i++){
53 final String arg=args[i];
54 final String[] split=arg.split("=");
55 String a=split[0].toLowerCase();
56 String b=split.length>1 ? split[1] : null;
57
58 if(a.equals("k") || a.equals("kmer")){
59 k=Integer.parseInt(b);
60 }else if(a.startsWith("cbits") || a.startsWith("cellbits")){
61 cbits=Integer.parseInt(b);
62 }else if(a.startsWith("reads") || a.startsWith("maxreads")){
63 maxReads=Parse.parseKMG(b);
64 }else if(a.startsWith("matrixbits")){
65 matrixbits=Integer.parseInt(b);
66 }else if(a.startsWith("hashes")){
67 hashes=Integer.parseInt(b);
68 }else if(a.equals("canonical")){
69 CANONICAL=Parse.parseBoolean(b);
70 }else{
71 throw new RuntimeException("Unknown parameter "+args[i]);
72 }
73 }
74
75 {//Process parser fields
76 Parser.processQuality();
77 }
78
79 int kbits=Tools.min(2*k, 62);
80 if(matrixbits<0){
81 matrixbits=kbits;
82 }
83 matrixbits=Tools.min(kbits, matrixbits);
84
85 if(fileIO.FileFormat.hasFastaExtension(fname1)){
86 assert(!FastaReadInputStream.SPLIT_READS);
87 FastaReadInputStream.MIN_READ_LEN=k;
88 }
89
90 KCountArray counts=KCountArray.makeNew(1L<<matrixbits, cbits, hashes);
91 ReadCounter rc=new ReadCounter(k, true, false, false, false);
92 try {
93 rc.count(fname1, fname2, counts);
94 } catch (Exception e) {
95 // TODO Auto-generated catch block
96 e.printStackTrace();
97 }
98 counts.shutdown();
99
100 // verbose=true;
101
102 t.stop();
103 System.out.println("Finished counting; time = "+t);
104
105 rc.printStatistics(counts);
106
107 }
108
109 /** Defaults for nucleotides. */
110 public ReadCounter(final int k_){this(k_, true, false, false, false);}
111
112 public ReadCounter(final int k_, final boolean rcomp_,
113 final boolean ecco_, final boolean merge_, final boolean amino_){
114 k=k_;
115 rcomp=rcomp_;
116 ecco=ecco_;
117 merge=merge_;
118 amino=amino_;
119
120 final int bitsPerChar=(amino ? AminoAcid.AMINO_SHIFT : 2);
121 aminoShift=AminoAcid.AMINO_SHIFT;
122 shift=bitsPerChar*k;
123 shift2=shift-bitsPerChar;
124 mask=(shift>63 ? -1L : ~((-1L)<<shift)); //Conditional allows K=32
125
126 assert(!amino || k*bitsPerChar<64);
127 assert(!amino || !rcomp);
128 assert(k>0);
129 }
130
131 public void printStatistics(KCountArray counts){
132 long[] freq=counts.transformToFrequency();
133
134 // System.out.println(count+"\n");
135 // System.out.println(Arrays.toString(freq)+"\n");
136
137 long sum=sum(freq);
138 System.out.println("Kmer fraction:");
139 int lim1=8, lim2=16;
140 for(int i=0; i<lim1; i++){
141 String prefix=i+"";
142 while(prefix.length()<8){prefix=prefix+" ";}
143 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
144 }
145 while(lim1<=freq.length){
146 int x=0;
147 for(int i=lim1; i<lim2; i++){
148 x+=freq[i];
149 }
150 String prefix=lim1+"-"+(lim2-1);
151 if(lim2>=freq.length){prefix=lim1+"+";}
152 while(prefix.length()<8){prefix=prefix+" ";}
153 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x);
154 lim1*=2;
155 lim2=min(lim2*2, freq.length);
156 }
157
158 long sum2=sum-freq[0];
159 long x=freq[1];
160 System.out.println();
161 System.out.println("Keys Counted: \t \t"+keysCounted);
162 System.out.println("Unique: \t \t"+sum2);
163 System.out.println("Avg Sites/Key: \t \t"+String.format(Locale.ROOT, "%.3f ",(keysCounted*1d/sum2)));
164 System.out.println();
165 System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
166 x=sum2-x;
167 System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
168 }
169
170 public KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles, int cbits){
171 return makeKca(fname1, fname2, extraFiles, cbits, Tools.min(2*k, 35), 1, minQuality,
172 maxReads, 1, 1, 1, 2);
173 }
174
175 public KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles,
176 int cbits, int matrixbits, int hashes, int minqual, long maxreads){
177 assert(matrixbits<63);
178 return makeKca(fname1, fname2, extraFiles, cbits, matrixbits, hashes, minqual,
179 maxreads, 1, 1, 1, 2);
180 }
181
182 public KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles,
183 int cbits, int matrixbits, int hashes, int minqual,
184 long maxreads, int passes, int stepsize, int thresh1, int thresh2){
185 assert(matrixbits<63);
186 return makeKca(fname1, fname2, extraFiles,
187 cbits, 1L<<matrixbits, hashes, minqual,
188 maxreads, passes, stepsize, thresh1, thresh2, null, 0);
189 }
190
191 public KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles,
192 int cbits, long cells, int hashes, int minqual,
193 long maxreads, int passes, int stepsize, int thresh1, int thresh2){
194 return makeKca(fname1, fname2, extraFiles,
195 cbits, cells, hashes, minqual,
196 maxreads, passes, stepsize, thresh1, thresh2, null, 0);
197 }
198
199 public KCountArray makeKca_als(ArrayList<String> fname1, ArrayList<String> fname2, Iterable<String> extraFiles,
200 int cbits, long cells, int hashes, int minqual,
201 long maxreads, int passes, int stepsize, int thresh1, int thresh2,
202 KCountArray prefilter, int prefilterLimit_){
203 String a=null, b=null;
204 ArrayList<String> list=new ArrayList<String>();
205 if(fname1!=null){
206 for(int i=0; i<fname1.size(); i++){
207 if(i==0){a=fname1.get(i);}
208 else{list.add(fname1.get(i));}
209 }
210 }
211 if(fname2!=null){
212 for(int i=0; i<fname2.size(); i++){
213 if(i==0){b=fname2.get(i);}
214 else{list.add(fname2.get(i));}
215 }
216 }
217 if(extraFiles!=null){
218 for(String s : extraFiles){
219 list.add(s);
220 }
221 }
222 return makeKca(a, b, list.isEmpty() ? null : list, cbits, cells, hashes, minqual,
223 maxreads, passes, stepsize, thresh1, thresh2,
224 prefilter, prefilterLimit_);
225 }
226
227 public KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles,
228 int cbits, long cells, int hashes, int minqual,
229 long maxreads, int passes, int stepsize, int thresh1, int thresh2,
230 KCountArray prefilter, int prefilterLimit_){
231 // verbose=true;
232 if(verbose){System.err.println("Making kca from ("+fname1+", "+fname2+")\nk="+k+", cells="+Tools.toKMG(cells)+", cbits="+cbits);}
233
234 if(fname1==null && fname2==null && extraFiles==null){
235 return KCountArray.makeNew(cells, cbits, hashes, prefilter, prefilterLimit_);
236 }
237
238 boolean oldsplit=FastaReadInputStream.SPLIT_READS;
239 long oldmax=maxReads;
240 byte oldq=minQuality;
241 maxReads=maxreads;
242 minQuality=(byte)minqual;
243 // System.out.println("kbits="+(kbits)+" -> "+(1L<<kbits)+", matrixbits="+(matrixbits)+" -> "+(1L<<matrixbits)+", cbits="+cbits+", gap="+gap+", hashes="+hashes);
244 KCountArray kca=KCountArray.makeNew(cells, cbits, hashes, prefilter, prefilterLimit_);
245
246 // System.out.println("a");
247 {//For processing input lists
248 ArrayList<String> extra2=null;
249 if(fname1!=null && fname1.contains(",")){
250 String[] s=fname1.split(",");
251 if(extra2==null){extra2=new ArrayList<String>();}
252 for(int i=1; i<s.length; i++){extra2.add(s[i]);}
253 fname1=s[0];
254 }
255 if(fname2!=null && fname2.contains(",")){
256 String[] s=fname2.split(",");
257 if(extra2==null){extra2=new ArrayList<String>();}
258 for(int i=1; i<s.length; i++){extra2.add(s[i]);}
259 fname2=s[0];
260 }
261 if(extra2!=null){
262 if(extraFiles!=null){
263 for(String s : extraFiles){
264 extra2.add(s);
265 }
266 }
267 extraFiles=extra2;
268 }
269 }
270 // System.out.println("b");
271
272 if(extraFiles!=null){
273 for(String s : extraFiles){
274 if(fileIO.FileFormat.hasFastaExtension(s)){
275 assert(!FastaReadInputStream.SPLIT_READS);
276 }
277 }
278 }
279
280 // System.out.println("c");
281
282 if(passes==1){
283 // System.out.println("c1");
284 if(fname1!=null){
285 try {
286 count(fname1, fname2, kca);
287 } catch (Exception e) {
288 // TODO Auto-generated catch block
289 e.printStackTrace();
290 }
291 }
292 if(extraFiles!=null){
293 maxReads=-1;
294 for(String s : extraFiles){
295 try {
296 count(s, null, kca);
297 } catch (Exception e) {
298 // TODO Auto-generated catch block
299 e.printStackTrace();
300 }
301 }
302 }
303 kca.shutdown();
304
305 }else{
306 // System.out.println("c2");
307 assert(passes>1);
308 KCountArray trusted=null;
309 for(int i=1; i<passes; i++){
310 boolean conservative=i>2;// /*or, alternately, (trusted==null || trusted.capacity()>0.3)
311 int step=(stepsize==1 ? 1 : stepsize+i%2);
312 // if(!conservative){step=(step+3)/4;}
313 if(!conservative){step=Tools.min(3, (step+3)/4);}
314
315 try {
316 count(fname1, fname2, cbits, kca, trusted, maxreads, thresh1, step, conservative);
317 } catch (Exception e) {
318 // TODO Auto-generated catch block
319 e.printStackTrace();
320 }
321 if(extraFiles!=null){
322 maxReads=-1;
323 for(String s : extraFiles){
324 try {
325 count(s, null, cbits, kca, trusted, maxreads, thresh1, step, conservative);
326 } catch (Exception e) {
327 // TODO Auto-generated catch block
328 e.printStackTrace();
329 }
330 }
331 }
332 kca.shutdown();
333
334 System.out.println("Trusted: \t"+kca.toShortString());
335 trusted=kca;
336 kca=KCountArray.makeNew(cells, cbits, hashes, prefilter, prefilterLimit_);
337
338 }
339
340 try {
341 count(fname1, fname2, cbits, kca, trusted, maxreads, thresh2, stepsize, true);
342 } catch (Exception e) {
343 // TODO Auto-generated catch block
344 e.printStackTrace();
345 }
346 if(extraFiles!=null){
347 maxReads=-1;
348 for(String s : extraFiles){
349 try {
350 count(s, null, cbits, kca, trusted, maxreads, thresh2, stepsize, true);
351 } catch (Exception e) {
352 // TODO Auto-generated catch block
353 e.printStackTrace();
354 }
355 }
356 }
357 kca.shutdown();
358 }
359 // System.out.println("d");
360 minQuality=oldq;
361 maxReads=oldmax;
362 FastaReadInputStream.SPLIT_READS=oldsplit;
363
364
365 return kca;
366 }
367
368 public KCountArray count(String reads1, String reads2, KCountArray counts) throws Exception{
369 // System.err.println("countFastq... making a new cris");
370 assert(counts!=null);
371
372 {
373 int pound=reads1.lastIndexOf('#');
374 if(pound>=0 && reads2==null && !new File(reads1).exists()){
375 String a=reads1.substring(0, pound);
376 String b=reads1.substring(pound+1);
377 reads1=a+1+b;
378 reads2=a+2+b;
379 }
380 }
381
382 final ConcurrentReadInputStream cris;
383 {
384 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
385 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
386 if(ff2==null){ff1.preferShreds=true;}
387 // if(ff2!=null){ //TODO - interleaved flag
388 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
389 cris.start(); //4567
390 }
391
392 assert(cris!=null) : reads1;
393 if(verbose){System.err.println("Started cris");}
394 boolean paired=cris.paired();
395 if(verbose){System.err.println("Paired: "+paired);}
396
397 // countFastq(cris, count);
398 // assert(false) : THREADS;
399 CountThread[] cta=new CountThread[Shared.threads()];
400 for(int i=0; i<cta.length; i++){
401 cta[i]=new CountThread(cris, counts);
402 cta[i].start();
403 }
404 // System.out.println("~1");
405 for(int i=0; i<cta.length; i++){
406 // System.out.println("~2");
407 CountThread ct=cta[i];
408 synchronized(ct){
409 // System.out.println("~3");
410 while(ct.getState()!=State.TERMINATED){
411 // System.out.println("~4");
412 try {
413 ct.join(2000);
414 } catch (InterruptedException e) {
415 // TODO Auto-generated catch block
416 e.printStackTrace();
417 }
418 // System.out.println("~5");
419 }
420 }
421 }
422 // System.out.println("~6");
423
424 ReadWrite.closeStream(cris);
425 if(verbose){System.err.println("Closed stream");}
426 if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
427
428
429 return counts;
430 }
431
432 public KCountArray count(String reads1, String reads2, final int cbits,
433 KCountArray counts, final KCountArray trusted, final long maxReads, final int thresh,
434 final int detectStepsize, final boolean conservative)
435 throws Exception{
436
437 assert(counts!=null);
438
439 {
440 int pound=reads1.lastIndexOf('#');
441 if(pound>=0 && reads2==null && !new File(reads1).exists()){
442 String a=reads1.substring(0, pound);
443 String b=reads1.substring(pound+1);
444 reads1=a+1+b;
445 reads2=a+2+b;
446 }
447 }
448
449 final ConcurrentReadInputStream cris;
450 {
451 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
452 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
453 if(ff2==null){ff1.preferShreds=true;}
454 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
455 cris.start(); //4567
456 }
457
458 assert(cris!=null) : reads1;
459 if(verbose){System.err.println("Started cris");}
460 boolean paired=cris.paired();
461 if(verbose){System.err.println("Paired: "+paired);}
462
463
464 // countFastq(cris, count, trusted, thresh, detectStepsize, conservative);
465
466 // assert(false) : THREADS;
467 CountThread[] cta=new CountThread[Shared.threads()];
468 for(int i=0; i<cta.length; i++){
469 cta[i]=new CountThread(cris, counts, trusted, thresh, detectStepsize, conservative);
470 cta[i].start();
471 }
472
473 for(int i=0; i<cta.length; i++){
474 CountThread ct=cta[i];
475 synchronized(ct){
476 while(ct.isAlive()){
477 try {
478 ct.join(1000);
479 } catch (InterruptedException e) {
480 // TODO Auto-generated catch block
481 e.printStackTrace();
482 }
483 }
484 }
485 }
486
487 cris.close();
488 if(verbose){System.err.println("Closed stream");}
489
490 // System.out.println("*** after ***");
491 // System.out.println("\ntrusted=\n"+trusted);
492 // System.out.println("\ncount=\n"+count);
493
494 return counts;
495 }
496
497 private final int findOverlap(Read r1, Read r2, boolean ecc){
498 if(vstrict){
499 return BBMerge.findOverlapVStrict(r1, r2, ecc);
500 }else{
501 return BBMerge.findOverlapStrict(r1, r2, ecc);
502 }
503 }
504
505 private class CountThread extends Thread{
506
507 CountThread(final ConcurrentReadInputStream cris_, final KCountArray counts_){
508 this(cris_, counts_, null, 2, 1, true);
509 }
510
511 CountThread(final ConcurrentReadInputStream cris_,
512 final KCountArray counts_, final KCountArray trusted_, final int thresh_,
513 final int detectStepsize_, final boolean conservative_){
514 cris=cris_;
515 counts=counts_;
516 trusted=trusted_;
517 thresh=thresh_;
518 detectStepsize=detectStepsize_;
519 conservative=conservative_;
520 }
521
522 @Override
523 public void run(){
524 // System.out.println("Running");
525 if(trusted==null){
526 count(cris);
527 }else{
528 count(cris, thresh, detectStepsize, conservative);
529 }
530 // System.out.println("Finished: "+readsProcessedLocal);
531
532 if(BUFFERED){dumpBuffer();}
533
534 synchronized(getClass()){
535 keysCounted+=keysCountedLocal;
536 increments+=incrementsLocal;
537 readsProcessed+=readsProcessedLocal;
538
539 if(verbose){System.err.println(keysCounted+", "+keysCountedLocal);}
540 if(verbose){System.err.println(readsProcessed+", "+readsProcessedLocal);}
541 }
542 }
543
544 private void increment(final long key){
545 if(BUFFERED){
546 buffer.add(key);
547 if(buffer.size>=BUFFERLEN){
548 dumpBuffer();
549 }
550 }else{
551 incrementByAmount(key, 1);
552 }
553 }
554
555
556 //TODO: All this 'sum' nonsense is just to count kmers added. Could be derived from buffer length.
557 private void dumpBuffer(){
558 final int lim=buffer.size;
559 if(lim<1){return;}
560 final long[] array=buffer.array;
561 // if(SORT_SERIAL){
562 // buffer.sortSerial();
563 // }else{
564 // buffer.sort();
565 // }
566 buffer.sort();//Can be disabled via parallelsort flag but only affects arrays>10k
567 long kmer=array[0]-1;//Ensures a nonmatch
568 int streak=0;
569 for(int i=0; i<lim; i++){
570 long x=array[i];
571 if(x==kmer){streak++;}
572 else{
573 if(streak>0){incrementByAmount(kmer, streak);}
574 kmer=x;
575 streak=1;
576 }
577 }
578 assert(streak>0);
579 incrementByAmount(kmer, streak);
580 buffer.clear();
581 }
582
583 private void incrementByAmount(final long key, int amt){
584 if(SKETCH_MODE){
585 final long code=SketchObject.hash(key);
586 if(code<minHashValue){return;}
587 counts.increment(STORE_HASHED ? code : key, amt);
588 }else{
589 counts.increment(key, amt);
590 }
591 keysCountedLocal+=amt;
592 incrementsLocal++;
593 }
594
595 private final void count(ConcurrentReadInputStream cris){
596 assert(k>=1 && counts!=null);
597 // System.out.println("Waiting for list");
598 ListNum<Read> ln=cris.nextList();
599 ArrayList<Read> reads=(ln!=null ? ln.list : null);
600 // System.out.println("Got list: "+(ln==null ? "null" : ln.id)+", "+(ln==null || ln.list==null ? "null" : ln.list.size()));
601
602 long[] array=null;
603 final Kmer kmer=new Kmer(k);
604
605 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
606 //System.err.println("reads.size()="+reads.size());
607 for(Read r1 : reads){
608
609 Read r2=r1.mate;
610 if((ecco || merge) && r2!=null){
611 if(merge){
612 final int insert=findOverlap(r1, r2, false);
613 if(insert>0){
614 r2.reverseComplement();
615 r1=r1.joinRead(insert);
616 r2=null;
617 }
618 }else if(ecco){
619 findOverlap(r1, r2, true);
620 }
621 }
622 readsProcessedLocal++;
623
624 if(k<=maxShortKmerLength){
625 array=addRead_Advanced(r1, array);
626 }else{
627 addReadBig(r1, kmer);
628 addReadBig(r1.mate, kmer);
629 }
630 // System.out.println(r);
631 // System.out.println("kmers hashed: "+keysCountedLocal);
632 }
633 //System.err.println("returning list");
634 cris.returnList(ln);
635 //System.err.println("fetching list");
636 ln=cris.nextList();
637 reads=(ln!=null ? ln.list : null);
638 }
639
640
641 if(verbose){System.err.println("Finished reading");}
642 cris.returnList(ln);
643 if(verbose){System.err.println("Returned list");}
644 }
645
646
647
648
649 private final void count(final ConcurrentReadInputStream cris, final int thresh,
650 final int detectStepsize, final boolean conservative){
651
652 ListNum<Read> ln=cris.nextList();
653 ArrayList<Read> reads=(ln!=null ? ln.list : null);
654
655 long[] array=null;
656 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
657 //System.err.println("reads.size()="+reads.size());
658 for(Read r1 : reads){
659
660 Read r2=r1.mate;
661 if((ecco || merge) && r2!=null){
662 if(merge){
663 final int insert=findOverlap(r1, r2, false);
664 if(insert>0){
665 r2.reverseComplement();
666 r1=r1.joinRead(insert);
667 r2=null;
668 }
669 }else if(ecco){
670 findOverlap(r1, r2, true);
671 }
672 }
673 {
674 if(trusted!=null){
675 BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r1, trusted, k, thresh, detectStepsize) :
676 ErrorCorrect.detectTrusted(r1, trusted, k, thresh, detectStepsize));
677 // System.out.println("\n"+toString(bs, r.length()));
678 // System.out.println(new String(r.bases));
679 if(bs!=null){
680 for(int i=bs.nextClearBit(0); i<r1.length(); i=bs.nextClearBit(i+1)){
681 r1.bases[i]='N';
682 if(r1.quality!=null){r1.quality[i]=0;}
683 }
684 }
685 // System.out.println(new String(r.bases));
686 // System.out.println("used = "+String.format(Locale.ROOT, "%.3f%%",count.usedFraction()*100));
687 // System.out.println("used = "+((KCountArray4)count).cellsUsed());
688 // if(bs.length()<r.length()){r=null;}
689 }
690 // if(r!=null){addRead(r, count, rcomp);}
691 }
692 if(r2!=null){
693 if(trusted!=null){
694 BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r2, trusted, k, thresh, detectStepsize) :
695 ErrorCorrect.detectTrusted(r2, trusted, k, thresh, detectStepsize));
696 if(bs!=null){
697 for(int i=bs.nextClearBit(0); i<r2.length(); i=bs.nextClearBit(i+1)){
698 r2.bases[i]='N';
699 if(r2.quality!=null){r2.quality[i]=0;}
700 }
701 }
702 }
703 }
704 array=addRead_Advanced(r1, array);
705
706 }
707 //System.err.println("returning list");
708 cris.returnList(ln);
709 //System.err.println("fetching list");
710 ln=cris.nextList();
711 reads=(ln!=null ? ln.list : null);
712 }
713
714 if(verbose){System.err.println("Finished reading");}
715 cris.returnList(ln);
716 if(verbose){System.err.println("Returned list");}
717 }
718
719
720
721 /**
722 * Hash a read's kmers into the KCountArray.
723 * Advanced mode processes paired reads together and sorts kmers to eliminate spurious duplicates.
724 * @param r1
725 * @param counts
726 * @param k
727 * @param mask
728 * @param rcomp
729 */
730 private final long[] addRead_Advanced(Read r1, long[] array){
731 if(PREJOIN && r1.mate!=null && r1.insert()>0){
732 r1.mate.reverseComplement();
733 r1=r1.joinRead();
734 }
735 Read r2=r1.mate;
736 final int len1=Tools.max(0, r1.length()-k+1);
737 final int len2=(r2==null || r2.bases==null) ? 0 : Tools.max(0, r2.length()-k+1);
738 final int len=len1+len2;
739 if(len<1){return array;}
740
741 if(!KEEP_DUPLICATE_KMERS){
742 if(array==null || array.length!=len){array=new long[len];}
743 Arrays.fill(array, -1);
744 fillKmerArray(r1, array, 0, len1);
745 if(r2!=null){fillKmerArray(r2, array, len1, len);}
746
747 Arrays.sort(array);
748 long prev=-1;
749 for(int i=0; i<array.length; i++){
750 long kmer=array[i];
751 if(kmer!=prev){
752 increment(kmer);
753 prev=kmer;
754 }
755 }
756 }else{
757 if(len1>0){addRead(r1);}
758 if(len2>0){addRead(r2);}
759 }
760 return array;
761 }
762
763 private final void addReadBig(Read r, Kmer kmer){
764 if(r==null || r.bases==null){return;}
765 final byte[] bases=r.bases;
766 final byte[] quals=r.quality;
767 int len=0;
768
769 if(bases==null || bases.length<k){return;}
770 kmer.clear();
771
772 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
773 float prob=1;
774 for(int i=0; i<bases.length; i++){
775 final byte b=bases[i];
776 final long x=AminoAcid.baseToNumber[b];
777
778 //Update kmers
779 kmer.addRight(b);
780
781 if(minProb>0 && quals!=null){//Update probability
782 prob=prob*KmerTableSet.PROB_CORRECT[quals[i]];
783 if(len>k){
784 byte oldq=quals[i-k];
785 prob=prob*KmerTableSet.PROB_CORRECT_INVERSE[oldq];
786 }
787 }
788
789 //Handle Ns
790 if(x<0){
791 len=0;
792 prob=1;
793 }else{len++;}
794
795 assert(len==kmer.len);
796
797 if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+"\t"+new String(bases, Tools.max(0, i-k), Tools.min(i+1, k)));}
798 if(len>=k && prob>=minProb){
799 // System.err.println("Incrementing xor()="+kmer.xor());
800 increment(kmer.xor());
801 // counts.incrementAndReturnUnincremented(kmer.xor(), 1);
802 // keysCountedLocal++;
803 }
804 }
805 }
806
807 private final void fillKmerArray(Read r, final long[] array, final int start, final int stop){
808 if(amino){
809 fillKmerArrayAmino(r, array, start, stop);
810 return;
811 }
812 if(k>maxShortKmerLength){
813 fillKmerArrayLong(r, array, start, stop);
814 return;
815 }
816 assert(k<=maxShortKmerLength);
817 assert(!PREJOIN || r.mate==null);
818 assert(CANONICAL);
819 assert(array!=null);
820
821 final byte[] bases=r.bases;
822 final byte[] quals=r.quality;
823
824 if(bases==null || bases.length<k){return;}
825
826 final int passes=(rcomp ? 2 : 1);
827 for(int pass=0; pass<passes; pass++){
828 int len=0;
829 int idx=(pass==0 ? start-k+1 : stop+k-2);
830 long kmer=0;
831 float prob=1;
832 for(int i=0; i<bases.length; i++){
833 byte b=bases[i];
834 assert(b>=0) : r.id+", "+bases.length+", "+i+", "+b+"\n"+(bases.length<20000 ? r.toFasta().toString() : "");
835 int x=AminoAcid.baseToNumber[b];
836
837 // int x=AminoAcid.baseToNumber[b<0 ? 'N' : b];
838
839 byte q;
840 if(quals==null){
841 q=50;
842 }else{
843 q=quals[i];
844 prob=prob*align2.QualityTools.PROB_CORRECT[q];
845 if(len>k){
846 byte oldq=quals[i-k];
847 prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq];
848 }
849 }
850
851 if(x<0 || q<minQuality){
852 len=0;
853 kmer=0;
854 prob=1;
855 }else{
856 kmer=((kmer<<2)|x)&mask;
857 len++;
858 if(len>=k && prob>=minProb){
859 array[idx]=Tools.max(array[idx], kmer);
860 }
861 }
862 if(pass==0){idx++;}else{idx--;}
863 }
864 // System.out.println(Arrays.toString(array));
865 r.reverseComplement();
866 }
867 }
868
869 private final void fillKmerArrayAmino(Read r, final long[] array, final int start, final int stop){
870 assert(false) : "TODO"; //TODO
871 if(k>maxShortKmerLength){
872 fillKmerArrayLong(r, array, start, stop);
873 return;
874 }
875 assert(k<=maxShortKmerLength);
876 assert(!PREJOIN || r.mate==null);
877 assert(CANONICAL);
878 assert(array!=null);
879
880 final byte[] bases=r.bases;
881 final byte[] quals=r.quality;
882
883 if(bases==null || bases.length<k){return;}
884
885 for(int pass=0; pass<2; pass++){
886 int len=0;
887 int idx=(pass==0 ? start-k+1 : stop+k-2);
888 long kmer=0;
889 float prob=1;
890 for(int i=0; i<bases.length; i++){
891 byte b=bases[i];
892 assert(b>=0) : r.id+", "+bases.length+", "+i+", "+b+"\n"+(bases.length<20000 ? r.toFasta().toString() : "");
893 int x=AminoAcid.baseToNumber[b];
894
895 // int x=AminoAcid.baseToNumber[b<0 ? 'N' : b];
896
897 byte q;
898 if(quals==null){
899 q=50;
900 }else{
901 q=quals[i];
902 prob=prob*align2.QualityTools.PROB_CORRECT[q];
903 if(len>k){
904 byte oldq=quals[i-k];
905 prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq];
906 }
907 }
908
909 if(x<0 || q<minQuality){
910 len=0;
911 kmer=0;
912 prob=1;
913 }else{
914 kmer=((kmer<<2)|x)&mask;
915 len++;
916 if(len>=k && prob>=minProb){
917 array[idx]=Tools.max(array[idx], kmer);
918 }
919 }
920 if(pass==0){idx++;}else{idx--;}
921 }
922 // System.out.println(Arrays.toString(array));
923 r.reverseComplement();
924 }
925 }
926
927 private final void addRead(Read r){
928 if(amino){
929 addReadAmino(r);
930 return;
931 }
932 assert(k<=maxShortKmerLength);
933 assert(!PREJOIN || r.mate==null);
934 assert(CANONICAL);
935
936 final byte[] bases=r.bases;
937 final byte[] quals=r.quality;
938
939 if(bases==null || bases.length<k){return;}
940
941 long kmer=0;
942 long rkmer=0;
943 int len=0;
944 float prob=1;
945
946 for(int i=0; i<bases.length; i++){
947 final byte b=bases[i];
948 long x=AminoAcid.baseToNumber[b];
949 long x2=AminoAcid.baseToComplementNumber[b];
950 kmer=((kmer<<2)|x)&mask;
951 rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
952
953 final byte q;
954 if(quals==null){
955 q=50;
956 }else{
957 q=quals[i];
958 prob=prob*align2.QualityTools.PROB_CORRECT[q];
959 if(len>k){
960 byte oldq=quals[i-k];
961 prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq];
962 }
963 }
964
965 if(x<0 || q<minQuality){
966 len=0;
967 kmer=rkmer=0;
968 prob=1;
969 }else{
970 len++;
971 if(len>=k && prob>=minProb){
972 long key=(rcomp ? Tools.max(kmer, rkmer) : kmer);
973 increment(key);
974 }
975 }
976 }
977 }
978
979 private final void addReadAmino(Read r){
980 assert(!PREJOIN || r.mate==null);
981
982 final byte[] bases=r.bases;
983
984 if(bases==null || bases.length<k){return;}
985
986 long kmer=0;
987 int len=0;
988
989 for(int i=0; i<bases.length; i++){
990 final byte b=bases[i];
991 long x=AminoAcid.acidToNumber[b];
992 kmer=((kmer<<aminoShift)|x)&mask;
993
994 if(x<0){
995 len=0;
996 kmer=0;
997 }else{
998 len++;
999 if(len>=k){
1000 increment(kmer);
1001 }
1002 }
1003 }
1004 }
1005
1006 private final void fillKmerArrayLong(Read r, final long[] array, final int start, final int stop){
1007 assert(k>maxShortKmerLength) : k;
1008 assert(!PREJOIN || r.mate==null);
1009 assert(CANONICAL);
1010 assert(array!=null);
1011 Kmer kmer=new Kmer(k);
1012
1013 float prob=1;
1014 byte[] bases=r.bases;
1015 byte[] quals=r.quality;
1016
1017 kmer.clear();
1018
1019 for(int i=0, idx=start-k+1; i<bases.length; i++, idx++){
1020 byte b=bases[i];
1021 kmer.addRight(b);
1022
1023 byte q;
1024 if(quals==null){
1025 q=50;
1026 }else{
1027 q=quals[i];
1028 prob=prob*align2.QualityTools.PROB_CORRECT[q];
1029 if(kmer.len>k){
1030 byte oldq=quals[i-k];
1031 prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq];
1032 }
1033 }
1034
1035 if(!AminoAcid.isFullyDefined(b) || q<minQuality){
1036 kmer.clear();
1037 prob=1;
1038 }
1039 if(kmer.len>=k && prob>=minProb){
1040 array[idx]=kmer.xor();
1041 }
1042 }
1043 }
1044
1045 private final ConcurrentReadInputStream cris;
1046
1047 private final KCountArray counts;
1048 private final KCountArray trusted;
1049 private final int thresh;
1050 private final int detectStepsize;
1051 private final boolean conservative;
1052 private long keysCountedLocal=0;
1053 private long incrementsLocal=0;
1054 private long readsProcessedLocal=0;
1055 private final long minHashValue=SketchObject.minHashValue;
1056 private final LongList buffer=(BUFFERED ? new LongList(BUFFERLEN) : null);
1057 }
1058
1059 public static boolean vstrict=false;
1060
1061 private final int k;
1062 private final int aminoShift;
1063 private final int shift;
1064 private final int shift2;
1065 private final long mask;
1066 private final boolean rcomp;
1067 private final boolean ecco;
1068 private final boolean merge;
1069 private final boolean amino;
1070
1071 }